xref: /freebsd/sys/geom/journal/g_journal.c (revision d3d381b2b194b4d24853e92eecef55f262688d1a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/bio.h>
40 #include <sys/sysctl.h>
41 #include <sys/malloc.h>
42 #include <sys/mount.h>
43 #include <sys/eventhandler.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <sys/sched.h>
47 #include <sys/taskqueue.h>
48 #include <sys/vnode.h>
49 #include <sys/sbuf.h>
50 #ifdef GJ_MEMDEBUG
51 #include <sys/stack.h>
52 #include <sys/kdb.h>
53 #endif
54 #include <vm/vm.h>
55 #include <vm/vm_kern.h>
56 #include <geom/geom.h>
57 
58 #include <geom/journal/g_journal.h>
59 
60 FEATURE(geom_journal, "GEOM journaling support");
61 
62 /*
63  * On-disk journal format:
64  *
65  * JH - Journal header
66  * RH - Record header
67  *
68  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
69  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
70  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
71  *
72  */
73 
74 CTASSERT(sizeof(struct g_journal_header) <= 512);
75 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
76 
77 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
78 static struct mtx g_journal_cache_mtx;
79 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
80 
81 const struct g_journal_desc *g_journal_filesystems[] = {
82 	&g_journal_ufs,
83 	NULL
84 };
85 
86 SYSCTL_DECL(_kern_geom);
87 
88 int g_journal_debug = 0;
89 static u_int g_journal_switch_time = 10;
90 static u_int g_journal_force_switch = 70;
91 static u_int g_journal_parallel_flushes = 16;
92 static u_int g_journal_parallel_copies = 16;
93 static u_int g_journal_accept_immediately = 64;
94 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
95 static u_int g_journal_do_optimize = 1;
96 
97 static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0,
98     "GEOM_JOURNAL stuff");
99 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0,
100     "Debug level");
101 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
102     &g_journal_switch_time, 0, "Switch journals every N seconds");
103 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
104     &g_journal_force_switch, 0, "Force switch when journal is N% full");
105 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
106     &g_journal_parallel_flushes, 0,
107     "Number of flush I/O requests to send in parallel");
108 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
109     &g_journal_accept_immediately, 0,
110     "Number of I/O requests accepted immediately");
111 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
112     &g_journal_parallel_copies, 0,
113     "Number of copy I/O requests to send in parallel");
114 static int
115 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
116 {
117 	u_int entries;
118 	int error;
119 
120 	entries = g_journal_record_entries;
121 	error = sysctl_handle_int(oidp, &entries, 0, req);
122 	if (error != 0 || req->newptr == NULL)
123 		return (error);
124 	if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
125 		return (EINVAL);
126 	g_journal_record_entries = entries;
127 	return (0);
128 }
129 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
130     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
131     "Maximum number of entires in one journal record");
132 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
133     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
134 
135 static u_long g_journal_cache_used = 0;
136 static u_long g_journal_cache_limit = 64 * 1024 * 1024;
137 static u_int g_journal_cache_divisor = 2;
138 static u_int g_journal_cache_switch = 90;
139 static u_int g_journal_cache_misses = 0;
140 static u_int g_journal_cache_alloc_failures = 0;
141 static u_long g_journal_cache_low = 0;
142 
143 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
144     "GEOM_JOURNAL cache");
145 SYSCTL_ULONG(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
146     &g_journal_cache_used, 0, "Number of allocated bytes");
147 static int
148 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
149 {
150 	u_long limit;
151 	int error;
152 
153 	limit = g_journal_cache_limit;
154 	error = sysctl_handle_long(oidp, &limit, 0, req);
155 	if (error != 0 || req->newptr == NULL)
156 		return (error);
157 	g_journal_cache_limit = limit;
158 	g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
159 	return (0);
160 }
161 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
162     CTLTYPE_ULONG | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I",
163     "Maximum number of allocated bytes");
164 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
165     &g_journal_cache_divisor, 0,
166     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
167 static int
168 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
169 {
170 	u_int cswitch;
171 	int error;
172 
173 	cswitch = g_journal_cache_switch;
174 	error = sysctl_handle_int(oidp, &cswitch, 0, req);
175 	if (error != 0 || req->newptr == NULL)
176 		return (error);
177 	if (cswitch > 100)
178 		return (EINVAL);
179 	g_journal_cache_switch = cswitch;
180 	g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
181 	return (0);
182 }
183 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
184     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
185     "Force switch when we hit this percent of cache use");
186 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
187     &g_journal_cache_misses, 0, "Number of cache misses");
188 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
189     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
190 
191 static u_long g_journal_stats_bytes_skipped = 0;
192 static u_long g_journal_stats_combined_ios = 0;
193 static u_long g_journal_stats_switches = 0;
194 static u_long g_journal_stats_wait_for_copy = 0;
195 static u_long g_journal_stats_journal_full = 0;
196 static u_long g_journal_stats_low_mem = 0;
197 
198 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
199     "GEOM_JOURNAL statistics");
200 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
201     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
202 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
203     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
204 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
205     &g_journal_stats_switches, 0, "Number of journal switches");
206 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
207     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
208 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
209     &g_journal_stats_journal_full, 0,
210     "Number of times journal was almost full.");
211 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
212     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
213 
214 static g_taste_t g_journal_taste;
215 static g_ctl_req_t g_journal_config;
216 static g_dumpconf_t g_journal_dumpconf;
217 static g_init_t g_journal_init;
218 static g_fini_t g_journal_fini;
219 
220 struct g_class g_journal_class = {
221 	.name = G_JOURNAL_CLASS_NAME,
222 	.version = G_VERSION,
223 	.taste = g_journal_taste,
224 	.ctlreq = g_journal_config,
225 	.dumpconf = g_journal_dumpconf,
226 	.init = g_journal_init,
227 	.fini = g_journal_fini
228 };
229 
230 static int g_journal_destroy(struct g_journal_softc *sc);
231 static void g_journal_metadata_update(struct g_journal_softc *sc);
232 static void g_journal_start_switcher(struct g_class *mp);
233 static void g_journal_stop_switcher(void);
234 static void g_journal_switch_wait(struct g_journal_softc *sc);
235 
236 #define	GJ_SWITCHER_WORKING	0
237 #define	GJ_SWITCHER_DIE		1
238 #define	GJ_SWITCHER_DIED	2
239 static struct proc *g_journal_switcher_proc = NULL;
240 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
241 static int g_journal_switcher_wokenup = 0;
242 static int g_journal_sync_requested = 0;
243 
244 #ifdef GJ_MEMDEBUG
245 struct meminfo {
246 	size_t		mi_size;
247 	struct stack	mi_stack;
248 };
249 #endif
250 
251 /*
252  * We use our own malloc/realloc/free funtions, so we can collect statistics
253  * and force journal switch when we're running out of cache.
254  */
255 static void *
256 gj_malloc(size_t size, int flags)
257 {
258 	void *p;
259 #ifdef GJ_MEMDEBUG
260 	struct meminfo *mi;
261 #endif
262 
263 	mtx_lock(&g_journal_cache_mtx);
264 	if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
265 	    g_journal_cache_used + size > g_journal_cache_low) {
266 		GJ_DEBUG(1, "No cache, waking up the switcher.");
267 		g_journal_switcher_wokenup = 1;
268 		wakeup(&g_journal_switcher_state);
269 	}
270 	if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
271 	    g_journal_cache_used + size > g_journal_cache_limit) {
272 		mtx_unlock(&g_journal_cache_mtx);
273 		g_journal_cache_alloc_failures++;
274 		return (NULL);
275 	}
276 	g_journal_cache_used += size;
277 	mtx_unlock(&g_journal_cache_mtx);
278 	flags &= ~M_NOWAIT;
279 #ifndef GJ_MEMDEBUG
280 	p = malloc(size, M_JOURNAL, flags | M_WAITOK);
281 #else
282 	mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
283 	p = (u_char *)mi + sizeof(*mi);
284 	mi->mi_size = size;
285 	stack_save(&mi->mi_stack);
286 #endif
287 	return (p);
288 }
289 
290 static void
291 gj_free(void *p, size_t size)
292 {
293 #ifdef GJ_MEMDEBUG
294 	struct meminfo *mi;
295 #endif
296 
297 	KASSERT(p != NULL, ("p=NULL"));
298 	KASSERT(size > 0, ("size=0"));
299 	mtx_lock(&g_journal_cache_mtx);
300 	KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
301 	g_journal_cache_used -= size;
302 	mtx_unlock(&g_journal_cache_mtx);
303 #ifdef GJ_MEMDEBUG
304 	mi = p = (void *)((u_char *)p - sizeof(*mi));
305 	if (mi->mi_size != size) {
306 		printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
307 		    mi->mi_size);
308 		printf("GJOURNAL: Alloc backtrace:\n");
309 		stack_print(&mi->mi_stack);
310 		printf("GJOURNAL: Free backtrace:\n");
311 		kdb_backtrace();
312 	}
313 #endif
314 	free(p, M_JOURNAL);
315 }
316 
317 static void *
318 gj_realloc(void *p, size_t size, size_t oldsize)
319 {
320 	void *np;
321 
322 #ifndef GJ_MEMDEBUG
323 	mtx_lock(&g_journal_cache_mtx);
324 	g_journal_cache_used -= oldsize;
325 	g_journal_cache_used += size;
326 	mtx_unlock(&g_journal_cache_mtx);
327 	np = realloc(p, size, M_JOURNAL, M_WAITOK);
328 #else
329 	np = gj_malloc(size, M_WAITOK);
330 	bcopy(p, np, MIN(oldsize, size));
331 	gj_free(p, oldsize);
332 #endif
333 	return (np);
334 }
335 
336 static void
337 g_journal_check_overflow(struct g_journal_softc *sc)
338 {
339 	off_t length, used;
340 
341 	if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
342 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
343 	    (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
344 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
345 	     sc->sc_journal_offset < sc->sc_active.jj_offset)) {
346 		panic("Journal overflow "
347 		    "(id = %u joffset=%jd active=%jd inactive=%jd)",
348 		    (unsigned)sc->sc_id,
349 		    (intmax_t)sc->sc_journal_offset,
350 		    (intmax_t)sc->sc_active.jj_offset,
351 		    (intmax_t)sc->sc_inactive.jj_offset);
352 	}
353 	if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
354 		length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
355 		used = sc->sc_journal_offset - sc->sc_active.jj_offset;
356 	} else {
357 		length = sc->sc_jend - sc->sc_active.jj_offset;
358 		length += sc->sc_inactive.jj_offset - sc->sc_jstart;
359 		if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
360 			used = sc->sc_journal_offset - sc->sc_active.jj_offset;
361 		else {
362 			used = sc->sc_jend - sc->sc_active.jj_offset;
363 			used += sc->sc_journal_offset - sc->sc_jstart;
364 		}
365 	}
366 	/* Already woken up? */
367 	if (g_journal_switcher_wokenup)
368 		return;
369 	/*
370 	 * If the active journal takes more than g_journal_force_switch precent
371 	 * of free journal space, we force journal switch.
372 	 */
373 	KASSERT(length > 0,
374 	    ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
375 	    (intmax_t)length, (intmax_t)used,
376 	    (intmax_t)sc->sc_active.jj_offset,
377 	    (intmax_t)sc->sc_inactive.jj_offset,
378 	    (intmax_t)sc->sc_journal_offset));
379 	if ((used * 100) / length > g_journal_force_switch) {
380 		g_journal_stats_journal_full++;
381 		GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
382 		    sc->sc_name, (used * 100) / length);
383 		mtx_lock(&g_journal_cache_mtx);
384 		g_journal_switcher_wokenup = 1;
385 		wakeup(&g_journal_switcher_state);
386 		mtx_unlock(&g_journal_cache_mtx);
387 	}
388 }
389 
390 static void
391 g_journal_orphan(struct g_consumer *cp)
392 {
393 	struct g_journal_softc *sc;
394 	char name[256];
395 	int error;
396 
397 	g_topology_assert();
398 	sc = cp->geom->softc;
399 	strlcpy(name, cp->provider->name, sizeof(name));
400 	GJ_DEBUG(0, "Lost provider %s.", name);
401 	if (sc == NULL)
402 		return;
403 	error = g_journal_destroy(sc);
404 	if (error == 0)
405 		GJ_DEBUG(0, "Journal %s destroyed.", name);
406 	else {
407 		GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
408 		    "Destroy it manually after last close.", sc->sc_name,
409 		    error);
410 	}
411 }
412 
413 static int
414 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
415 {
416 	struct g_journal_softc *sc;
417 	int dcr, dcw, dce;
418 
419 	g_topology_assert();
420 	GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
421 	    acr, acw, ace);
422 
423 	dcr = pp->acr + acr;
424 	dcw = pp->acw + acw;
425 	dce = pp->ace + ace;
426 
427 	sc = pp->geom->softc;
428 	if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
429 		if (acr <= 0 && acw <= 0 && ace <= 0)
430 			return (0);
431 		else
432 			return (ENXIO);
433 	}
434 	if (pp->acw == 0 && dcw > 0) {
435 		GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
436 		sc->sc_flags &= ~GJF_DEVICE_CLEAN;
437 		g_topology_unlock();
438 		g_journal_metadata_update(sc);
439 		g_topology_lock();
440 	} /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
441 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
442 		sc->sc_flags |= GJF_DEVICE_CLEAN;
443 		g_topology_unlock();
444 		g_journal_metadata_update(sc);
445 		g_topology_lock();
446 	} */
447 	return (0);
448 }
449 
450 static void
451 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
452 {
453 
454 	bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
455 	data += sizeof(GJ_HEADER_MAGIC);
456 	le32enc(data, hdr->jh_journal_id);
457 	data += 4;
458 	le32enc(data, hdr->jh_journal_next_id);
459 }
460 
461 static int
462 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
463 {
464 
465 	bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
466 	data += sizeof(hdr->jh_magic);
467 	if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
468 		return (EINVAL);
469 	hdr->jh_journal_id = le32dec(data);
470 	data += 4;
471 	hdr->jh_journal_next_id = le32dec(data);
472 	return (0);
473 }
474 
475 static void
476 g_journal_flush_cache(struct g_journal_softc *sc)
477 {
478 	struct bintime bt;
479 	int error;
480 
481 	if (sc->sc_bio_flush == 0)
482 		return;
483 	GJ_TIMER_START(1, &bt);
484 	if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
485 		error = g_io_flush(sc->sc_jconsumer);
486 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
487 		    sc->sc_jconsumer->provider->name, error);
488 	}
489 	if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
490 		/*
491 		 * TODO: This could be called in parallel with the
492 		 *       previous call.
493 		 */
494 		error = g_io_flush(sc->sc_dconsumer);
495 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
496 		    sc->sc_dconsumer->provider->name, error);
497 	}
498 	GJ_TIMER_STOP(1, &bt, "Cache flush time");
499 }
500 
501 static int
502 g_journal_write_header(struct g_journal_softc *sc)
503 {
504 	struct g_journal_header hdr;
505 	struct g_consumer *cp;
506 	u_char *buf;
507 	int error;
508 
509 	cp = sc->sc_jconsumer;
510 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
511 
512 	strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
513 	hdr.jh_journal_id = sc->sc_journal_id;
514 	hdr.jh_journal_next_id = sc->sc_journal_next_id;
515 	g_journal_header_encode(&hdr, buf);
516 	error = g_write_data(cp, sc->sc_journal_offset, buf,
517 	    cp->provider->sectorsize);
518 	/* if (error == 0) */
519 	sc->sc_journal_offset += cp->provider->sectorsize;
520 
521 	gj_free(buf, cp->provider->sectorsize);
522 	return (error);
523 }
524 
525 /*
526  * Every journal record has a header and data following it.
527  * Functions below are used to decode the header before storing it to
528  * little endian and to encode it after reading to system endianness.
529  */
530 static void
531 g_journal_record_header_encode(struct g_journal_record_header *hdr,
532     u_char *data)
533 {
534 	struct g_journal_entry *ent;
535 	u_int i;
536 
537 	bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
538 	data += sizeof(GJ_RECORD_HEADER_MAGIC);
539 	le32enc(data, hdr->jrh_journal_id);
540 	data += 8;
541 	le16enc(data, hdr->jrh_nentries);
542 	data += 2;
543 	bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
544 	data += 8;
545 	for (i = 0; i < hdr->jrh_nentries; i++) {
546 		ent = &hdr->jrh_entries[i];
547 		le64enc(data, ent->je_joffset);
548 		data += 8;
549 		le64enc(data, ent->je_offset);
550 		data += 8;
551 		le64enc(data, ent->je_length);
552 		data += 8;
553 	}
554 }
555 
556 static int
557 g_journal_record_header_decode(const u_char *data,
558     struct g_journal_record_header *hdr)
559 {
560 	struct g_journal_entry *ent;
561 	u_int i;
562 
563 	bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
564 	data += sizeof(hdr->jrh_magic);
565 	if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
566 		return (EINVAL);
567 	hdr->jrh_journal_id = le32dec(data);
568 	data += 8;
569 	hdr->jrh_nentries = le16dec(data);
570 	data += 2;
571 	if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
572 		return (EINVAL);
573 	bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
574 	data += 8;
575 	for (i = 0; i < hdr->jrh_nentries; i++) {
576 		ent = &hdr->jrh_entries[i];
577 		ent->je_joffset = le64dec(data);
578 		data += 8;
579 		ent->je_offset = le64dec(data);
580 		data += 8;
581 		ent->je_length = le64dec(data);
582 		data += 8;
583 	}
584 	return (0);
585 }
586 
587 /*
588  * Function reads metadata from a provider (via the given consumer), decodes
589  * it to system endianness and verifies its correctness.
590  */
591 static int
592 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
593 {
594 	struct g_provider *pp;
595 	u_char *buf;
596 	int error;
597 
598 	g_topology_assert();
599 
600 	error = g_access(cp, 1, 0, 0);
601 	if (error != 0)
602 		return (error);
603 	pp = cp->provider;
604 	g_topology_unlock();
605 	/* Metadata is stored in last sector. */
606 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
607 	    &error);
608 	g_topology_lock();
609 	g_access(cp, -1, 0, 0);
610 	if (buf == NULL) {
611 		GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
612 		    cp->provider->name, error);
613 		return (error);
614 	}
615 
616 	/* Decode metadata. */
617 	error = journal_metadata_decode(buf, md);
618 	g_free(buf);
619 	/* Is this is gjournal provider at all? */
620 	if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
621 		return (EINVAL);
622 	/*
623 	 * Are we able to handle this version of metadata?
624 	 * We only maintain backward compatibility.
625 	 */
626 	if (md->md_version > G_JOURNAL_VERSION) {
627 		GJ_DEBUG(0,
628 		    "Kernel module is too old to handle metadata from %s.",
629 		    cp->provider->name);
630 		return (EINVAL);
631 	}
632 	/* Is checksum correct? */
633 	if (error != 0) {
634 		GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
635 		    cp->provider->name);
636 		return (error);
637 	}
638 	return (0);
639 }
640 
641 /*
642  * Two functions below are responsible for updating metadata.
643  * Only metadata on the data provider is updated (we need to update
644  * information about active journal in there).
645  */
646 static void
647 g_journal_metadata_done(struct bio *bp)
648 {
649 
650 	/*
651 	 * There is not much we can do on error except informing about it.
652 	 */
653 	if (bp->bio_error != 0) {
654 		GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
655 		    bp->bio_error);
656 	} else {
657 		GJ_LOGREQ(2, bp, "Metadata updated.");
658 	}
659 	gj_free(bp->bio_data, bp->bio_length);
660 	g_destroy_bio(bp);
661 }
662 
663 static void
664 g_journal_metadata_update(struct g_journal_softc *sc)
665 {
666 	struct g_journal_metadata md;
667 	struct g_consumer *cp;
668 	struct bio *bp;
669 	u_char *sector;
670 
671 	cp = sc->sc_dconsumer;
672 	sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
673 	strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
674 	md.md_version = G_JOURNAL_VERSION;
675 	md.md_id = sc->sc_id;
676 	md.md_type = sc->sc_orig_type;
677 	md.md_jstart = sc->sc_jstart;
678 	md.md_jend = sc->sc_jend;
679 	md.md_joffset = sc->sc_inactive.jj_offset;
680 	md.md_jid = sc->sc_journal_previous_id;
681 	md.md_flags = 0;
682 	if (sc->sc_flags & GJF_DEVICE_CLEAN)
683 		md.md_flags |= GJ_FLAG_CLEAN;
684 
685 	if (sc->sc_flags & GJF_DEVICE_HARDCODED)
686 		strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
687 	else
688 		bzero(md.md_provider, sizeof(md.md_provider));
689 	md.md_provsize = cp->provider->mediasize;
690 	journal_metadata_encode(&md, sector);
691 
692 	/*
693 	 * Flush the cache, so we know all data are on disk.
694 	 * We write here informations like "journal is consistent", so we need
695 	 * to be sure it is. Without BIO_FLUSH here, we can end up in situation
696 	 * where metadata is stored on disk, but not all data.
697 	 */
698 	g_journal_flush_cache(sc);
699 
700 	bp = g_alloc_bio();
701 	bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
702 	bp->bio_length = cp->provider->sectorsize;
703 	bp->bio_data = sector;
704 	bp->bio_cmd = BIO_WRITE;
705 	if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
706 		bp->bio_done = g_journal_metadata_done;
707 		g_io_request(bp, cp);
708 	} else {
709 		bp->bio_done = NULL;
710 		g_io_request(bp, cp);
711 		biowait(bp, "gjmdu");
712 		g_journal_metadata_done(bp);
713 	}
714 
715 	/*
716 	 * Be sure metadata reached the disk.
717 	 */
718 	g_journal_flush_cache(sc);
719 }
720 
721 /*
722  * This is where the I/O request comes from the GEOM.
723  */
724 static void
725 g_journal_start(struct bio *bp)
726 {
727 	struct g_journal_softc *sc;
728 
729 	sc = bp->bio_to->geom->softc;
730 	GJ_LOGREQ(3, bp, "Request received.");
731 
732 	switch (bp->bio_cmd) {
733 	case BIO_READ:
734 	case BIO_WRITE:
735 		mtx_lock(&sc->sc_mtx);
736 		bioq_insert_tail(&sc->sc_regular_queue, bp);
737 		wakeup(sc);
738 		mtx_unlock(&sc->sc_mtx);
739 		return;
740 	case BIO_GETATTR:
741 		if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
742 			strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
743 			bp->bio_completed = strlen(bp->bio_to->name) + 1;
744 			g_io_deliver(bp, 0);
745 			return;
746 		}
747 		/* FALLTHROUGH */
748 	case BIO_DELETE:
749 	default:
750 		g_io_deliver(bp, EOPNOTSUPP);
751 		return;
752 	}
753 }
754 
755 static void
756 g_journal_std_done(struct bio *bp)
757 {
758 	struct g_journal_softc *sc;
759 
760 	sc = bp->bio_from->geom->softc;
761 	mtx_lock(&sc->sc_mtx);
762 	bioq_insert_tail(&sc->sc_back_queue, bp);
763 	wakeup(sc);
764 	mtx_unlock(&sc->sc_mtx);
765 }
766 
767 static struct bio *
768 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
769     int flags)
770 {
771 	struct bio *bp;
772 
773 	bp = g_alloc_bio();
774 	bp->bio_offset = start;
775 	bp->bio_joffset = joffset;
776 	bp->bio_length = end - start;
777 	bp->bio_cmd = BIO_WRITE;
778 	bp->bio_done = g_journal_std_done;
779 	if (data == NULL)
780 		bp->bio_data = NULL;
781 	else {
782 		bp->bio_data = gj_malloc(bp->bio_length, flags);
783 		if (bp->bio_data != NULL)
784 			bcopy(data, bp->bio_data, bp->bio_length);
785 	}
786 	return (bp);
787 }
788 
789 #define	g_journal_insert_bio(head, bp, flags)				\
790 	g_journal_insert((head), (bp)->bio_offset,			\
791 		(bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset,	\
792 		(bp)->bio_data, flags)
793 /*
794  * The function below does a lot more than just inserting bio to the queue.
795  * It keeps the queue sorted by offset and ensures that there are no doubled
796  * data (it combines bios where ranges overlap).
797  *
798  * The function returns the number of bios inserted (as bio can be splitted).
799  */
800 static int
801 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
802     u_char *data, int flags)
803 {
804 	struct bio *nbp, *cbp, *pbp;
805 	off_t cstart, cend;
806 	u_char *tmpdata;
807 	int n;
808 
809 	GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
810 	    joffset);
811 	n = 0;
812 	pbp = NULL;
813 	GJQ_FOREACH(*head, cbp) {
814 		cstart = cbp->bio_offset;
815 		cend = cbp->bio_offset + cbp->bio_length;
816 
817 		if (nstart >= cend) {
818 			/*
819 			 *  +-------------+
820 			 *  |             |
821 			 *  |   current   |  +-------------+
822 			 *  |     bio     |  |             |
823 			 *  |             |  |     new     |
824 			 *  +-------------+  |     bio     |
825 			 *                   |             |
826 			 *                   +-------------+
827 			 */
828 			GJ_DEBUG(3, "INSERT(%p): 1", *head);
829 		} else if (nend <= cstart) {
830 			/*
831 			 *                   +-------------+
832 			 *                   |             |
833 			 *  +-------------+  |   current   |
834 			 *  |             |  |     bio     |
835 			 *  |     new     |  |             |
836 			 *  |     bio     |  +-------------+
837 			 *  |             |
838 			 *  +-------------+
839 			 */
840 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
841 			    flags);
842 			if (pbp == NULL)
843 				*head = nbp;
844 			else
845 				pbp->bio_next = nbp;
846 			nbp->bio_next = cbp;
847 			n++;
848 			GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
849 			    pbp);
850 			goto end;
851 		} else if (nstart <= cstart && nend >= cend) {
852 			/*
853 			 *      +-------------+      +-------------+
854 			 *      | current bio |      | current bio |
855 			 *  +---+-------------+---+  +-------------+---+
856 			 *  |   |             |   |  |             |   |
857 			 *  |   |             |   |  |             |   |
858 			 *  |   +-------------+   |  +-------------+   |
859 			 *  |       new bio       |  |     new bio     |
860 			 *  +---------------------+  +-----------------+
861 			 *
862 			 *      +-------------+  +-------------+
863 			 *      | current bio |  | current bio |
864 			 *  +---+-------------+  +-------------+
865 			 *  |   |             |  |             |
866 			 *  |   |             |  |             |
867 			 *  |   +-------------+  +-------------+
868 			 *  |     new bio     |  |   new bio   |
869 			 *  +-----------------+  +-------------+
870 			 */
871 			g_journal_stats_bytes_skipped += cbp->bio_length;
872 			cbp->bio_offset = nstart;
873 			cbp->bio_joffset = joffset;
874 			cbp->bio_length = cend - nstart;
875 			if (cbp->bio_data != NULL) {
876 				gj_free(cbp->bio_data, cend - cstart);
877 				cbp->bio_data = NULL;
878 			}
879 			if (data != NULL) {
880 				cbp->bio_data = gj_malloc(cbp->bio_length,
881 				    flags);
882 				if (cbp->bio_data != NULL) {
883 					bcopy(data, cbp->bio_data,
884 					    cbp->bio_length);
885 				}
886 				data += cend - nstart;
887 			}
888 			joffset += cend - nstart;
889 			nstart = cend;
890 			GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
891 		} else if (nstart > cstart && nend >= cend) {
892 			/*
893 			 *  +-----------------+  +-------------+
894 			 *  |   current bio   |  | current bio |
895 			 *  |   +-------------+  |   +---------+---+
896 			 *  |   |             |  |   |         |   |
897 			 *  |   |             |  |   |         |   |
898 			 *  +---+-------------+  +---+---------+   |
899 			 *      |   new bio   |      |   new bio   |
900 			 *      +-------------+      +-------------+
901 			 */
902 			g_journal_stats_bytes_skipped += cend - nstart;
903 			nbp = g_journal_new_bio(nstart, cend, joffset, data,
904 			    flags);
905 			nbp->bio_next = cbp->bio_next;
906 			cbp->bio_next = nbp;
907 			cbp->bio_length = nstart - cstart;
908 			if (cbp->bio_data != NULL) {
909 				cbp->bio_data = gj_realloc(cbp->bio_data,
910 				    cbp->bio_length, cend - cstart);
911 			}
912 			if (data != NULL)
913 				data += cend - nstart;
914 			joffset += cend - nstart;
915 			nstart = cend;
916 			n++;
917 			GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
918 		} else if (nstart > cstart && nend < cend) {
919 			/*
920 			 *  +---------------------+
921 			 *  |     current bio     |
922 			 *  |   +-------------+   |
923 			 *  |   |             |   |
924 			 *  |   |             |   |
925 			 *  +---+-------------+---+
926 			 *      |   new bio   |
927 			 *      +-------------+
928 			 */
929 			g_journal_stats_bytes_skipped += nend - nstart;
930 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
931 			    flags);
932 			nbp->bio_next = cbp->bio_next;
933 			cbp->bio_next = nbp;
934 			if (cbp->bio_data == NULL)
935 				tmpdata = NULL;
936 			else
937 				tmpdata = cbp->bio_data + nend - cstart;
938 			nbp = g_journal_new_bio(nend, cend,
939 			    cbp->bio_joffset + nend - cstart, tmpdata, flags);
940 			nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
941 			((struct bio *)cbp->bio_next)->bio_next = nbp;
942 			cbp->bio_length = nstart - cstart;
943 			if (cbp->bio_data != NULL) {
944 				cbp->bio_data = gj_realloc(cbp->bio_data,
945 				    cbp->bio_length, cend - cstart);
946 			}
947 			n += 2;
948 			GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
949 			goto end;
950 		} else if (nstart <= cstart && nend < cend) {
951 			/*
952 			 *  +-----------------+      +-------------+
953 			 *  |   current bio   |      | current bio |
954 			 *  +-------------+   |  +---+---------+   |
955 			 *  |             |   |  |   |         |   |
956 			 *  |             |   |  |   |         |   |
957 			 *  +-------------+---+  |   +---------+---+
958 			 *  |   new bio   |      |   new bio   |
959 			 *  +-------------+      +-------------+
960 			 */
961 			g_journal_stats_bytes_skipped += nend - nstart;
962 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
963 			    flags);
964 			if (pbp == NULL)
965 				*head = nbp;
966 			else
967 				pbp->bio_next = nbp;
968 			nbp->bio_next = cbp;
969 			cbp->bio_offset = nend;
970 			cbp->bio_length = cend - nend;
971 			cbp->bio_joffset += nend - cstart;
972 			tmpdata = cbp->bio_data;
973 			if (tmpdata != NULL) {
974 				cbp->bio_data = gj_malloc(cbp->bio_length,
975 				    flags);
976 				if (cbp->bio_data != NULL) {
977 					bcopy(tmpdata + nend - cstart,
978 					    cbp->bio_data, cbp->bio_length);
979 				}
980 				gj_free(tmpdata, cend - cstart);
981 			}
982 			n++;
983 			GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
984 			goto end;
985 		}
986 		if (nstart == nend)
987 			goto end;
988 		pbp = cbp;
989 	}
990 	nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
991 	if (pbp == NULL)
992 		*head = nbp;
993 	else
994 		pbp->bio_next = nbp;
995 	nbp->bio_next = NULL;
996 	n++;
997 	GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
998 end:
999 	if (g_journal_debug >= 3) {
1000 		GJQ_FOREACH(*head, cbp) {
1001 			GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
1002 			    (intmax_t)cbp->bio_offset,
1003 			    (intmax_t)cbp->bio_length,
1004 			    (intmax_t)cbp->bio_joffset, cbp->bio_data);
1005 		}
1006 		GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
1007 	}
1008 	return (n);
1009 }
1010 
1011 /*
1012  * The function combines neighbour bios trying to squeeze as much data as
1013  * possible into one bio.
1014  *
1015  * The function returns the number of bios combined (negative value).
1016  */
1017 static int
1018 g_journal_optimize(struct bio *head)
1019 {
1020 	struct bio *cbp, *pbp;
1021 	int n;
1022 
1023 	n = 0;
1024 	pbp = NULL;
1025 	GJQ_FOREACH(head, cbp) {
1026 		/* Skip bios which has to be read first. */
1027 		if (cbp->bio_data == NULL) {
1028 			pbp = NULL;
1029 			continue;
1030 		}
1031 		/* There is no previous bio yet. */
1032 		if (pbp == NULL) {
1033 			pbp = cbp;
1034 			continue;
1035 		}
1036 		/* Is this a neighbour bio? */
1037 		if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
1038 			/* Be sure that bios queue is sorted. */
1039 			KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
1040 			    ("poffset=%jd plength=%jd coffset=%jd",
1041 			    (intmax_t)pbp->bio_offset,
1042 			    (intmax_t)pbp->bio_length,
1043 			    (intmax_t)cbp->bio_offset));
1044 			pbp = cbp;
1045 			continue;
1046 		}
1047 		/* Be sure we don't end up with too big bio. */
1048 		if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
1049 			pbp = cbp;
1050 			continue;
1051 		}
1052 		/* Ok, we can join bios. */
1053 		GJ_LOGREQ(4, pbp, "Join: ");
1054 		GJ_LOGREQ(4, cbp, "and: ");
1055 		pbp->bio_data = gj_realloc(pbp->bio_data,
1056 		    pbp->bio_length + cbp->bio_length, pbp->bio_length);
1057 		bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
1058 		    cbp->bio_length);
1059 		gj_free(cbp->bio_data, cbp->bio_length);
1060 		pbp->bio_length += cbp->bio_length;
1061 		pbp->bio_next = cbp->bio_next;
1062 		g_destroy_bio(cbp);
1063 		cbp = pbp;
1064 		g_journal_stats_combined_ios++;
1065 		n--;
1066 		GJ_LOGREQ(4, pbp, "Got: ");
1067 	}
1068 	return (n);
1069 }
1070 
1071 /*
1072  * TODO: Update comment.
1073  * These are functions responsible for copying one portion of data from journal
1074  * to the destination provider.
1075  * The order goes like this:
1076  * 1. Read the header, which contains informations about data blocks
1077  *    following it.
1078  * 2. Read the data blocks from the journal.
1079  * 3. Write the data blocks on the data provider.
1080  *
1081  * g_journal_copy_start()
1082  * g_journal_copy_done() - got finished write request, logs potential errors.
1083  */
1084 
1085 /*
1086  * When there is no data in cache, this function is used to read it.
1087  */
1088 static void
1089 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
1090 {
1091 	struct bio *cbp;
1092 
1093 	/*
1094 	 * We were short in memory, so data was freed.
1095 	 * In that case we need to read it back from journal.
1096 	 */
1097 	cbp = g_alloc_bio();
1098 	cbp->bio_cflags = bp->bio_cflags;
1099 	cbp->bio_parent = bp;
1100 	cbp->bio_offset = bp->bio_joffset;
1101 	cbp->bio_length = bp->bio_length;
1102 	cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
1103 	cbp->bio_cmd = BIO_READ;
1104 	cbp->bio_done = g_journal_std_done;
1105 	GJ_LOGREQ(4, cbp, "READ FIRST");
1106 	g_io_request(cbp, sc->sc_jconsumer);
1107 	g_journal_cache_misses++;
1108 }
1109 
1110 static void
1111 g_journal_copy_send(struct g_journal_softc *sc)
1112 {
1113 	struct bio *bioq, *bp, *lbp;
1114 
1115 	bioq = lbp = NULL;
1116 	mtx_lock(&sc->sc_mtx);
1117 	for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
1118 		bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
1119 		if (bp == NULL)
1120 			break;
1121 		GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
1122 		sc->sc_copy_in_progress++;
1123 		GJQ_INSERT_AFTER(bioq, bp, lbp);
1124 		lbp = bp;
1125 	}
1126 	mtx_unlock(&sc->sc_mtx);
1127 	if (g_journal_do_optimize)
1128 		sc->sc_copy_in_progress += g_journal_optimize(bioq);
1129 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
1130 		GJQ_REMOVE(bioq, bp);
1131 		GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
1132 		bp->bio_cflags = GJ_BIO_COPY;
1133 		if (bp->bio_data == NULL)
1134 			g_journal_read_first(sc, bp);
1135 		else {
1136 			bp->bio_joffset = 0;
1137 			GJ_LOGREQ(4, bp, "SEND");
1138 			g_io_request(bp, sc->sc_dconsumer);
1139 		}
1140 	}
1141 }
1142 
1143 static void
1144 g_journal_copy_start(struct g_journal_softc *sc)
1145 {
1146 
1147 	/*
1148 	 * Remember in metadata that we're starting to copy journaled data
1149 	 * to the data provider.
1150 	 * In case of power failure, we will copy these data once again on boot.
1151 	 */
1152 	if (!sc->sc_journal_copying) {
1153 		sc->sc_journal_copying = 1;
1154 		GJ_DEBUG(1, "Starting copy of journal.");
1155 		g_journal_metadata_update(sc);
1156 	}
1157 	g_journal_copy_send(sc);
1158 }
1159 
1160 /*
1161  * Data block has been read from the journal provider.
1162  */
1163 static int
1164 g_journal_copy_read_done(struct bio *bp)
1165 {
1166 	struct g_journal_softc *sc;
1167 	struct g_consumer *cp;
1168 	struct bio *pbp;
1169 
1170 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1171 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1172 
1173 	sc = bp->bio_from->geom->softc;
1174 	pbp = bp->bio_parent;
1175 
1176 	if (bp->bio_error != 0) {
1177 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1178 		    bp->bio_to->name, bp->bio_error);
1179 		/*
1180 		 * We will not be able to deliver WRITE request as well.
1181 		 */
1182 		gj_free(bp->bio_data, bp->bio_length);
1183 		g_destroy_bio(pbp);
1184 		g_destroy_bio(bp);
1185 		sc->sc_copy_in_progress--;
1186 		return (1);
1187 	}
1188 	pbp->bio_data = bp->bio_data;
1189 	cp = sc->sc_dconsumer;
1190 	g_io_request(pbp, cp);
1191 	GJ_LOGREQ(4, bp, "READ DONE");
1192 	g_destroy_bio(bp);
1193 	return (0);
1194 }
1195 
1196 /*
1197  * Data block has been written to the data provider.
1198  */
1199 static void
1200 g_journal_copy_write_done(struct bio *bp)
1201 {
1202 	struct g_journal_softc *sc;
1203 
1204 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1205 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1206 
1207 	sc = bp->bio_from->geom->softc;
1208 	sc->sc_copy_in_progress--;
1209 
1210 	if (bp->bio_error != 0) {
1211 		GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
1212 		    bp->bio_error);
1213 	}
1214 	GJQ_REMOVE(sc->sc_copy_queue, bp);
1215 	gj_free(bp->bio_data, bp->bio_length);
1216 	GJ_LOGREQ(4, bp, "DONE");
1217 	g_destroy_bio(bp);
1218 
1219 	if (sc->sc_copy_in_progress == 0) {
1220 		/*
1221 		 * This was the last write request for this journal.
1222 		 */
1223 		GJ_DEBUG(1, "Data has been copied.");
1224 		sc->sc_journal_copying = 0;
1225 	}
1226 }
1227 
1228 static void g_journal_flush_done(struct bio *bp);
1229 
1230 /*
1231  * Flush one record onto active journal provider.
1232  */
1233 static void
1234 g_journal_flush(struct g_journal_softc *sc)
1235 {
1236 	struct g_journal_record_header hdr;
1237 	struct g_journal_entry *ent;
1238 	struct g_provider *pp;
1239 	struct bio **bioq;
1240 	struct bio *bp, *fbp, *pbp;
1241 	off_t joffset;
1242 	u_char *data, hash[16];
1243 	MD5_CTX ctx;
1244 	u_int i;
1245 
1246 	if (sc->sc_current_count == 0)
1247 		return;
1248 
1249 	pp = sc->sc_jprovider;
1250 	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1251 	joffset = sc->sc_journal_offset;
1252 
1253 	GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
1254 	    sc->sc_current_count, pp->name, (intmax_t)joffset);
1255 
1256 	/*
1257 	 * Store 'journal id', so we know to which journal this record belongs.
1258 	 */
1259 	hdr.jrh_journal_id = sc->sc_journal_id;
1260 	/* Could be less than g_journal_record_entries if called due timeout. */
1261 	hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
1262 	strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
1263 
1264 	bioq = &sc->sc_active.jj_queue;
1265 	GJQ_LAST(sc->sc_flush_queue, pbp);
1266 
1267 	fbp = g_alloc_bio();
1268 	fbp->bio_parent = NULL;
1269 	fbp->bio_cflags = GJ_BIO_JOURNAL;
1270 	fbp->bio_offset = -1;
1271 	fbp->bio_joffset = joffset;
1272 	fbp->bio_length = pp->sectorsize;
1273 	fbp->bio_cmd = BIO_WRITE;
1274 	fbp->bio_done = g_journal_std_done;
1275 	GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
1276 	pbp = fbp;
1277 	fbp->bio_to = pp;
1278 	GJ_LOGREQ(4, fbp, "FLUSH_OUT");
1279 	joffset += pp->sectorsize;
1280 	sc->sc_flush_count++;
1281 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1282 		MD5Init(&ctx);
1283 
1284 	for (i = 0; i < hdr.jrh_nentries; i++) {
1285 		bp = sc->sc_current_queue;
1286 		KASSERT(bp != NULL, ("NULL bp"));
1287 		bp->bio_to = pp;
1288 		GJ_LOGREQ(4, bp, "FLUSHED");
1289 		sc->sc_current_queue = bp->bio_next;
1290 		bp->bio_next = NULL;
1291 		sc->sc_current_count--;
1292 
1293 		/* Add to the header. */
1294 		ent = &hdr.jrh_entries[i];
1295 		ent->je_offset = bp->bio_offset;
1296 		ent->je_joffset = joffset;
1297 		ent->je_length = bp->bio_length;
1298 
1299 		data = bp->bio_data;
1300 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1301 			MD5Update(&ctx, data, ent->je_length);
1302 		g_reset_bio(bp);
1303 		bp->bio_cflags = GJ_BIO_JOURNAL;
1304 		bp->bio_offset = ent->je_offset;
1305 		bp->bio_joffset = ent->je_joffset;
1306 		bp->bio_length = ent->je_length;
1307 		bp->bio_data = data;
1308 		bp->bio_cmd = BIO_WRITE;
1309 		bp->bio_done = g_journal_std_done;
1310 		GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
1311 		pbp = bp;
1312 		bp->bio_to = pp;
1313 		GJ_LOGREQ(4, bp, "FLUSH_OUT");
1314 		joffset += bp->bio_length;
1315 		sc->sc_flush_count++;
1316 
1317 		/*
1318 		 * Add request to the active sc_journal_queue queue.
1319 		 * This is our cache. After journal switch we don't have to
1320 		 * read the data from the inactive journal, because we keep
1321 		 * it in memory.
1322 		 */
1323 		g_journal_insert(bioq, ent->je_offset,
1324 		    ent->je_offset + ent->je_length, ent->je_joffset, data,
1325 		    M_NOWAIT);
1326 	}
1327 
1328 	/*
1329 	 * After all requests, store valid header.
1330 	 */
1331 	data = gj_malloc(pp->sectorsize, M_WAITOK);
1332 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1333 		MD5Final(hash, &ctx);
1334 		bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
1335 	}
1336 	g_journal_record_header_encode(&hdr, data);
1337 	fbp->bio_data = data;
1338 
1339 	sc->sc_journal_offset = joffset;
1340 
1341 	g_journal_check_overflow(sc);
1342 }
1343 
1344 /*
1345  * Flush request finished.
1346  */
1347 static void
1348 g_journal_flush_done(struct bio *bp)
1349 {
1350 	struct g_journal_softc *sc;
1351 	struct g_consumer *cp;
1352 
1353 	KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
1354 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
1355 
1356 	cp = bp->bio_from;
1357 	sc = cp->geom->softc;
1358 	sc->sc_flush_in_progress--;
1359 
1360 	if (bp->bio_error != 0) {
1361 		GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
1362 		    bp->bio_error);
1363 	}
1364 	gj_free(bp->bio_data, bp->bio_length);
1365 	GJ_LOGREQ(4, bp, "DONE");
1366 	g_destroy_bio(bp);
1367 }
1368 
1369 static void g_journal_release_delayed(struct g_journal_softc *sc);
1370 
1371 static void
1372 g_journal_flush_send(struct g_journal_softc *sc)
1373 {
1374 	struct g_consumer *cp;
1375 	struct bio *bioq, *bp, *lbp;
1376 
1377 	cp = sc->sc_jconsumer;
1378 	bioq = lbp = NULL;
1379 	while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
1380 		/* Send one flush requests to the active journal. */
1381 		bp = GJQ_FIRST(sc->sc_flush_queue);
1382 		if (bp != NULL) {
1383 			GJQ_REMOVE(sc->sc_flush_queue, bp);
1384 			sc->sc_flush_count--;
1385 			bp->bio_offset = bp->bio_joffset;
1386 			bp->bio_joffset = 0;
1387 			sc->sc_flush_in_progress++;
1388 			GJQ_INSERT_AFTER(bioq, bp, lbp);
1389 			lbp = bp;
1390 		}
1391 		/* Try to release delayed requests. */
1392 		g_journal_release_delayed(sc);
1393 		/* If there are no requests to flush, leave. */
1394 		if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
1395 			break;
1396 	}
1397 	if (g_journal_do_optimize)
1398 		sc->sc_flush_in_progress += g_journal_optimize(bioq);
1399 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
1400 		GJQ_REMOVE(bioq, bp);
1401 		GJ_LOGREQ(3, bp, "Flush request send");
1402 		g_io_request(bp, cp);
1403 	}
1404 }
1405 
1406 static void
1407 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
1408 {
1409 	int n;
1410 
1411 	GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
1412 	n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
1413 	sc->sc_current_count += n;
1414 	n = g_journal_optimize(sc->sc_current_queue);
1415 	sc->sc_current_count += n;
1416 	/*
1417 	 * For requests which are added to the current queue we deliver
1418 	 * response immediately.
1419 	 */
1420 	bp->bio_completed = bp->bio_length;
1421 	g_io_deliver(bp, 0);
1422 	if (sc->sc_current_count >= g_journal_record_entries) {
1423 		/*
1424 		 * Let's flush one record onto active journal provider.
1425 		 */
1426 		g_journal_flush(sc);
1427 	}
1428 }
1429 
1430 static void
1431 g_journal_release_delayed(struct g_journal_softc *sc)
1432 {
1433 	struct bio *bp;
1434 
1435 	for (;;) {
1436 		/* The flush queue is full, exit. */
1437 		if (sc->sc_flush_count >= g_journal_accept_immediately)
1438 			return;
1439 		bp = bioq_takefirst(&sc->sc_delayed_queue);
1440 		if (bp == NULL)
1441 			return;
1442 		sc->sc_delayed_count--;
1443 		g_journal_add_current(sc, bp);
1444 	}
1445 }
1446 
1447 /*
1448  * Add I/O request to the current queue. If we have enough requests for one
1449  * journal record we flush them onto active journal provider.
1450  */
1451 static void
1452 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
1453 {
1454 
1455 	/*
1456 	 * The flush queue is full, we need to delay the request.
1457 	 */
1458 	if (sc->sc_delayed_count > 0 ||
1459 	    sc->sc_flush_count >= g_journal_accept_immediately) {
1460 		GJ_LOGREQ(4, bp, "DELAYED");
1461 		bioq_insert_tail(&sc->sc_delayed_queue, bp);
1462 		sc->sc_delayed_count++;
1463 		return;
1464 	}
1465 
1466 	KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
1467 	    ("DELAYED queue not empty."));
1468 	g_journal_add_current(sc, bp);
1469 }
1470 
1471 static void g_journal_read_done(struct bio *bp);
1472 
1473 /*
1474  * Try to find requested data in cache.
1475  */
1476 static struct bio *
1477 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
1478     off_t oend)
1479 {
1480 	off_t cstart, cend;
1481 	struct bio *bp;
1482 
1483 	GJQ_FOREACH(head, bp) {
1484 		if (bp->bio_offset == -1)
1485 			continue;
1486 		cstart = MAX(ostart, bp->bio_offset);
1487 		cend = MIN(oend, bp->bio_offset + bp->bio_length);
1488 		if (cend <= ostart)
1489 			continue;
1490 		else if (cstart >= oend) {
1491 			if (!sorted)
1492 				continue;
1493 			else {
1494 				bp = NULL;
1495 				break;
1496 			}
1497 		}
1498 		if (bp->bio_data == NULL)
1499 			break;
1500 		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1501 		    bp);
1502 		bcopy(bp->bio_data + cstart - bp->bio_offset,
1503 		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1504 		pbp->bio_completed += cend - cstart;
1505 		if (pbp->bio_completed == pbp->bio_length) {
1506 			/*
1507 			 * Cool, the whole request was in cache, deliver happy
1508 			 * message.
1509 			 */
1510 			g_io_deliver(pbp, 0);
1511 			return (pbp);
1512 		}
1513 		break;
1514 	}
1515 	return (bp);
1516 }
1517 
1518 /*
1519  * This function is used for collecting data on read.
1520  * The complexity is because parts of the data can be stored in four different
1521  * places:
1522  * - in memory - the data not yet send to the active journal provider
1523  * - in the active journal
1524  * - in the inactive journal
1525  * - in the data provider
1526  */
1527 static void
1528 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
1529     off_t oend)
1530 {
1531 	struct bio *bp, *nbp, *head;
1532 	off_t cstart, cend;
1533 	u_int i, sorted = 0;
1534 
1535 	GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
1536 
1537 	cstart = cend = -1;
1538 	bp = NULL;
1539 	head = NULL;
1540 	for (i = 1; i <= 5; i++) {
1541 		switch (i) {
1542 		case 1:	/* Not-yet-send data. */
1543 			head = sc->sc_current_queue;
1544 			sorted = 1;
1545 			break;
1546 		case 2: /* Skip flush queue as they are also in active queue */
1547 			continue;
1548 		case 3:	/* Active journal. */
1549 			head = sc->sc_active.jj_queue;
1550 			sorted = 1;
1551 			break;
1552 		case 4:	/* Inactive journal. */
1553 			/*
1554 			 * XXX: Here could be a race with g_journal_lowmem().
1555 			 */
1556 			head = sc->sc_inactive.jj_queue;
1557 			sorted = 1;
1558 			break;
1559 		case 5:	/* In-flight to the data provider. */
1560 			head = sc->sc_copy_queue;
1561 			sorted = 0;
1562 			break;
1563 		default:
1564 			panic("gjournal %s: i=%d", __func__, i);
1565 		}
1566 		bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
1567 		if (bp == pbp) { /* Got the whole request. */
1568 			GJ_DEBUG(2, "Got the whole request from %u.", i);
1569 			return;
1570 		} else if (bp != NULL) {
1571 			cstart = MAX(ostart, bp->bio_offset);
1572 			cend = MIN(oend, bp->bio_offset + bp->bio_length);
1573 			GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
1574 			    i, (intmax_t)cstart, (intmax_t)cend);
1575 			break;
1576 		}
1577 	}
1578 	if (bp != NULL) {
1579 		if (bp->bio_data == NULL) {
1580 			nbp = g_duplicate_bio(pbp);
1581 			nbp->bio_cflags = GJ_BIO_READ;
1582 			nbp->bio_data =
1583 			    pbp->bio_data + cstart - pbp->bio_offset;
1584 			nbp->bio_offset =
1585 			    bp->bio_joffset + cstart - bp->bio_offset;
1586 			nbp->bio_length = cend - cstart;
1587 			nbp->bio_done = g_journal_read_done;
1588 			g_io_request(nbp, sc->sc_jconsumer);
1589 		}
1590 		/*
1591 		 * If we don't have the whole request yet, call g_journal_read()
1592 		 * recursively.
1593 		 */
1594 		if (ostart < cstart)
1595 			g_journal_read(sc, pbp, ostart, cstart);
1596 		if (oend > cend)
1597 			g_journal_read(sc, pbp, cend, oend);
1598 	} else {
1599 		/*
1600 		 * No data in memory, no data in journal.
1601 		 * Its time for asking data provider.
1602 		 */
1603 		GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
1604 		nbp = g_duplicate_bio(pbp);
1605 		nbp->bio_cflags = GJ_BIO_READ;
1606 		nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
1607 		nbp->bio_offset = ostart;
1608 		nbp->bio_length = oend - ostart;
1609 		nbp->bio_done = g_journal_read_done;
1610 		g_io_request(nbp, sc->sc_dconsumer);
1611 		/* We have the whole request, return here. */
1612 		return;
1613 	}
1614 }
1615 
1616 /*
1617  * Function responsible for handling finished READ requests.
1618  * Actually, g_std_done() could be used here, the only difference is that we
1619  * log error.
1620  */
1621 static void
1622 g_journal_read_done(struct bio *bp)
1623 {
1624 	struct bio *pbp;
1625 
1626 	KASSERT(bp->bio_cflags == GJ_BIO_READ,
1627 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
1628 
1629 	pbp = bp->bio_parent;
1630 	pbp->bio_inbed++;
1631 	pbp->bio_completed += bp->bio_length;
1632 
1633 	if (bp->bio_error != 0) {
1634 		if (pbp->bio_error == 0)
1635 			pbp->bio_error = bp->bio_error;
1636 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1637 		    bp->bio_to->name, bp->bio_error);
1638 	}
1639 	g_destroy_bio(bp);
1640 	if (pbp->bio_children == pbp->bio_inbed &&
1641 	    pbp->bio_completed == pbp->bio_length) {
1642 		/* We're done. */
1643 		g_io_deliver(pbp, 0);
1644 	}
1645 }
1646 
1647 /*
1648  * Deactive current journal and active next one.
1649  */
1650 static void
1651 g_journal_switch(struct g_journal_softc *sc)
1652 {
1653 	struct g_provider *pp;
1654 
1655 	if (JEMPTY(sc)) {
1656 		GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
1657 		pp = LIST_FIRST(&sc->sc_geom->provider);
1658 		if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
1659 			sc->sc_flags |= GJF_DEVICE_CLEAN;
1660 			GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
1661 			g_journal_metadata_update(sc);
1662 		}
1663 	} else {
1664 		GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
1665 
1666 		pp = sc->sc_jprovider;
1667 
1668 		sc->sc_journal_previous_id = sc->sc_journal_id;
1669 
1670 		sc->sc_journal_id = sc->sc_journal_next_id;
1671 		sc->sc_journal_next_id = arc4random();
1672 
1673 		GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1674 
1675 		g_journal_write_header(sc);
1676 
1677 		sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
1678 		sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
1679 
1680 		sc->sc_active.jj_offset =
1681 		    sc->sc_journal_offset - pp->sectorsize;
1682 		sc->sc_active.jj_queue = NULL;
1683 
1684 		/*
1685 		 * Switch is done, start copying data from the (now) inactive
1686 		 * journal to the data provider.
1687 		 */
1688 		g_journal_copy_start(sc);
1689 	}
1690 	mtx_lock(&sc->sc_mtx);
1691 	sc->sc_flags &= ~GJF_DEVICE_SWITCH;
1692 	mtx_unlock(&sc->sc_mtx);
1693 }
1694 
1695 static void
1696 g_journal_initialize(struct g_journal_softc *sc)
1697 {
1698 
1699 	sc->sc_journal_id = arc4random();
1700 	sc->sc_journal_next_id = arc4random();
1701 	sc->sc_journal_previous_id = sc->sc_journal_id;
1702 	sc->sc_journal_offset = sc->sc_jstart;
1703 	sc->sc_inactive.jj_offset = sc->sc_jstart;
1704 	g_journal_write_header(sc);
1705 	sc->sc_active.jj_offset = sc->sc_jstart;
1706 }
1707 
1708 static void
1709 g_journal_mark_as_dirty(struct g_journal_softc *sc)
1710 {
1711 	const struct g_journal_desc *desc;
1712 	int i;
1713 
1714 	GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
1715 	for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
1716 		desc->jd_dirty(sc->sc_dconsumer);
1717 }
1718 
1719 /*
1720  * Function read record header from the given journal.
1721  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
1722  * and data on every call.
1723  */
1724 static int
1725 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
1726     void *data)
1727 {
1728 	int error;
1729 
1730 	g_reset_bio(bp);
1731 	bp->bio_cmd = BIO_READ;
1732 	bp->bio_done = NULL;
1733 	bp->bio_offset = offset;
1734 	bp->bio_length = cp->provider->sectorsize;
1735 	bp->bio_data = data;
1736 	g_io_request(bp, cp);
1737 	error = biowait(bp, "gjs_read");
1738 	return (error);
1739 }
1740 
1741 #if 0
1742 /*
1743  * Function is called when we start the journal device and we detect that
1744  * one of the journals was not fully copied.
1745  * The purpose of this function is to read all records headers from journal
1746  * and placed them in the inactive queue, so we can start journal
1747  * synchronization process and the journal provider itself.
1748  * Design decision was taken to not synchronize the whole journal here as it
1749  * can take too much time. Reading headers only and delaying synchronization
1750  * process until after journal provider is started should be the best choice.
1751  */
1752 #endif
1753 
1754 static void
1755 g_journal_sync(struct g_journal_softc *sc)
1756 {
1757 	struct g_journal_record_header rhdr;
1758 	struct g_journal_entry *ent;
1759 	struct g_journal_header jhdr;
1760 	struct g_consumer *cp;
1761 	struct bio *bp, *fbp, *tbp;
1762 	off_t joffset, offset;
1763 	u_char *buf, sum[16];
1764 	uint64_t id;
1765 	MD5_CTX ctx;
1766 	int error, found, i;
1767 
1768 	found = 0;
1769 	fbp = NULL;
1770 	cp = sc->sc_jconsumer;
1771 	bp = g_alloc_bio();
1772 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
1773 	offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
1774 
1775 	GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
1776 
1777 	/*
1778 	 * Read and decode first journal header.
1779 	 */
1780 	error = g_journal_sync_read(cp, bp, offset, buf);
1781 	if (error != 0) {
1782 		GJ_DEBUG(0, "Error while reading journal header from %s.",
1783 		    cp->provider->name);
1784 		goto end;
1785 	}
1786 	error = g_journal_header_decode(buf, &jhdr);
1787 	if (error != 0) {
1788 		GJ_DEBUG(0, "Cannot decode journal header from %s.",
1789 		    cp->provider->name);
1790 		goto end;
1791 	}
1792 	id = sc->sc_journal_id;
1793 	if (jhdr.jh_journal_id != sc->sc_journal_id) {
1794 		GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
1795 		    (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
1796 		goto end;
1797 	}
1798 	offset += cp->provider->sectorsize;
1799 	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1800 
1801 	for (;;) {
1802 		/*
1803 		 * If the biggest record won't fit, look for a record header or
1804 		 * journal header from the beginning.
1805 		 */
1806 		GJ_VALIDATE_OFFSET(offset, sc);
1807 		error = g_journal_sync_read(cp, bp, offset, buf);
1808 		if (error != 0) {
1809 			/*
1810 			 * Not good. Having an error while reading header
1811 			 * means, that we cannot read next headers and in
1812 			 * consequence we cannot find termination.
1813 			 */
1814 			GJ_DEBUG(0,
1815 			    "Error while reading record header from %s.",
1816 			    cp->provider->name);
1817 			break;
1818 		}
1819 
1820 		error = g_journal_record_header_decode(buf, &rhdr);
1821 		if (error != 0) {
1822 			GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
1823 			    (intmax_t)offset, error);
1824 			/*
1825 			 * This is not a record header.
1826 			 * If we are lucky, this is next journal header.
1827 			 */
1828 			error = g_journal_header_decode(buf, &jhdr);
1829 			if (error != 0) {
1830 				GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
1831 				    (intmax_t)offset, error);
1832 				/*
1833 				 * Nope, this is not journal header, which
1834 				 * bascially means that journal is not
1835 				 * terminated properly.
1836 				 */
1837 				error = ENOENT;
1838 				break;
1839 			}
1840 			/*
1841 			 * Ok. This is header of _some_ journal. Now we need to
1842 			 * verify if this is header of the _next_ journal.
1843 			 */
1844 			if (jhdr.jh_journal_id != id) {
1845 				GJ_DEBUG(1, "Journal ID mismatch at %jd "
1846 				    "(0x%08x != 0x%08x).", (intmax_t)offset,
1847 				    (u_int)jhdr.jh_journal_id, (u_int)id);
1848 				error = ENOENT;
1849 				break;
1850 			}
1851 
1852 			/* Found termination. */
1853 			found++;
1854 			GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
1855 			    (intmax_t)offset, (u_int)id);
1856 			sc->sc_active.jj_offset = offset;
1857 			sc->sc_journal_offset =
1858 			    offset + cp->provider->sectorsize;
1859 			sc->sc_journal_id = id;
1860 			id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1861 
1862 			while ((tbp = fbp) != NULL) {
1863 				fbp = tbp->bio_next;
1864 				GJ_LOGREQ(3, tbp, "Adding request.");
1865 				g_journal_insert_bio(&sc->sc_inactive.jj_queue,
1866 				    tbp, M_WAITOK);
1867 			}
1868 
1869 			/* Skip journal's header. */
1870 			offset += cp->provider->sectorsize;
1871 			continue;
1872 		}
1873 
1874 		/* Skip record's header. */
1875 		offset += cp->provider->sectorsize;
1876 
1877 		/*
1878 		 * Add information about every record entry to the inactive
1879 		 * queue.
1880 		 */
1881 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1882 			MD5Init(&ctx);
1883 		for (i = 0; i < rhdr.jrh_nentries; i++) {
1884 			ent = &rhdr.jrh_entries[i];
1885 			GJ_DEBUG(3, "Insert entry: %jd %jd.",
1886 			    (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
1887 			g_journal_insert(&fbp, ent->je_offset,
1888 			    ent->je_offset + ent->je_length, ent->je_joffset,
1889 			    NULL, M_WAITOK);
1890 			if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1891 				u_char *buf2;
1892 
1893 				/*
1894 				 * TODO: Should use faster function (like
1895 				 *       g_journal_sync_read()).
1896 				 */
1897 				buf2 = g_read_data(cp, offset, ent->je_length,
1898 				    NULL);
1899 				if (buf2 == NULL)
1900 					GJ_DEBUG(0, "Cannot read data at %jd.",
1901 					    (intmax_t)offset);
1902 				else {
1903 					MD5Update(&ctx, buf2, ent->je_length);
1904 					g_free(buf2);
1905 				}
1906 			}
1907 			/* Skip entry's data. */
1908 			offset += ent->je_length;
1909 		}
1910 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1911 			MD5Final(sum, &ctx);
1912 			if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
1913 				GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
1914 				    (intmax_t)offset);
1915 			}
1916 		}
1917 	}
1918 end:
1919 	gj_free(bp->bio_data, cp->provider->sectorsize);
1920 	g_destroy_bio(bp);
1921 
1922 	/* Remove bios from unterminated journal. */
1923 	while ((tbp = fbp) != NULL) {
1924 		fbp = tbp->bio_next;
1925 		g_destroy_bio(tbp);
1926 	}
1927 
1928 	if (found < 1 && joffset > 0) {
1929 		GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
1930 		    sc->sc_name);
1931 		while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
1932 			sc->sc_inactive.jj_queue = tbp->bio_next;
1933 			g_destroy_bio(tbp);
1934 		}
1935 		g_journal_initialize(sc);
1936 		g_journal_mark_as_dirty(sc);
1937 	} else {
1938 		GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
1939 		g_journal_copy_start(sc);
1940 	}
1941 }
1942 
1943 /*
1944  * Wait for requests.
1945  * If we have requests in the current queue, flush them after 3 seconds from the
1946  * last flush. In this way we don't wait forever (or for journal switch) with
1947  * storing not full records on journal.
1948  */
1949 static void
1950 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
1951 {
1952 	int error, timeout;
1953 
1954 	GJ_DEBUG(3, "%s: enter", __func__);
1955 	if (sc->sc_current_count == 0) {
1956 		if (g_journal_debug < 2)
1957 			msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
1958 		else {
1959 			/*
1960 			 * If we have debug turned on, show number of elements
1961 			 * in various queues.
1962 			 */
1963 			for (;;) {
1964 				error = msleep(sc, &sc->sc_mtx, PRIBIO,
1965 				    "gj:work", hz * 3);
1966 				if (error == 0) {
1967 					mtx_unlock(&sc->sc_mtx);
1968 					break;
1969 				}
1970 				GJ_DEBUG(3, "Report: current count=%d",
1971 				    sc->sc_current_count);
1972 				GJ_DEBUG(3, "Report: flush count=%d",
1973 				    sc->sc_flush_count);
1974 				GJ_DEBUG(3, "Report: flush in progress=%d",
1975 				    sc->sc_flush_in_progress);
1976 				GJ_DEBUG(3, "Report: copy in progress=%d",
1977 				    sc->sc_copy_in_progress);
1978 				GJ_DEBUG(3, "Report: delayed=%d",
1979 				    sc->sc_delayed_count);
1980 			}
1981 		}
1982 		GJ_DEBUG(3, "%s: exit 1", __func__);
1983 		return;
1984 	}
1985 
1986 	/*
1987 	 * Flush even not full records every 3 seconds.
1988 	 */
1989 	timeout = (last_write + 3 - time_second) * hz;
1990 	if (timeout <= 0) {
1991 		mtx_unlock(&sc->sc_mtx);
1992 		g_journal_flush(sc);
1993 		g_journal_flush_send(sc);
1994 		GJ_DEBUG(3, "%s: exit 2", __func__);
1995 		return;
1996 	}
1997 	error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
1998 	if (error == EWOULDBLOCK)
1999 		g_journal_flush_send(sc);
2000 	GJ_DEBUG(3, "%s: exit 3", __func__);
2001 }
2002 
2003 /*
2004  * Worker thread.
2005  */
2006 static void
2007 g_journal_worker(void *arg)
2008 {
2009 	struct g_journal_softc *sc;
2010 	struct g_geom *gp;
2011 	struct g_provider *pp;
2012 	struct bio *bp;
2013 	time_t last_write;
2014 	int type;
2015 
2016 	thread_lock(curthread);
2017 	sched_prio(curthread, PRIBIO);
2018 	thread_unlock(curthread);
2019 
2020 	sc = arg;
2021 	type = 0;	/* gcc */
2022 
2023 	if (sc->sc_flags & GJF_DEVICE_CLEAN) {
2024 		GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
2025 		g_journal_initialize(sc);
2026 	} else {
2027 		g_journal_sync(sc);
2028 	}
2029 	/*
2030 	 * Check if we can use BIO_FLUSH.
2031 	 */
2032 	sc->sc_bio_flush = 0;
2033 	if (g_io_flush(sc->sc_jconsumer) == 0) {
2034 		sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
2035 		GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2036 		    sc->sc_jconsumer->provider->name);
2037 	} else {
2038 		GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2039 		    sc->sc_jconsumer->provider->name);
2040 	}
2041 	if (sc->sc_jconsumer != sc->sc_dconsumer) {
2042 		if (g_io_flush(sc->sc_dconsumer) == 0) {
2043 			sc->sc_bio_flush |= GJ_FLUSH_DATA;
2044 			GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2045 			    sc->sc_dconsumer->provider->name);
2046 		} else {
2047 			GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2048 			    sc->sc_dconsumer->provider->name);
2049 		}
2050 	}
2051 
2052 	gp = sc->sc_geom;
2053 	g_topology_lock();
2054 	pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
2055 	pp->mediasize = sc->sc_mediasize;
2056 	/*
2057 	 * There could be a problem when data provider and journal providers
2058 	 * have different sectorsize, but such scenario is prevented on journal
2059 	 * creation.
2060 	 */
2061 	pp->sectorsize = sc->sc_sectorsize;
2062 	g_error_provider(pp, 0);
2063 	g_topology_unlock();
2064 	last_write = time_second;
2065 
2066 	if (sc->sc_rootmount != NULL) {
2067 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2068 		root_mount_rel(sc->sc_rootmount);
2069 		sc->sc_rootmount = NULL;
2070 	}
2071 
2072 	for (;;) {
2073 		/* Get first request from the queue. */
2074 		mtx_lock(&sc->sc_mtx);
2075 		bp = bioq_first(&sc->sc_back_queue);
2076 		if (bp != NULL)
2077 			type = (bp->bio_cflags & GJ_BIO_MASK);
2078 		if (bp == NULL) {
2079 			bp = bioq_first(&sc->sc_regular_queue);
2080 			if (bp != NULL)
2081 				type = GJ_BIO_REGULAR;
2082 		}
2083 		if (bp == NULL) {
2084 try_switch:
2085 			if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
2086 			    (sc->sc_flags & GJF_DEVICE_DESTROY)) {
2087 				if (sc->sc_current_count > 0) {
2088 					mtx_unlock(&sc->sc_mtx);
2089 					g_journal_flush(sc);
2090 					g_journal_flush_send(sc);
2091 					continue;
2092 				}
2093 				if (sc->sc_flush_in_progress > 0)
2094 					goto sleep;
2095 				if (sc->sc_copy_in_progress > 0)
2096 					goto sleep;
2097 			}
2098 			if (sc->sc_flags & GJF_DEVICE_SWITCH) {
2099 				mtx_unlock(&sc->sc_mtx);
2100 				g_journal_switch(sc);
2101 				wakeup(&sc->sc_journal_copying);
2102 				continue;
2103 			}
2104 			if (sc->sc_flags & GJF_DEVICE_DESTROY) {
2105 				GJ_DEBUG(1, "Shutting down worker "
2106 				    "thread for %s.", gp->name);
2107 				sc->sc_worker = NULL;
2108 				wakeup(&sc->sc_worker);
2109 				mtx_unlock(&sc->sc_mtx);
2110 				kproc_exit(0);
2111 			}
2112 sleep:
2113 			g_journal_wait(sc, last_write);
2114 			continue;
2115 		}
2116 		/*
2117 		 * If we're in switch process, we need to delay all new
2118 		 * write requests until its done.
2119 		 */
2120 		if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
2121 		    type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
2122 			GJ_LOGREQ(2, bp, "WRITE on SWITCH");
2123 			goto try_switch;
2124 		}
2125 		if (type == GJ_BIO_REGULAR)
2126 			bioq_remove(&sc->sc_regular_queue, bp);
2127 		else
2128 			bioq_remove(&sc->sc_back_queue, bp);
2129 		mtx_unlock(&sc->sc_mtx);
2130 		switch (type) {
2131 		case GJ_BIO_REGULAR:
2132 			/* Regular request. */
2133 			switch (bp->bio_cmd) {
2134 			case BIO_READ:
2135 				g_journal_read(sc, bp, bp->bio_offset,
2136 				    bp->bio_offset + bp->bio_length);
2137 				break;
2138 			case BIO_WRITE:
2139 				last_write = time_second;
2140 				g_journal_add_request(sc, bp);
2141 				g_journal_flush_send(sc);
2142 				break;
2143 			default:
2144 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2145 			}
2146 			break;
2147 		case GJ_BIO_COPY:
2148 			switch (bp->bio_cmd) {
2149 			case BIO_READ:
2150 				if (g_journal_copy_read_done(bp))
2151 					g_journal_copy_send(sc);
2152 				break;
2153 			case BIO_WRITE:
2154 				g_journal_copy_write_done(bp);
2155 				g_journal_copy_send(sc);
2156 				break;
2157 			default:
2158 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2159 			}
2160 			break;
2161 		case GJ_BIO_JOURNAL:
2162 			g_journal_flush_done(bp);
2163 			g_journal_flush_send(sc);
2164 			break;
2165 		case GJ_BIO_READ:
2166 		default:
2167 			panic("Invalid bio (%d).", type);
2168 		}
2169 	}
2170 }
2171 
2172 static void
2173 g_journal_destroy_event(void *arg, int flags __unused)
2174 {
2175 	struct g_journal_softc *sc;
2176 
2177 	g_topology_assert();
2178 	sc = arg;
2179 	g_journal_destroy(sc);
2180 }
2181 
2182 static void
2183 g_journal_timeout(void *arg)
2184 {
2185 	struct g_journal_softc *sc;
2186 
2187 	sc = arg;
2188 	GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
2189 	    sc->sc_geom->name);
2190 	g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
2191 }
2192 
2193 static struct g_geom *
2194 g_journal_create(struct g_class *mp, struct g_provider *pp,
2195     const struct g_journal_metadata *md)
2196 {
2197 	struct g_journal_softc *sc;
2198 	struct g_geom *gp;
2199 	struct g_consumer *cp;
2200 	int error;
2201 
2202 	sc = NULL;	/* gcc */
2203 
2204 	g_topology_assert();
2205 	/*
2206 	 * There are two possibilities:
2207 	 * 1. Data and both journals are on the same provider.
2208 	 * 2. Data and journals are all on separated providers.
2209 	 */
2210 	/* Look for journal device with the same ID. */
2211 	LIST_FOREACH(gp, &mp->geom, geom) {
2212 		sc = gp->softc;
2213 		if (sc == NULL)
2214 			continue;
2215 		if (sc->sc_id == md->md_id)
2216 			break;
2217 	}
2218 	if (gp == NULL)
2219 		sc = NULL;
2220 	else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
2221 		GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
2222 		return (NULL);
2223 	}
2224 	if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
2225 		GJ_DEBUG(0, "Invalid type on %s.", pp->name);
2226 		return (NULL);
2227 	}
2228 	if (md->md_type & GJ_TYPE_DATA) {
2229 		GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
2230 		    pp->name);
2231 	}
2232 	if (md->md_type & GJ_TYPE_JOURNAL) {
2233 		GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
2234 		    pp->name);
2235 	}
2236 
2237 	if (sc == NULL) {
2238 		/* Action geom. */
2239 		sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
2240 		sc->sc_id = md->md_id;
2241 		sc->sc_type = 0;
2242 		sc->sc_flags = 0;
2243 		sc->sc_worker = NULL;
2244 
2245 		gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
2246 		gp->start = g_journal_start;
2247 		gp->orphan = g_journal_orphan;
2248 		gp->access = g_journal_access;
2249 		gp->softc = sc;
2250 		gp->flags |= G_GEOM_VOLATILE_BIO;
2251 		sc->sc_geom = gp;
2252 
2253 		mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
2254 
2255 		bioq_init(&sc->sc_back_queue);
2256 		bioq_init(&sc->sc_regular_queue);
2257 		bioq_init(&sc->sc_delayed_queue);
2258 		sc->sc_delayed_count = 0;
2259 		sc->sc_current_queue = NULL;
2260 		sc->sc_current_count = 0;
2261 		sc->sc_flush_queue = NULL;
2262 		sc->sc_flush_count = 0;
2263 		sc->sc_flush_in_progress = 0;
2264 		sc->sc_copy_queue = NULL;
2265 		sc->sc_copy_in_progress = 0;
2266 		sc->sc_inactive.jj_queue = NULL;
2267 		sc->sc_active.jj_queue = NULL;
2268 
2269 		sc->sc_rootmount = root_mount_hold("GJOURNAL");
2270 		GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
2271 
2272 		callout_init(&sc->sc_callout, 1);
2273 		if (md->md_type != GJ_TYPE_COMPLETE) {
2274 			/*
2275 			 * Journal and data are on separate providers.
2276 			 * At this point we have only one of them.
2277 			 * We setup a timeout in case the other part will not
2278 			 * appear, so we won't wait forever.
2279 			 */
2280 			callout_reset(&sc->sc_callout, 5 * hz,
2281 			    g_journal_timeout, sc);
2282 		}
2283 	}
2284 
2285 	/* Remember type of the data provider. */
2286 	if (md->md_type & GJ_TYPE_DATA)
2287 		sc->sc_orig_type = md->md_type;
2288 	sc->sc_type |= md->md_type;
2289 	cp = NULL;
2290 
2291 	if (md->md_type & GJ_TYPE_DATA) {
2292 		if (md->md_flags & GJ_FLAG_CLEAN)
2293 			sc->sc_flags |= GJF_DEVICE_CLEAN;
2294 		if (md->md_flags & GJ_FLAG_CHECKSUM)
2295 			sc->sc_flags |= GJF_DEVICE_CHECKSUM;
2296 		cp = g_new_consumer(gp);
2297 		error = g_attach(cp, pp);
2298 		KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2299 		    pp->name, error));
2300 		error = g_access(cp, 1, 1, 1);
2301 		if (error != 0) {
2302 			GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
2303 			    error);
2304 			g_journal_destroy(sc);
2305 			return (NULL);
2306 		}
2307 		sc->sc_dconsumer = cp;
2308 		sc->sc_mediasize = pp->mediasize - pp->sectorsize;
2309 		sc->sc_sectorsize = pp->sectorsize;
2310 		sc->sc_jstart = md->md_jstart;
2311 		sc->sc_jend = md->md_jend;
2312 		if (md->md_provider[0] != '\0')
2313 			sc->sc_flags |= GJF_DEVICE_HARDCODED;
2314 		sc->sc_journal_offset = md->md_joffset;
2315 		sc->sc_journal_id = md->md_jid;
2316 		sc->sc_journal_previous_id = md->md_jid;
2317 	}
2318 	if (md->md_type & GJ_TYPE_JOURNAL) {
2319 		if (cp == NULL) {
2320 			cp = g_new_consumer(gp);
2321 			error = g_attach(cp, pp);
2322 			KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2323 			    pp->name, error));
2324 			error = g_access(cp, 1, 1, 1);
2325 			if (error != 0) {
2326 				GJ_DEBUG(0, "Cannot access %s (error=%d).",
2327 				    pp->name, error);
2328 				g_journal_destroy(sc);
2329 				return (NULL);
2330 			}
2331 		} else {
2332 			/*
2333 			 * Journal is on the same provider as data, which means
2334 			 * that data provider ends where journal starts.
2335 			 */
2336 			sc->sc_mediasize = md->md_jstart;
2337 		}
2338 		sc->sc_jconsumer = cp;
2339 	}
2340 
2341 	/* Start switcher kproc if needed. */
2342 	if (g_journal_switcher_proc == NULL)
2343 		g_journal_start_switcher(mp);
2344 
2345 	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
2346 		/* Journal is not complete yet. */
2347 		return (gp);
2348 	} else {
2349 		/* Journal complete, cancel timeout. */
2350 		callout_drain(&sc->sc_callout);
2351 	}
2352 
2353 	error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
2354 	    "g_journal %s", sc->sc_name);
2355 	if (error != 0) {
2356 		GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
2357 		    sc->sc_name);
2358 		g_journal_destroy(sc);
2359 		return (NULL);
2360 	}
2361 
2362 	return (gp);
2363 }
2364 
2365 static void
2366 g_journal_destroy_consumer(void *arg, int flags __unused)
2367 {
2368 	struct g_consumer *cp;
2369 
2370 	g_topology_assert();
2371 	cp = arg;
2372 	g_detach(cp);
2373 	g_destroy_consumer(cp);
2374 }
2375 
2376 static int
2377 g_journal_destroy(struct g_journal_softc *sc)
2378 {
2379 	struct g_geom *gp;
2380 	struct g_provider *pp;
2381 	struct g_consumer *cp;
2382 
2383 	g_topology_assert();
2384 
2385 	if (sc == NULL)
2386 		return (ENXIO);
2387 
2388 	gp = sc->sc_geom;
2389 	pp = LIST_FIRST(&gp->provider);
2390 	if (pp != NULL) {
2391 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
2392 			GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
2393 			    pp->name, pp->acr, pp->acw, pp->ace);
2394 			return (EBUSY);
2395 		}
2396 		g_error_provider(pp, ENXIO);
2397 
2398 		g_journal_flush(sc);
2399 		g_journal_flush_send(sc);
2400 		g_journal_switch(sc);
2401 	}
2402 
2403 	sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
2404 
2405 	g_topology_unlock();
2406 
2407 	if (sc->sc_rootmount != NULL) {
2408 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2409 		root_mount_rel(sc->sc_rootmount);
2410 		sc->sc_rootmount = NULL;
2411 	}
2412 
2413 	callout_drain(&sc->sc_callout);
2414 	mtx_lock(&sc->sc_mtx);
2415 	wakeup(sc);
2416 	while (sc->sc_worker != NULL)
2417 		msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
2418 	mtx_unlock(&sc->sc_mtx);
2419 
2420 	if (pp != NULL) {
2421 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
2422 		g_journal_metadata_update(sc);
2423 		g_topology_lock();
2424 		g_wither_provider(pp, ENXIO);
2425 	} else {
2426 		g_topology_lock();
2427 	}
2428 	mtx_destroy(&sc->sc_mtx);
2429 
2430 	if (sc->sc_current_count != 0) {
2431 		GJ_DEBUG(0, "Warning! Number of current requests %d.",
2432 		    sc->sc_current_count);
2433 	}
2434 
2435 	gp->softc = NULL;
2436 	LIST_FOREACH(cp, &gp->consumer, consumer) {
2437 		if (cp->acr + cp->acw + cp->ace > 0)
2438 			g_access(cp, -1, -1, -1);
2439 		/*
2440 		 * We keep all consumers open for writting, so if I'll detach
2441 		 * and destroy consumer here, I'll get providers for taste, so
2442 		 * journal will be started again.
2443 		 * Sending an event here, prevents this from happening.
2444 		 */
2445 		g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
2446 	}
2447 	g_wither_geom(gp, ENXIO);
2448 	free(sc, M_JOURNAL);
2449 	return (0);
2450 }
2451 
2452 static void
2453 g_journal_taste_orphan(struct g_consumer *cp)
2454 {
2455 
2456 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2457 	    cp->provider->name));
2458 }
2459 
2460 static struct g_geom *
2461 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2462 {
2463 	struct g_journal_metadata md;
2464 	struct g_consumer *cp;
2465 	struct g_geom *gp;
2466 	int error;
2467 
2468 	g_topology_assert();
2469 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2470 	GJ_DEBUG(2, "Tasting %s.", pp->name);
2471 	if (pp->geom->class == mp)
2472 		return (NULL);
2473 
2474 	gp = g_new_geomf(mp, "journal:taste");
2475 	/* This orphan function should be never called. */
2476 	gp->orphan = g_journal_taste_orphan;
2477 	cp = g_new_consumer(gp);
2478 	g_attach(cp, pp);
2479 	error = g_journal_metadata_read(cp, &md);
2480 	g_detach(cp);
2481 	g_destroy_consumer(cp);
2482 	g_destroy_geom(gp);
2483 	if (error != 0)
2484 		return (NULL);
2485 	gp = NULL;
2486 
2487 	if (md.md_provider[0] != '\0' &&
2488 	    !g_compare_names(md.md_provider, pp->name))
2489 		return (NULL);
2490 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2491 		return (NULL);
2492 	if (g_journal_debug >= 2)
2493 		journal_metadata_dump(&md);
2494 
2495 	gp = g_journal_create(mp, pp, &md);
2496 	return (gp);
2497 }
2498 
2499 static struct g_journal_softc *
2500 g_journal_find_device(struct g_class *mp, const char *name)
2501 {
2502 	struct g_journal_softc *sc;
2503 	struct g_geom *gp;
2504 	struct g_provider *pp;
2505 
2506 	if (strncmp(name, "/dev/", 5) == 0)
2507 		name += 5;
2508 	LIST_FOREACH(gp, &mp->geom, geom) {
2509 		sc = gp->softc;
2510 		if (sc == NULL)
2511 			continue;
2512 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
2513 			continue;
2514 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2515 			continue;
2516 		pp = LIST_FIRST(&gp->provider);
2517 		if (strcmp(sc->sc_name, name) == 0)
2518 			return (sc);
2519 		if (pp != NULL && strcmp(pp->name, name) == 0)
2520 			return (sc);
2521 	}
2522 	return (NULL);
2523 }
2524 
2525 static void
2526 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
2527 {
2528 	struct g_journal_softc *sc;
2529 	const char *name;
2530 	char param[16];
2531 	int *nargs;
2532 	int error, i;
2533 
2534 	g_topology_assert();
2535 
2536 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
2537 	if (nargs == NULL) {
2538 		gctl_error(req, "No '%s' argument.", "nargs");
2539 		return;
2540 	}
2541 	if (*nargs <= 0) {
2542 		gctl_error(req, "Missing device(s).");
2543 		return;
2544 	}
2545 
2546 	for (i = 0; i < *nargs; i++) {
2547 		snprintf(param, sizeof(param), "arg%d", i);
2548 		name = gctl_get_asciiparam(req, param);
2549 		if (name == NULL) {
2550 			gctl_error(req, "No 'arg%d' argument.", i);
2551 			return;
2552 		}
2553 		sc = g_journal_find_device(mp, name);
2554 		if (sc == NULL) {
2555 			gctl_error(req, "No such device: %s.", name);
2556 			return;
2557 		}
2558 		error = g_journal_destroy(sc);
2559 		if (error != 0) {
2560 			gctl_error(req, "Cannot destroy device %s (error=%d).",
2561 			    LIST_FIRST(&sc->sc_geom->provider)->name, error);
2562 			return;
2563 		}
2564 	}
2565 }
2566 
2567 static void
2568 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
2569 {
2570 
2571 	g_topology_assert();
2572 	g_topology_unlock();
2573 	g_journal_sync_requested++;
2574 	wakeup(&g_journal_switcher_state);
2575 	while (g_journal_sync_requested > 0)
2576 		tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
2577 	g_topology_lock();
2578 }
2579 
2580 static void
2581 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
2582 {
2583 	uint32_t *version;
2584 
2585 	g_topology_assert();
2586 
2587 	version = gctl_get_paraml(req, "version", sizeof(*version));
2588 	if (version == NULL) {
2589 		gctl_error(req, "No '%s' argument.", "version");
2590 		return;
2591 	}
2592 	if (*version != G_JOURNAL_VERSION) {
2593 		gctl_error(req, "Userland and kernel parts are out of sync.");
2594 		return;
2595 	}
2596 
2597 	if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
2598 		g_journal_ctl_destroy(req, mp);
2599 		return;
2600 	} else if (strcmp(verb, "sync") == 0) {
2601 		g_journal_ctl_sync(req, mp);
2602 		return;
2603 	}
2604 
2605 	gctl_error(req, "Unknown verb.");
2606 }
2607 
2608 static void
2609 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2610     struct g_consumer *cp, struct g_provider *pp)
2611 {
2612 	struct g_journal_softc *sc;
2613 
2614 	g_topology_assert();
2615 
2616 	sc = gp->softc;
2617 	if (sc == NULL)
2618 		return;
2619 	if (pp != NULL) {
2620 		/* Nothing here. */
2621 	} else if (cp != NULL) {
2622 		int first = 1;
2623 
2624 		sbuf_printf(sb, "%s<Role>", indent);
2625 		if (cp == sc->sc_dconsumer) {
2626 			sbuf_printf(sb, "Data");
2627 			first = 0;
2628 		}
2629 		if (cp == sc->sc_jconsumer) {
2630 			if (!first)
2631 				sbuf_printf(sb, ",");
2632 			sbuf_printf(sb, "Journal");
2633 		}
2634 		sbuf_printf(sb, "</Role>\n");
2635 		if (cp == sc->sc_jconsumer) {
2636 			sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
2637 			    (intmax_t)sc->sc_jstart);
2638 			sbuf_printf(sb, "<Jend>%jd</Jend>\n",
2639 			    (intmax_t)sc->sc_jend);
2640 		}
2641 	} else {
2642 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2643 	}
2644 }
2645 
2646 static eventhandler_tag g_journal_event_shutdown = NULL;
2647 static eventhandler_tag g_journal_event_lowmem = NULL;
2648 
2649 static void
2650 g_journal_shutdown(void *arg, int howto __unused)
2651 {
2652 	struct g_class *mp;
2653 	struct g_geom *gp, *gp2;
2654 
2655 	if (panicstr != NULL)
2656 		return;
2657 	mp = arg;
2658 	g_topology_lock();
2659 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2660 		if (gp->softc == NULL)
2661 			continue;
2662 		GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
2663 		g_journal_destroy(gp->softc);
2664 	}
2665 	g_topology_unlock();
2666 }
2667 
2668 /*
2669  * Free cached requests from inactive queue in case of low memory.
2670  * We free GJ_FREE_AT_ONCE elements at once.
2671  */
2672 #define	GJ_FREE_AT_ONCE	4
2673 static void
2674 g_journal_lowmem(void *arg, int howto __unused)
2675 {
2676 	struct g_journal_softc *sc;
2677 	struct g_class *mp;
2678 	struct g_geom *gp;
2679 	struct bio *bp;
2680 	u_int nfree = GJ_FREE_AT_ONCE;
2681 
2682 	g_journal_stats_low_mem++;
2683 	mp = arg;
2684 	g_topology_lock();
2685 	LIST_FOREACH(gp, &mp->geom, geom) {
2686 		sc = gp->softc;
2687 		if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
2688 			continue;
2689 		mtx_lock(&sc->sc_mtx);
2690 		for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
2691 		    nfree--, bp = bp->bio_next) {
2692 			/*
2693 			 * This is safe to free the bio_data, because:
2694 			 * 1. If bio_data is NULL it will be read from the
2695 			 *    inactive journal.
2696 			 * 2. If bp is sent down, it is first removed from the
2697 			 *    inactive queue, so it's impossible to free the
2698 			 *    data from under in-flight bio.
2699 			 * On the other hand, freeing elements from the active
2700 			 * queue, is not safe.
2701 			 */
2702 			if (bp->bio_data != NULL) {
2703 				GJ_DEBUG(2, "Freeing data from %s.",
2704 				    sc->sc_name);
2705 				gj_free(bp->bio_data, bp->bio_length);
2706 				bp->bio_data = NULL;
2707 			}
2708 		}
2709 		mtx_unlock(&sc->sc_mtx);
2710 		if (nfree == 0)
2711 			break;
2712 	}
2713 	g_topology_unlock();
2714 }
2715 
2716 static void g_journal_switcher(void *arg);
2717 
2718 static void
2719 g_journal_init(struct g_class *mp)
2720 {
2721 
2722 	/* Pick a conservative value if provided value sucks. */
2723 	if (g_journal_cache_divisor <= 0 ||
2724 	    (vm_kmem_size / g_journal_cache_divisor == 0)) {
2725 		g_journal_cache_divisor = 5;
2726 	}
2727 	if (g_journal_cache_limit > 0) {
2728 		g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
2729 		g_journal_cache_low =
2730 		    (g_journal_cache_limit / 100) * g_journal_cache_switch;
2731 	}
2732 	g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
2733 	    g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
2734 	if (g_journal_event_shutdown == NULL)
2735 		GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
2736 	g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
2737 	    g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
2738 	if (g_journal_event_lowmem == NULL)
2739 		GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
2740 }
2741 
2742 static void
2743 g_journal_fini(struct g_class *mp)
2744 {
2745 
2746 	if (g_journal_event_shutdown != NULL) {
2747 		EVENTHANDLER_DEREGISTER(shutdown_post_sync,
2748 		    g_journal_event_shutdown);
2749 	}
2750 	if (g_journal_event_lowmem != NULL)
2751 		EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
2752 	g_journal_stop_switcher();
2753 }
2754 
2755 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
2756 
2757 static const struct g_journal_desc *
2758 g_journal_find_desc(const char *fstype)
2759 {
2760 	const struct g_journal_desc *desc;
2761 	int i;
2762 
2763 	for (desc = g_journal_filesystems[i = 0]; desc != NULL;
2764 	     desc = g_journal_filesystems[++i]) {
2765 		if (strcmp(desc->jd_fstype, fstype) == 0)
2766 			break;
2767 	}
2768 	return (desc);
2769 }
2770 
2771 static void
2772 g_journal_switch_wait(struct g_journal_softc *sc)
2773 {
2774 	struct bintime bt;
2775 
2776 	mtx_assert(&sc->sc_mtx, MA_OWNED);
2777 	if (g_journal_debug >= 2) {
2778 		if (sc->sc_flush_in_progress > 0) {
2779 			GJ_DEBUG(2, "%d requests flushing.",
2780 			    sc->sc_flush_in_progress);
2781 		}
2782 		if (sc->sc_copy_in_progress > 0) {
2783 			GJ_DEBUG(2, "%d requests copying.",
2784 			    sc->sc_copy_in_progress);
2785 		}
2786 		if (sc->sc_flush_count > 0) {
2787 			GJ_DEBUG(2, "%d requests to flush.",
2788 			    sc->sc_flush_count);
2789 		}
2790 		if (sc->sc_delayed_count > 0) {
2791 			GJ_DEBUG(2, "%d requests delayed.",
2792 			    sc->sc_delayed_count);
2793 		}
2794 	}
2795 	g_journal_stats_switches++;
2796 	if (sc->sc_copy_in_progress > 0)
2797 		g_journal_stats_wait_for_copy++;
2798 	GJ_TIMER_START(1, &bt);
2799 	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2800 	sc->sc_flags |= GJF_DEVICE_SWITCH;
2801 	wakeup(sc);
2802 	while (sc->sc_flags & GJF_DEVICE_SWITCH) {
2803 		msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
2804 		    "gj:switch", 0);
2805 	}
2806 	GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
2807 }
2808 
2809 static void
2810 g_journal_do_switch(struct g_class *classp)
2811 {
2812 	struct g_journal_softc *sc;
2813 	const struct g_journal_desc *desc;
2814 	struct g_geom *gp;
2815 	struct mount *mp;
2816 	struct bintime bt;
2817 	char *mountpoint;
2818 	int error, save;
2819 
2820 	g_topology_lock();
2821 	LIST_FOREACH(gp, &classp->geom, geom) {
2822 		sc = gp->softc;
2823 		if (sc == NULL)
2824 			continue;
2825 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
2826 			continue;
2827 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2828 			continue;
2829 		mtx_lock(&sc->sc_mtx);
2830 		sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
2831 		mtx_unlock(&sc->sc_mtx);
2832 	}
2833 	g_topology_unlock();
2834 
2835 	mtx_lock(&mountlist_mtx);
2836 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2837 		if (mp->mnt_gjprovider == NULL)
2838 			continue;
2839 		if (mp->mnt_flag & MNT_RDONLY)
2840 			continue;
2841 		desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
2842 		if (desc == NULL)
2843 			continue;
2844 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
2845 			continue;
2846 		/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
2847 
2848 		g_topology_lock();
2849 		sc = g_journal_find_device(classp, mp->mnt_gjprovider);
2850 		g_topology_unlock();
2851 
2852 		if (sc == NULL) {
2853 			GJ_DEBUG(0, "Cannot find journal geom for %s.",
2854 			    mp->mnt_gjprovider);
2855 			goto next;
2856 		} else if (JEMPTY(sc)) {
2857 			mtx_lock(&sc->sc_mtx);
2858 			sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2859 			mtx_unlock(&sc->sc_mtx);
2860 			GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
2861 			goto next;
2862 		}
2863 
2864 		mountpoint = mp->mnt_stat.f_mntonname;
2865 
2866 		error = vn_start_write(NULL, &mp, V_WAIT);
2867 		if (error != 0) {
2868 			GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
2869 			    mountpoint, error);
2870 			goto next;
2871 		}
2872 
2873 		save = curthread_pflags_set(TDP_SYNCIO);
2874 
2875 		GJ_TIMER_START(1, &bt);
2876 		vfs_msync(mp, MNT_NOWAIT);
2877 		GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
2878 
2879 		GJ_TIMER_START(1, &bt);
2880 		error = VFS_SYNC(mp, MNT_NOWAIT);
2881 		if (error == 0)
2882 			GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
2883 		else {
2884 			GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
2885 			    mountpoint, error);
2886 		}
2887 
2888 		curthread_pflags_restore(save);
2889 
2890 		vn_finished_write(mp);
2891 
2892 		if (error != 0)
2893 			goto next;
2894 
2895 		/*
2896 		 * Send BIO_FLUSH before freezing the file system, so it can be
2897 		 * faster after the freeze.
2898 		 */
2899 		GJ_TIMER_START(1, &bt);
2900 		g_journal_flush_cache(sc);
2901 		GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
2902 
2903 		GJ_TIMER_START(1, &bt);
2904 		error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
2905 		GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
2906 		if (error != 0) {
2907 			GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
2908 			    mountpoint, error);
2909 			goto next;
2910 		}
2911 
2912 		error = desc->jd_clean(mp);
2913 		if (error != 0)
2914 			goto next;
2915 
2916 		mtx_lock(&sc->sc_mtx);
2917 		g_journal_switch_wait(sc);
2918 		mtx_unlock(&sc->sc_mtx);
2919 
2920 		vfs_write_resume(mp, 0);
2921 next:
2922 		mtx_lock(&mountlist_mtx);
2923 		vfs_unbusy(mp);
2924 	}
2925 	mtx_unlock(&mountlist_mtx);
2926 
2927 	sc = NULL;
2928 	for (;;) {
2929 		g_topology_lock();
2930 		LIST_FOREACH(gp, &g_journal_class.geom, geom) {
2931 			sc = gp->softc;
2932 			if (sc == NULL)
2933 				continue;
2934 			mtx_lock(&sc->sc_mtx);
2935 			if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
2936 			    !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
2937 			    (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
2938 				break;
2939 			}
2940 			mtx_unlock(&sc->sc_mtx);
2941 			sc = NULL;
2942 		}
2943 		g_topology_unlock();
2944 		if (sc == NULL)
2945 			break;
2946 		mtx_assert(&sc->sc_mtx, MA_OWNED);
2947 		g_journal_switch_wait(sc);
2948 		mtx_unlock(&sc->sc_mtx);
2949 	}
2950 }
2951 
2952 static void
2953 g_journal_start_switcher(struct g_class *mp)
2954 {
2955 	int error;
2956 
2957 	g_topology_assert();
2958 	MPASS(g_journal_switcher_proc == NULL);
2959 	g_journal_switcher_state = GJ_SWITCHER_WORKING;
2960 	error = kproc_create(g_journal_switcher, mp, &g_journal_switcher_proc,
2961 	    0, 0, "g_journal switcher");
2962 	KASSERT(error == 0, ("Cannot create switcher thread."));
2963 }
2964 
2965 static void
2966 g_journal_stop_switcher(void)
2967 {
2968 	g_topology_assert();
2969 	MPASS(g_journal_switcher_proc != NULL);
2970 	g_journal_switcher_state = GJ_SWITCHER_DIE;
2971 	wakeup(&g_journal_switcher_state);
2972 	while (g_journal_switcher_state != GJ_SWITCHER_DIED)
2973 		tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
2974 	GJ_DEBUG(1, "Switcher died.");
2975 	g_journal_switcher_proc = NULL;
2976 }
2977 
2978 /*
2979  * TODO: Kill switcher thread on last geom destruction?
2980  */
2981 static void
2982 g_journal_switcher(void *arg)
2983 {
2984 	struct g_class *mp;
2985 	struct bintime bt;
2986 	int error;
2987 
2988 	mp = arg;
2989 	curthread->td_pflags |= TDP_NORUNNINGBUF;
2990 	for (;;) {
2991 		g_journal_switcher_wokenup = 0;
2992 		error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
2993 		    g_journal_switch_time * hz);
2994 		if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
2995 			g_journal_switcher_state = GJ_SWITCHER_DIED;
2996 			GJ_DEBUG(1, "Switcher exiting.");
2997 			wakeup(&g_journal_switcher_state);
2998 			kproc_exit(0);
2999 		}
3000 		if (error == 0 && g_journal_sync_requested == 0) {
3001 			GJ_DEBUG(1, "Out of cache, force switch (used=%jd "
3002 			    "limit=%jd).", (intmax_t)g_journal_cache_used,
3003 			    (intmax_t)g_journal_cache_limit);
3004 		}
3005 		GJ_TIMER_START(1, &bt);
3006 		g_journal_do_switch(mp);
3007 		GJ_TIMER_STOP(1, &bt, "Entire switch time");
3008 		if (g_journal_sync_requested > 0) {
3009 			g_journal_sync_requested = 0;
3010 			wakeup(&g_journal_sync_requested);
3011 		}
3012 	}
3013 }
3014