xref: /linux/fs/bcachefs/journal_io.c (revision f96a974170b749e3a56844e25b31d46a7233b6f6)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "btree_write_buffer.h"
8 #include "buckets.h"
9 #include "checksum.h"
10 #include "disk_groups.h"
11 #include "error.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17 #include "sb-clean.h"
18 #include "trace.h"
19 
20 #include <linux/string_choices.h>
21 
22 void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
23 {
24 	lockdep_assert_held(&c->sb_lock);
25 
26 	for_each_member_device(c, ca) {
27 		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
28 
29 		m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
30 		m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
31 	}
32 }
33 
34 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
35 {
36 	mutex_lock(&c->sb_lock);
37 	for_each_member_device(c, ca) {
38 		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
39 
40 		unsigned idx = le32_to_cpu(m.last_journal_bucket);
41 		if (idx < ca->journal.nr)
42 			ca->journal.cur_idx = idx;
43 		unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
44 		if (offset <= ca->mi.bucket_size)
45 			ca->journal.sectors_free = ca->mi.bucket_size - offset;
46 	}
47 	mutex_unlock(&c->sb_lock);
48 }
49 
50 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
51 			       struct journal_replay *j)
52 {
53 	darray_for_each(j->ptrs, i) {
54 		if (i != j->ptrs.data)
55 			prt_printf(out, " ");
56 		prt_printf(out, "%u:%u:%u (sector %llu)",
57 			   i->dev, i->bucket, i->bucket_offset, i->sector);
58 	}
59 }
60 
61 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
62 					struct journal_replay *j)
63 {
64 	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
65 
66 	bch2_journal_ptrs_to_text(out, c, j);
67 
68 	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
69 		struct jset_entry_datetime *datetime =
70 			container_of(entry, struct jset_entry_datetime, entry);
71 		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
72 		break;
73 	}
74 }
75 
76 static struct nonce journal_nonce(const struct jset *jset)
77 {
78 	return (struct nonce) {{
79 		[0] = 0,
80 		[1] = ((__le32 *) &jset->seq)[0],
81 		[2] = ((__le32 *) &jset->seq)[1],
82 		[3] = BCH_NONCE_JOURNAL,
83 	}};
84 }
85 
86 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
87 {
88 	if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
89 		*csum = (struct bch_csum) {};
90 		return false;
91 	}
92 
93 	*csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
94 	return !bch2_crc_cmp(j->csum, *csum);
95 }
96 
97 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
98 {
99 	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
100 }
101 
102 static void __journal_replay_free(struct bch_fs *c,
103 				  struct journal_replay *i)
104 {
105 	struct journal_replay **p =
106 		genradix_ptr(&c->journal_entries,
107 			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
108 
109 	BUG_ON(*p != i);
110 	*p = NULL;
111 	kvfree(i);
112 }
113 
114 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
115 {
116 	if (blacklisted)
117 		i->ignore_blacklisted = true;
118 	else
119 		i->ignore_not_dirty = true;
120 
121 	if (!c->opts.read_entire_journal)
122 		__journal_replay_free(c, i);
123 }
124 
125 struct journal_list {
126 	struct closure		cl;
127 	u64			last_seq;
128 	struct mutex		lock;
129 	int			ret;
130 };
131 
132 #define JOURNAL_ENTRY_ADD_OK		0
133 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
134 
135 /*
136  * Given a journal entry we just read, add it to the list of journal entries to
137  * be replayed:
138  */
139 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
140 			     struct journal_ptr entry_ptr,
141 			     struct journal_list *jlist, struct jset *j)
142 {
143 	struct genradix_iter iter;
144 	struct journal_replay **_i, *i, *dup;
145 	size_t bytes = vstruct_bytes(j);
146 	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
147 	struct printbuf buf = PRINTBUF;
148 	int ret = JOURNAL_ENTRY_ADD_OK;
149 
150 	if (!c->journal.oldest_seq_found_ondisk ||
151 	    le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
152 		c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
153 
154 	/* Is this entry older than the range we need? */
155 	if (!c->opts.read_entire_journal &&
156 	    le64_to_cpu(j->seq) < jlist->last_seq)
157 		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
158 
159 	/*
160 	 * genradixes are indexed by a ulong, not a u64, so we can't index them
161 	 * by sequence number directly: Assume instead that they will all fall
162 	 * within the range of +-2billion of the filrst one we find.
163 	 */
164 	if (!c->journal_entries_base_seq)
165 		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
166 
167 	/* Drop entries we don't need anymore */
168 	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
169 		genradix_for_each_from(&c->journal_entries, iter, _i,
170 				       journal_entry_radix_idx(c, jlist->last_seq)) {
171 			i = *_i;
172 
173 			if (journal_replay_ignore(i))
174 				continue;
175 
176 			if (le64_to_cpu(i->j.seq) >= last_seq)
177 				break;
178 
179 			journal_replay_free(c, i, false);
180 		}
181 	}
182 
183 	jlist->last_seq = max(jlist->last_seq, last_seq);
184 
185 	_i = genradix_ptr_alloc(&c->journal_entries,
186 				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
187 				GFP_KERNEL);
188 	if (!_i)
189 		return -BCH_ERR_ENOMEM_journal_entry_add;
190 
191 	/*
192 	 * Duplicate journal entries? If so we want the one that didn't have a
193 	 * checksum error:
194 	 */
195 	dup = *_i;
196 	if (dup) {
197 		bool identical = bytes == vstruct_bytes(&dup->j) &&
198 			!memcmp(j, &dup->j, bytes);
199 		bool not_identical = !identical &&
200 			entry_ptr.csum_good &&
201 			dup->csum_good;
202 
203 		bool same_device = false;
204 		darray_for_each(dup->ptrs, ptr)
205 			if (ptr->dev == ca->dev_idx)
206 				same_device = true;
207 
208 		ret = darray_push(&dup->ptrs, entry_ptr);
209 		if (ret)
210 			goto out;
211 
212 		bch2_journal_replay_to_text(&buf, c, dup);
213 
214 		fsck_err_on(same_device,
215 			    c, journal_entry_dup_same_device,
216 			    "duplicate journal entry on same device\n  %s",
217 			    buf.buf);
218 
219 		fsck_err_on(not_identical,
220 			    c, journal_entry_replicas_data_mismatch,
221 			    "found duplicate but non identical journal entries\n  %s",
222 			    buf.buf);
223 
224 		if (entry_ptr.csum_good && !identical)
225 			goto replace;
226 
227 		goto out;
228 	}
229 replace:
230 	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
231 	if (!i)
232 		return -BCH_ERR_ENOMEM_journal_entry_add;
233 
234 	darray_init(&i->ptrs);
235 	i->csum_good		= entry_ptr.csum_good;
236 	i->ignore_blacklisted	= false;
237 	i->ignore_not_dirty	= false;
238 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
239 
240 	if (dup) {
241 		/* The first ptr should represent the jset we kept: */
242 		darray_for_each(dup->ptrs, ptr)
243 			darray_push(&i->ptrs, *ptr);
244 		__journal_replay_free(c, dup);
245 	} else {
246 		darray_push(&i->ptrs, entry_ptr);
247 	}
248 
249 	*_i = i;
250 out:
251 fsck_err:
252 	printbuf_exit(&buf);
253 	return ret;
254 }
255 
256 /* this fills in a range with empty jset_entries: */
257 static void journal_entry_null_range(void *start, void *end)
258 {
259 	struct jset_entry *entry;
260 
261 	for (entry = start; entry != end; entry = vstruct_next(entry))
262 		memset(entry, 0, sizeof(*entry));
263 }
264 
265 #define JOURNAL_ENTRY_REREAD	5
266 #define JOURNAL_ENTRY_NONE	6
267 #define JOURNAL_ENTRY_BAD	7
268 
269 static void journal_entry_err_msg(struct printbuf *out,
270 				  u32 version,
271 				  struct jset *jset,
272 				  struct jset_entry *entry)
273 {
274 	prt_str(out, "invalid journal entry, version=");
275 	bch2_version_to_text(out, version);
276 
277 	if (entry) {
278 		prt_str(out, " type=");
279 		bch2_prt_jset_entry_type(out, entry->type);
280 	}
281 
282 	if (!jset) {
283 		prt_printf(out, " in superblock");
284 	} else {
285 
286 		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
287 
288 		if (entry)
289 			prt_printf(out, " offset=%zi/%u",
290 				   (u64 *) entry - jset->_data,
291 				   le32_to_cpu(jset->u64s));
292 	}
293 
294 	prt_str(out, ": ");
295 }
296 
297 #define journal_entry_err(c, version, jset, entry, _err, msg, ...)	\
298 ({									\
299 	struct printbuf _buf = PRINTBUF;				\
300 									\
301 	journal_entry_err_msg(&_buf, version, jset, entry);		\
302 	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
303 									\
304 	switch (from.flags & BCH_VALIDATE_write) {			\
305 	case READ:							\
306 		mustfix_fsck_err(c, _err, "%s", _buf.buf);		\
307 		break;							\
308 	case WRITE:							\
309 		bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);		\
310 		bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
311 		if (bch2_fs_inconsistent(c)) {				\
312 			ret = -BCH_ERR_fsck_errors_not_fixed;		\
313 			goto fsck_err;					\
314 		}							\
315 		break;							\
316 	}								\
317 									\
318 	printbuf_exit(&_buf);						\
319 	true;								\
320 })
321 
322 #define journal_entry_err_on(cond, ...)					\
323 	((cond) ? journal_entry_err(__VA_ARGS__) : false)
324 
325 #define FSCK_DELETED_KEY	5
326 
327 static int journal_validate_key(struct bch_fs *c,
328 				struct jset *jset,
329 				struct jset_entry *entry,
330 				struct bkey_i *k,
331 				struct bkey_validate_context from,
332 				unsigned version, int big_endian)
333 {
334 	enum bch_validate_flags flags = from.flags;
335 	int write = flags & BCH_VALIDATE_write;
336 	void *next = vstruct_next(entry);
337 	int ret = 0;
338 
339 	if (journal_entry_err_on(!k->k.u64s,
340 				 c, version, jset, entry,
341 				 journal_entry_bkey_u64s_0,
342 				 "k->u64s 0")) {
343 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
344 		journal_entry_null_range(vstruct_next(entry), next);
345 		return FSCK_DELETED_KEY;
346 	}
347 
348 	if (journal_entry_err_on((void *) bkey_next(k) >
349 				 (void *) vstruct_next(entry),
350 				 c, version, jset, entry,
351 				 journal_entry_bkey_past_end,
352 				 "extends past end of journal entry")) {
353 		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
354 		journal_entry_null_range(vstruct_next(entry), next);
355 		return FSCK_DELETED_KEY;
356 	}
357 
358 	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
359 				 c, version, jset, entry,
360 				 journal_entry_bkey_bad_format,
361 				 "bad format %u", k->k.format)) {
362 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
363 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
364 		journal_entry_null_range(vstruct_next(entry), next);
365 		return FSCK_DELETED_KEY;
366 	}
367 
368 	if (!write)
369 		bch2_bkey_compat(from.level, from.btree, version, big_endian,
370 				 write, NULL, bkey_to_packed(k));
371 
372 	ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
373 	if (ret == -BCH_ERR_fsck_delete_bkey) {
374 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
375 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
376 		journal_entry_null_range(vstruct_next(entry), next);
377 		return FSCK_DELETED_KEY;
378 	}
379 	if (ret)
380 		goto fsck_err;
381 
382 	if (write)
383 		bch2_bkey_compat(from.level, from.btree, version, big_endian,
384 				 write, NULL, bkey_to_packed(k));
385 fsck_err:
386 	return ret;
387 }
388 
389 static int journal_entry_btree_keys_validate(struct bch_fs *c,
390 				struct jset *jset,
391 				struct jset_entry *entry,
392 				unsigned version, int big_endian,
393 				struct bkey_validate_context from)
394 {
395 	struct bkey_i *k = entry->start;
396 
397 	from.level	= entry->level;
398 	from.btree	= entry->btree_id;
399 
400 	while (k != vstruct_last(entry)) {
401 		int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
402 		if (ret == FSCK_DELETED_KEY)
403 			continue;
404 		else if (ret)
405 			return ret;
406 
407 		k = bkey_next(k);
408 	}
409 
410 	return 0;
411 }
412 
413 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
414 					     struct jset_entry *entry)
415 {
416 	bool first = true;
417 
418 	jset_entry_for_each_key(entry, k) {
419 		if (!first) {
420 			prt_newline(out);
421 			bch2_prt_jset_entry_type(out, entry->type);
422 			prt_str(out, ": ");
423 		}
424 		bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
425 		prt_char(out, ' ');
426 		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
427 		first = false;
428 	}
429 }
430 
431 static int journal_entry_btree_root_validate(struct bch_fs *c,
432 				struct jset *jset,
433 				struct jset_entry *entry,
434 				unsigned version, int big_endian,
435 				struct bkey_validate_context from)
436 {
437 	struct bkey_i *k = entry->start;
438 	int ret = 0;
439 
440 	from.root	= true;
441 	from.level	= entry->level + 1;
442 	from.btree	= entry->btree_id;
443 
444 	if (journal_entry_err_on(!entry->u64s ||
445 				 le16_to_cpu(entry->u64s) != k->k.u64s,
446 				 c, version, jset, entry,
447 				 journal_entry_btree_root_bad_size,
448 				 "invalid btree root journal entry: wrong number of keys")) {
449 		void *next = vstruct_next(entry);
450 		/*
451 		 * we don't want to null out this jset_entry,
452 		 * just the contents, so that later we can tell
453 		 * we were _supposed_ to have a btree root
454 		 */
455 		entry->u64s = 0;
456 		journal_entry_null_range(vstruct_next(entry), next);
457 		return 0;
458 	}
459 
460 	ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
461 	if (ret == FSCK_DELETED_KEY)
462 		ret = 0;
463 fsck_err:
464 	return ret;
465 }
466 
467 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
468 					     struct jset_entry *entry)
469 {
470 	journal_entry_btree_keys_to_text(out, c, entry);
471 }
472 
473 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
474 				struct jset *jset,
475 				struct jset_entry *entry,
476 				unsigned version, int big_endian,
477 				struct bkey_validate_context from)
478 {
479 	/* obsolete, don't care: */
480 	return 0;
481 }
482 
483 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
484 					    struct jset_entry *entry)
485 {
486 }
487 
488 static int journal_entry_blacklist_validate(struct bch_fs *c,
489 				struct jset *jset,
490 				struct jset_entry *entry,
491 				unsigned version, int big_endian,
492 				struct bkey_validate_context from)
493 {
494 	int ret = 0;
495 
496 	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
497 				 c, version, jset, entry,
498 				 journal_entry_blacklist_bad_size,
499 		"invalid journal seq blacklist entry: bad size")) {
500 		journal_entry_null_range(entry, vstruct_next(entry));
501 	}
502 fsck_err:
503 	return ret;
504 }
505 
506 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
507 					    struct jset_entry *entry)
508 {
509 	struct jset_entry_blacklist *bl =
510 		container_of(entry, struct jset_entry_blacklist, entry);
511 
512 	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
513 }
514 
515 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
516 				struct jset *jset,
517 				struct jset_entry *entry,
518 				unsigned version, int big_endian,
519 				struct bkey_validate_context from)
520 {
521 	struct jset_entry_blacklist_v2 *bl_entry;
522 	int ret = 0;
523 
524 	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
525 				 c, version, jset, entry,
526 				 journal_entry_blacklist_v2_bad_size,
527 		"invalid journal seq blacklist entry: bad size")) {
528 		journal_entry_null_range(entry, vstruct_next(entry));
529 		goto out;
530 	}
531 
532 	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
533 
534 	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
535 				 le64_to_cpu(bl_entry->end),
536 				 c, version, jset, entry,
537 				 journal_entry_blacklist_v2_start_past_end,
538 		"invalid journal seq blacklist entry: start > end")) {
539 		journal_entry_null_range(entry, vstruct_next(entry));
540 	}
541 out:
542 fsck_err:
543 	return ret;
544 }
545 
546 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
547 					       struct jset_entry *entry)
548 {
549 	struct jset_entry_blacklist_v2 *bl =
550 		container_of(entry, struct jset_entry_blacklist_v2, entry);
551 
552 	prt_printf(out, "start=%llu end=%llu",
553 	       le64_to_cpu(bl->start),
554 	       le64_to_cpu(bl->end));
555 }
556 
557 static int journal_entry_usage_validate(struct bch_fs *c,
558 				struct jset *jset,
559 				struct jset_entry *entry,
560 				unsigned version, int big_endian,
561 				struct bkey_validate_context from)
562 {
563 	struct jset_entry_usage *u =
564 		container_of(entry, struct jset_entry_usage, entry);
565 	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
566 	int ret = 0;
567 
568 	if (journal_entry_err_on(bytes < sizeof(*u),
569 				 c, version, jset, entry,
570 				 journal_entry_usage_bad_size,
571 				 "invalid journal entry usage: bad size")) {
572 		journal_entry_null_range(entry, vstruct_next(entry));
573 		return ret;
574 	}
575 
576 fsck_err:
577 	return ret;
578 }
579 
580 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
581 					struct jset_entry *entry)
582 {
583 	struct jset_entry_usage *u =
584 		container_of(entry, struct jset_entry_usage, entry);
585 
586 	prt_str(out, "type=");
587 	bch2_prt_fs_usage_type(out, u->entry.btree_id);
588 	prt_printf(out, " v=%llu", le64_to_cpu(u->v));
589 }
590 
591 static int journal_entry_data_usage_validate(struct bch_fs *c,
592 				struct jset *jset,
593 				struct jset_entry *entry,
594 				unsigned version, int big_endian,
595 				struct bkey_validate_context from)
596 {
597 	struct jset_entry_data_usage *u =
598 		container_of(entry, struct jset_entry_data_usage, entry);
599 	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
600 	struct printbuf err = PRINTBUF;
601 	int ret = 0;
602 
603 	if (journal_entry_err_on(bytes < sizeof(*u) ||
604 				 bytes < sizeof(*u) + u->r.nr_devs,
605 				 c, version, jset, entry,
606 				 journal_entry_data_usage_bad_size,
607 				 "invalid journal entry usage: bad size")) {
608 		journal_entry_null_range(entry, vstruct_next(entry));
609 		goto out;
610 	}
611 
612 	if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
613 				 c, version, jset, entry,
614 				 journal_entry_data_usage_bad_size,
615 				 "invalid journal entry usage: %s", err.buf)) {
616 		journal_entry_null_range(entry, vstruct_next(entry));
617 		goto out;
618 	}
619 out:
620 fsck_err:
621 	printbuf_exit(&err);
622 	return ret;
623 }
624 
625 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
626 					     struct jset_entry *entry)
627 {
628 	struct jset_entry_data_usage *u =
629 		container_of(entry, struct jset_entry_data_usage, entry);
630 
631 	bch2_replicas_entry_to_text(out, &u->r);
632 	prt_printf(out, "=%llu", le64_to_cpu(u->v));
633 }
634 
635 static int journal_entry_clock_validate(struct bch_fs *c,
636 				struct jset *jset,
637 				struct jset_entry *entry,
638 				unsigned version, int big_endian,
639 				struct bkey_validate_context from)
640 {
641 	struct jset_entry_clock *clock =
642 		container_of(entry, struct jset_entry_clock, entry);
643 	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
644 	int ret = 0;
645 
646 	if (journal_entry_err_on(bytes != sizeof(*clock),
647 				 c, version, jset, entry,
648 				 journal_entry_clock_bad_size,
649 				 "bad size")) {
650 		journal_entry_null_range(entry, vstruct_next(entry));
651 		return ret;
652 	}
653 
654 	if (journal_entry_err_on(clock->rw > 1,
655 				 c, version, jset, entry,
656 				 journal_entry_clock_bad_rw,
657 				 "bad rw")) {
658 		journal_entry_null_range(entry, vstruct_next(entry));
659 		return ret;
660 	}
661 
662 fsck_err:
663 	return ret;
664 }
665 
666 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
667 					struct jset_entry *entry)
668 {
669 	struct jset_entry_clock *clock =
670 		container_of(entry, struct jset_entry_clock, entry);
671 
672 	prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
673 }
674 
675 static int journal_entry_dev_usage_validate(struct bch_fs *c,
676 				struct jset *jset,
677 				struct jset_entry *entry,
678 				unsigned version, int big_endian,
679 				struct bkey_validate_context from)
680 {
681 	struct jset_entry_dev_usage *u =
682 		container_of(entry, struct jset_entry_dev_usage, entry);
683 	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
684 	unsigned expected = sizeof(*u);
685 	int ret = 0;
686 
687 	if (journal_entry_err_on(bytes < expected,
688 				 c, version, jset, entry,
689 				 journal_entry_dev_usage_bad_size,
690 				 "bad size (%u < %u)",
691 				 bytes, expected)) {
692 		journal_entry_null_range(entry, vstruct_next(entry));
693 		return ret;
694 	}
695 
696 	if (journal_entry_err_on(u->pad,
697 				 c, version, jset, entry,
698 				 journal_entry_dev_usage_bad_pad,
699 				 "bad pad")) {
700 		journal_entry_null_range(entry, vstruct_next(entry));
701 		return ret;
702 	}
703 
704 fsck_err:
705 	return ret;
706 }
707 
708 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
709 					    struct jset_entry *entry)
710 {
711 	struct jset_entry_dev_usage *u =
712 		container_of(entry, struct jset_entry_dev_usage, entry);
713 	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
714 
715 	if (vstruct_bytes(entry) < sizeof(*u))
716 		return;
717 
718 	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
719 
720 	printbuf_indent_add(out, 2);
721 	for (i = 0; i < nr_types; i++) {
722 		prt_newline(out);
723 		bch2_prt_data_type(out, i);
724 		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
725 		       le64_to_cpu(u->d[i].buckets),
726 		       le64_to_cpu(u->d[i].sectors),
727 		       le64_to_cpu(u->d[i].fragmented));
728 	}
729 	printbuf_indent_sub(out, 2);
730 }
731 
732 static int journal_entry_log_validate(struct bch_fs *c,
733 				struct jset *jset,
734 				struct jset_entry *entry,
735 				unsigned version, int big_endian,
736 				struct bkey_validate_context from)
737 {
738 	return 0;
739 }
740 
741 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
742 				      struct jset_entry *entry)
743 {
744 	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
745 
746 	prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
747 }
748 
749 static int journal_entry_overwrite_validate(struct bch_fs *c,
750 				struct jset *jset,
751 				struct jset_entry *entry,
752 				unsigned version, int big_endian,
753 				struct bkey_validate_context from)
754 {
755 	from.flags = 0;
756 	return journal_entry_btree_keys_validate(c, jset, entry,
757 				version, big_endian, from);
758 }
759 
760 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
761 					    struct jset_entry *entry)
762 {
763 	journal_entry_btree_keys_to_text(out, c, entry);
764 }
765 
766 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
767 				struct jset *jset,
768 				struct jset_entry *entry,
769 				unsigned version, int big_endian,
770 				struct bkey_validate_context from)
771 {
772 	return journal_entry_btree_keys_validate(c, jset, entry,
773 				version, big_endian, from);
774 }
775 
776 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
777 					    struct jset_entry *entry)
778 {
779 	journal_entry_btree_keys_to_text(out, c, entry);
780 }
781 
782 static int journal_entry_datetime_validate(struct bch_fs *c,
783 				struct jset *jset,
784 				struct jset_entry *entry,
785 				unsigned version, int big_endian,
786 				struct bkey_validate_context from)
787 {
788 	unsigned bytes = vstruct_bytes(entry);
789 	unsigned expected = 16;
790 	int ret = 0;
791 
792 	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
793 				 c, version, jset, entry,
794 				 journal_entry_dev_usage_bad_size,
795 				 "bad size (%u < %u)",
796 				 bytes, expected)) {
797 		journal_entry_null_range(entry, vstruct_next(entry));
798 		return ret;
799 	}
800 fsck_err:
801 	return ret;
802 }
803 
804 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
805 					    struct jset_entry *entry)
806 {
807 	struct jset_entry_datetime *datetime =
808 		container_of(entry, struct jset_entry_datetime, entry);
809 
810 	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
811 }
812 
813 struct jset_entry_ops {
814 	int (*validate)(struct bch_fs *, struct jset *,
815 			struct jset_entry *, unsigned, int,
816 			struct bkey_validate_context);
817 	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
818 };
819 
820 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
821 #define x(f, nr)						\
822 	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
823 		.validate	= journal_entry_##f##_validate,	\
824 		.to_text	= journal_entry_##f##_to_text,	\
825 	},
826 	BCH_JSET_ENTRY_TYPES()
827 #undef x
828 };
829 
830 int bch2_journal_entry_validate(struct bch_fs *c,
831 				struct jset *jset,
832 				struct jset_entry *entry,
833 				unsigned version, int big_endian,
834 				struct bkey_validate_context from)
835 {
836 	return entry->type < BCH_JSET_ENTRY_NR
837 		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
838 				version, big_endian, from)
839 		: 0;
840 }
841 
842 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
843 				struct jset_entry *entry)
844 {
845 	bch2_prt_jset_entry_type(out, entry->type);
846 
847 	if (entry->type < BCH_JSET_ENTRY_NR) {
848 		prt_str(out, ": ");
849 		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
850 	}
851 }
852 
853 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
854 				 enum bch_validate_flags flags)
855 {
856 	struct bkey_validate_context from = {
857 		.flags		= flags,
858 		.from		= BKEY_VALIDATE_journal,
859 		.journal_seq	= le64_to_cpu(jset->seq),
860 	};
861 
862 	unsigned version = le32_to_cpu(jset->version);
863 	int ret = 0;
864 
865 	vstruct_for_each(jset, entry) {
866 		from.journal_offset = (u64 *) entry - jset->_data;
867 
868 		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
869 				c, version, jset, entry,
870 				journal_entry_past_jset_end,
871 				"journal entry extends past end of jset")) {
872 			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
873 			break;
874 		}
875 
876 		ret = bch2_journal_entry_validate(c, jset, entry, version,
877 						  JSET_BIG_ENDIAN(jset), from);
878 		if (ret)
879 			break;
880 	}
881 fsck_err:
882 	return ret;
883 }
884 
885 static int jset_validate(struct bch_fs *c,
886 			 struct bch_dev *ca,
887 			 struct jset *jset, u64 sector,
888 			 enum bch_validate_flags flags)
889 {
890 	struct bkey_validate_context from = {
891 		.flags		= flags,
892 		.from		= BKEY_VALIDATE_journal,
893 		.journal_seq	= le64_to_cpu(jset->seq),
894 	};
895 	int ret = 0;
896 
897 	if (le64_to_cpu(jset->magic) != jset_magic(c))
898 		return JOURNAL_ENTRY_NONE;
899 
900 	unsigned version = le32_to_cpu(jset->version);
901 	if (journal_entry_err_on(!bch2_version_compatible(version),
902 			c, version, jset, NULL,
903 			jset_unsupported_version,
904 			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
905 			ca ? ca->name : c->name,
906 			sector, le64_to_cpu(jset->seq),
907 			BCH_VERSION_MAJOR(version),
908 			BCH_VERSION_MINOR(version))) {
909 		/* don't try to continue: */
910 		return -EINVAL;
911 	}
912 
913 	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
914 			c, version, jset, NULL,
915 			jset_unknown_csum,
916 			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
917 			ca ? ca->name : c->name,
918 			sector, le64_to_cpu(jset->seq),
919 			JSET_CSUM_TYPE(jset)))
920 		ret = JOURNAL_ENTRY_BAD;
921 
922 	/* last_seq is ignored when JSET_NO_FLUSH is true */
923 	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
924 				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
925 				 c, version, jset, NULL,
926 				 jset_last_seq_newer_than_seq,
927 				 "invalid journal entry: last_seq > seq (%llu > %llu)",
928 				 le64_to_cpu(jset->last_seq),
929 				 le64_to_cpu(jset->seq))) {
930 		jset->last_seq = jset->seq;
931 		return JOURNAL_ENTRY_BAD;
932 	}
933 
934 	ret = jset_validate_entries(c, jset, flags);
935 fsck_err:
936 	return ret;
937 }
938 
939 static int jset_validate_early(struct bch_fs *c,
940 			 struct bch_dev *ca,
941 			 struct jset *jset, u64 sector,
942 			 unsigned bucket_sectors_left,
943 			 unsigned sectors_read)
944 {
945 	struct bkey_validate_context from = {
946 		.from		= BKEY_VALIDATE_journal,
947 		.journal_seq	= le64_to_cpu(jset->seq),
948 	};
949 	int ret = 0;
950 
951 	if (le64_to_cpu(jset->magic) != jset_magic(c))
952 		return JOURNAL_ENTRY_NONE;
953 
954 	unsigned version = le32_to_cpu(jset->version);
955 	if (journal_entry_err_on(!bch2_version_compatible(version),
956 			c, version, jset, NULL,
957 			jset_unsupported_version,
958 			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
959 			ca ? ca->name : c->name,
960 			sector, le64_to_cpu(jset->seq),
961 			BCH_VERSION_MAJOR(version),
962 			BCH_VERSION_MINOR(version))) {
963 		/* don't try to continue: */
964 		return -EINVAL;
965 	}
966 
967 	size_t bytes = vstruct_bytes(jset);
968 	if (bytes > (sectors_read << 9) &&
969 	    sectors_read < bucket_sectors_left)
970 		return JOURNAL_ENTRY_REREAD;
971 
972 	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
973 			c, version, jset, NULL,
974 			jset_past_bucket_end,
975 			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
976 			ca ? ca->name : c->name,
977 			sector, le64_to_cpu(jset->seq), bytes))
978 		le32_add_cpu(&jset->u64s,
979 			     -((bytes - (bucket_sectors_left << 9)) / 8));
980 fsck_err:
981 	return ret;
982 }
983 
984 struct journal_read_buf {
985 	void		*data;
986 	size_t		size;
987 };
988 
989 static int journal_read_buf_realloc(struct journal_read_buf *b,
990 				    size_t new_size)
991 {
992 	void *n;
993 
994 	/* the bios are sized for this many pages, max: */
995 	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
996 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
997 
998 	new_size = roundup_pow_of_two(new_size);
999 	n = kvmalloc(new_size, GFP_KERNEL);
1000 	if (!n)
1001 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
1002 
1003 	kvfree(b->data);
1004 	b->data = n;
1005 	b->size = new_size;
1006 	return 0;
1007 }
1008 
1009 static int journal_read_bucket(struct bch_dev *ca,
1010 			       struct journal_read_buf *buf,
1011 			       struct journal_list *jlist,
1012 			       unsigned bucket)
1013 {
1014 	struct bch_fs *c = ca->fs;
1015 	struct journal_device *ja = &ca->journal;
1016 	struct jset *j = NULL;
1017 	unsigned sectors, sectors_read = 0;
1018 	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
1019 	    end = offset + ca->mi.bucket_size;
1020 	bool saw_bad = false, csum_good;
1021 	struct printbuf err = PRINTBUF;
1022 	int ret = 0;
1023 
1024 	pr_debug("reading %u", bucket);
1025 
1026 	while (offset < end) {
1027 		if (!sectors_read) {
1028 			struct bio *bio;
1029 			unsigned nr_bvecs;
1030 reread:
1031 			sectors_read = min_t(unsigned,
1032 				end - offset, buf->size >> 9);
1033 			nr_bvecs = buf_pages(buf->data, sectors_read << 9);
1034 
1035 			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
1036 			if (!bio)
1037 				return -BCH_ERR_ENOMEM_journal_read_bucket;
1038 			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1039 
1040 			bio->bi_iter.bi_sector = offset;
1041 			bch2_bio_map(bio, buf->data, sectors_read << 9);
1042 
1043 			ret = submit_bio_wait(bio);
1044 			kfree(bio);
1045 
1046 			if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
1047 					       "journal read error: sector %llu",
1048 					       offset) ||
1049 			    bch2_meta_read_fault("journal")) {
1050 				/*
1051 				 * We don't error out of the recovery process
1052 				 * here, since the relevant journal entry may be
1053 				 * found on a different device, and missing or
1054 				 * no journal entries will be handled later
1055 				 */
1056 				goto out;
1057 			}
1058 
1059 			j = buf->data;
1060 		}
1061 
1062 		ret = jset_validate_early(c, ca, j, offset,
1063 				    end - offset, sectors_read);
1064 		switch (ret) {
1065 		case 0:
1066 			sectors = vstruct_sectors(j, c->block_bits);
1067 			break;
1068 		case JOURNAL_ENTRY_REREAD:
1069 			if (vstruct_bytes(j) > buf->size) {
1070 				ret = journal_read_buf_realloc(buf,
1071 							vstruct_bytes(j));
1072 				if (ret)
1073 					goto err;
1074 			}
1075 			goto reread;
1076 		case JOURNAL_ENTRY_NONE:
1077 			if (!saw_bad)
1078 				goto out;
1079 			/*
1080 			 * On checksum error we don't really trust the size
1081 			 * field of the journal entry we read, so try reading
1082 			 * again at next block boundary:
1083 			 */
1084 			sectors = block_sectors(c);
1085 			goto next_block;
1086 		default:
1087 			goto err;
1088 		}
1089 
1090 		if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
1091 			ja->highest_seq_found = le64_to_cpu(j->seq);
1092 			ja->cur_idx = bucket;
1093 			ja->sectors_free = ca->mi.bucket_size -
1094 				bucket_remainder(ca, offset) - sectors;
1095 		}
1096 
1097 		/*
1098 		 * This happens sometimes if we don't have discards on -
1099 		 * when we've partially overwritten a bucket with new
1100 		 * journal entries. We don't need the rest of the
1101 		 * bucket:
1102 		 */
1103 		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
1104 			goto out;
1105 
1106 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
1107 
1108 		enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
1109 		struct bch_csum csum;
1110 		csum_good = jset_csum_good(c, j, &csum);
1111 
1112 		if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
1113 				       "%s",
1114 				       (printbuf_reset(&err),
1115 					prt_str(&err, "journal "),
1116 					bch2_csum_err_msg(&err, csum_type, j->csum, csum),
1117 					err.buf)))
1118 			saw_bad = true;
1119 
1120 		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
1121 			     j->encrypted_start,
1122 			     vstruct_end(j) - (void *) j->encrypted_start);
1123 		bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
1124 
1125 		mutex_lock(&jlist->lock);
1126 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
1127 					.csum_good	= csum_good,
1128 					.dev		= ca->dev_idx,
1129 					.bucket		= bucket,
1130 					.bucket_offset	= offset -
1131 						bucket_to_sector(ca, ja->buckets[bucket]),
1132 					.sector		= offset,
1133 					}, jlist, j);
1134 		mutex_unlock(&jlist->lock);
1135 
1136 		switch (ret) {
1137 		case JOURNAL_ENTRY_ADD_OK:
1138 			break;
1139 		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
1140 			break;
1141 		default:
1142 			goto err;
1143 		}
1144 next_block:
1145 		pr_debug("next");
1146 		offset		+= sectors;
1147 		sectors_read	-= sectors;
1148 		j = ((void *) j) + (sectors << 9);
1149 	}
1150 
1151 out:
1152 	ret = 0;
1153 err:
1154 	printbuf_exit(&err);
1155 	return ret;
1156 }
1157 
1158 static CLOSURE_CALLBACK(bch2_journal_read_device)
1159 {
1160 	closure_type(ja, struct journal_device, read);
1161 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
1162 	struct bch_fs *c = ca->fs;
1163 	struct journal_list *jlist =
1164 		container_of(cl->parent, struct journal_list, cl);
1165 	struct journal_read_buf buf = { NULL, 0 };
1166 	unsigned i;
1167 	int ret = 0;
1168 
1169 	if (!ja->nr)
1170 		goto out;
1171 
1172 	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
1173 	if (ret)
1174 		goto err;
1175 
1176 	pr_debug("%u journal buckets", ja->nr);
1177 
1178 	for (i = 0; i < ja->nr; i++) {
1179 		ret = journal_read_bucket(ca, &buf, jlist, i);
1180 		if (ret)
1181 			goto err;
1182 	}
1183 
1184 	/*
1185 	 * Set dirty_idx to indicate the entire journal is full and needs to be
1186 	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
1187 	 * pinned when it first runs:
1188 	 */
1189 	ja->discard_idx = ja->dirty_idx_ondisk =
1190 		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1191 out:
1192 	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1193 	kvfree(buf.data);
1194 	percpu_ref_put(&ca->io_ref);
1195 	closure_return(cl);
1196 	return;
1197 err:
1198 	mutex_lock(&jlist->lock);
1199 	jlist->ret = ret;
1200 	mutex_unlock(&jlist->lock);
1201 	goto out;
1202 }
1203 
1204 int bch2_journal_read(struct bch_fs *c,
1205 		      u64 *last_seq,
1206 		      u64 *blacklist_seq,
1207 		      u64 *start_seq)
1208 {
1209 	struct journal_list jlist;
1210 	struct journal_replay *i, **_i, *prev = NULL;
1211 	struct genradix_iter radix_iter;
1212 	struct printbuf buf = PRINTBUF;
1213 	bool degraded = false, last_write_torn = false;
1214 	u64 seq;
1215 	int ret = 0;
1216 
1217 	closure_init_stack(&jlist.cl);
1218 	mutex_init(&jlist.lock);
1219 	jlist.last_seq = 0;
1220 	jlist.ret = 0;
1221 
1222 	for_each_member_device(c, ca) {
1223 		if (!c->opts.fsck &&
1224 		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1225 			continue;
1226 
1227 		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1228 		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
1229 		    percpu_ref_tryget(&ca->io_ref))
1230 			closure_call(&ca->journal.read,
1231 				     bch2_journal_read_device,
1232 				     system_unbound_wq,
1233 				     &jlist.cl);
1234 		else
1235 			degraded = true;
1236 	}
1237 
1238 	closure_sync(&jlist.cl);
1239 
1240 	if (jlist.ret)
1241 		return jlist.ret;
1242 
1243 	*last_seq	= 0;
1244 	*start_seq	= 0;
1245 	*blacklist_seq	= 0;
1246 
1247 	/*
1248 	 * Find most recent flush entry, and ignore newer non flush entries -
1249 	 * those entries will be blacklisted:
1250 	 */
1251 	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1252 		i = *_i;
1253 
1254 		if (journal_replay_ignore(i))
1255 			continue;
1256 
1257 		if (!*start_seq)
1258 			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
1259 
1260 		if (JSET_NO_FLUSH(&i->j)) {
1261 			i->ignore_blacklisted = true;
1262 			continue;
1263 		}
1264 
1265 		if (!last_write_torn && !i->csum_good) {
1266 			last_write_torn = true;
1267 			i->ignore_blacklisted = true;
1268 			continue;
1269 		}
1270 
1271 		struct bkey_validate_context from = {
1272 			.from		= BKEY_VALIDATE_journal,
1273 			.journal_seq	= le64_to_cpu(i->j.seq),
1274 		};
1275 		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
1276 					 c, le32_to_cpu(i->j.version), &i->j, NULL,
1277 					 jset_last_seq_newer_than_seq,
1278 					 "invalid journal entry: last_seq > seq (%llu > %llu)",
1279 					 le64_to_cpu(i->j.last_seq),
1280 					 le64_to_cpu(i->j.seq)))
1281 			i->j.last_seq = i->j.seq;
1282 
1283 		*last_seq	= le64_to_cpu(i->j.last_seq);
1284 		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
1285 		break;
1286 	}
1287 
1288 	if (!*start_seq) {
1289 		bch_info(c, "journal read done, but no entries found");
1290 		return 0;
1291 	}
1292 
1293 	if (!*last_seq) {
1294 		fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
1295 			 "journal read done, but no entries found after dropping non-flushes");
1296 		return 0;
1297 	}
1298 
1299 	bch_info(c, "journal read done, replaying entries %llu-%llu",
1300 		 *last_seq, *blacklist_seq - 1);
1301 
1302 	if (*start_seq != *blacklist_seq)
1303 		bch_info(c, "dropped unflushed entries %llu-%llu",
1304 			 *blacklist_seq, *start_seq - 1);
1305 
1306 	/* Drop blacklisted entries and entries older than last_seq: */
1307 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
1308 		i = *_i;
1309 
1310 		if (journal_replay_ignore(i))
1311 			continue;
1312 
1313 		seq = le64_to_cpu(i->j.seq);
1314 		if (seq < *last_seq) {
1315 			journal_replay_free(c, i, false);
1316 			continue;
1317 		}
1318 
1319 		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1320 			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1321 				    jset_seq_blacklisted,
1322 				    "found blacklisted journal entry %llu", seq);
1323 			i->ignore_blacklisted = true;
1324 		}
1325 	}
1326 
1327 	/* Check for missing entries: */
1328 	seq = *last_seq;
1329 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
1330 		i = *_i;
1331 
1332 		if (journal_replay_ignore(i))
1333 			continue;
1334 
1335 		BUG_ON(seq > le64_to_cpu(i->j.seq));
1336 
1337 		while (seq < le64_to_cpu(i->j.seq)) {
1338 			u64 missing_start, missing_end;
1339 			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1340 
1341 			while (seq < le64_to_cpu(i->j.seq) &&
1342 			       bch2_journal_seq_is_blacklisted(c, seq, false))
1343 				seq++;
1344 
1345 			if (seq == le64_to_cpu(i->j.seq))
1346 				break;
1347 
1348 			missing_start = seq;
1349 
1350 			while (seq < le64_to_cpu(i->j.seq) &&
1351 			       !bch2_journal_seq_is_blacklisted(c, seq, false))
1352 				seq++;
1353 
1354 			if (prev) {
1355 				bch2_journal_ptrs_to_text(&buf1, c, prev);
1356 				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1357 			} else
1358 				prt_printf(&buf1, "(none)");
1359 			bch2_journal_ptrs_to_text(&buf2, c, i);
1360 
1361 			missing_end = seq - 1;
1362 			fsck_err(c, journal_entries_missing,
1363 				 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1364 				 "  prev at %s\n"
1365 				 "  next at %s, continue?",
1366 				 missing_start, missing_end,
1367 				 *last_seq, *blacklist_seq - 1,
1368 				 buf1.buf, buf2.buf);
1369 
1370 			printbuf_exit(&buf1);
1371 			printbuf_exit(&buf2);
1372 		}
1373 
1374 		prev = i;
1375 		seq++;
1376 	}
1377 
1378 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
1379 		struct bch_replicas_padded replicas = {
1380 			.e.data_type = BCH_DATA_journal,
1381 			.e.nr_devs = 0,
1382 			.e.nr_required = 1,
1383 		};
1384 
1385 		i = *_i;
1386 		if (journal_replay_ignore(i))
1387 			continue;
1388 
1389 		darray_for_each(i->ptrs, ptr) {
1390 			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
1391 
1392 			if (!ptr->csum_good)
1393 				bch_err_dev_offset(ca, ptr->sector,
1394 						   "invalid journal checksum, seq %llu%s",
1395 						   le64_to_cpu(i->j.seq),
1396 						   i->csum_good ? " (had good copy on another device)" : "");
1397 		}
1398 
1399 		ret = jset_validate(c,
1400 				    bch2_dev_have_ref(c, i->ptrs.data[0].dev),
1401 				    &i->j,
1402 				    i->ptrs.data[0].sector,
1403 				    READ);
1404 		if (ret)
1405 			goto err;
1406 
1407 		darray_for_each(i->ptrs, ptr)
1408 			replicas_entry_add_dev(&replicas.e, ptr->dev);
1409 
1410 		bch2_replicas_entry_sort(&replicas.e);
1411 
1412 		printbuf_reset(&buf);
1413 		bch2_replicas_entry_to_text(&buf, &replicas.e);
1414 
1415 		if (!degraded &&
1416 		    !bch2_replicas_marked(c, &replicas.e) &&
1417 		    (le64_to_cpu(i->j.seq) == *last_seq ||
1418 		     fsck_err(c, journal_entry_replicas_not_marked,
1419 			      "superblock not marked as containing replicas for journal entry %llu\n  %s",
1420 			      le64_to_cpu(i->j.seq), buf.buf))) {
1421 			ret = bch2_mark_replicas(c, &replicas.e);
1422 			if (ret)
1423 				goto err;
1424 		}
1425 	}
1426 err:
1427 fsck_err:
1428 	printbuf_exit(&buf);
1429 	return ret;
1430 }
1431 
1432 /* journal write: */
1433 
1434 static void journal_advance_devs_to_next_bucket(struct journal *j,
1435 						struct dev_alloc_list *devs,
1436 						unsigned sectors, u64 seq)
1437 {
1438 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1439 
1440 	darray_for_each(*devs, i) {
1441 		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
1442 		if (!ca)
1443 			continue;
1444 
1445 		struct journal_device *ja = &ca->journal;
1446 
1447 		if (sectors > ja->sectors_free &&
1448 		    sectors <= ca->mi.bucket_size &&
1449 		    bch2_journal_dev_buckets_available(j, ja,
1450 					journal_space_discarded)) {
1451 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1452 			ja->sectors_free = ca->mi.bucket_size;
1453 
1454 			/*
1455 			 * ja->bucket_seq[ja->cur_idx] must always have
1456 			 * something sensible:
1457 			 */
1458 			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
1459 		}
1460 	}
1461 }
1462 
1463 static void __journal_write_alloc(struct journal *j,
1464 				  struct journal_buf *w,
1465 				  struct dev_alloc_list *devs,
1466 				  unsigned sectors,
1467 				  unsigned *replicas,
1468 				  unsigned replicas_want)
1469 {
1470 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1471 
1472 	darray_for_each(*devs, i) {
1473 		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
1474 		if (!ca)
1475 			continue;
1476 
1477 		struct journal_device *ja = &ca->journal;
1478 
1479 		/*
1480 		 * Check that we can use this device, and aren't already using
1481 		 * it:
1482 		 */
1483 		if (!ca->mi.durability ||
1484 		    ca->mi.state != BCH_MEMBER_STATE_rw ||
1485 		    !ja->nr ||
1486 		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
1487 		    sectors > ja->sectors_free)
1488 			continue;
1489 
1490 		bch2_dev_stripe_increment(ca, &j->wp.stripe);
1491 
1492 		bch2_bkey_append_ptr(&w->key,
1493 			(struct bch_extent_ptr) {
1494 				  .offset = bucket_to_sector(ca,
1495 					ja->buckets[ja->cur_idx]) +
1496 					ca->mi.bucket_size -
1497 					ja->sectors_free,
1498 				  .dev = ca->dev_idx,
1499 		});
1500 
1501 		ja->sectors_free -= sectors;
1502 		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1503 
1504 		*replicas += ca->mi.durability;
1505 
1506 		if (*replicas >= replicas_want)
1507 			break;
1508 	}
1509 }
1510 
1511 /**
1512  * journal_write_alloc - decide where to write next journal entry
1513  *
1514  * @j:		journal object
1515  * @w:		journal buf (entry to be written)
1516  *
1517  * Returns: 0 on success, or -EROFS on failure
1518  */
1519 static int journal_write_alloc(struct journal *j, struct journal_buf *w)
1520 {
1521 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1522 	struct bch_devs_mask devs;
1523 	struct dev_alloc_list devs_sorted;
1524 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1525 	unsigned target = c->opts.metadata_target ?:
1526 		c->opts.foreground_target;
1527 	unsigned replicas = 0, replicas_want =
1528 		READ_ONCE(c->opts.metadata_replicas);
1529 	unsigned replicas_need = min_t(unsigned, replicas_want,
1530 				       READ_ONCE(c->opts.metadata_replicas_required));
1531 	bool advance_done = false;
1532 
1533 	rcu_read_lock();
1534 
1535 	/* We might run more than once if we have to stop and do discards: */
1536 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
1537 	bkey_for_each_ptr(ptrs, p) {
1538 		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
1539 		if (ca)
1540 			replicas += ca->mi.durability;
1541 	}
1542 
1543 retry_target:
1544 	devs = target_rw_devs(c, BCH_DATA_journal, target);
1545 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1546 retry_alloc:
1547 	__journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
1548 
1549 	if (likely(replicas >= replicas_want))
1550 		goto done;
1551 
1552 	if (!advance_done) {
1553 		journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
1554 		advance_done = true;
1555 		goto retry_alloc;
1556 	}
1557 
1558 	if (replicas < replicas_want && target) {
1559 		/* Retry from all devices: */
1560 		target = 0;
1561 		advance_done = false;
1562 		goto retry_target;
1563 	}
1564 done:
1565 	rcu_read_unlock();
1566 
1567 	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1568 
1569 	return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
1570 }
1571 
1572 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1573 {
1574 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1575 
1576 	/* we aren't holding j->lock: */
1577 	unsigned new_size = READ_ONCE(j->buf_size_want);
1578 	void *new_buf;
1579 
1580 	if (buf->buf_size >= new_size)
1581 		return;
1582 
1583 	size_t btree_write_buffer_size = new_size / 64;
1584 
1585 	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
1586 		return;
1587 
1588 	new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
1589 	if (!new_buf)
1590 		return;
1591 
1592 	memcpy(new_buf, buf->data, buf->buf_size);
1593 
1594 	spin_lock(&j->lock);
1595 	swap(buf->data,		new_buf);
1596 	swap(buf->buf_size,	new_size);
1597 	spin_unlock(&j->lock);
1598 
1599 	kvfree(new_buf);
1600 }
1601 
1602 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1603 {
1604 	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1605 }
1606 
1607 static CLOSURE_CALLBACK(journal_write_done)
1608 {
1609 	closure_type(w, struct journal_buf, io);
1610 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
1611 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1612 	struct bch_replicas_padded replicas;
1613 	union journal_res_state old, new;
1614 	u64 seq = le64_to_cpu(w->data->seq);
1615 	int err = 0;
1616 
1617 	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1618 			       ? j->flush_write_time
1619 			       : j->noflush_write_time, j->write_start_time);
1620 
1621 	if (!w->devs_written.nr) {
1622 		bch_err(c, "unable to write journal to sufficient devices");
1623 		err = -EIO;
1624 	} else {
1625 		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1626 					 w->devs_written);
1627 		if (bch2_mark_replicas(c, &replicas.e))
1628 			err = -EIO;
1629 	}
1630 
1631 	if (err)
1632 		bch2_fatal_error(c);
1633 
1634 	closure_debug_destroy(cl);
1635 
1636 	spin_lock(&j->lock);
1637 	if (seq >= j->pin.front)
1638 		journal_seq_pin(j, seq)->devs = w->devs_written;
1639 	if (err && (!j->err_seq || seq < j->err_seq))
1640 		j->err_seq	= seq;
1641 	w->write_done = true;
1642 
1643 	bool completed = false;
1644 
1645 	for (seq = journal_last_unwritten_seq(j);
1646 	     seq <= journal_cur_seq(j);
1647 	     seq++) {
1648 		w = j->buf + (seq & JOURNAL_BUF_MASK);
1649 		if (!w->write_done)
1650 			break;
1651 
1652 		if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
1653 			j->flushed_seq_ondisk = seq;
1654 			j->last_seq_ondisk = w->last_seq;
1655 
1656 			bch2_do_discards(c);
1657 			closure_wake_up(&c->freelist_wait);
1658 			bch2_reset_alloc_cursors(c);
1659 		}
1660 
1661 		j->seq_ondisk = seq;
1662 
1663 		/*
1664 		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1665 		 * more buckets:
1666 		 *
1667 		 * Must come before signaling write completion, for
1668 		 * bch2_fs_journal_stop():
1669 		 */
1670 		if (j->watermark != BCH_WATERMARK_stripe)
1671 			journal_reclaim_kick(&c->journal);
1672 
1673 		old.v = atomic64_read(&j->reservations.counter);
1674 		do {
1675 			new.v = old.v;
1676 			BUG_ON(journal_state_count(new, new.unwritten_idx));
1677 			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
1678 
1679 			new.unwritten_idx++;
1680 		} while (!atomic64_try_cmpxchg(&j->reservations.counter,
1681 					       &old.v, new.v));
1682 
1683 		closure_wake_up(&w->wait);
1684 		completed = true;
1685 	}
1686 
1687 	if (completed) {
1688 		bch2_journal_reclaim_fast(j);
1689 		bch2_journal_space_available(j);
1690 
1691 		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
1692 
1693 		journal_wake(j);
1694 	}
1695 
1696 	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1697 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1698 		struct journal_buf *buf = journal_cur_buf(j);
1699 		long delta = buf->expires - jiffies;
1700 
1701 		/*
1702 		 * We don't close a journal entry to write it while there's
1703 		 * previous entries still in flight - the current journal entry
1704 		 * might want to be written now:
1705 		 */
1706 		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
1707 	}
1708 
1709 	/*
1710 	 * We don't typically trigger journal writes from her - the next journal
1711 	 * write will be triggered immediately after the previous one is
1712 	 * allocated, in bch2_journal_write() - but the journal write error path
1713 	 * is special:
1714 	 */
1715 	bch2_journal_do_writes(j);
1716 	spin_unlock(&j->lock);
1717 }
1718 
1719 static void journal_write_endio(struct bio *bio)
1720 {
1721 	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
1722 	struct bch_dev *ca = jbio->ca;
1723 	struct journal *j = &ca->fs->journal;
1724 	struct journal_buf *w = j->buf + jbio->buf_idx;
1725 
1726 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
1727 			       "error writing journal entry %llu: %s",
1728 			       le64_to_cpu(w->data->seq),
1729 			       bch2_blk_status_to_str(bio->bi_status)) ||
1730 	    bch2_meta_write_fault("journal")) {
1731 		unsigned long flags;
1732 
1733 		spin_lock_irqsave(&j->err_lock, flags);
1734 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1735 		spin_unlock_irqrestore(&j->err_lock, flags);
1736 	}
1737 
1738 	closure_put(&w->io);
1739 	percpu_ref_put(&ca->io_ref);
1740 }
1741 
1742 static CLOSURE_CALLBACK(journal_write_submit)
1743 {
1744 	closure_type(w, struct journal_buf, io);
1745 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
1746 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1747 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1748 
1749 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1750 		struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
1751 		if (!ca) {
1752 			/* XXX: fix this */
1753 			bch_err(c, "missing device for journal write\n");
1754 			continue;
1755 		}
1756 
1757 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1758 			     sectors);
1759 
1760 		struct journal_device *ja = &ca->journal;
1761 		struct bio *bio = &ja->bio[w->idx]->bio;
1762 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1763 		bio->bi_iter.bi_sector	= ptr->offset;
1764 		bio->bi_end_io		= journal_write_endio;
1765 		bio->bi_private		= ca;
1766 
1767 		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1768 		ca->prev_journal_sector = bio->bi_iter.bi_sector;
1769 
1770 		if (!JSET_NO_FLUSH(w->data))
1771 			bio->bi_opf    |= REQ_FUA;
1772 		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1773 			bio->bi_opf    |= REQ_PREFLUSH;
1774 
1775 		bch2_bio_map(bio, w->data, sectors << 9);
1776 
1777 		trace_and_count(c, journal_write, bio);
1778 		closure_bio_submit(bio, cl);
1779 
1780 		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1781 	}
1782 
1783 	continue_at(cl, journal_write_done, j->wq);
1784 }
1785 
1786 static CLOSURE_CALLBACK(journal_write_preflush)
1787 {
1788 	closure_type(w, struct journal_buf, io);
1789 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
1790 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1791 
1792 	if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
1793 		spin_lock(&j->lock);
1794 		if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
1795 			closure_wait(&j->async_wait, cl);
1796 			spin_unlock(&j->lock);
1797 			continue_at(cl, journal_write_preflush, j->wq);
1798 			return;
1799 		}
1800 		spin_unlock(&j->lock);
1801 	}
1802 
1803 	if (w->separate_flush) {
1804 		for_each_rw_member(c, ca) {
1805 			percpu_ref_get(&ca->io_ref);
1806 
1807 			struct journal_device *ja = &ca->journal;
1808 			struct bio *bio = &ja->bio[w->idx]->bio;
1809 			bio_reset(bio, ca->disk_sb.bdev,
1810 				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
1811 			bio->bi_end_io		= journal_write_endio;
1812 			bio->bi_private		= ca;
1813 			closure_bio_submit(bio, cl);
1814 		}
1815 
1816 		continue_at(cl, journal_write_submit, j->wq);
1817 	} else {
1818 		/*
1819 		 * no need to punt to another work item if we're not waiting on
1820 		 * preflushes
1821 		 */
1822 		journal_write_submit(&cl->work);
1823 	}
1824 }
1825 
1826 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
1827 {
1828 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1829 	struct jset_entry *start, *end;
1830 	struct jset *jset = w->data;
1831 	struct journal_keys_to_wb wb = { NULL };
1832 	unsigned sectors, bytes, u64s;
1833 	unsigned long btree_roots_have = 0;
1834 	bool validate_before_checksum = false;
1835 	u64 seq = le64_to_cpu(jset->seq);
1836 	int ret;
1837 
1838 	/*
1839 	 * Simple compaction, dropping empty jset_entries (from journal
1840 	 * reservations that weren't fully used) and merging jset_entries that
1841 	 * can be.
1842 	 *
1843 	 * If we wanted to be really fancy here, we could sort all the keys in
1844 	 * the jset and drop keys that were overwritten - probably not worth it:
1845 	 */
1846 	vstruct_for_each(jset, i) {
1847 		unsigned u64s = le16_to_cpu(i->u64s);
1848 
1849 		/* Empty entry: */
1850 		if (!u64s)
1851 			continue;
1852 
1853 		/*
1854 		 * New btree roots are set by journalling them; when the journal
1855 		 * entry gets written we have to propagate them to
1856 		 * c->btree_roots
1857 		 *
1858 		 * But, every journal entry we write has to contain all the
1859 		 * btree roots (at least for now); so after we copy btree roots
1860 		 * to c->btree_roots we have to get any missing btree roots and
1861 		 * add them to this journal entry:
1862 		 */
1863 		switch (i->type) {
1864 		case BCH_JSET_ENTRY_btree_root:
1865 			bch2_journal_entry_to_btree_root(c, i);
1866 			__set_bit(i->btree_id, &btree_roots_have);
1867 			break;
1868 		case BCH_JSET_ENTRY_write_buffer_keys:
1869 			EBUG_ON(!w->need_flush_to_write_buffer);
1870 
1871 			if (!wb.wb)
1872 				bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
1873 
1874 			jset_entry_for_each_key(i, k) {
1875 				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
1876 				if (ret) {
1877 					bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
1878 							    bch2_err_str(ret));
1879 					bch2_journal_keys_to_write_buffer_end(c, &wb);
1880 					return ret;
1881 				}
1882 			}
1883 			i->type = BCH_JSET_ENTRY_btree_keys;
1884 			break;
1885 		}
1886 	}
1887 
1888 	if (wb.wb) {
1889 		ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
1890 		if (ret) {
1891 			bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
1892 					    bch2_err_str(ret));
1893 			return ret;
1894 		}
1895 	}
1896 
1897 	spin_lock(&c->journal.lock);
1898 	w->need_flush_to_write_buffer = false;
1899 	spin_unlock(&c->journal.lock);
1900 
1901 	start = end = vstruct_last(jset);
1902 
1903 	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
1904 
1905 	struct jset_entry_datetime *d =
1906 		container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
1907 	d->entry.type	= BCH_JSET_ENTRY_datetime;
1908 	d->seconds	= cpu_to_le64(ktime_get_real_seconds());
1909 
1910 	bch2_journal_super_entries_add_common(c, &end, seq);
1911 	u64s	= (u64 *) end - (u64 *) start;
1912 
1913 	WARN_ON(u64s > j->entry_u64s_reserved);
1914 
1915 	le32_add_cpu(&jset->u64s, u64s);
1916 
1917 	sectors = vstruct_sectors(jset, c->block_bits);
1918 	bytes	= vstruct_bytes(jset);
1919 
1920 	if (sectors > w->sectors) {
1921 		bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
1922 				    vstruct_bytes(jset), w->sectors << 9,
1923 				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
1924 		return -EINVAL;
1925 	}
1926 
1927 	jset->magic		= cpu_to_le64(jset_magic(c));
1928 	jset->version		= cpu_to_le32(c->sb.version);
1929 
1930 	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1931 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1932 
1933 	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
1934 		j->last_empty_seq = seq;
1935 
1936 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1937 		validate_before_checksum = true;
1938 
1939 	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1940 		validate_before_checksum = true;
1941 
1942 	if (validate_before_checksum &&
1943 	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
1944 		return ret;
1945 
1946 	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1947 		    jset->encrypted_start,
1948 		    vstruct_end(jset) - (void *) jset->encrypted_start);
1949 	if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
1950 		return ret;
1951 
1952 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1953 				  journal_nonce(jset), jset);
1954 
1955 	if (!validate_before_checksum &&
1956 	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
1957 		return ret;
1958 
1959 	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1960 	return 0;
1961 }
1962 
1963 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
1964 {
1965 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
1966 	int error = bch2_journal_error(j);
1967 
1968 	/*
1969 	 * If the journal is in an error state - we did an emergency shutdown -
1970 	 * we prefer to continue doing journal writes. We just mark them as
1971 	 * noflush so they'll never be used, but they'll still be visible by the
1972 	 * list_journal tool - this helps in debugging.
1973 	 *
1974 	 * There's a caveat: the first journal write after marking the
1975 	 * superblock dirty must always be a flush write, because on startup
1976 	 * from a clean shutdown we didn't necessarily read the journal and the
1977 	 * new journal write might overwrite whatever was in the journal
1978 	 * previously - we can't leave the journal without any flush writes in
1979 	 * it.
1980 	 *
1981 	 * So if we're in an error state, and we're still starting up, we don't
1982 	 * write anything at all.
1983 	 */
1984 	if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
1985 		return -EIO;
1986 
1987 	if (error ||
1988 	    w->noflush ||
1989 	    (!w->must_flush &&
1990 	     time_before(jiffies, j->last_flush_write +
1991 		 msecs_to_jiffies(c->opts.journal_flush_delay)) &&
1992 	     test_bit(JOURNAL_may_skip_flush, &j->flags))) {
1993 		w->noflush = true;
1994 		SET_JSET_NO_FLUSH(w->data, true);
1995 		w->data->last_seq	= 0;
1996 		w->last_seq		= 0;
1997 
1998 		j->nr_noflush_writes++;
1999 	} else {
2000 		w->must_flush = true;
2001 		j->last_flush_write = jiffies;
2002 		j->nr_flush_writes++;
2003 		clear_bit(JOURNAL_need_flush_write, &j->flags);
2004 	}
2005 
2006 	return 0;
2007 }
2008 
2009 CLOSURE_CALLBACK(bch2_journal_write)
2010 {
2011 	closure_type(w, struct journal_buf, io);
2012 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
2013 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
2014 	struct bch_replicas_padded replicas;
2015 	unsigned nr_rw_members = 0;
2016 	int ret;
2017 
2018 	for_each_rw_member(c, ca)
2019 		nr_rw_members++;
2020 
2021 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
2022 	BUG_ON(!w->write_started);
2023 	BUG_ON(w->write_allocated);
2024 	BUG_ON(w->write_done);
2025 
2026 	j->write_start_time = local_clock();
2027 
2028 	spin_lock(&j->lock);
2029 	if (nr_rw_members > 1)
2030 		w->separate_flush = true;
2031 
2032 	ret = bch2_journal_write_pick_flush(j, w);
2033 	spin_unlock(&j->lock);
2034 	if (ret)
2035 		goto err;
2036 
2037 	mutex_lock(&j->buf_lock);
2038 	journal_buf_realloc(j, w);
2039 
2040 	ret = bch2_journal_write_prep(j, w);
2041 	mutex_unlock(&j->buf_lock);
2042 	if (ret)
2043 		goto err;
2044 
2045 	j->entry_bytes_written += vstruct_bytes(w->data);
2046 
2047 	while (1) {
2048 		spin_lock(&j->lock);
2049 		ret = journal_write_alloc(j, w);
2050 		if (!ret || !j->can_discard)
2051 			break;
2052 
2053 		spin_unlock(&j->lock);
2054 		bch2_journal_do_discards(j);
2055 	}
2056 
2057 	if (ret && !bch2_journal_error(j)) {
2058 		struct printbuf buf = PRINTBUF;
2059 		buf.atomic++;
2060 
2061 		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
2062 					  le64_to_cpu(w->data->seq),
2063 					  vstruct_sectors(w->data, c->block_bits),
2064 					  bch2_err_str(ret));
2065 		__bch2_journal_debug_to_text(&buf, j);
2066 		spin_unlock(&j->lock);
2067 		bch2_print_string_as_lines(KERN_ERR, buf.buf);
2068 		printbuf_exit(&buf);
2069 	}
2070 	if (ret)
2071 		goto err;
2072 
2073 	/*
2074 	 * write is allocated, no longer need to account for it in
2075 	 * bch2_journal_space_available():
2076 	 */
2077 	w->sectors = 0;
2078 	w->write_allocated = true;
2079 
2080 	/*
2081 	 * journal entry has been compacted and allocated, recalculate space
2082 	 * available:
2083 	 */
2084 	bch2_journal_space_available(j);
2085 	bch2_journal_do_writes(j);
2086 	spin_unlock(&j->lock);
2087 
2088 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
2089 
2090 	if (c->opts.nochanges)
2091 		goto no_io;
2092 
2093 	/*
2094 	 * Mark journal replicas before we submit the write to guarantee
2095 	 * recovery will find the journal entries after a crash.
2096 	 */
2097 	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
2098 				 w->devs_written);
2099 	ret = bch2_mark_replicas(c, &replicas.e);
2100 	if (ret)
2101 		goto err;
2102 
2103 	if (!JSET_NO_FLUSH(w->data))
2104 		continue_at(cl, journal_write_preflush, j->wq);
2105 	else
2106 		continue_at(cl, journal_write_submit, j->wq);
2107 	return;
2108 no_io:
2109 	continue_at(cl, journal_write_done, j->wq);
2110 	return;
2111 err:
2112 	bch2_fatal_error(c);
2113 	continue_at(cl, journal_write_done, j->wq);
2114 }
2115