xref: /linux/fs/bcachefs/replicas.c (revision e7b2b108cdeab76a7e7324459e50b0c1214c0386)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "journal.h"
6 #include "replicas.h"
7 #include "super-io.h"
8 
9 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
10 					    struct bch_replicas_cpu *);
11 
12 /* Replicas tracking - in memory: */
13 
14 static void verify_replicas_entry(struct bch_replicas_entry *e)
15 {
16 #ifdef CONFIG_BCACHEFS_DEBUG
17 	unsigned i;
18 
19 	BUG_ON(e->data_type >= BCH_DATA_NR);
20 	BUG_ON(!e->nr_devs);
21 	BUG_ON(e->nr_required > 1 &&
22 	       e->nr_required >= e->nr_devs);
23 
24 	for (i = 0; i + 1 < e->nr_devs; i++)
25 		BUG_ON(e->devs[i] >= e->devs[i + 1]);
26 #endif
27 }
28 
29 void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
30 {
31 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
32 }
33 
34 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
35 {
36 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
37 }
38 
39 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
40 					   struct bch_replicas_entry_v0 *e)
41 {
42 	unsigned i;
43 
44 	if (e->data_type < BCH_DATA_NR)
45 		prt_printf(out, "%s", bch2_data_types[e->data_type]);
46 	else
47 		prt_printf(out, "(invalid data type %u)", e->data_type);
48 
49 	prt_printf(out, ": %u [", e->nr_devs);
50 	for (i = 0; i < e->nr_devs; i++)
51 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
52 	prt_printf(out, "]");
53 }
54 
55 void bch2_replicas_entry_to_text(struct printbuf *out,
56 				 struct bch_replicas_entry *e)
57 {
58 	unsigned i;
59 
60 	if (e->data_type < BCH_DATA_NR)
61 		prt_printf(out, "%s", bch2_data_types[e->data_type]);
62 	else
63 		prt_printf(out, "(invalid data type %u)", e->data_type);
64 
65 	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
66 	for (i = 0; i < e->nr_devs; i++)
67 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
68 	prt_printf(out, "]");
69 }
70 
71 int bch2_replicas_entry_validate(struct bch_replicas_entry *r,
72 				 struct bch_sb *sb,
73 				 struct printbuf *err)
74 {
75 	if (!r->nr_devs) {
76 		prt_printf(err, "no devices in entry ");
77 		goto bad;
78 	}
79 
80 	if (r->nr_required > 1 &&
81 	    r->nr_required >= r->nr_devs) {
82 		prt_printf(err, "bad nr_required in entry ");
83 		goto bad;
84 	}
85 
86 	for (unsigned i = 0; i < r->nr_devs; i++)
87 		if (!bch2_dev_exists(sb, r->devs[i])) {
88 			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
89 			goto bad;
90 		}
91 
92 	return 0;
93 bad:
94 	bch2_replicas_entry_to_text(err, r);
95 	return -BCH_ERR_invalid_replicas_entry;
96 }
97 
98 void bch2_cpu_replicas_to_text(struct printbuf *out,
99 			       struct bch_replicas_cpu *r)
100 {
101 	struct bch_replicas_entry *e;
102 	bool first = true;
103 
104 	for_each_cpu_replicas_entry(r, e) {
105 		if (!first)
106 			prt_printf(out, " ");
107 		first = false;
108 
109 		bch2_replicas_entry_to_text(out, e);
110 	}
111 }
112 
113 static void extent_to_replicas(struct bkey_s_c k,
114 			       struct bch_replicas_entry *r)
115 {
116 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
117 	const union bch_extent_entry *entry;
118 	struct extent_ptr_decoded p;
119 
120 	r->nr_required	= 1;
121 
122 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
123 		if (p.ptr.cached)
124 			continue;
125 
126 		if (!p.has_ec)
127 			r->devs[r->nr_devs++] = p.ptr.dev;
128 		else
129 			r->nr_required = 0;
130 	}
131 }
132 
133 static void stripe_to_replicas(struct bkey_s_c k,
134 			       struct bch_replicas_entry *r)
135 {
136 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
137 	const struct bch_extent_ptr *ptr;
138 
139 	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
140 
141 	for (ptr = s.v->ptrs;
142 	     ptr < s.v->ptrs + s.v->nr_blocks;
143 	     ptr++)
144 		r->devs[r->nr_devs++] = ptr->dev;
145 }
146 
147 void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
148 			   struct bkey_s_c k)
149 {
150 	e->nr_devs = 0;
151 
152 	switch (k.k->type) {
153 	case KEY_TYPE_btree_ptr:
154 	case KEY_TYPE_btree_ptr_v2:
155 		e->data_type = BCH_DATA_btree;
156 		extent_to_replicas(k, e);
157 		break;
158 	case KEY_TYPE_extent:
159 	case KEY_TYPE_reflink_v:
160 		e->data_type = BCH_DATA_user;
161 		extent_to_replicas(k, e);
162 		break;
163 	case KEY_TYPE_stripe:
164 		e->data_type = BCH_DATA_parity;
165 		stripe_to_replicas(k, e);
166 		break;
167 	}
168 
169 	bch2_replicas_entry_sort(e);
170 }
171 
172 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
173 			      enum bch_data_type data_type,
174 			      struct bch_devs_list devs)
175 {
176 	unsigned i;
177 
178 	BUG_ON(!data_type ||
179 	       data_type == BCH_DATA_sb ||
180 	       data_type >= BCH_DATA_NR);
181 
182 	e->data_type	= data_type;
183 	e->nr_devs	= 0;
184 	e->nr_required	= 1;
185 
186 	for (i = 0; i < devs.nr; i++)
187 		e->devs[e->nr_devs++] = devs.devs[i];
188 
189 	bch2_replicas_entry_sort(e);
190 }
191 
192 static struct bch_replicas_cpu
193 cpu_replicas_add_entry(struct bch_fs *c,
194 		       struct bch_replicas_cpu *old,
195 		       struct bch_replicas_entry *new_entry)
196 {
197 	unsigned i;
198 	struct bch_replicas_cpu new = {
199 		.nr		= old->nr + 1,
200 		.entry_size	= max_t(unsigned, old->entry_size,
201 					replicas_entry_bytes(new_entry)),
202 	};
203 
204 	for (i = 0; i < new_entry->nr_devs; i++)
205 		BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
206 
207 	BUG_ON(!new_entry->data_type);
208 	verify_replicas_entry(new_entry);
209 
210 	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
211 	if (!new.entries)
212 		return new;
213 
214 	for (i = 0; i < old->nr; i++)
215 		memcpy(cpu_replicas_entry(&new, i),
216 		       cpu_replicas_entry(old, i),
217 		       old->entry_size);
218 
219 	memcpy(cpu_replicas_entry(&new, old->nr),
220 	       new_entry,
221 	       replicas_entry_bytes(new_entry));
222 
223 	bch2_cpu_replicas_sort(&new);
224 	return new;
225 }
226 
227 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
228 				       struct bch_replicas_entry *search)
229 {
230 	int idx, entry_size = replicas_entry_bytes(search);
231 
232 	if (unlikely(entry_size > r->entry_size))
233 		return -1;
234 
235 	verify_replicas_entry(search);
236 
237 #define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
238 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
239 			      entry_cmp, search);
240 #undef entry_cmp
241 
242 	return idx < r->nr ? idx : -1;
243 }
244 
245 int bch2_replicas_entry_idx(struct bch_fs *c,
246 			    struct bch_replicas_entry *search)
247 {
248 	bch2_replicas_entry_sort(search);
249 
250 	return __replicas_entry_idx(&c->replicas, search);
251 }
252 
253 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
254 				 struct bch_replicas_entry *search)
255 {
256 	return __replicas_entry_idx(r, search) >= 0;
257 }
258 
259 bool bch2_replicas_marked(struct bch_fs *c,
260 			  struct bch_replicas_entry *search)
261 {
262 	bool marked;
263 
264 	if (!search->nr_devs)
265 		return true;
266 
267 	verify_replicas_entry(search);
268 
269 	percpu_down_read(&c->mark_lock);
270 	marked = __replicas_has_entry(&c->replicas, search) &&
271 		(likely((!c->replicas_gc.entries)) ||
272 		 __replicas_has_entry(&c->replicas_gc, search));
273 	percpu_up_read(&c->mark_lock);
274 
275 	return marked;
276 }
277 
278 static void __replicas_table_update(struct bch_fs_usage *dst,
279 				    struct bch_replicas_cpu *dst_r,
280 				    struct bch_fs_usage *src,
281 				    struct bch_replicas_cpu *src_r)
282 {
283 	int src_idx, dst_idx;
284 
285 	*dst = *src;
286 
287 	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
288 		if (!src->replicas[src_idx])
289 			continue;
290 
291 		dst_idx = __replicas_entry_idx(dst_r,
292 				cpu_replicas_entry(src_r, src_idx));
293 		BUG_ON(dst_idx < 0);
294 
295 		dst->replicas[dst_idx] = src->replicas[src_idx];
296 	}
297 }
298 
299 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
300 				    struct bch_replicas_cpu *dst_r,
301 				    struct bch_fs_usage __percpu *src_p,
302 				    struct bch_replicas_cpu *src_r)
303 {
304 	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
305 	struct bch_fs_usage *dst, *src = (void *)
306 		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
307 
308 	preempt_disable();
309 	dst = this_cpu_ptr(dst_p);
310 	preempt_enable();
311 
312 	__replicas_table_update(dst, dst_r, src, src_r);
313 }
314 
315 /*
316  * Resize filesystem accounting:
317  */
318 static int replicas_table_update(struct bch_fs *c,
319 				 struct bch_replicas_cpu *new_r)
320 {
321 	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
322 	struct bch_fs_usage_online *new_scratch = NULL;
323 	struct bch_fs_usage __percpu *new_gc = NULL;
324 	struct bch_fs_usage *new_base = NULL;
325 	unsigned i, bytes = sizeof(struct bch_fs_usage) +
326 		sizeof(u64) * new_r->nr;
327 	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
328 		sizeof(u64) * new_r->nr;
329 	int ret = 0;
330 
331 	memset(new_usage, 0, sizeof(new_usage));
332 
333 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
334 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
335 					sizeof(u64), GFP_KERNEL)))
336 			goto err;
337 
338 	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
339 	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
340 	    (c->usage_gc &&
341 	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
342 		goto err;
343 
344 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
345 		if (c->usage[i])
346 			__replicas_table_update_pcpu(new_usage[i], new_r,
347 						     c->usage[i], &c->replicas);
348 	if (c->usage_base)
349 		__replicas_table_update(new_base,		new_r,
350 					c->usage_base,		&c->replicas);
351 	if (c->usage_gc)
352 		__replicas_table_update_pcpu(new_gc,		new_r,
353 					     c->usage_gc,	&c->replicas);
354 
355 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
356 		swap(c->usage[i],	new_usage[i]);
357 	swap(c->usage_base,	new_base);
358 	swap(c->usage_scratch,	new_scratch);
359 	swap(c->usage_gc,	new_gc);
360 	swap(c->replicas,	*new_r);
361 out:
362 	free_percpu(new_gc);
363 	kfree(new_scratch);
364 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
365 		free_percpu(new_usage[i]);
366 	kfree(new_base);
367 	return ret;
368 err:
369 	bch_err(c, "error updating replicas table: memory allocation failure");
370 	ret = -BCH_ERR_ENOMEM_replicas_table;
371 	goto out;
372 }
373 
374 static unsigned reserve_journal_replicas(struct bch_fs *c,
375 				     struct bch_replicas_cpu *r)
376 {
377 	struct bch_replicas_entry *e;
378 	unsigned journal_res_u64s = 0;
379 
380 	/* nr_inodes: */
381 	journal_res_u64s +=
382 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
383 
384 	/* key_version: */
385 	journal_res_u64s +=
386 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
387 
388 	/* persistent_reserved: */
389 	journal_res_u64s +=
390 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
391 		BCH_REPLICAS_MAX;
392 
393 	for_each_cpu_replicas_entry(r, e)
394 		journal_res_u64s +=
395 			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
396 				     e->nr_devs, sizeof(u64));
397 	return journal_res_u64s;
398 }
399 
400 noinline
401 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
402 				struct bch_replicas_entry *new_entry)
403 {
404 	struct bch_replicas_cpu new_r, new_gc;
405 	int ret = 0;
406 
407 	verify_replicas_entry(new_entry);
408 
409 	memset(&new_r, 0, sizeof(new_r));
410 	memset(&new_gc, 0, sizeof(new_gc));
411 
412 	mutex_lock(&c->sb_lock);
413 
414 	if (c->replicas_gc.entries &&
415 	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
416 		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
417 		if (!new_gc.entries) {
418 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
419 			goto err;
420 		}
421 	}
422 
423 	if (!__replicas_has_entry(&c->replicas, new_entry)) {
424 		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
425 		if (!new_r.entries) {
426 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
427 			goto err;
428 		}
429 
430 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
431 		if (ret)
432 			goto err;
433 
434 		bch2_journal_entry_res_resize(&c->journal,
435 				&c->replicas_journal_res,
436 				reserve_journal_replicas(c, &new_r));
437 	}
438 
439 	if (!new_r.entries &&
440 	    !new_gc.entries)
441 		goto out;
442 
443 	/* allocations done, now commit: */
444 
445 	if (new_r.entries)
446 		bch2_write_super(c);
447 
448 	/* don't update in memory replicas until changes are persistent */
449 	percpu_down_write(&c->mark_lock);
450 	if (new_r.entries)
451 		ret = replicas_table_update(c, &new_r);
452 	if (new_gc.entries)
453 		swap(new_gc, c->replicas_gc);
454 	percpu_up_write(&c->mark_lock);
455 out:
456 	mutex_unlock(&c->sb_lock);
457 
458 	kfree(new_r.entries);
459 	kfree(new_gc.entries);
460 
461 	return ret;
462 err:
463 	bch_err_msg(c, ret, "adding replicas entry");
464 	goto out;
465 }
466 
467 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
468 {
469 	return likely(bch2_replicas_marked(c, r))
470 		? 0 : bch2_mark_replicas_slowpath(c, r);
471 }
472 
473 /* replicas delta list: */
474 
475 int bch2_replicas_delta_list_mark(struct bch_fs *c,
476 				  struct replicas_delta_list *r)
477 {
478 	struct replicas_delta *d = r->d;
479 	struct replicas_delta *top = (void *) r->d + r->used;
480 	int ret = 0;
481 
482 	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
483 		ret = bch2_mark_replicas(c, &d->r);
484 	return ret;
485 }
486 
487 /*
488  * Old replicas_gc mechanism: only used for journal replicas entries now, should
489  * die at some point:
490  */
491 
492 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
493 {
494 	lockdep_assert_held(&c->replicas_gc_lock);
495 
496 	mutex_lock(&c->sb_lock);
497 	percpu_down_write(&c->mark_lock);
498 
499 	ret =   ret ?:
500 		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
501 		replicas_table_update(c, &c->replicas_gc);
502 
503 	kfree(c->replicas_gc.entries);
504 	c->replicas_gc.entries = NULL;
505 
506 	percpu_up_write(&c->mark_lock);
507 
508 	if (!ret)
509 		bch2_write_super(c);
510 
511 	mutex_unlock(&c->sb_lock);
512 
513 	return ret;
514 }
515 
516 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
517 {
518 	struct bch_replicas_entry *e;
519 	unsigned i = 0;
520 
521 	lockdep_assert_held(&c->replicas_gc_lock);
522 
523 	mutex_lock(&c->sb_lock);
524 	BUG_ON(c->replicas_gc.entries);
525 
526 	c->replicas_gc.nr		= 0;
527 	c->replicas_gc.entry_size	= 0;
528 
529 	for_each_cpu_replicas_entry(&c->replicas, e)
530 		if (!((1 << e->data_type) & typemask)) {
531 			c->replicas_gc.nr++;
532 			c->replicas_gc.entry_size =
533 				max_t(unsigned, c->replicas_gc.entry_size,
534 				      replicas_entry_bytes(e));
535 		}
536 
537 	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
538 					 c->replicas_gc.entry_size,
539 					 GFP_KERNEL);
540 	if (!c->replicas_gc.entries) {
541 		mutex_unlock(&c->sb_lock);
542 		bch_err(c, "error allocating c->replicas_gc");
543 		return -BCH_ERR_ENOMEM_replicas_gc;
544 	}
545 
546 	for_each_cpu_replicas_entry(&c->replicas, e)
547 		if (!((1 << e->data_type) & typemask))
548 			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
549 			       e, c->replicas_gc.entry_size);
550 
551 	bch2_cpu_replicas_sort(&c->replicas_gc);
552 	mutex_unlock(&c->sb_lock);
553 
554 	return 0;
555 }
556 
557 /*
558  * New much simpler mechanism for clearing out unneeded replicas entries - drop
559  * replicas entries that have 0 sectors used.
560  *
561  * However, we don't track sector counts for journal usage, so this doesn't drop
562  * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
563  * is retained for that.
564  */
565 int bch2_replicas_gc2(struct bch_fs *c)
566 {
567 	struct bch_replicas_cpu new = { 0 };
568 	unsigned i, nr;
569 	int ret = 0;
570 
571 	bch2_journal_meta(&c->journal);
572 retry:
573 	nr		= READ_ONCE(c->replicas.nr);
574 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
575 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
576 	if (!new.entries) {
577 		bch_err(c, "error allocating c->replicas_gc");
578 		return -BCH_ERR_ENOMEM_replicas_gc;
579 	}
580 
581 	mutex_lock(&c->sb_lock);
582 	percpu_down_write(&c->mark_lock);
583 
584 	if (nr			!= c->replicas.nr ||
585 	    new.entry_size	!= c->replicas.entry_size) {
586 		percpu_up_write(&c->mark_lock);
587 		mutex_unlock(&c->sb_lock);
588 		kfree(new.entries);
589 		goto retry;
590 	}
591 
592 	for (i = 0; i < c->replicas.nr; i++) {
593 		struct bch_replicas_entry *e =
594 			cpu_replicas_entry(&c->replicas, i);
595 
596 		if (e->data_type == BCH_DATA_journal ||
597 		    c->usage_base->replicas[i] ||
598 		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
599 		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
600 		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
601 		    percpu_u64_get(&c->usage[3]->replicas[i]))
602 			memcpy(cpu_replicas_entry(&new, new.nr++),
603 			       e, new.entry_size);
604 	}
605 
606 	bch2_cpu_replicas_sort(&new);
607 
608 	ret =   bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
609 		replicas_table_update(c, &new);
610 
611 	kfree(new.entries);
612 
613 	percpu_up_write(&c->mark_lock);
614 
615 	if (!ret)
616 		bch2_write_super(c);
617 
618 	mutex_unlock(&c->sb_lock);
619 
620 	return ret;
621 }
622 
623 int bch2_replicas_set_usage(struct bch_fs *c,
624 			    struct bch_replicas_entry *r,
625 			    u64 sectors)
626 {
627 	int ret, idx = bch2_replicas_entry_idx(c, r);
628 
629 	if (idx < 0) {
630 		struct bch_replicas_cpu n;
631 
632 		n = cpu_replicas_add_entry(c, &c->replicas, r);
633 		if (!n.entries)
634 			return -BCH_ERR_ENOMEM_cpu_replicas;
635 
636 		ret = replicas_table_update(c, &n);
637 		if (ret)
638 			return ret;
639 
640 		kfree(n.entries);
641 
642 		idx = bch2_replicas_entry_idx(c, r);
643 		BUG_ON(ret < 0);
644 	}
645 
646 	c->usage_base->replicas[idx] = sectors;
647 
648 	return 0;
649 }
650 
651 /* Replicas tracking - superblock: */
652 
653 static int
654 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
655 				   struct bch_replicas_cpu *cpu_r)
656 {
657 	struct bch_replicas_entry *e, *dst;
658 	unsigned nr = 0, entry_size = 0, idx = 0;
659 
660 	for_each_replicas_entry(sb_r, e) {
661 		entry_size = max_t(unsigned, entry_size,
662 				   replicas_entry_bytes(e));
663 		nr++;
664 	}
665 
666 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
667 	if (!cpu_r->entries)
668 		return -BCH_ERR_ENOMEM_cpu_replicas;
669 
670 	cpu_r->nr		= nr;
671 	cpu_r->entry_size	= entry_size;
672 
673 	for_each_replicas_entry(sb_r, e) {
674 		dst = cpu_replicas_entry(cpu_r, idx++);
675 		memcpy(dst, e, replicas_entry_bytes(e));
676 		bch2_replicas_entry_sort(dst);
677 	}
678 
679 	return 0;
680 }
681 
682 static int
683 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
684 				      struct bch_replicas_cpu *cpu_r)
685 {
686 	struct bch_replicas_entry_v0 *e;
687 	unsigned nr = 0, entry_size = 0, idx = 0;
688 
689 	for_each_replicas_entry(sb_r, e) {
690 		entry_size = max_t(unsigned, entry_size,
691 				   replicas_entry_bytes(e));
692 		nr++;
693 	}
694 
695 	entry_size += sizeof(struct bch_replicas_entry) -
696 		sizeof(struct bch_replicas_entry_v0);
697 
698 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
699 	if (!cpu_r->entries)
700 		return -BCH_ERR_ENOMEM_cpu_replicas;
701 
702 	cpu_r->nr		= nr;
703 	cpu_r->entry_size	= entry_size;
704 
705 	for_each_replicas_entry(sb_r, e) {
706 		struct bch_replicas_entry *dst =
707 			cpu_replicas_entry(cpu_r, idx++);
708 
709 		dst->data_type	= e->data_type;
710 		dst->nr_devs	= e->nr_devs;
711 		dst->nr_required = 1;
712 		memcpy(dst->devs, e->devs, e->nr_devs);
713 		bch2_replicas_entry_sort(dst);
714 	}
715 
716 	return 0;
717 }
718 
719 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
720 {
721 	struct bch_sb_field_replicas *sb_v1;
722 	struct bch_sb_field_replicas_v0 *sb_v0;
723 	struct bch_replicas_cpu new_r = { 0, 0, NULL };
724 	int ret = 0;
725 
726 	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
727 		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
728 	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
729 		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
730 	if (ret)
731 		return ret;
732 
733 	bch2_cpu_replicas_sort(&new_r);
734 
735 	percpu_down_write(&c->mark_lock);
736 
737 	ret = replicas_table_update(c, &new_r);
738 	percpu_up_write(&c->mark_lock);
739 
740 	kfree(new_r.entries);
741 
742 	return 0;
743 }
744 
745 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
746 					       struct bch_replicas_cpu *r)
747 {
748 	struct bch_sb_field_replicas_v0 *sb_r;
749 	struct bch_replicas_entry_v0 *dst;
750 	struct bch_replicas_entry *src;
751 	size_t bytes;
752 
753 	bytes = sizeof(struct bch_sb_field_replicas);
754 
755 	for_each_cpu_replicas_entry(r, src)
756 		bytes += replicas_entry_bytes(src) - 1;
757 
758 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
759 			DIV_ROUND_UP(bytes, sizeof(u64)));
760 	if (!sb_r)
761 		return -BCH_ERR_ENOSPC_sb_replicas;
762 
763 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
764 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
765 
766 	memset(&sb_r->entries, 0,
767 	       vstruct_end(&sb_r->field) -
768 	       (void *) &sb_r->entries);
769 
770 	dst = sb_r->entries;
771 	for_each_cpu_replicas_entry(r, src) {
772 		dst->data_type	= src->data_type;
773 		dst->nr_devs	= src->nr_devs;
774 		memcpy(dst->devs, src->devs, src->nr_devs);
775 
776 		dst = replicas_entry_next(dst);
777 
778 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
779 	}
780 
781 	return 0;
782 }
783 
784 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
785 					    struct bch_replicas_cpu *r)
786 {
787 	struct bch_sb_field_replicas *sb_r;
788 	struct bch_replicas_entry *dst, *src;
789 	bool need_v1 = false;
790 	size_t bytes;
791 
792 	bytes = sizeof(struct bch_sb_field_replicas);
793 
794 	for_each_cpu_replicas_entry(r, src) {
795 		bytes += replicas_entry_bytes(src);
796 		if (src->nr_required != 1)
797 			need_v1 = true;
798 	}
799 
800 	if (!need_v1)
801 		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
802 
803 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
804 			DIV_ROUND_UP(bytes, sizeof(u64)));
805 	if (!sb_r)
806 		return -BCH_ERR_ENOSPC_sb_replicas;
807 
808 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
809 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
810 
811 	memset(&sb_r->entries, 0,
812 	       vstruct_end(&sb_r->field) -
813 	       (void *) &sb_r->entries);
814 
815 	dst = sb_r->entries;
816 	for_each_cpu_replicas_entry(r, src) {
817 		memcpy(dst, src, replicas_entry_bytes(src));
818 
819 		dst = replicas_entry_next(dst);
820 
821 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
822 	}
823 
824 	return 0;
825 }
826 
827 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
828 				      struct bch_sb *sb,
829 				      struct printbuf *err)
830 {
831 	unsigned i;
832 
833 	sort_cmp_size(cpu_r->entries,
834 		      cpu_r->nr,
835 		      cpu_r->entry_size,
836 		      memcmp, NULL);
837 
838 	for (i = 0; i < cpu_r->nr; i++) {
839 		struct bch_replicas_entry *e =
840 			cpu_replicas_entry(cpu_r, i);
841 
842 		int ret = bch2_replicas_entry_validate(e, sb, err);
843 		if (ret)
844 			return ret;
845 
846 		if (i + 1 < cpu_r->nr) {
847 			struct bch_replicas_entry *n =
848 				cpu_replicas_entry(cpu_r, i + 1);
849 
850 			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
851 
852 			if (!memcmp(e, n, cpu_r->entry_size)) {
853 				prt_printf(err, "duplicate replicas entry ");
854 				bch2_replicas_entry_to_text(err, e);
855 				return -BCH_ERR_invalid_sb_replicas;
856 			}
857 		}
858 	}
859 
860 	return 0;
861 }
862 
863 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
864 				     struct printbuf *err)
865 {
866 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
867 	struct bch_replicas_cpu cpu_r;
868 	int ret;
869 
870 	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
871 	if (ret)
872 		return ret;
873 
874 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
875 	kfree(cpu_r.entries);
876 	return ret;
877 }
878 
879 static void bch2_sb_replicas_to_text(struct printbuf *out,
880 				     struct bch_sb *sb,
881 				     struct bch_sb_field *f)
882 {
883 	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
884 	struct bch_replicas_entry *e;
885 	bool first = true;
886 
887 	for_each_replicas_entry(r, e) {
888 		if (!first)
889 			prt_printf(out, " ");
890 		first = false;
891 
892 		bch2_replicas_entry_to_text(out, e);
893 	}
894 	prt_newline(out);
895 }
896 
897 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
898 	.validate	= bch2_sb_replicas_validate,
899 	.to_text	= bch2_sb_replicas_to_text,
900 };
901 
902 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
903 					struct printbuf *err)
904 {
905 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
906 	struct bch_replicas_cpu cpu_r;
907 	int ret;
908 
909 	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
910 	if (ret)
911 		return ret;
912 
913 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
914 	kfree(cpu_r.entries);
915 	return ret;
916 }
917 
918 static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
919 					struct bch_sb *sb,
920 					struct bch_sb_field *f)
921 {
922 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
923 	struct bch_replicas_entry_v0 *e;
924 	bool first = true;
925 
926 	for_each_replicas_entry(sb_r, e) {
927 		if (!first)
928 			prt_printf(out, " ");
929 		first = false;
930 
931 		bch2_replicas_entry_v0_to_text(out, e);
932 	}
933 	prt_newline(out);
934 }
935 
936 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
937 	.validate	= bch2_sb_replicas_v0_validate,
938 	.to_text	= bch2_sb_replicas_v0_to_text,
939 };
940 
941 /* Query replicas: */
942 
943 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
944 			   unsigned flags, bool print)
945 {
946 	struct bch_replicas_entry *e;
947 	bool ret = true;
948 
949 	percpu_down_read(&c->mark_lock);
950 	for_each_cpu_replicas_entry(&c->replicas, e) {
951 		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
952 		bool metadata = e->data_type < BCH_DATA_user;
953 
954 		if (e->data_type == BCH_DATA_cached)
955 			continue;
956 
957 		for (i = 0; i < e->nr_devs; i++) {
958 			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
959 
960 			nr_online += test_bit(e->devs[i], devs.d);
961 			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
962 		}
963 
964 		if (nr_failed == e->nr_devs)
965 			continue;
966 
967 		if (nr_online < e->nr_required)
968 			dflags |= metadata
969 				? BCH_FORCE_IF_METADATA_LOST
970 				: BCH_FORCE_IF_DATA_LOST;
971 
972 		if (nr_online < e->nr_devs)
973 			dflags |= metadata
974 				? BCH_FORCE_IF_METADATA_DEGRADED
975 				: BCH_FORCE_IF_DATA_DEGRADED;
976 
977 		if (dflags & ~flags) {
978 			if (print) {
979 				struct printbuf buf = PRINTBUF;
980 
981 				bch2_replicas_entry_to_text(&buf, e);
982 				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
983 					nr_online, buf.buf);
984 				printbuf_exit(&buf);
985 			}
986 			ret = false;
987 			break;
988 		}
989 
990 	}
991 	percpu_up_read(&c->mark_lock);
992 
993 	return ret;
994 }
995 
996 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
997 {
998 	struct bch_sb_field_replicas *replicas;
999 	struct bch_sb_field_replicas_v0 *replicas_v0;
1000 	unsigned i, data_has = 0;
1001 
1002 	replicas = bch2_sb_field_get(sb, replicas);
1003 	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
1004 
1005 	if (replicas) {
1006 		struct bch_replicas_entry *r;
1007 
1008 		for_each_replicas_entry(replicas, r)
1009 			for (i = 0; i < r->nr_devs; i++)
1010 				if (r->devs[i] == dev)
1011 					data_has |= 1 << r->data_type;
1012 	} else if (replicas_v0) {
1013 		struct bch_replicas_entry_v0 *r;
1014 
1015 		for_each_replicas_entry_v0(replicas_v0, r)
1016 			for (i = 0; i < r->nr_devs; i++)
1017 				if (r->devs[i] == dev)
1018 					data_has |= 1 << r->data_type;
1019 	}
1020 
1021 
1022 	return data_has;
1023 }
1024 
1025 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1026 {
1027 	unsigned ret;
1028 
1029 	mutex_lock(&c->sb_lock);
1030 	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
1031 	mutex_unlock(&c->sb_lock);
1032 
1033 	return ret;
1034 }
1035 
1036 void bch2_fs_replicas_exit(struct bch_fs *c)
1037 {
1038 	unsigned i;
1039 
1040 	kfree(c->usage_scratch);
1041 	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1042 		free_percpu(c->usage[i]);
1043 	kfree(c->usage_base);
1044 	kfree(c->replicas.entries);
1045 	kfree(c->replicas_gc.entries);
1046 
1047 	mempool_exit(&c->replicas_delta_pool);
1048 }
1049 
1050 int bch2_fs_replicas_init(struct bch_fs *c)
1051 {
1052 	bch2_journal_entry_res_resize(&c->journal,
1053 			&c->replicas_journal_res,
1054 			reserve_journal_replicas(c, &c->replicas));
1055 
1056 	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
1057 					 REPLICAS_DELTA_LIST_MAX) ?:
1058 		replicas_table_update(c, &c->replicas);
1059 }
1060