xref: /linux/fs/bcachefs/replicas.c (revision 08df80a3c51674ab73ae770885a383ca553fbbbf)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "journal.h"
6 #include "replicas.h"
7 #include "super-io.h"
8 
9 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
10 					    struct bch_replicas_cpu *);
11 
12 /* Replicas tracking - in memory: */
13 
14 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
15 {
16 #ifdef CONFIG_BCACHEFS_DEBUG
17 	unsigned i;
18 
19 	BUG_ON(e->data_type >= BCH_DATA_NR);
20 	BUG_ON(!e->nr_devs);
21 	BUG_ON(e->nr_required > 1 &&
22 	       e->nr_required >= e->nr_devs);
23 
24 	for (i = 0; i + 1 < e->nr_devs; i++)
25 		BUG_ON(e->devs[i] >= e->devs[i + 1]);
26 #endif
27 }
28 
29 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
30 {
31 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
32 }
33 
34 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
35 {
36 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
37 }
38 
39 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
40 					   struct bch_replicas_entry_v0 *e)
41 {
42 	unsigned i;
43 
44 	if (e->data_type < BCH_DATA_NR)
45 		prt_printf(out, "%s", bch2_data_types[e->data_type]);
46 	else
47 		prt_printf(out, "(invalid data type %u)", e->data_type);
48 
49 	prt_printf(out, ": %u [", e->nr_devs);
50 	for (i = 0; i < e->nr_devs; i++)
51 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
52 	prt_printf(out, "]");
53 }
54 
55 void bch2_replicas_entry_to_text(struct printbuf *out,
56 				 struct bch_replicas_entry_v1 *e)
57 {
58 	unsigned i;
59 
60 	if (e->data_type < BCH_DATA_NR)
61 		prt_printf(out, "%s", bch2_data_types[e->data_type]);
62 	else
63 		prt_printf(out, "(invalid data type %u)", e->data_type);
64 
65 	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
66 	for (i = 0; i < e->nr_devs; i++)
67 		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
68 	prt_printf(out, "]");
69 }
70 
71 int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
72 				 struct bch_sb *sb,
73 				 struct printbuf *err)
74 {
75 	if (!r->nr_devs) {
76 		prt_printf(err, "no devices in entry ");
77 		goto bad;
78 	}
79 
80 	if (r->nr_required > 1 &&
81 	    r->nr_required >= r->nr_devs) {
82 		prt_printf(err, "bad nr_required in entry ");
83 		goto bad;
84 	}
85 
86 	for (unsigned i = 0; i < r->nr_devs; i++)
87 		if (!bch2_dev_exists(sb, r->devs[i])) {
88 			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
89 			goto bad;
90 		}
91 
92 	return 0;
93 bad:
94 	bch2_replicas_entry_to_text(err, r);
95 	return -BCH_ERR_invalid_replicas_entry;
96 }
97 
98 void bch2_cpu_replicas_to_text(struct printbuf *out,
99 			       struct bch_replicas_cpu *r)
100 {
101 	struct bch_replicas_entry_v1 *e;
102 	bool first = true;
103 
104 	for_each_cpu_replicas_entry(r, e) {
105 		if (!first)
106 			prt_printf(out, " ");
107 		first = false;
108 
109 		bch2_replicas_entry_to_text(out, e);
110 	}
111 }
112 
113 static void extent_to_replicas(struct bkey_s_c k,
114 			       struct bch_replicas_entry_v1 *r)
115 {
116 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
117 	const union bch_extent_entry *entry;
118 	struct extent_ptr_decoded p;
119 
120 	r->nr_required	= 1;
121 
122 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
123 		if (p.ptr.cached)
124 			continue;
125 
126 		if (!p.has_ec)
127 			r->devs[r->nr_devs++] = p.ptr.dev;
128 		else
129 			r->nr_required = 0;
130 	}
131 }
132 
133 static void stripe_to_replicas(struct bkey_s_c k,
134 			       struct bch_replicas_entry_v1 *r)
135 {
136 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
137 	const struct bch_extent_ptr *ptr;
138 
139 	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
140 
141 	for (ptr = s.v->ptrs;
142 	     ptr < s.v->ptrs + s.v->nr_blocks;
143 	     ptr++)
144 		r->devs[r->nr_devs++] = ptr->dev;
145 }
146 
147 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
148 			   struct bkey_s_c k)
149 {
150 	e->nr_devs = 0;
151 
152 	switch (k.k->type) {
153 	case KEY_TYPE_btree_ptr:
154 	case KEY_TYPE_btree_ptr_v2:
155 		e->data_type = BCH_DATA_btree;
156 		extent_to_replicas(k, e);
157 		break;
158 	case KEY_TYPE_extent:
159 	case KEY_TYPE_reflink_v:
160 		e->data_type = BCH_DATA_user;
161 		extent_to_replicas(k, e);
162 		break;
163 	case KEY_TYPE_stripe:
164 		e->data_type = BCH_DATA_parity;
165 		stripe_to_replicas(k, e);
166 		break;
167 	}
168 
169 	bch2_replicas_entry_sort(e);
170 }
171 
172 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
173 			      enum bch_data_type data_type,
174 			      struct bch_devs_list devs)
175 {
176 	BUG_ON(!data_type ||
177 	       data_type == BCH_DATA_sb ||
178 	       data_type >= BCH_DATA_NR);
179 
180 	e->data_type	= data_type;
181 	e->nr_devs	= 0;
182 	e->nr_required	= 1;
183 
184 	darray_for_each(devs, i)
185 		e->devs[e->nr_devs++] = *i;
186 
187 	bch2_replicas_entry_sort(e);
188 }
189 
190 static struct bch_replicas_cpu
191 cpu_replicas_add_entry(struct bch_fs *c,
192 		       struct bch_replicas_cpu *old,
193 		       struct bch_replicas_entry_v1 *new_entry)
194 {
195 	unsigned i;
196 	struct bch_replicas_cpu new = {
197 		.nr		= old->nr + 1,
198 		.entry_size	= max_t(unsigned, old->entry_size,
199 					replicas_entry_bytes(new_entry)),
200 	};
201 
202 	for (i = 0; i < new_entry->nr_devs; i++)
203 		BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
204 
205 	BUG_ON(!new_entry->data_type);
206 	verify_replicas_entry(new_entry);
207 
208 	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
209 	if (!new.entries)
210 		return new;
211 
212 	for (i = 0; i < old->nr; i++)
213 		memcpy(cpu_replicas_entry(&new, i),
214 		       cpu_replicas_entry(old, i),
215 		       old->entry_size);
216 
217 	memcpy(cpu_replicas_entry(&new, old->nr),
218 	       new_entry,
219 	       replicas_entry_bytes(new_entry));
220 
221 	bch2_cpu_replicas_sort(&new);
222 	return new;
223 }
224 
225 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
226 				       struct bch_replicas_entry_v1 *search)
227 {
228 	int idx, entry_size = replicas_entry_bytes(search);
229 
230 	if (unlikely(entry_size > r->entry_size))
231 		return -1;
232 
233 	verify_replicas_entry(search);
234 
235 #define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
236 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
237 			      entry_cmp, search);
238 #undef entry_cmp
239 
240 	return idx < r->nr ? idx : -1;
241 }
242 
243 int bch2_replicas_entry_idx(struct bch_fs *c,
244 			    struct bch_replicas_entry_v1 *search)
245 {
246 	bch2_replicas_entry_sort(search);
247 
248 	return __replicas_entry_idx(&c->replicas, search);
249 }
250 
251 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
252 				 struct bch_replicas_entry_v1 *search)
253 {
254 	return __replicas_entry_idx(r, search) >= 0;
255 }
256 
257 bool bch2_replicas_marked(struct bch_fs *c,
258 			  struct bch_replicas_entry_v1 *search)
259 {
260 	bool marked;
261 
262 	if (!search->nr_devs)
263 		return true;
264 
265 	verify_replicas_entry(search);
266 
267 	percpu_down_read(&c->mark_lock);
268 	marked = __replicas_has_entry(&c->replicas, search) &&
269 		(likely((!c->replicas_gc.entries)) ||
270 		 __replicas_has_entry(&c->replicas_gc, search));
271 	percpu_up_read(&c->mark_lock);
272 
273 	return marked;
274 }
275 
276 static void __replicas_table_update(struct bch_fs_usage *dst,
277 				    struct bch_replicas_cpu *dst_r,
278 				    struct bch_fs_usage *src,
279 				    struct bch_replicas_cpu *src_r)
280 {
281 	int src_idx, dst_idx;
282 
283 	*dst = *src;
284 
285 	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
286 		if (!src->replicas[src_idx])
287 			continue;
288 
289 		dst_idx = __replicas_entry_idx(dst_r,
290 				cpu_replicas_entry(src_r, src_idx));
291 		BUG_ON(dst_idx < 0);
292 
293 		dst->replicas[dst_idx] = src->replicas[src_idx];
294 	}
295 }
296 
297 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
298 				    struct bch_replicas_cpu *dst_r,
299 				    struct bch_fs_usage __percpu *src_p,
300 				    struct bch_replicas_cpu *src_r)
301 {
302 	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
303 	struct bch_fs_usage *dst, *src = (void *)
304 		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
305 
306 	preempt_disable();
307 	dst = this_cpu_ptr(dst_p);
308 	preempt_enable();
309 
310 	__replicas_table_update(dst, dst_r, src, src_r);
311 }
312 
313 /*
314  * Resize filesystem accounting:
315  */
316 static int replicas_table_update(struct bch_fs *c,
317 				 struct bch_replicas_cpu *new_r)
318 {
319 	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
320 	struct bch_fs_usage_online *new_scratch = NULL;
321 	struct bch_fs_usage __percpu *new_gc = NULL;
322 	struct bch_fs_usage *new_base = NULL;
323 	unsigned i, bytes = sizeof(struct bch_fs_usage) +
324 		sizeof(u64) * new_r->nr;
325 	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
326 		sizeof(u64) * new_r->nr;
327 	int ret = 0;
328 
329 	memset(new_usage, 0, sizeof(new_usage));
330 
331 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
332 		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
333 					sizeof(u64), GFP_KERNEL)))
334 			goto err;
335 
336 	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
337 	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
338 	    (c->usage_gc &&
339 	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
340 		goto err;
341 
342 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
343 		if (c->usage[i])
344 			__replicas_table_update_pcpu(new_usage[i], new_r,
345 						     c->usage[i], &c->replicas);
346 	if (c->usage_base)
347 		__replicas_table_update(new_base,		new_r,
348 					c->usage_base,		&c->replicas);
349 	if (c->usage_gc)
350 		__replicas_table_update_pcpu(new_gc,		new_r,
351 					     c->usage_gc,	&c->replicas);
352 
353 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
354 		swap(c->usage[i],	new_usage[i]);
355 	swap(c->usage_base,	new_base);
356 	swap(c->usage_scratch,	new_scratch);
357 	swap(c->usage_gc,	new_gc);
358 	swap(c->replicas,	*new_r);
359 out:
360 	free_percpu(new_gc);
361 	kfree(new_scratch);
362 	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
363 		free_percpu(new_usage[i]);
364 	kfree(new_base);
365 	return ret;
366 err:
367 	bch_err(c, "error updating replicas table: memory allocation failure");
368 	ret = -BCH_ERR_ENOMEM_replicas_table;
369 	goto out;
370 }
371 
372 static unsigned reserve_journal_replicas(struct bch_fs *c,
373 				     struct bch_replicas_cpu *r)
374 {
375 	struct bch_replicas_entry_v1 *e;
376 	unsigned journal_res_u64s = 0;
377 
378 	/* nr_inodes: */
379 	journal_res_u64s +=
380 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
381 
382 	/* key_version: */
383 	journal_res_u64s +=
384 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
385 
386 	/* persistent_reserved: */
387 	journal_res_u64s +=
388 		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
389 		BCH_REPLICAS_MAX;
390 
391 	for_each_cpu_replicas_entry(r, e)
392 		journal_res_u64s +=
393 			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
394 				     e->nr_devs, sizeof(u64));
395 	return journal_res_u64s;
396 }
397 
398 noinline
399 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
400 				struct bch_replicas_entry_v1 *new_entry)
401 {
402 	struct bch_replicas_cpu new_r, new_gc;
403 	int ret = 0;
404 
405 	verify_replicas_entry(new_entry);
406 
407 	memset(&new_r, 0, sizeof(new_r));
408 	memset(&new_gc, 0, sizeof(new_gc));
409 
410 	mutex_lock(&c->sb_lock);
411 
412 	if (c->replicas_gc.entries &&
413 	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
414 		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
415 		if (!new_gc.entries) {
416 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
417 			goto err;
418 		}
419 	}
420 
421 	if (!__replicas_has_entry(&c->replicas, new_entry)) {
422 		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
423 		if (!new_r.entries) {
424 			ret = -BCH_ERR_ENOMEM_cpu_replicas;
425 			goto err;
426 		}
427 
428 		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
429 		if (ret)
430 			goto err;
431 
432 		bch2_journal_entry_res_resize(&c->journal,
433 				&c->replicas_journal_res,
434 				reserve_journal_replicas(c, &new_r));
435 	}
436 
437 	if (!new_r.entries &&
438 	    !new_gc.entries)
439 		goto out;
440 
441 	/* allocations done, now commit: */
442 
443 	if (new_r.entries)
444 		bch2_write_super(c);
445 
446 	/* don't update in memory replicas until changes are persistent */
447 	percpu_down_write(&c->mark_lock);
448 	if (new_r.entries)
449 		ret = replicas_table_update(c, &new_r);
450 	if (new_gc.entries)
451 		swap(new_gc, c->replicas_gc);
452 	percpu_up_write(&c->mark_lock);
453 out:
454 	mutex_unlock(&c->sb_lock);
455 
456 	kfree(new_r.entries);
457 	kfree(new_gc.entries);
458 
459 	return ret;
460 err:
461 	bch_err_msg(c, ret, "adding replicas entry");
462 	goto out;
463 }
464 
465 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
466 {
467 	return likely(bch2_replicas_marked(c, r))
468 		? 0 : bch2_mark_replicas_slowpath(c, r);
469 }
470 
471 /* replicas delta list: */
472 
473 int bch2_replicas_delta_list_mark(struct bch_fs *c,
474 				  struct replicas_delta_list *r)
475 {
476 	struct replicas_delta *d = r->d;
477 	struct replicas_delta *top = (void *) r->d + r->used;
478 	int ret = 0;
479 
480 	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
481 		ret = bch2_mark_replicas(c, &d->r);
482 	return ret;
483 }
484 
485 /*
486  * Old replicas_gc mechanism: only used for journal replicas entries now, should
487  * die at some point:
488  */
489 
490 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
491 {
492 	lockdep_assert_held(&c->replicas_gc_lock);
493 
494 	mutex_lock(&c->sb_lock);
495 	percpu_down_write(&c->mark_lock);
496 
497 	ret =   ret ?:
498 		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
499 		replicas_table_update(c, &c->replicas_gc);
500 
501 	kfree(c->replicas_gc.entries);
502 	c->replicas_gc.entries = NULL;
503 
504 	percpu_up_write(&c->mark_lock);
505 
506 	if (!ret)
507 		bch2_write_super(c);
508 
509 	mutex_unlock(&c->sb_lock);
510 
511 	return ret;
512 }
513 
514 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
515 {
516 	struct bch_replicas_entry_v1 *e;
517 	unsigned i = 0;
518 
519 	lockdep_assert_held(&c->replicas_gc_lock);
520 
521 	mutex_lock(&c->sb_lock);
522 	BUG_ON(c->replicas_gc.entries);
523 
524 	c->replicas_gc.nr		= 0;
525 	c->replicas_gc.entry_size	= 0;
526 
527 	for_each_cpu_replicas_entry(&c->replicas, e)
528 		if (!((1 << e->data_type) & typemask)) {
529 			c->replicas_gc.nr++;
530 			c->replicas_gc.entry_size =
531 				max_t(unsigned, c->replicas_gc.entry_size,
532 				      replicas_entry_bytes(e));
533 		}
534 
535 	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
536 					 c->replicas_gc.entry_size,
537 					 GFP_KERNEL);
538 	if (!c->replicas_gc.entries) {
539 		mutex_unlock(&c->sb_lock);
540 		bch_err(c, "error allocating c->replicas_gc");
541 		return -BCH_ERR_ENOMEM_replicas_gc;
542 	}
543 
544 	for_each_cpu_replicas_entry(&c->replicas, e)
545 		if (!((1 << e->data_type) & typemask))
546 			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
547 			       e, c->replicas_gc.entry_size);
548 
549 	bch2_cpu_replicas_sort(&c->replicas_gc);
550 	mutex_unlock(&c->sb_lock);
551 
552 	return 0;
553 }
554 
555 /*
556  * New much simpler mechanism for clearing out unneeded replicas entries - drop
557  * replicas entries that have 0 sectors used.
558  *
559  * However, we don't track sector counts for journal usage, so this doesn't drop
560  * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
561  * is retained for that.
562  */
563 int bch2_replicas_gc2(struct bch_fs *c)
564 {
565 	struct bch_replicas_cpu new = { 0 };
566 	unsigned i, nr;
567 	int ret = 0;
568 
569 	bch2_journal_meta(&c->journal);
570 retry:
571 	nr		= READ_ONCE(c->replicas.nr);
572 	new.entry_size	= READ_ONCE(c->replicas.entry_size);
573 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
574 	if (!new.entries) {
575 		bch_err(c, "error allocating c->replicas_gc");
576 		return -BCH_ERR_ENOMEM_replicas_gc;
577 	}
578 
579 	mutex_lock(&c->sb_lock);
580 	percpu_down_write(&c->mark_lock);
581 
582 	if (nr			!= c->replicas.nr ||
583 	    new.entry_size	!= c->replicas.entry_size) {
584 		percpu_up_write(&c->mark_lock);
585 		mutex_unlock(&c->sb_lock);
586 		kfree(new.entries);
587 		goto retry;
588 	}
589 
590 	for (i = 0; i < c->replicas.nr; i++) {
591 		struct bch_replicas_entry_v1 *e =
592 			cpu_replicas_entry(&c->replicas, i);
593 
594 		if (e->data_type == BCH_DATA_journal ||
595 		    c->usage_base->replicas[i] ||
596 		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
597 		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
598 		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
599 		    percpu_u64_get(&c->usage[3]->replicas[i]))
600 			memcpy(cpu_replicas_entry(&new, new.nr++),
601 			       e, new.entry_size);
602 	}
603 
604 	bch2_cpu_replicas_sort(&new);
605 
606 	ret =   bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
607 		replicas_table_update(c, &new);
608 
609 	kfree(new.entries);
610 
611 	percpu_up_write(&c->mark_lock);
612 
613 	if (!ret)
614 		bch2_write_super(c);
615 
616 	mutex_unlock(&c->sb_lock);
617 
618 	return ret;
619 }
620 
621 int bch2_replicas_set_usage(struct bch_fs *c,
622 			    struct bch_replicas_entry_v1 *r,
623 			    u64 sectors)
624 {
625 	int ret, idx = bch2_replicas_entry_idx(c, r);
626 
627 	if (idx < 0) {
628 		struct bch_replicas_cpu n;
629 
630 		n = cpu_replicas_add_entry(c, &c->replicas, r);
631 		if (!n.entries)
632 			return -BCH_ERR_ENOMEM_cpu_replicas;
633 
634 		ret = replicas_table_update(c, &n);
635 		if (ret)
636 			return ret;
637 
638 		kfree(n.entries);
639 
640 		idx = bch2_replicas_entry_idx(c, r);
641 		BUG_ON(ret < 0);
642 	}
643 
644 	c->usage_base->replicas[idx] = sectors;
645 
646 	return 0;
647 }
648 
649 /* Replicas tracking - superblock: */
650 
651 static int
652 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
653 				   struct bch_replicas_cpu *cpu_r)
654 {
655 	struct bch_replicas_entry_v1 *e, *dst;
656 	unsigned nr = 0, entry_size = 0, idx = 0;
657 
658 	for_each_replicas_entry(sb_r, e) {
659 		entry_size = max_t(unsigned, entry_size,
660 				   replicas_entry_bytes(e));
661 		nr++;
662 	}
663 
664 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
665 	if (!cpu_r->entries)
666 		return -BCH_ERR_ENOMEM_cpu_replicas;
667 
668 	cpu_r->nr		= nr;
669 	cpu_r->entry_size	= entry_size;
670 
671 	for_each_replicas_entry(sb_r, e) {
672 		dst = cpu_replicas_entry(cpu_r, idx++);
673 		memcpy(dst, e, replicas_entry_bytes(e));
674 		bch2_replicas_entry_sort(dst);
675 	}
676 
677 	return 0;
678 }
679 
680 static int
681 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
682 				      struct bch_replicas_cpu *cpu_r)
683 {
684 	struct bch_replicas_entry_v0 *e;
685 	unsigned nr = 0, entry_size = 0, idx = 0;
686 
687 	for_each_replicas_entry(sb_r, e) {
688 		entry_size = max_t(unsigned, entry_size,
689 				   replicas_entry_bytes(e));
690 		nr++;
691 	}
692 
693 	entry_size += sizeof(struct bch_replicas_entry_v1) -
694 		sizeof(struct bch_replicas_entry_v0);
695 
696 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
697 	if (!cpu_r->entries)
698 		return -BCH_ERR_ENOMEM_cpu_replicas;
699 
700 	cpu_r->nr		= nr;
701 	cpu_r->entry_size	= entry_size;
702 
703 	for_each_replicas_entry(sb_r, e) {
704 		struct bch_replicas_entry_v1 *dst =
705 			cpu_replicas_entry(cpu_r, idx++);
706 
707 		dst->data_type	= e->data_type;
708 		dst->nr_devs	= e->nr_devs;
709 		dst->nr_required = 1;
710 		memcpy(dst->devs, e->devs, e->nr_devs);
711 		bch2_replicas_entry_sort(dst);
712 	}
713 
714 	return 0;
715 }
716 
717 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
718 {
719 	struct bch_sb_field_replicas *sb_v1;
720 	struct bch_sb_field_replicas_v0 *sb_v0;
721 	struct bch_replicas_cpu new_r = { 0, 0, NULL };
722 	int ret = 0;
723 
724 	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
725 		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
726 	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
727 		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
728 	if (ret)
729 		return ret;
730 
731 	bch2_cpu_replicas_sort(&new_r);
732 
733 	percpu_down_write(&c->mark_lock);
734 
735 	ret = replicas_table_update(c, &new_r);
736 	percpu_up_write(&c->mark_lock);
737 
738 	kfree(new_r.entries);
739 
740 	return 0;
741 }
742 
743 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
744 					       struct bch_replicas_cpu *r)
745 {
746 	struct bch_sb_field_replicas_v0 *sb_r;
747 	struct bch_replicas_entry_v0 *dst;
748 	struct bch_replicas_entry_v1 *src;
749 	size_t bytes;
750 
751 	bytes = sizeof(struct bch_sb_field_replicas);
752 
753 	for_each_cpu_replicas_entry(r, src)
754 		bytes += replicas_entry_bytes(src) - 1;
755 
756 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
757 			DIV_ROUND_UP(bytes, sizeof(u64)));
758 	if (!sb_r)
759 		return -BCH_ERR_ENOSPC_sb_replicas;
760 
761 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
762 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
763 
764 	memset(&sb_r->entries, 0,
765 	       vstruct_end(&sb_r->field) -
766 	       (void *) &sb_r->entries);
767 
768 	dst = sb_r->entries;
769 	for_each_cpu_replicas_entry(r, src) {
770 		dst->data_type	= src->data_type;
771 		dst->nr_devs	= src->nr_devs;
772 		memcpy(dst->devs, src->devs, src->nr_devs);
773 
774 		dst = replicas_entry_next(dst);
775 
776 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
777 	}
778 
779 	return 0;
780 }
781 
782 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
783 					    struct bch_replicas_cpu *r)
784 {
785 	struct bch_sb_field_replicas *sb_r;
786 	struct bch_replicas_entry_v1 *dst, *src;
787 	bool need_v1 = false;
788 	size_t bytes;
789 
790 	bytes = sizeof(struct bch_sb_field_replicas);
791 
792 	for_each_cpu_replicas_entry(r, src) {
793 		bytes += replicas_entry_bytes(src);
794 		if (src->nr_required != 1)
795 			need_v1 = true;
796 	}
797 
798 	if (!need_v1)
799 		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
800 
801 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
802 			DIV_ROUND_UP(bytes, sizeof(u64)));
803 	if (!sb_r)
804 		return -BCH_ERR_ENOSPC_sb_replicas;
805 
806 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
807 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
808 
809 	memset(&sb_r->entries, 0,
810 	       vstruct_end(&sb_r->field) -
811 	       (void *) &sb_r->entries);
812 
813 	dst = sb_r->entries;
814 	for_each_cpu_replicas_entry(r, src) {
815 		memcpy(dst, src, replicas_entry_bytes(src));
816 
817 		dst = replicas_entry_next(dst);
818 
819 		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
820 	}
821 
822 	return 0;
823 }
824 
825 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
826 				      struct bch_sb *sb,
827 				      struct printbuf *err)
828 {
829 	unsigned i;
830 
831 	sort_cmp_size(cpu_r->entries,
832 		      cpu_r->nr,
833 		      cpu_r->entry_size,
834 		      memcmp, NULL);
835 
836 	for (i = 0; i < cpu_r->nr; i++) {
837 		struct bch_replicas_entry_v1 *e =
838 			cpu_replicas_entry(cpu_r, i);
839 
840 		int ret = bch2_replicas_entry_validate(e, sb, err);
841 		if (ret)
842 			return ret;
843 
844 		if (i + 1 < cpu_r->nr) {
845 			struct bch_replicas_entry_v1 *n =
846 				cpu_replicas_entry(cpu_r, i + 1);
847 
848 			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
849 
850 			if (!memcmp(e, n, cpu_r->entry_size)) {
851 				prt_printf(err, "duplicate replicas entry ");
852 				bch2_replicas_entry_to_text(err, e);
853 				return -BCH_ERR_invalid_sb_replicas;
854 			}
855 		}
856 	}
857 
858 	return 0;
859 }
860 
861 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
862 				     struct printbuf *err)
863 {
864 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
865 	struct bch_replicas_cpu cpu_r;
866 	int ret;
867 
868 	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
869 	if (ret)
870 		return ret;
871 
872 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
873 	kfree(cpu_r.entries);
874 	return ret;
875 }
876 
877 static void bch2_sb_replicas_to_text(struct printbuf *out,
878 				     struct bch_sb *sb,
879 				     struct bch_sb_field *f)
880 {
881 	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
882 	struct bch_replicas_entry_v1 *e;
883 	bool first = true;
884 
885 	for_each_replicas_entry(r, e) {
886 		if (!first)
887 			prt_printf(out, " ");
888 		first = false;
889 
890 		bch2_replicas_entry_to_text(out, e);
891 	}
892 	prt_newline(out);
893 }
894 
895 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
896 	.validate	= bch2_sb_replicas_validate,
897 	.to_text	= bch2_sb_replicas_to_text,
898 };
899 
900 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
901 					struct printbuf *err)
902 {
903 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
904 	struct bch_replicas_cpu cpu_r;
905 	int ret;
906 
907 	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
908 	if (ret)
909 		return ret;
910 
911 	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
912 	kfree(cpu_r.entries);
913 	return ret;
914 }
915 
916 static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
917 					struct bch_sb *sb,
918 					struct bch_sb_field *f)
919 {
920 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
921 	struct bch_replicas_entry_v0 *e;
922 	bool first = true;
923 
924 	for_each_replicas_entry(sb_r, e) {
925 		if (!first)
926 			prt_printf(out, " ");
927 		first = false;
928 
929 		bch2_replicas_entry_v0_to_text(out, e);
930 	}
931 	prt_newline(out);
932 }
933 
934 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
935 	.validate	= bch2_sb_replicas_v0_validate,
936 	.to_text	= bch2_sb_replicas_v0_to_text,
937 };
938 
939 /* Query replicas: */
940 
941 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
942 			   unsigned flags, bool print)
943 {
944 	struct bch_replicas_entry_v1 *e;
945 	bool ret = true;
946 
947 	percpu_down_read(&c->mark_lock);
948 	for_each_cpu_replicas_entry(&c->replicas, e) {
949 		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
950 		bool metadata = e->data_type < BCH_DATA_user;
951 
952 		if (e->data_type == BCH_DATA_cached)
953 			continue;
954 
955 		for (i = 0; i < e->nr_devs; i++) {
956 			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
957 
958 			nr_online += test_bit(e->devs[i], devs.d);
959 			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
960 		}
961 
962 		if (nr_failed == e->nr_devs)
963 			continue;
964 
965 		if (nr_online < e->nr_required)
966 			dflags |= metadata
967 				? BCH_FORCE_IF_METADATA_LOST
968 				: BCH_FORCE_IF_DATA_LOST;
969 
970 		if (nr_online < e->nr_devs)
971 			dflags |= metadata
972 				? BCH_FORCE_IF_METADATA_DEGRADED
973 				: BCH_FORCE_IF_DATA_DEGRADED;
974 
975 		if (dflags & ~flags) {
976 			if (print) {
977 				struct printbuf buf = PRINTBUF;
978 
979 				bch2_replicas_entry_to_text(&buf, e);
980 				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
981 					nr_online, buf.buf);
982 				printbuf_exit(&buf);
983 			}
984 			ret = false;
985 			break;
986 		}
987 
988 	}
989 	percpu_up_read(&c->mark_lock);
990 
991 	return ret;
992 }
993 
994 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
995 {
996 	struct bch_sb_field_replicas *replicas;
997 	struct bch_sb_field_replicas_v0 *replicas_v0;
998 	unsigned i, data_has = 0;
999 
1000 	replicas = bch2_sb_field_get(sb, replicas);
1001 	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
1002 
1003 	if (replicas) {
1004 		struct bch_replicas_entry_v1 *r;
1005 
1006 		for_each_replicas_entry(replicas, r)
1007 			for (i = 0; i < r->nr_devs; i++)
1008 				if (r->devs[i] == dev)
1009 					data_has |= 1 << r->data_type;
1010 	} else if (replicas_v0) {
1011 		struct bch_replicas_entry_v0 *r;
1012 
1013 		for_each_replicas_entry_v0(replicas_v0, r)
1014 			for (i = 0; i < r->nr_devs; i++)
1015 				if (r->devs[i] == dev)
1016 					data_has |= 1 << r->data_type;
1017 	}
1018 
1019 
1020 	return data_has;
1021 }
1022 
1023 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1024 {
1025 	unsigned ret;
1026 
1027 	mutex_lock(&c->sb_lock);
1028 	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
1029 	mutex_unlock(&c->sb_lock);
1030 
1031 	return ret;
1032 }
1033 
1034 void bch2_fs_replicas_exit(struct bch_fs *c)
1035 {
1036 	unsigned i;
1037 
1038 	kfree(c->usage_scratch);
1039 	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1040 		free_percpu(c->usage[i]);
1041 	kfree(c->usage_base);
1042 	kfree(c->replicas.entries);
1043 	kfree(c->replicas_gc.entries);
1044 
1045 	mempool_exit(&c->replicas_delta_pool);
1046 }
1047 
1048 int bch2_fs_replicas_init(struct bch_fs *c)
1049 {
1050 	bch2_journal_entry_res_resize(&c->journal,
1051 			&c->replicas_journal_res,
1052 			reserve_journal_replicas(c, &c->replicas));
1053 
1054 	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
1055 					 REPLICAS_DELTA_LIST_MAX) ?:
1056 		replicas_table_update(c, &c->replicas);
1057 }
1058