xref: /linux/fs/bcachefs/alloc_foreground.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13 
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io_write.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33 
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37 
38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39 					   struct mutex *lock)
40 {
41 	if (!mutex_trylock(lock)) {
42 		bch2_trans_unlock(trans);
43 		mutex_lock(lock);
44 	}
45 }
46 
47 const char * const bch2_watermarks[] = {
48 #define x(t) #t,
49 	BCH_WATERMARKS()
50 #undef x
51 	NULL
52 };
53 
54 /*
55  * Open buckets represent a bucket that's currently being allocated from.  They
56  * serve two purposes:
57  *
58  *  - They track buckets that have been partially allocated, allowing for
59  *    sub-bucket sized allocations - they're used by the sector allocator below
60  *
61  *  - They provide a reference to the buckets they own that mark and sweep GC
62  *    can find, until the new allocation has a pointer to it inserted into the
63  *    btree
64  *
65  * When allocating some space with the sector allocator, the allocation comes
66  * with a reference to an open bucket - the caller is required to put that
67  * reference _after_ doing the index update that makes its allocation reachable.
68  */
69 
70 void bch2_reset_alloc_cursors(struct bch_fs *c)
71 {
72 	rcu_read_lock();
73 	for_each_member_device_rcu(c, ca, NULL)
74 		ca->alloc_cursor = 0;
75 	rcu_read_unlock();
76 }
77 
78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
79 {
80 	open_bucket_idx_t idx = ob - c->open_buckets;
81 	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
82 
83 	ob->hash = *slot;
84 	*slot = idx;
85 }
86 
87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
88 {
89 	open_bucket_idx_t idx = ob - c->open_buckets;
90 	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
91 
92 	while (*slot != idx) {
93 		BUG_ON(!*slot);
94 		slot = &c->open_buckets[*slot].hash;
95 	}
96 
97 	*slot = ob->hash;
98 	ob->hash = 0;
99 }
100 
101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
102 {
103 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
104 
105 	if (ob->ec) {
106 		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
107 		return;
108 	}
109 
110 	percpu_down_read(&c->mark_lock);
111 	spin_lock(&ob->lock);
112 
113 	ob->valid = false;
114 	ob->data_type = 0;
115 
116 	spin_unlock(&ob->lock);
117 	percpu_up_read(&c->mark_lock);
118 
119 	spin_lock(&c->freelist_lock);
120 	bch2_open_bucket_hash_remove(c, ob);
121 
122 	ob->freelist = c->open_buckets_freelist;
123 	c->open_buckets_freelist = ob - c->open_buckets;
124 
125 	c->open_buckets_nr_free++;
126 	ca->nr_open_buckets--;
127 	spin_unlock(&c->freelist_lock);
128 
129 	closure_wake_up(&c->open_buckets_wait);
130 }
131 
132 void bch2_open_bucket_write_error(struct bch_fs *c,
133 				  struct open_buckets *obs,
134 				  unsigned dev)
135 {
136 	struct open_bucket *ob;
137 	unsigned i;
138 
139 	open_bucket_for_each(c, obs, ob, i)
140 		if (ob->dev == dev && ob->ec)
141 			bch2_ec_bucket_cancel(c, ob);
142 }
143 
144 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
145 {
146 	struct open_bucket *ob;
147 
148 	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
149 
150 	ob = c->open_buckets + c->open_buckets_freelist;
151 	c->open_buckets_freelist = ob->freelist;
152 	atomic_set(&ob->pin, 1);
153 	ob->data_type = 0;
154 
155 	c->open_buckets_nr_free--;
156 	return ob;
157 }
158 
159 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
160 {
161 	BUG_ON(c->open_buckets_partial_nr >=
162 	       ARRAY_SIZE(c->open_buckets_partial));
163 
164 	spin_lock(&c->freelist_lock);
165 	ob->on_partial_list = true;
166 	c->open_buckets_partial[c->open_buckets_partial_nr++] =
167 		ob - c->open_buckets;
168 	spin_unlock(&c->freelist_lock);
169 
170 	closure_wake_up(&c->open_buckets_wait);
171 	closure_wake_up(&c->freelist_wait);
172 }
173 
174 /* _only_ for allocating the journal on a new device: */
175 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
176 {
177 	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
178 		u64 b = ca->new_fs_bucket_idx++;
179 
180 		if (!is_superblock_bucket(ca, b) &&
181 		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
182 			return b;
183 	}
184 
185 	return -1;
186 }
187 
188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
189 {
190 	switch (watermark) {
191 	case BCH_WATERMARK_reclaim:
192 		return 0;
193 	case BCH_WATERMARK_btree:
194 	case BCH_WATERMARK_btree_copygc:
195 		return OPEN_BUCKETS_COUNT / 4;
196 	case BCH_WATERMARK_copygc:
197 		return OPEN_BUCKETS_COUNT / 3;
198 	default:
199 		return OPEN_BUCKETS_COUNT / 2;
200 	}
201 }
202 
203 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
204 					      u64 bucket,
205 					      enum bch_watermark watermark,
206 					      const struct bch_alloc_v4 *a,
207 					      struct bucket_alloc_state *s,
208 					      struct closure *cl)
209 {
210 	struct open_bucket *ob;
211 
212 	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
213 		s->skipped_nouse++;
214 		return NULL;
215 	}
216 
217 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
218 		s->skipped_open++;
219 		return NULL;
220 	}
221 
222 	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
223 			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
224 		s->skipped_need_journal_commit++;
225 		return NULL;
226 	}
227 
228 	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
229 		s->skipped_nocow++;
230 		return NULL;
231 	}
232 
233 	spin_lock(&c->freelist_lock);
234 
235 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
236 		if (cl)
237 			closure_wait(&c->open_buckets_wait, cl);
238 
239 		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
240 		spin_unlock(&c->freelist_lock);
241 		return ERR_PTR(-BCH_ERR_open_buckets_empty);
242 	}
243 
244 	/* Recheck under lock: */
245 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
246 		spin_unlock(&c->freelist_lock);
247 		s->skipped_open++;
248 		return NULL;
249 	}
250 
251 	ob = bch2_open_bucket_alloc(c);
252 
253 	spin_lock(&ob->lock);
254 
255 	ob->valid	= true;
256 	ob->sectors_free = ca->mi.bucket_size;
257 	ob->dev		= ca->dev_idx;
258 	ob->gen		= a->gen;
259 	ob->bucket	= bucket;
260 	spin_unlock(&ob->lock);
261 
262 	ca->nr_open_buckets++;
263 	bch2_open_bucket_hash_add(c, ob);
264 
265 	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
266 	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
267 
268 	spin_unlock(&c->freelist_lock);
269 	return ob;
270 }
271 
272 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
273 					    enum bch_watermark watermark, u64 free_entry,
274 					    struct bucket_alloc_state *s,
275 					    struct bkey_s_c freespace_k,
276 					    struct closure *cl)
277 {
278 	struct bch_fs *c = trans->c;
279 	struct btree_iter iter = { NULL };
280 	struct bkey_s_c k;
281 	struct open_bucket *ob;
282 	struct bch_alloc_v4 a_convert;
283 	const struct bch_alloc_v4 *a;
284 	u64 b = free_entry & ~(~0ULL << 56);
285 	unsigned genbits = free_entry >> 56;
286 	struct printbuf buf = PRINTBUF;
287 	int ret;
288 
289 	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
290 		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
291 		       "  freespace key ",
292 			ca->mi.first_bucket, ca->mi.nbuckets);
293 		bch2_bkey_val_to_text(&buf, c, freespace_k);
294 		bch2_trans_inconsistent(trans, "%s", buf.buf);
295 		ob = ERR_PTR(-EIO);
296 		goto err;
297 	}
298 
299 	k = bch2_bkey_get_iter(trans, &iter,
300 			       BTREE_ID_alloc, POS(ca->dev_idx, b),
301 			       BTREE_ITER_CACHED);
302 	ret = bkey_err(k);
303 	if (ret) {
304 		ob = ERR_PTR(ret);
305 		goto err;
306 	}
307 
308 	a = bch2_alloc_to_v4(k, &a_convert);
309 
310 	if (a->data_type != BCH_DATA_free) {
311 		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
312 			ob = NULL;
313 			goto err;
314 		}
315 
316 		prt_printf(&buf, "non free bucket in freespace btree\n"
317 		       "  freespace key ");
318 		bch2_bkey_val_to_text(&buf, c, freespace_k);
319 		prt_printf(&buf, "\n  ");
320 		bch2_bkey_val_to_text(&buf, c, k);
321 		bch2_trans_inconsistent(trans, "%s", buf.buf);
322 		ob = ERR_PTR(-EIO);
323 		goto err;
324 	}
325 
326 	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
327 	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
328 		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
329 		       "  freespace key ",
330 		       genbits, alloc_freespace_genbits(*a) >> 56);
331 		bch2_bkey_val_to_text(&buf, c, freespace_k);
332 		prt_printf(&buf, "\n  ");
333 		bch2_bkey_val_to_text(&buf, c, k);
334 		bch2_trans_inconsistent(trans, "%s", buf.buf);
335 		ob = ERR_PTR(-EIO);
336 		goto err;
337 	}
338 
339 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
340 		struct bch_backpointer bp;
341 		struct bpos bp_pos = POS_MIN;
342 
343 		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
344 						&bp_pos, &bp,
345 						BTREE_ITER_NOPRESERVE);
346 		if (ret) {
347 			ob = ERR_PTR(ret);
348 			goto err;
349 		}
350 
351 		if (!bkey_eq(bp_pos, POS_MAX)) {
352 			/*
353 			 * Bucket may have data in it - we don't call
354 			 * bc2h_trans_inconnsistent() because fsck hasn't
355 			 * finished yet
356 			 */
357 			ob = NULL;
358 			goto err;
359 		}
360 	}
361 
362 	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
363 	if (!ob)
364 		set_btree_iter_dontneed(&iter);
365 err:
366 	if (iter.path)
367 		set_btree_iter_dontneed(&iter);
368 	bch2_trans_iter_exit(trans, &iter);
369 	printbuf_exit(&buf);
370 	return ob;
371 }
372 
373 /*
374  * This path is for before the freespace btree is initialized:
375  *
376  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
377  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
378  */
379 static noinline struct open_bucket *
380 bch2_bucket_alloc_early(struct btree_trans *trans,
381 			struct bch_dev *ca,
382 			enum bch_watermark watermark,
383 			struct bucket_alloc_state *s,
384 			struct closure *cl)
385 {
386 	struct btree_iter iter, citer;
387 	struct bkey_s_c k, ck;
388 	struct open_bucket *ob = NULL;
389 	u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
390 	u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
391 	u64 alloc_cursor = alloc_start;
392 	int ret;
393 
394 	/*
395 	 * Scan with an uncached iterator to avoid polluting the key cache. An
396 	 * uncached iter will return a cached key if one exists, but if not
397 	 * there is no other underlying protection for the associated key cache
398 	 * slot. To avoid racing bucket allocations, look up the cached key slot
399 	 * of any likely allocation candidate before attempting to proceed with
400 	 * the allocation. This provides proper exclusion on the associated
401 	 * bucket.
402 	 */
403 again:
404 	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
405 			   BTREE_ITER_SLOTS, k, ret) {
406 		struct bch_alloc_v4 a_convert;
407 		const struct bch_alloc_v4 *a;
408 
409 		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
410 			break;
411 
412 		if (ca->new_fs_bucket_idx &&
413 		    is_superblock_bucket(ca, k.k->p.offset))
414 			continue;
415 
416 		a = bch2_alloc_to_v4(k, &a_convert);
417 		if (a->data_type != BCH_DATA_free)
418 			continue;
419 
420 		/* now check the cached key to serialize concurrent allocs of the bucket */
421 		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
422 		ret = bkey_err(ck);
423 		if (ret)
424 			break;
425 
426 		a = bch2_alloc_to_v4(ck, &a_convert);
427 		if (a->data_type != BCH_DATA_free)
428 			goto next;
429 
430 		s->buckets_seen++;
431 
432 		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
433 next:
434 		set_btree_iter_dontneed(&citer);
435 		bch2_trans_iter_exit(trans, &citer);
436 		if (ob)
437 			break;
438 	}
439 	bch2_trans_iter_exit(trans, &iter);
440 
441 	alloc_cursor = iter.pos.offset;
442 	ca->alloc_cursor = alloc_cursor;
443 
444 	if (!ob && ret)
445 		ob = ERR_PTR(ret);
446 
447 	if (!ob && alloc_start > first_bucket) {
448 		alloc_cursor = alloc_start = first_bucket;
449 		goto again;
450 	}
451 
452 	return ob;
453 }
454 
455 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
456 						   struct bch_dev *ca,
457 						   enum bch_watermark watermark,
458 						   struct bucket_alloc_state *s,
459 						   struct closure *cl)
460 {
461 	struct btree_iter iter;
462 	struct bkey_s_c k;
463 	struct open_bucket *ob = NULL;
464 	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
465 	u64 alloc_cursor = alloc_start;
466 	int ret;
467 
468 	BUG_ON(ca->new_fs_bucket_idx);
469 again:
470 	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
471 				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
472 		if (k.k->p.inode != ca->dev_idx)
473 			break;
474 
475 		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
476 		     alloc_cursor < k.k->p.offset;
477 		     alloc_cursor++) {
478 			ret = btree_trans_too_many_iters(trans);
479 			if (ret) {
480 				ob = ERR_PTR(ret);
481 				break;
482 			}
483 
484 			s->buckets_seen++;
485 
486 			ob = try_alloc_bucket(trans, ca, watermark,
487 					      alloc_cursor, s, k, cl);
488 			if (ob) {
489 				set_btree_iter_dontneed(&iter);
490 				break;
491 			}
492 		}
493 
494 		if (ob || ret)
495 			break;
496 	}
497 	bch2_trans_iter_exit(trans, &iter);
498 
499 	ca->alloc_cursor = alloc_cursor;
500 
501 	if (!ob && ret)
502 		ob = ERR_PTR(ret);
503 
504 	if (!ob && alloc_start > ca->mi.first_bucket) {
505 		alloc_cursor = alloc_start = ca->mi.first_bucket;
506 		goto again;
507 	}
508 
509 	return ob;
510 }
511 
512 /**
513  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
514  * @trans:	transaction object
515  * @ca:		device to allocate from
516  * @watermark:	how important is this allocation?
517  * @cl:		if not NULL, closure to be used to wait if buckets not available
518  * @usage:	for secondarily also returning the current device usage
519  *
520  * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
521  */
522 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
523 				      struct bch_dev *ca,
524 				      enum bch_watermark watermark,
525 				      struct closure *cl,
526 				      struct bch_dev_usage *usage)
527 {
528 	struct bch_fs *c = trans->c;
529 	struct open_bucket *ob = NULL;
530 	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
531 	u64 avail;
532 	struct bucket_alloc_state s = { 0 };
533 	bool waiting = false;
534 again:
535 	bch2_dev_usage_read_fast(ca, usage);
536 	avail = dev_buckets_free(ca, *usage, watermark);
537 
538 	if (usage->d[BCH_DATA_need_discard].buckets > avail)
539 		bch2_do_discards(c);
540 
541 	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
542 		bch2_do_gc_gens(c);
543 
544 	if (should_invalidate_buckets(ca, *usage))
545 		bch2_do_invalidates(c);
546 
547 	if (!avail) {
548 		if (cl && !waiting) {
549 			closure_wait(&c->freelist_wait, cl);
550 			waiting = true;
551 			goto again;
552 		}
553 
554 		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
555 
556 		ob = ERR_PTR(-BCH_ERR_freelist_empty);
557 		goto err;
558 	}
559 
560 	if (waiting)
561 		closure_wake_up(&c->freelist_wait);
562 alloc:
563 	ob = likely(freespace)
564 		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
565 		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
566 
567 	if (s.skipped_need_journal_commit * 2 > avail)
568 		bch2_journal_flush_async(&c->journal, NULL);
569 
570 	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
571 		freespace = false;
572 		goto alloc;
573 	}
574 err:
575 	if (!ob)
576 		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
577 
578 	if (!IS_ERR(ob))
579 		trace_and_count(c, bucket_alloc, ca,
580 				bch2_watermarks[watermark],
581 				ob->bucket,
582 				usage->d[BCH_DATA_free].buckets,
583 				avail,
584 				bch2_copygc_wait_amount(c),
585 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
586 				&s,
587 				cl == NULL,
588 				"");
589 	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
590 		trace_and_count(c, bucket_alloc_fail, ca,
591 				bch2_watermarks[watermark],
592 				0,
593 				usage->d[BCH_DATA_free].buckets,
594 				avail,
595 				bch2_copygc_wait_amount(c),
596 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
597 				&s,
598 				cl == NULL,
599 				bch2_err_str(PTR_ERR(ob)));
600 
601 	return ob;
602 }
603 
604 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
605 				      enum bch_watermark watermark,
606 				      struct closure *cl)
607 {
608 	struct bch_dev_usage usage;
609 	struct open_bucket *ob;
610 
611 	bch2_trans_do(c, NULL, NULL, 0,
612 		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
613 							cl, &usage)));
614 	return ob;
615 }
616 
617 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
618 			    unsigned l, unsigned r)
619 {
620 	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
621 		(stripe->next_alloc[l] < stripe->next_alloc[r]));
622 }
623 
624 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
625 
626 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
627 					  struct dev_stripe_state *stripe,
628 					  struct bch_devs_mask *devs)
629 {
630 	struct dev_alloc_list ret = { .nr = 0 };
631 	unsigned i;
632 
633 	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
634 		ret.devs[ret.nr++] = i;
635 
636 	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
637 	return ret;
638 }
639 
640 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
641 			       struct dev_stripe_state *stripe,
642 			       struct bch_dev_usage *usage)
643 {
644 	u64 *v = stripe->next_alloc + ca->dev_idx;
645 	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
646 	u64 free_space_inv = free_space
647 		? div64_u64(1ULL << 48, free_space)
648 		: 1ULL << 48;
649 	u64 scale = *v / 4;
650 
651 	if (*v + free_space_inv >= *v)
652 		*v += free_space_inv;
653 	else
654 		*v = U64_MAX;
655 
656 	for (v = stripe->next_alloc;
657 	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
658 		*v = *v < scale ? 0 : *v - scale;
659 }
660 
661 void bch2_dev_stripe_increment(struct bch_dev *ca,
662 			       struct dev_stripe_state *stripe)
663 {
664 	struct bch_dev_usage usage;
665 
666 	bch2_dev_usage_read_fast(ca, &usage);
667 	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
668 }
669 
670 static int add_new_bucket(struct bch_fs *c,
671 			   struct open_buckets *ptrs,
672 			   struct bch_devs_mask *devs_may_alloc,
673 			   unsigned nr_replicas,
674 			   unsigned *nr_effective,
675 			   bool *have_cache,
676 			   unsigned flags,
677 			   struct open_bucket *ob)
678 {
679 	unsigned durability =
680 		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
681 
682 	BUG_ON(*nr_effective >= nr_replicas);
683 
684 	__clear_bit(ob->dev, devs_may_alloc->d);
685 	*nr_effective	+= durability;
686 	*have_cache	|= !durability;
687 
688 	ob_push(c, ptrs, ob);
689 
690 	if (*nr_effective >= nr_replicas)
691 		return 1;
692 	if (ob->ec)
693 		return 1;
694 	return 0;
695 }
696 
697 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
698 		      struct open_buckets *ptrs,
699 		      struct dev_stripe_state *stripe,
700 		      struct bch_devs_mask *devs_may_alloc,
701 		      unsigned nr_replicas,
702 		      unsigned *nr_effective,
703 		      bool *have_cache,
704 		      unsigned flags,
705 		      enum bch_data_type data_type,
706 		      enum bch_watermark watermark,
707 		      struct closure *cl)
708 {
709 	struct bch_fs *c = trans->c;
710 	struct dev_alloc_list devs_sorted =
711 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
712 	unsigned dev;
713 	struct bch_dev *ca;
714 	int ret = -BCH_ERR_insufficient_devices;
715 	unsigned i;
716 
717 	BUG_ON(*nr_effective >= nr_replicas);
718 
719 	for (i = 0; i < devs_sorted.nr; i++) {
720 		struct bch_dev_usage usage;
721 		struct open_bucket *ob;
722 
723 		dev = devs_sorted.devs[i];
724 
725 		rcu_read_lock();
726 		ca = rcu_dereference(c->devs[dev]);
727 		if (ca)
728 			percpu_ref_get(&ca->ref);
729 		rcu_read_unlock();
730 
731 		if (!ca)
732 			continue;
733 
734 		if (!ca->mi.durability && *have_cache) {
735 			percpu_ref_put(&ca->ref);
736 			continue;
737 		}
738 
739 		ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
740 		if (!IS_ERR(ob))
741 			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
742 		percpu_ref_put(&ca->ref);
743 
744 		if (IS_ERR(ob)) {
745 			ret = PTR_ERR(ob);
746 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
747 				break;
748 			continue;
749 		}
750 
751 		ob->data_type = data_type;
752 
753 		if (add_new_bucket(c, ptrs, devs_may_alloc,
754 				   nr_replicas, nr_effective,
755 				   have_cache, flags, ob)) {
756 			ret = 0;
757 			break;
758 		}
759 	}
760 
761 	return ret;
762 }
763 
764 /* Allocate from stripes: */
765 
766 /*
767  * if we can't allocate a new stripe because there are already too many
768  * partially filled stripes, force allocating from an existing stripe even when
769  * it's to a device we don't want:
770  */
771 
772 static int bucket_alloc_from_stripe(struct btree_trans *trans,
773 			 struct open_buckets *ptrs,
774 			 struct write_point *wp,
775 			 struct bch_devs_mask *devs_may_alloc,
776 			 u16 target,
777 			 unsigned nr_replicas,
778 			 unsigned *nr_effective,
779 			 bool *have_cache,
780 			 enum bch_watermark watermark,
781 			 unsigned flags,
782 			 struct closure *cl)
783 {
784 	struct bch_fs *c = trans->c;
785 	struct dev_alloc_list devs_sorted;
786 	struct ec_stripe_head *h;
787 	struct open_bucket *ob;
788 	unsigned i, ec_idx;
789 	int ret = 0;
790 
791 	if (nr_replicas < 2)
792 		return 0;
793 
794 	if (ec_open_bucket(c, ptrs))
795 		return 0;
796 
797 	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
798 	if (IS_ERR(h))
799 		return PTR_ERR(h);
800 	if (!h)
801 		return 0;
802 
803 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
804 
805 	for (i = 0; i < devs_sorted.nr; i++)
806 		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
807 			if (!h->s->blocks[ec_idx])
808 				continue;
809 
810 			ob = c->open_buckets + h->s->blocks[ec_idx];
811 			if (ob->dev == devs_sorted.devs[i] &&
812 			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
813 				goto got_bucket;
814 		}
815 	goto out_put_head;
816 got_bucket:
817 	ob->ec_idx	= ec_idx;
818 	ob->ec		= h->s;
819 	ec_stripe_new_get(h->s, STRIPE_REF_io);
820 
821 	ret = add_new_bucket(c, ptrs, devs_may_alloc,
822 			     nr_replicas, nr_effective,
823 			     have_cache, flags, ob);
824 out_put_head:
825 	bch2_ec_stripe_head_put(c, h);
826 	return ret;
827 }
828 
829 /* Sector allocator */
830 
831 static bool want_bucket(struct bch_fs *c,
832 			struct write_point *wp,
833 			struct bch_devs_mask *devs_may_alloc,
834 			bool *have_cache, bool ec,
835 			struct open_bucket *ob)
836 {
837 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
838 
839 	if (!test_bit(ob->dev, devs_may_alloc->d))
840 		return false;
841 
842 	if (ob->data_type != wp->data_type)
843 		return false;
844 
845 	if (!ca->mi.durability &&
846 	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
847 		return false;
848 
849 	if (ec != (ob->ec != NULL))
850 		return false;
851 
852 	return true;
853 }
854 
855 static int bucket_alloc_set_writepoint(struct bch_fs *c,
856 				       struct open_buckets *ptrs,
857 				       struct write_point *wp,
858 				       struct bch_devs_mask *devs_may_alloc,
859 				       unsigned nr_replicas,
860 				       unsigned *nr_effective,
861 				       bool *have_cache,
862 				       bool ec, unsigned flags)
863 {
864 	struct open_buckets ptrs_skip = { .nr = 0 };
865 	struct open_bucket *ob;
866 	unsigned i;
867 	int ret = 0;
868 
869 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
870 		if (!ret && want_bucket(c, wp, devs_may_alloc,
871 					have_cache, ec, ob))
872 			ret = add_new_bucket(c, ptrs, devs_may_alloc,
873 				       nr_replicas, nr_effective,
874 				       have_cache, flags, ob);
875 		else
876 			ob_push(c, &ptrs_skip, ob);
877 	}
878 	wp->ptrs = ptrs_skip;
879 
880 	return ret;
881 }
882 
883 static int bucket_alloc_set_partial(struct bch_fs *c,
884 				    struct open_buckets *ptrs,
885 				    struct write_point *wp,
886 				    struct bch_devs_mask *devs_may_alloc,
887 				    unsigned nr_replicas,
888 				    unsigned *nr_effective,
889 				    bool *have_cache, bool ec,
890 				    enum bch_watermark watermark,
891 				    unsigned flags)
892 {
893 	int i, ret = 0;
894 
895 	if (!c->open_buckets_partial_nr)
896 		return 0;
897 
898 	spin_lock(&c->freelist_lock);
899 
900 	if (!c->open_buckets_partial_nr)
901 		goto unlock;
902 
903 	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
904 		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
905 
906 		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
907 			struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
908 			struct bch_dev_usage usage;
909 			u64 avail;
910 
911 			bch2_dev_usage_read_fast(ca, &usage);
912 			avail = dev_buckets_free(ca, usage, watermark);
913 			if (!avail)
914 				continue;
915 
916 			array_remove_item(c->open_buckets_partial,
917 					  c->open_buckets_partial_nr,
918 					  i);
919 			ob->on_partial_list = false;
920 
921 			ret = add_new_bucket(c, ptrs, devs_may_alloc,
922 					     nr_replicas, nr_effective,
923 					     have_cache, flags, ob);
924 			if (ret)
925 				break;
926 		}
927 	}
928 unlock:
929 	spin_unlock(&c->freelist_lock);
930 	return ret;
931 }
932 
933 static int __open_bucket_add_buckets(struct btree_trans *trans,
934 			struct open_buckets *ptrs,
935 			struct write_point *wp,
936 			struct bch_devs_list *devs_have,
937 			u16 target,
938 			bool erasure_code,
939 			unsigned nr_replicas,
940 			unsigned *nr_effective,
941 			bool *have_cache,
942 			enum bch_watermark watermark,
943 			unsigned flags,
944 			struct closure *_cl)
945 {
946 	struct bch_fs *c = trans->c;
947 	struct bch_devs_mask devs;
948 	struct open_bucket *ob;
949 	struct closure *cl = NULL;
950 	unsigned i;
951 	int ret;
952 
953 	devs = target_rw_devs(c, wp->data_type, target);
954 
955 	/* Don't allocate from devices we already have pointers to: */
956 	darray_for_each(*devs_have, i)
957 		__clear_bit(*i, devs.d);
958 
959 	open_bucket_for_each(c, ptrs, ob, i)
960 		__clear_bit(ob->dev, devs.d);
961 
962 	if (erasure_code && ec_open_bucket(c, ptrs))
963 		return 0;
964 
965 	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
966 				 nr_replicas, nr_effective,
967 				 have_cache, erasure_code, flags);
968 	if (ret)
969 		return ret;
970 
971 	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
972 				 nr_replicas, nr_effective,
973 				 have_cache, erasure_code, watermark, flags);
974 	if (ret)
975 		return ret;
976 
977 	if (erasure_code) {
978 		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
979 					 target,
980 					 nr_replicas, nr_effective,
981 					 have_cache,
982 					 watermark, flags, _cl);
983 	} else {
984 retry_blocking:
985 		/*
986 		 * Try nonblocking first, so that if one device is full we'll try from
987 		 * other devices:
988 		 */
989 		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
990 					nr_replicas, nr_effective, have_cache,
991 					flags, wp->data_type, watermark, cl);
992 		if (ret &&
993 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
994 		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
995 		    !cl && _cl) {
996 			cl = _cl;
997 			goto retry_blocking;
998 		}
999 	}
1000 
1001 	return ret;
1002 }
1003 
1004 static int open_bucket_add_buckets(struct btree_trans *trans,
1005 			struct open_buckets *ptrs,
1006 			struct write_point *wp,
1007 			struct bch_devs_list *devs_have,
1008 			u16 target,
1009 			unsigned erasure_code,
1010 			unsigned nr_replicas,
1011 			unsigned *nr_effective,
1012 			bool *have_cache,
1013 			enum bch_watermark watermark,
1014 			unsigned flags,
1015 			struct closure *cl)
1016 {
1017 	int ret;
1018 
1019 	if (erasure_code) {
1020 		ret = __open_bucket_add_buckets(trans, ptrs, wp,
1021 				devs_have, target, erasure_code,
1022 				nr_replicas, nr_effective, have_cache,
1023 				watermark, flags, cl);
1024 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1025 		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1026 		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1027 		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1028 			return ret;
1029 		if (*nr_effective >= nr_replicas)
1030 			return 0;
1031 	}
1032 
1033 	ret = __open_bucket_add_buckets(trans, ptrs, wp,
1034 			devs_have, target, false,
1035 			nr_replicas, nr_effective, have_cache,
1036 			watermark, flags, cl);
1037 	return ret < 0 ? ret : 0;
1038 }
1039 
1040 /**
1041  * should_drop_bucket - check if this is open_bucket should go away
1042  * @ob:		open_bucket to predicate on
1043  * @c:		filesystem handle
1044  * @ca:		if set, we're killing buckets for a particular device
1045  * @ec:		if true, we're shutting down erasure coding and killing all ec
1046  *		open_buckets
1047  *		otherwise, return true
1048  * Returns: true if we should kill this open_bucket
1049  *
1050  * We're killing open_buckets because we're shutting down a device, erasure
1051  * coding, or the entire filesystem - check if this open_bucket matches:
1052  */
1053 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1054 			       struct bch_dev *ca, bool ec)
1055 {
1056 	if (ec) {
1057 		return ob->ec != NULL;
1058 	} else if (ca) {
1059 		bool drop = ob->dev == ca->dev_idx;
1060 		struct open_bucket *ob2;
1061 		unsigned i;
1062 
1063 		if (!drop && ob->ec) {
1064 			unsigned nr_blocks;
1065 
1066 			mutex_lock(&ob->ec->lock);
1067 			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1068 
1069 			for (i = 0; i < nr_blocks; i++) {
1070 				if (!ob->ec->blocks[i])
1071 					continue;
1072 
1073 				ob2 = c->open_buckets + ob->ec->blocks[i];
1074 				drop |= ob2->dev == ca->dev_idx;
1075 			}
1076 			mutex_unlock(&ob->ec->lock);
1077 		}
1078 
1079 		return drop;
1080 	} else {
1081 		return true;
1082 	}
1083 }
1084 
1085 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1086 				 bool ec, struct write_point *wp)
1087 {
1088 	struct open_buckets ptrs = { .nr = 0 };
1089 	struct open_bucket *ob;
1090 	unsigned i;
1091 
1092 	mutex_lock(&wp->lock);
1093 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1094 		if (should_drop_bucket(ob, c, ca, ec))
1095 			bch2_open_bucket_put(c, ob);
1096 		else
1097 			ob_push(c, &ptrs, ob);
1098 	wp->ptrs = ptrs;
1099 	mutex_unlock(&wp->lock);
1100 }
1101 
1102 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1103 			    bool ec)
1104 {
1105 	unsigned i;
1106 
1107 	/* Next, close write points that point to this device... */
1108 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1109 		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1110 
1111 	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1112 	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1113 	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1114 
1115 	mutex_lock(&c->btree_reserve_cache_lock);
1116 	while (c->btree_reserve_cache_nr) {
1117 		struct btree_alloc *a =
1118 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1119 
1120 		bch2_open_buckets_put(c, &a->ob);
1121 	}
1122 	mutex_unlock(&c->btree_reserve_cache_lock);
1123 
1124 	spin_lock(&c->freelist_lock);
1125 	i = 0;
1126 	while (i < c->open_buckets_partial_nr) {
1127 		struct open_bucket *ob =
1128 			c->open_buckets + c->open_buckets_partial[i];
1129 
1130 		if (should_drop_bucket(ob, c, ca, ec)) {
1131 			--c->open_buckets_partial_nr;
1132 			swap(c->open_buckets_partial[i],
1133 			     c->open_buckets_partial[c->open_buckets_partial_nr]);
1134 			ob->on_partial_list = false;
1135 			spin_unlock(&c->freelist_lock);
1136 			bch2_open_bucket_put(c, ob);
1137 			spin_lock(&c->freelist_lock);
1138 		} else {
1139 			i++;
1140 		}
1141 	}
1142 	spin_unlock(&c->freelist_lock);
1143 
1144 	bch2_ec_stop_dev(c, ca);
1145 }
1146 
1147 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1148 						 unsigned long write_point)
1149 {
1150 	unsigned hash =
1151 		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1152 
1153 	return &c->write_points_hash[hash];
1154 }
1155 
1156 static struct write_point *__writepoint_find(struct hlist_head *head,
1157 					     unsigned long write_point)
1158 {
1159 	struct write_point *wp;
1160 
1161 	rcu_read_lock();
1162 	hlist_for_each_entry_rcu(wp, head, node)
1163 		if (wp->write_point == write_point)
1164 			goto out;
1165 	wp = NULL;
1166 out:
1167 	rcu_read_unlock();
1168 	return wp;
1169 }
1170 
1171 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1172 {
1173 	u64 stranded	= c->write_points_nr * c->bucket_size_max;
1174 	u64 free	= bch2_fs_usage_read_short(c).free;
1175 
1176 	return stranded * factor > free;
1177 }
1178 
1179 static bool try_increase_writepoints(struct bch_fs *c)
1180 {
1181 	struct write_point *wp;
1182 
1183 	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1184 	    too_many_writepoints(c, 32))
1185 		return false;
1186 
1187 	wp = c->write_points + c->write_points_nr++;
1188 	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1189 	return true;
1190 }
1191 
1192 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1193 {
1194 	struct bch_fs *c = trans->c;
1195 	struct write_point *wp;
1196 	struct open_bucket *ob;
1197 	unsigned i;
1198 
1199 	mutex_lock(&c->write_points_hash_lock);
1200 	if (c->write_points_nr < old_nr) {
1201 		mutex_unlock(&c->write_points_hash_lock);
1202 		return true;
1203 	}
1204 
1205 	if (c->write_points_nr == 1 ||
1206 	    !too_many_writepoints(c, 8)) {
1207 		mutex_unlock(&c->write_points_hash_lock);
1208 		return false;
1209 	}
1210 
1211 	wp = c->write_points + --c->write_points_nr;
1212 
1213 	hlist_del_rcu(&wp->node);
1214 	mutex_unlock(&c->write_points_hash_lock);
1215 
1216 	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1217 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1218 		open_bucket_free_unused(c, ob);
1219 	wp->ptrs.nr = 0;
1220 	mutex_unlock(&wp->lock);
1221 	return true;
1222 }
1223 
1224 static struct write_point *writepoint_find(struct btree_trans *trans,
1225 					   unsigned long write_point)
1226 {
1227 	struct bch_fs *c = trans->c;
1228 	struct write_point *wp, *oldest;
1229 	struct hlist_head *head;
1230 
1231 	if (!(write_point & 1UL)) {
1232 		wp = (struct write_point *) write_point;
1233 		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1234 		return wp;
1235 	}
1236 
1237 	head = writepoint_hash(c, write_point);
1238 restart_find:
1239 	wp = __writepoint_find(head, write_point);
1240 	if (wp) {
1241 lock_wp:
1242 		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1243 		if (wp->write_point == write_point)
1244 			goto out;
1245 		mutex_unlock(&wp->lock);
1246 		goto restart_find;
1247 	}
1248 restart_find_oldest:
1249 	oldest = NULL;
1250 	for (wp = c->write_points;
1251 	     wp < c->write_points + c->write_points_nr; wp++)
1252 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
1253 			oldest = wp;
1254 
1255 	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1256 	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1257 	if (oldest >= c->write_points + c->write_points_nr ||
1258 	    try_increase_writepoints(c)) {
1259 		mutex_unlock(&c->write_points_hash_lock);
1260 		mutex_unlock(&oldest->lock);
1261 		goto restart_find_oldest;
1262 	}
1263 
1264 	wp = __writepoint_find(head, write_point);
1265 	if (wp && wp != oldest) {
1266 		mutex_unlock(&c->write_points_hash_lock);
1267 		mutex_unlock(&oldest->lock);
1268 		goto lock_wp;
1269 	}
1270 
1271 	wp = oldest;
1272 	hlist_del_rcu(&wp->node);
1273 	wp->write_point = write_point;
1274 	hlist_add_head_rcu(&wp->node, head);
1275 	mutex_unlock(&c->write_points_hash_lock);
1276 out:
1277 	wp->last_used = local_clock();
1278 	return wp;
1279 }
1280 
1281 static noinline void
1282 deallocate_extra_replicas(struct bch_fs *c,
1283 			  struct open_buckets *ptrs,
1284 			  struct open_buckets *ptrs_no_use,
1285 			  unsigned extra_replicas)
1286 {
1287 	struct open_buckets ptrs2 = { 0 };
1288 	struct open_bucket *ob;
1289 	unsigned i;
1290 
1291 	open_bucket_for_each(c, ptrs, ob, i) {
1292 		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1293 
1294 		if (d && d <= extra_replicas) {
1295 			extra_replicas -= d;
1296 			ob_push(c, ptrs_no_use, ob);
1297 		} else {
1298 			ob_push(c, &ptrs2, ob);
1299 		}
1300 	}
1301 
1302 	*ptrs = ptrs2;
1303 }
1304 
1305 /*
1306  * Get us an open_bucket we can allocate from, return with it locked:
1307  */
1308 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1309 			     unsigned target,
1310 			     unsigned erasure_code,
1311 			     struct write_point_specifier write_point,
1312 			     struct bch_devs_list *devs_have,
1313 			     unsigned nr_replicas,
1314 			     unsigned nr_replicas_required,
1315 			     enum bch_watermark watermark,
1316 			     unsigned flags,
1317 			     struct closure *cl,
1318 			     struct write_point **wp_ret)
1319 {
1320 	struct bch_fs *c = trans->c;
1321 	struct write_point *wp;
1322 	struct open_bucket *ob;
1323 	struct open_buckets ptrs;
1324 	unsigned nr_effective, write_points_nr;
1325 	bool have_cache;
1326 	int ret;
1327 	int i;
1328 
1329 	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1330 		erasure_code = false;
1331 
1332 	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1333 
1334 	BUG_ON(!nr_replicas || !nr_replicas_required);
1335 retry:
1336 	ptrs.nr		= 0;
1337 	nr_effective	= 0;
1338 	write_points_nr = c->write_points_nr;
1339 	have_cache	= false;
1340 
1341 	*wp_ret = wp = writepoint_find(trans, write_point.v);
1342 
1343 	/* metadata may not allocate on cache devices: */
1344 	if (wp->data_type != BCH_DATA_user)
1345 		have_cache = true;
1346 
1347 	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1348 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1349 					      target, erasure_code,
1350 					      nr_replicas, &nr_effective,
1351 					      &have_cache, watermark,
1352 					      flags, NULL);
1353 		if (!ret ||
1354 		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
1355 			goto alloc_done;
1356 
1357 		/* Don't retry from all devices if we're out of open buckets: */
1358 		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1359 			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1360 					      target, erasure_code,
1361 					      nr_replicas, &nr_effective,
1362 					      &have_cache, watermark,
1363 					      flags, cl);
1364 			if (!ret2 ||
1365 			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
1366 			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
1367 				ret = ret2;
1368 				goto alloc_done;
1369 			}
1370 		}
1371 
1372 		/*
1373 		 * Only try to allocate cache (durability = 0 devices) from the
1374 		 * specified target:
1375 		 */
1376 		have_cache = true;
1377 
1378 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1379 					      0, erasure_code,
1380 					      nr_replicas, &nr_effective,
1381 					      &have_cache, watermark,
1382 					      flags, cl);
1383 	} else {
1384 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1385 					      target, erasure_code,
1386 					      nr_replicas, &nr_effective,
1387 					      &have_cache, watermark,
1388 					      flags, cl);
1389 	}
1390 alloc_done:
1391 	BUG_ON(!ret && nr_effective < nr_replicas);
1392 
1393 	if (erasure_code && !ec_open_bucket(c, &ptrs))
1394 		pr_debug("failed to get ec bucket: ret %u", ret);
1395 
1396 	if (ret == -BCH_ERR_insufficient_devices &&
1397 	    nr_effective >= nr_replicas_required)
1398 		ret = 0;
1399 
1400 	if (ret)
1401 		goto err;
1402 
1403 	if (nr_effective > nr_replicas)
1404 		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1405 
1406 	/* Free buckets we didn't use: */
1407 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1408 		open_bucket_free_unused(c, ob);
1409 
1410 	wp->ptrs = ptrs;
1411 
1412 	wp->sectors_free = UINT_MAX;
1413 
1414 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1415 		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1416 
1417 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1418 
1419 	return 0;
1420 err:
1421 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1422 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1423 			ob_push(c, &ptrs, ob);
1424 		else
1425 			open_bucket_free_unused(c, ob);
1426 	wp->ptrs = ptrs;
1427 
1428 	mutex_unlock(&wp->lock);
1429 
1430 	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1431 	    try_decrease_writepoints(trans, write_points_nr))
1432 		goto retry;
1433 
1434 	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1435 	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
1436 		return cl
1437 			? -BCH_ERR_bucket_alloc_blocked
1438 			: -BCH_ERR_ENOSPC_bucket_alloc;
1439 
1440 	return ret;
1441 }
1442 
1443 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1444 {
1445 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1446 
1447 	return (struct bch_extent_ptr) {
1448 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
1449 		.gen	= ob->gen,
1450 		.dev	= ob->dev,
1451 		.offset	= bucket_to_sector(ca, ob->bucket) +
1452 			ca->mi.bucket_size -
1453 			ob->sectors_free,
1454 	};
1455 }
1456 
1457 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1458 				    struct bkey_i *k, unsigned sectors,
1459 				    bool cached)
1460 {
1461 	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1462 }
1463 
1464 /*
1465  * Append pointers to the space we just allocated to @k, and mark @sectors space
1466  * as allocated out of @ob
1467  */
1468 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1469 {
1470 	bch2_alloc_sectors_done_inlined(c, wp);
1471 }
1472 
1473 static inline void writepoint_init(struct write_point *wp,
1474 				   enum bch_data_type type)
1475 {
1476 	mutex_init(&wp->lock);
1477 	wp->data_type = type;
1478 
1479 	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1480 	INIT_LIST_HEAD(&wp->writes);
1481 	spin_lock_init(&wp->writes_lock);
1482 }
1483 
1484 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1485 {
1486 	struct open_bucket *ob;
1487 	struct write_point *wp;
1488 
1489 	mutex_init(&c->write_points_hash_lock);
1490 	c->write_points_nr = ARRAY_SIZE(c->write_points);
1491 
1492 	/* open bucket 0 is a sentinal NULL: */
1493 	spin_lock_init(&c->open_buckets[0].lock);
1494 
1495 	for (ob = c->open_buckets + 1;
1496 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1497 		spin_lock_init(&ob->lock);
1498 		c->open_buckets_nr_free++;
1499 
1500 		ob->freelist = c->open_buckets_freelist;
1501 		c->open_buckets_freelist = ob - c->open_buckets;
1502 	}
1503 
1504 	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
1505 	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
1506 	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
1507 
1508 	for (wp = c->write_points;
1509 	     wp < c->write_points + c->write_points_nr; wp++) {
1510 		writepoint_init(wp, BCH_DATA_user);
1511 
1512 		wp->last_used	= local_clock();
1513 		wp->write_point	= (unsigned long) wp;
1514 		hlist_add_head_rcu(&wp->node,
1515 				   writepoint_hash(c, wp->write_point));
1516 	}
1517 }
1518 
1519 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1520 {
1521 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1522 	unsigned data_type = ob->data_type;
1523 	barrier(); /* READ_ONCE() doesn't work on bitfields */
1524 
1525 	prt_printf(out, "%zu ref %u ",
1526 		   ob - c->open_buckets,
1527 		   atomic_read(&ob->pin));
1528 	bch2_prt_data_type(out, data_type);
1529 	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
1530 		   ob->dev, ob->bucket, ob->gen,
1531 		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1532 	if (ob->ec)
1533 		prt_printf(out, " ec idx %llu", ob->ec->idx);
1534 	if (ob->on_partial_list)
1535 		prt_str(out, " partial");
1536 	prt_newline(out);
1537 }
1538 
1539 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1540 {
1541 	struct open_bucket *ob;
1542 
1543 	out->atomic++;
1544 
1545 	for (ob = c->open_buckets;
1546 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1547 	     ob++) {
1548 		spin_lock(&ob->lock);
1549 		if (ob->valid && !ob->on_partial_list)
1550 			bch2_open_bucket_to_text(out, c, ob);
1551 		spin_unlock(&ob->lock);
1552 	}
1553 
1554 	--out->atomic;
1555 }
1556 
1557 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1558 {
1559 	unsigned i;
1560 
1561 	out->atomic++;
1562 	spin_lock(&c->freelist_lock);
1563 
1564 	for (i = 0; i < c->open_buckets_partial_nr; i++)
1565 		bch2_open_bucket_to_text(out, c,
1566 				c->open_buckets + c->open_buckets_partial[i]);
1567 
1568 	spin_unlock(&c->freelist_lock);
1569 	--out->atomic;
1570 }
1571 
1572 static const char * const bch2_write_point_states[] = {
1573 #define x(n)	#n,
1574 	WRITE_POINT_STATES()
1575 #undef x
1576 	NULL
1577 };
1578 
1579 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1580 				     struct write_point *wp)
1581 {
1582 	struct open_bucket *ob;
1583 	unsigned i;
1584 
1585 	prt_printf(out, "%lu: ", wp->write_point);
1586 	prt_human_readable_u64(out, wp->sectors_allocated);
1587 
1588 	prt_printf(out, " last wrote: ");
1589 	bch2_pr_time_units(out, sched_clock() - wp->last_used);
1590 
1591 	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1592 		prt_printf(out, " %s: ", bch2_write_point_states[i]);
1593 		bch2_pr_time_units(out, wp->time[i]);
1594 	}
1595 
1596 	prt_newline(out);
1597 
1598 	printbuf_indent_add(out, 2);
1599 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1600 		bch2_open_bucket_to_text(out, c, ob);
1601 	printbuf_indent_sub(out, 2);
1602 }
1603 
1604 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1605 {
1606 	struct write_point *wp;
1607 
1608 	prt_str(out, "Foreground write points\n");
1609 	for (wp = c->write_points;
1610 	     wp < c->write_points + ARRAY_SIZE(c->write_points);
1611 	     wp++)
1612 		bch2_write_point_to_text(out, c, wp);
1613 
1614 	prt_str(out, "Copygc write point\n");
1615 	bch2_write_point_to_text(out, c, &c->copygc_write_point);
1616 
1617 	prt_str(out, "Rebalance write point\n");
1618 	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1619 
1620 	prt_str(out, "Btree write point\n");
1621 	bch2_write_point_to_text(out, c, &c->btree_write_point);
1622 }
1623