xref: /linux/fs/bcachefs/alloc_foreground.c (revision 4e73826089ce899357580bbf6e0afe4e6f9900b7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13 
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io_write.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33 
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37 
38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39 					   struct mutex *lock)
40 {
41 	if (!mutex_trylock(lock)) {
42 		bch2_trans_unlock(trans);
43 		mutex_lock(lock);
44 	}
45 }
46 
47 const char * const bch2_watermarks[] = {
48 #define x(t) #t,
49 	BCH_WATERMARKS()
50 #undef x
51 	NULL
52 };
53 
54 /*
55  * Open buckets represent a bucket that's currently being allocated from.  They
56  * serve two purposes:
57  *
58  *  - They track buckets that have been partially allocated, allowing for
59  *    sub-bucket sized allocations - they're used by the sector allocator below
60  *
61  *  - They provide a reference to the buckets they own that mark and sweep GC
62  *    can find, until the new allocation has a pointer to it inserted into the
63  *    btree
64  *
65  * When allocating some space with the sector allocator, the allocation comes
66  * with a reference to an open bucket - the caller is required to put that
67  * reference _after_ doing the index update that makes its allocation reachable.
68  */
69 
70 void bch2_reset_alloc_cursors(struct bch_fs *c)
71 {
72 	rcu_read_lock();
73 	for_each_member_device_rcu(c, ca, NULL)
74 		ca->alloc_cursor = 0;
75 	rcu_read_unlock();
76 }
77 
78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
79 {
80 	open_bucket_idx_t idx = ob - c->open_buckets;
81 	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
82 
83 	ob->hash = *slot;
84 	*slot = idx;
85 }
86 
87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
88 {
89 	open_bucket_idx_t idx = ob - c->open_buckets;
90 	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
91 
92 	while (*slot != idx) {
93 		BUG_ON(!*slot);
94 		slot = &c->open_buckets[*slot].hash;
95 	}
96 
97 	*slot = ob->hash;
98 	ob->hash = 0;
99 }
100 
101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
102 {
103 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
104 
105 	if (ob->ec) {
106 		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
107 		return;
108 	}
109 
110 	percpu_down_read(&c->mark_lock);
111 	spin_lock(&ob->lock);
112 
113 	ob->valid = false;
114 	ob->data_type = 0;
115 
116 	spin_unlock(&ob->lock);
117 	percpu_up_read(&c->mark_lock);
118 
119 	spin_lock(&c->freelist_lock);
120 	bch2_open_bucket_hash_remove(c, ob);
121 
122 	ob->freelist = c->open_buckets_freelist;
123 	c->open_buckets_freelist = ob - c->open_buckets;
124 
125 	c->open_buckets_nr_free++;
126 	ca->nr_open_buckets--;
127 	spin_unlock(&c->freelist_lock);
128 
129 	closure_wake_up(&c->open_buckets_wait);
130 }
131 
132 void bch2_open_bucket_write_error(struct bch_fs *c,
133 				  struct open_buckets *obs,
134 				  unsigned dev)
135 {
136 	struct open_bucket *ob;
137 	unsigned i;
138 
139 	open_bucket_for_each(c, obs, ob, i)
140 		if (ob->dev == dev && ob->ec)
141 			bch2_ec_bucket_cancel(c, ob);
142 }
143 
144 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
145 {
146 	struct open_bucket *ob;
147 
148 	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
149 
150 	ob = c->open_buckets + c->open_buckets_freelist;
151 	c->open_buckets_freelist = ob->freelist;
152 	atomic_set(&ob->pin, 1);
153 	ob->data_type = 0;
154 
155 	c->open_buckets_nr_free--;
156 	return ob;
157 }
158 
159 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
160 {
161 	BUG_ON(c->open_buckets_partial_nr >=
162 	       ARRAY_SIZE(c->open_buckets_partial));
163 
164 	spin_lock(&c->freelist_lock);
165 	ob->on_partial_list = true;
166 	c->open_buckets_partial[c->open_buckets_partial_nr++] =
167 		ob - c->open_buckets;
168 	spin_unlock(&c->freelist_lock);
169 
170 	closure_wake_up(&c->open_buckets_wait);
171 	closure_wake_up(&c->freelist_wait);
172 }
173 
174 /* _only_ for allocating the journal on a new device: */
175 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
176 {
177 	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
178 		u64 b = ca->new_fs_bucket_idx++;
179 
180 		if (!is_superblock_bucket(ca, b) &&
181 		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
182 			return b;
183 	}
184 
185 	return -1;
186 }
187 
188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
189 {
190 	switch (watermark) {
191 	case BCH_WATERMARK_reclaim:
192 		return 0;
193 	case BCH_WATERMARK_btree:
194 	case BCH_WATERMARK_btree_copygc:
195 		return OPEN_BUCKETS_COUNT / 4;
196 	case BCH_WATERMARK_copygc:
197 		return OPEN_BUCKETS_COUNT / 3;
198 	default:
199 		return OPEN_BUCKETS_COUNT / 2;
200 	}
201 }
202 
203 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
204 					      u64 bucket,
205 					      enum bch_watermark watermark,
206 					      const struct bch_alloc_v4 *a,
207 					      struct bucket_alloc_state *s,
208 					      struct closure *cl)
209 {
210 	struct open_bucket *ob;
211 
212 	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
213 		s->skipped_nouse++;
214 		return NULL;
215 	}
216 
217 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
218 		s->skipped_open++;
219 		return NULL;
220 	}
221 
222 	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
223 			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
224 		s->skipped_need_journal_commit++;
225 		return NULL;
226 	}
227 
228 	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
229 		s->skipped_nocow++;
230 		return NULL;
231 	}
232 
233 	spin_lock(&c->freelist_lock);
234 
235 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
236 		if (cl)
237 			closure_wait(&c->open_buckets_wait, cl);
238 
239 		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
240 				   &c->blocked_allocate_open_bucket, true);
241 		spin_unlock(&c->freelist_lock);
242 		return ERR_PTR(-BCH_ERR_open_buckets_empty);
243 	}
244 
245 	/* Recheck under lock: */
246 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
247 		spin_unlock(&c->freelist_lock);
248 		s->skipped_open++;
249 		return NULL;
250 	}
251 
252 	ob = bch2_open_bucket_alloc(c);
253 
254 	spin_lock(&ob->lock);
255 
256 	ob->valid	= true;
257 	ob->sectors_free = ca->mi.bucket_size;
258 	ob->dev		= ca->dev_idx;
259 	ob->gen		= a->gen;
260 	ob->bucket	= bucket;
261 	spin_unlock(&ob->lock);
262 
263 	ca->nr_open_buckets++;
264 	bch2_open_bucket_hash_add(c, ob);
265 
266 	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
267 			   &c->blocked_allocate_open_bucket, false);
268 
269 	track_event_change(&c->times[BCH_TIME_blocked_allocate],
270 			   &c->blocked_allocate, false);
271 
272 	spin_unlock(&c->freelist_lock);
273 	return ob;
274 }
275 
276 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
277 					    enum bch_watermark watermark, u64 free_entry,
278 					    struct bucket_alloc_state *s,
279 					    struct bkey_s_c freespace_k,
280 					    struct closure *cl)
281 {
282 	struct bch_fs *c = trans->c;
283 	struct btree_iter iter = { NULL };
284 	struct bkey_s_c k;
285 	struct open_bucket *ob;
286 	struct bch_alloc_v4 a_convert;
287 	const struct bch_alloc_v4 *a;
288 	u64 b = free_entry & ~(~0ULL << 56);
289 	unsigned genbits = free_entry >> 56;
290 	struct printbuf buf = PRINTBUF;
291 	int ret;
292 
293 	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
294 		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
295 		       "  freespace key ",
296 			ca->mi.first_bucket, ca->mi.nbuckets);
297 		bch2_bkey_val_to_text(&buf, c, freespace_k);
298 		bch2_trans_inconsistent(trans, "%s", buf.buf);
299 		ob = ERR_PTR(-EIO);
300 		goto err;
301 	}
302 
303 	k = bch2_bkey_get_iter(trans, &iter,
304 			       BTREE_ID_alloc, POS(ca->dev_idx, b),
305 			       BTREE_ITER_CACHED);
306 	ret = bkey_err(k);
307 	if (ret) {
308 		ob = ERR_PTR(ret);
309 		goto err;
310 	}
311 
312 	a = bch2_alloc_to_v4(k, &a_convert);
313 
314 	if (a->data_type != BCH_DATA_free) {
315 		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
316 			ob = NULL;
317 			goto err;
318 		}
319 
320 		prt_printf(&buf, "non free bucket in freespace btree\n"
321 		       "  freespace key ");
322 		bch2_bkey_val_to_text(&buf, c, freespace_k);
323 		prt_printf(&buf, "\n  ");
324 		bch2_bkey_val_to_text(&buf, c, k);
325 		bch2_trans_inconsistent(trans, "%s", buf.buf);
326 		ob = ERR_PTR(-EIO);
327 		goto err;
328 	}
329 
330 	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
331 	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
332 		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
333 		       "  freespace key ",
334 		       genbits, alloc_freespace_genbits(*a) >> 56);
335 		bch2_bkey_val_to_text(&buf, c, freespace_k);
336 		prt_printf(&buf, "\n  ");
337 		bch2_bkey_val_to_text(&buf, c, k);
338 		bch2_trans_inconsistent(trans, "%s", buf.buf);
339 		ob = ERR_PTR(-EIO);
340 		goto err;
341 	}
342 
343 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
344 		struct bch_backpointer bp;
345 		struct bpos bp_pos = POS_MIN;
346 
347 		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
348 						&bp_pos, &bp,
349 						BTREE_ITER_NOPRESERVE);
350 		if (ret) {
351 			ob = ERR_PTR(ret);
352 			goto err;
353 		}
354 
355 		if (!bkey_eq(bp_pos, POS_MAX)) {
356 			/*
357 			 * Bucket may have data in it - we don't call
358 			 * bc2h_trans_inconnsistent() because fsck hasn't
359 			 * finished yet
360 			 */
361 			ob = NULL;
362 			goto err;
363 		}
364 	}
365 
366 	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
367 	if (!ob)
368 		set_btree_iter_dontneed(&iter);
369 err:
370 	if (iter.path)
371 		set_btree_iter_dontneed(&iter);
372 	bch2_trans_iter_exit(trans, &iter);
373 	printbuf_exit(&buf);
374 	return ob;
375 }
376 
377 /*
378  * This path is for before the freespace btree is initialized:
379  *
380  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
381  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
382  */
383 static noinline struct open_bucket *
384 bch2_bucket_alloc_early(struct btree_trans *trans,
385 			struct bch_dev *ca,
386 			enum bch_watermark watermark,
387 			struct bucket_alloc_state *s,
388 			struct closure *cl)
389 {
390 	struct btree_iter iter, citer;
391 	struct bkey_s_c k, ck;
392 	struct open_bucket *ob = NULL;
393 	u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
394 	u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
395 	u64 alloc_cursor = alloc_start;
396 	int ret;
397 
398 	/*
399 	 * Scan with an uncached iterator to avoid polluting the key cache. An
400 	 * uncached iter will return a cached key if one exists, but if not
401 	 * there is no other underlying protection for the associated key cache
402 	 * slot. To avoid racing bucket allocations, look up the cached key slot
403 	 * of any likely allocation candidate before attempting to proceed with
404 	 * the allocation. This provides proper exclusion on the associated
405 	 * bucket.
406 	 */
407 again:
408 	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
409 			   BTREE_ITER_SLOTS, k, ret) {
410 		struct bch_alloc_v4 a_convert;
411 		const struct bch_alloc_v4 *a;
412 
413 		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
414 			break;
415 
416 		if (ca->new_fs_bucket_idx &&
417 		    is_superblock_bucket(ca, k.k->p.offset))
418 			continue;
419 
420 		a = bch2_alloc_to_v4(k, &a_convert);
421 		if (a->data_type != BCH_DATA_free)
422 			continue;
423 
424 		/* now check the cached key to serialize concurrent allocs of the bucket */
425 		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
426 		ret = bkey_err(ck);
427 		if (ret)
428 			break;
429 
430 		a = bch2_alloc_to_v4(ck, &a_convert);
431 		if (a->data_type != BCH_DATA_free)
432 			goto next;
433 
434 		s->buckets_seen++;
435 
436 		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
437 next:
438 		set_btree_iter_dontneed(&citer);
439 		bch2_trans_iter_exit(trans, &citer);
440 		if (ob)
441 			break;
442 	}
443 	bch2_trans_iter_exit(trans, &iter);
444 
445 	alloc_cursor = iter.pos.offset;
446 	ca->alloc_cursor = alloc_cursor;
447 
448 	if (!ob && ret)
449 		ob = ERR_PTR(ret);
450 
451 	if (!ob && alloc_start > first_bucket) {
452 		alloc_cursor = alloc_start = first_bucket;
453 		goto again;
454 	}
455 
456 	return ob;
457 }
458 
459 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
460 						   struct bch_dev *ca,
461 						   enum bch_watermark watermark,
462 						   struct bucket_alloc_state *s,
463 						   struct closure *cl)
464 {
465 	struct btree_iter iter;
466 	struct bkey_s_c k;
467 	struct open_bucket *ob = NULL;
468 	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
469 	u64 alloc_cursor = alloc_start;
470 	int ret;
471 
472 	BUG_ON(ca->new_fs_bucket_idx);
473 again:
474 	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
475 				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
476 		if (k.k->p.inode != ca->dev_idx)
477 			break;
478 
479 		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
480 		     alloc_cursor < k.k->p.offset;
481 		     alloc_cursor++) {
482 			ret = btree_trans_too_many_iters(trans);
483 			if (ret) {
484 				ob = ERR_PTR(ret);
485 				break;
486 			}
487 
488 			s->buckets_seen++;
489 
490 			ob = try_alloc_bucket(trans, ca, watermark,
491 					      alloc_cursor, s, k, cl);
492 			if (ob) {
493 				set_btree_iter_dontneed(&iter);
494 				break;
495 			}
496 		}
497 
498 		if (ob || ret)
499 			break;
500 	}
501 	bch2_trans_iter_exit(trans, &iter);
502 
503 	ca->alloc_cursor = alloc_cursor;
504 
505 	if (!ob && ret)
506 		ob = ERR_PTR(ret);
507 
508 	if (!ob && alloc_start > ca->mi.first_bucket) {
509 		alloc_cursor = alloc_start = ca->mi.first_bucket;
510 		goto again;
511 	}
512 
513 	return ob;
514 }
515 
516 /**
517  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
518  * @trans:	transaction object
519  * @ca:		device to allocate from
520  * @watermark:	how important is this allocation?
521  * @cl:		if not NULL, closure to be used to wait if buckets not available
522  * @usage:	for secondarily also returning the current device usage
523  *
524  * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
525  */
526 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
527 				      struct bch_dev *ca,
528 				      enum bch_watermark watermark,
529 				      struct closure *cl,
530 				      struct bch_dev_usage *usage)
531 {
532 	struct bch_fs *c = trans->c;
533 	struct open_bucket *ob = NULL;
534 	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
535 	u64 avail;
536 	struct bucket_alloc_state s = { 0 };
537 	bool waiting = false;
538 again:
539 	bch2_dev_usage_read_fast(ca, usage);
540 	avail = dev_buckets_free(ca, *usage, watermark);
541 
542 	if (usage->d[BCH_DATA_need_discard].buckets > avail)
543 		bch2_do_discards(c);
544 
545 	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
546 		bch2_do_gc_gens(c);
547 
548 	if (should_invalidate_buckets(ca, *usage))
549 		bch2_do_invalidates(c);
550 
551 	if (!avail) {
552 		if (cl && !waiting) {
553 			closure_wait(&c->freelist_wait, cl);
554 			waiting = true;
555 			goto again;
556 		}
557 
558 		track_event_change(&c->times[BCH_TIME_blocked_allocate],
559 				   &c->blocked_allocate, true);
560 
561 		ob = ERR_PTR(-BCH_ERR_freelist_empty);
562 		goto err;
563 	}
564 
565 	if (waiting)
566 		closure_wake_up(&c->freelist_wait);
567 alloc:
568 	ob = likely(freespace)
569 		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
570 		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
571 
572 	if (s.skipped_need_journal_commit * 2 > avail)
573 		bch2_journal_flush_async(&c->journal, NULL);
574 
575 	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
576 		freespace = false;
577 		goto alloc;
578 	}
579 err:
580 	if (!ob)
581 		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
582 
583 	if (!IS_ERR(ob))
584 		trace_and_count(c, bucket_alloc, ca,
585 				bch2_watermarks[watermark],
586 				ob->bucket,
587 				usage->d[BCH_DATA_free].buckets,
588 				avail,
589 				bch2_copygc_wait_amount(c),
590 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
591 				&s,
592 				cl == NULL,
593 				"");
594 	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
595 		trace_and_count(c, bucket_alloc_fail, ca,
596 				bch2_watermarks[watermark],
597 				0,
598 				usage->d[BCH_DATA_free].buckets,
599 				avail,
600 				bch2_copygc_wait_amount(c),
601 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
602 				&s,
603 				cl == NULL,
604 				bch2_err_str(PTR_ERR(ob)));
605 
606 	return ob;
607 }
608 
609 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
610 				      enum bch_watermark watermark,
611 				      struct closure *cl)
612 {
613 	struct bch_dev_usage usage;
614 	struct open_bucket *ob;
615 
616 	bch2_trans_do(c, NULL, NULL, 0,
617 		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
618 							cl, &usage)));
619 	return ob;
620 }
621 
622 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
623 			    unsigned l, unsigned r)
624 {
625 	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
626 		(stripe->next_alloc[l] < stripe->next_alloc[r]));
627 }
628 
629 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
630 
631 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
632 					  struct dev_stripe_state *stripe,
633 					  struct bch_devs_mask *devs)
634 {
635 	struct dev_alloc_list ret = { .nr = 0 };
636 	unsigned i;
637 
638 	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
639 		ret.devs[ret.nr++] = i;
640 
641 	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
642 	return ret;
643 }
644 
645 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
646 			       struct dev_stripe_state *stripe,
647 			       struct bch_dev_usage *usage)
648 {
649 	u64 *v = stripe->next_alloc + ca->dev_idx;
650 	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
651 	u64 free_space_inv = free_space
652 		? div64_u64(1ULL << 48, free_space)
653 		: 1ULL << 48;
654 	u64 scale = *v / 4;
655 
656 	if (*v + free_space_inv >= *v)
657 		*v += free_space_inv;
658 	else
659 		*v = U64_MAX;
660 
661 	for (v = stripe->next_alloc;
662 	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
663 		*v = *v < scale ? 0 : *v - scale;
664 }
665 
666 void bch2_dev_stripe_increment(struct bch_dev *ca,
667 			       struct dev_stripe_state *stripe)
668 {
669 	struct bch_dev_usage usage;
670 
671 	bch2_dev_usage_read_fast(ca, &usage);
672 	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
673 }
674 
675 static int add_new_bucket(struct bch_fs *c,
676 			   struct open_buckets *ptrs,
677 			   struct bch_devs_mask *devs_may_alloc,
678 			   unsigned nr_replicas,
679 			   unsigned *nr_effective,
680 			   bool *have_cache,
681 			   unsigned flags,
682 			   struct open_bucket *ob)
683 {
684 	unsigned durability =
685 		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
686 
687 	BUG_ON(*nr_effective >= nr_replicas);
688 
689 	__clear_bit(ob->dev, devs_may_alloc->d);
690 	*nr_effective	+= durability;
691 	*have_cache	|= !durability;
692 
693 	ob_push(c, ptrs, ob);
694 
695 	if (*nr_effective >= nr_replicas)
696 		return 1;
697 	if (ob->ec)
698 		return 1;
699 	return 0;
700 }
701 
702 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
703 		      struct open_buckets *ptrs,
704 		      struct dev_stripe_state *stripe,
705 		      struct bch_devs_mask *devs_may_alloc,
706 		      unsigned nr_replicas,
707 		      unsigned *nr_effective,
708 		      bool *have_cache,
709 		      unsigned flags,
710 		      enum bch_data_type data_type,
711 		      enum bch_watermark watermark,
712 		      struct closure *cl)
713 {
714 	struct bch_fs *c = trans->c;
715 	struct dev_alloc_list devs_sorted =
716 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
717 	unsigned dev;
718 	struct bch_dev *ca;
719 	int ret = -BCH_ERR_insufficient_devices;
720 	unsigned i;
721 
722 	BUG_ON(*nr_effective >= nr_replicas);
723 
724 	for (i = 0; i < devs_sorted.nr; i++) {
725 		struct bch_dev_usage usage;
726 		struct open_bucket *ob;
727 
728 		dev = devs_sorted.devs[i];
729 
730 		rcu_read_lock();
731 		ca = rcu_dereference(c->devs[dev]);
732 		if (ca)
733 			percpu_ref_get(&ca->ref);
734 		rcu_read_unlock();
735 
736 		if (!ca)
737 			continue;
738 
739 		if (!ca->mi.durability && *have_cache) {
740 			percpu_ref_put(&ca->ref);
741 			continue;
742 		}
743 
744 		ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
745 		if (!IS_ERR(ob))
746 			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
747 		percpu_ref_put(&ca->ref);
748 
749 		if (IS_ERR(ob)) {
750 			ret = PTR_ERR(ob);
751 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
752 				break;
753 			continue;
754 		}
755 
756 		ob->data_type = data_type;
757 
758 		if (add_new_bucket(c, ptrs, devs_may_alloc,
759 				   nr_replicas, nr_effective,
760 				   have_cache, flags, ob)) {
761 			ret = 0;
762 			break;
763 		}
764 	}
765 
766 	return ret;
767 }
768 
769 /* Allocate from stripes: */
770 
771 /*
772  * if we can't allocate a new stripe because there are already too many
773  * partially filled stripes, force allocating from an existing stripe even when
774  * it's to a device we don't want:
775  */
776 
777 static int bucket_alloc_from_stripe(struct btree_trans *trans,
778 			 struct open_buckets *ptrs,
779 			 struct write_point *wp,
780 			 struct bch_devs_mask *devs_may_alloc,
781 			 u16 target,
782 			 unsigned nr_replicas,
783 			 unsigned *nr_effective,
784 			 bool *have_cache,
785 			 enum bch_watermark watermark,
786 			 unsigned flags,
787 			 struct closure *cl)
788 {
789 	struct bch_fs *c = trans->c;
790 	struct dev_alloc_list devs_sorted;
791 	struct ec_stripe_head *h;
792 	struct open_bucket *ob;
793 	unsigned i, ec_idx;
794 	int ret = 0;
795 
796 	if (nr_replicas < 2)
797 		return 0;
798 
799 	if (ec_open_bucket(c, ptrs))
800 		return 0;
801 
802 	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
803 	if (IS_ERR(h))
804 		return PTR_ERR(h);
805 	if (!h)
806 		return 0;
807 
808 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
809 
810 	for (i = 0; i < devs_sorted.nr; i++)
811 		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
812 			if (!h->s->blocks[ec_idx])
813 				continue;
814 
815 			ob = c->open_buckets + h->s->blocks[ec_idx];
816 			if (ob->dev == devs_sorted.devs[i] &&
817 			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
818 				goto got_bucket;
819 		}
820 	goto out_put_head;
821 got_bucket:
822 	ob->ec_idx	= ec_idx;
823 	ob->ec		= h->s;
824 	ec_stripe_new_get(h->s, STRIPE_REF_io);
825 
826 	ret = add_new_bucket(c, ptrs, devs_may_alloc,
827 			     nr_replicas, nr_effective,
828 			     have_cache, flags, ob);
829 out_put_head:
830 	bch2_ec_stripe_head_put(c, h);
831 	return ret;
832 }
833 
834 /* Sector allocator */
835 
836 static bool want_bucket(struct bch_fs *c,
837 			struct write_point *wp,
838 			struct bch_devs_mask *devs_may_alloc,
839 			bool *have_cache, bool ec,
840 			struct open_bucket *ob)
841 {
842 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
843 
844 	if (!test_bit(ob->dev, devs_may_alloc->d))
845 		return false;
846 
847 	if (ob->data_type != wp->data_type)
848 		return false;
849 
850 	if (!ca->mi.durability &&
851 	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
852 		return false;
853 
854 	if (ec != (ob->ec != NULL))
855 		return false;
856 
857 	return true;
858 }
859 
860 static int bucket_alloc_set_writepoint(struct bch_fs *c,
861 				       struct open_buckets *ptrs,
862 				       struct write_point *wp,
863 				       struct bch_devs_mask *devs_may_alloc,
864 				       unsigned nr_replicas,
865 				       unsigned *nr_effective,
866 				       bool *have_cache,
867 				       bool ec, unsigned flags)
868 {
869 	struct open_buckets ptrs_skip = { .nr = 0 };
870 	struct open_bucket *ob;
871 	unsigned i;
872 	int ret = 0;
873 
874 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
875 		if (!ret && want_bucket(c, wp, devs_may_alloc,
876 					have_cache, ec, ob))
877 			ret = add_new_bucket(c, ptrs, devs_may_alloc,
878 				       nr_replicas, nr_effective,
879 				       have_cache, flags, ob);
880 		else
881 			ob_push(c, &ptrs_skip, ob);
882 	}
883 	wp->ptrs = ptrs_skip;
884 
885 	return ret;
886 }
887 
888 static int bucket_alloc_set_partial(struct bch_fs *c,
889 				    struct open_buckets *ptrs,
890 				    struct write_point *wp,
891 				    struct bch_devs_mask *devs_may_alloc,
892 				    unsigned nr_replicas,
893 				    unsigned *nr_effective,
894 				    bool *have_cache, bool ec,
895 				    enum bch_watermark watermark,
896 				    unsigned flags)
897 {
898 	int i, ret = 0;
899 
900 	if (!c->open_buckets_partial_nr)
901 		return 0;
902 
903 	spin_lock(&c->freelist_lock);
904 
905 	if (!c->open_buckets_partial_nr)
906 		goto unlock;
907 
908 	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
909 		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
910 
911 		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
912 			struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
913 			struct bch_dev_usage usage;
914 			u64 avail;
915 
916 			bch2_dev_usage_read_fast(ca, &usage);
917 			avail = dev_buckets_free(ca, usage, watermark);
918 			if (!avail)
919 				continue;
920 
921 			array_remove_item(c->open_buckets_partial,
922 					  c->open_buckets_partial_nr,
923 					  i);
924 			ob->on_partial_list = false;
925 
926 			ret = add_new_bucket(c, ptrs, devs_may_alloc,
927 					     nr_replicas, nr_effective,
928 					     have_cache, flags, ob);
929 			if (ret)
930 				break;
931 		}
932 	}
933 unlock:
934 	spin_unlock(&c->freelist_lock);
935 	return ret;
936 }
937 
938 static int __open_bucket_add_buckets(struct btree_trans *trans,
939 			struct open_buckets *ptrs,
940 			struct write_point *wp,
941 			struct bch_devs_list *devs_have,
942 			u16 target,
943 			bool erasure_code,
944 			unsigned nr_replicas,
945 			unsigned *nr_effective,
946 			bool *have_cache,
947 			enum bch_watermark watermark,
948 			unsigned flags,
949 			struct closure *_cl)
950 {
951 	struct bch_fs *c = trans->c;
952 	struct bch_devs_mask devs;
953 	struct open_bucket *ob;
954 	struct closure *cl = NULL;
955 	unsigned i;
956 	int ret;
957 
958 	devs = target_rw_devs(c, wp->data_type, target);
959 
960 	/* Don't allocate from devices we already have pointers to: */
961 	darray_for_each(*devs_have, i)
962 		__clear_bit(*i, devs.d);
963 
964 	open_bucket_for_each(c, ptrs, ob, i)
965 		__clear_bit(ob->dev, devs.d);
966 
967 	if (erasure_code && ec_open_bucket(c, ptrs))
968 		return 0;
969 
970 	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
971 				 nr_replicas, nr_effective,
972 				 have_cache, erasure_code, flags);
973 	if (ret)
974 		return ret;
975 
976 	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
977 				 nr_replicas, nr_effective,
978 				 have_cache, erasure_code, watermark, flags);
979 	if (ret)
980 		return ret;
981 
982 	if (erasure_code) {
983 		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
984 					 target,
985 					 nr_replicas, nr_effective,
986 					 have_cache,
987 					 watermark, flags, _cl);
988 	} else {
989 retry_blocking:
990 		/*
991 		 * Try nonblocking first, so that if one device is full we'll try from
992 		 * other devices:
993 		 */
994 		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
995 					nr_replicas, nr_effective, have_cache,
996 					flags, wp->data_type, watermark, cl);
997 		if (ret &&
998 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
999 		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
1000 		    !cl && _cl) {
1001 			cl = _cl;
1002 			goto retry_blocking;
1003 		}
1004 	}
1005 
1006 	return ret;
1007 }
1008 
1009 static int open_bucket_add_buckets(struct btree_trans *trans,
1010 			struct open_buckets *ptrs,
1011 			struct write_point *wp,
1012 			struct bch_devs_list *devs_have,
1013 			u16 target,
1014 			unsigned erasure_code,
1015 			unsigned nr_replicas,
1016 			unsigned *nr_effective,
1017 			bool *have_cache,
1018 			enum bch_watermark watermark,
1019 			unsigned flags,
1020 			struct closure *cl)
1021 {
1022 	int ret;
1023 
1024 	if (erasure_code) {
1025 		ret = __open_bucket_add_buckets(trans, ptrs, wp,
1026 				devs_have, target, erasure_code,
1027 				nr_replicas, nr_effective, have_cache,
1028 				watermark, flags, cl);
1029 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1030 		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1031 		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1032 		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1033 			return ret;
1034 		if (*nr_effective >= nr_replicas)
1035 			return 0;
1036 	}
1037 
1038 	ret = __open_bucket_add_buckets(trans, ptrs, wp,
1039 			devs_have, target, false,
1040 			nr_replicas, nr_effective, have_cache,
1041 			watermark, flags, cl);
1042 	return ret < 0 ? ret : 0;
1043 }
1044 
1045 /**
1046  * should_drop_bucket - check if this is open_bucket should go away
1047  * @ob:		open_bucket to predicate on
1048  * @c:		filesystem handle
1049  * @ca:		if set, we're killing buckets for a particular device
1050  * @ec:		if true, we're shutting down erasure coding and killing all ec
1051  *		open_buckets
1052  *		otherwise, return true
1053  * Returns: true if we should kill this open_bucket
1054  *
1055  * We're killing open_buckets because we're shutting down a device, erasure
1056  * coding, or the entire filesystem - check if this open_bucket matches:
1057  */
1058 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1059 			       struct bch_dev *ca, bool ec)
1060 {
1061 	if (ec) {
1062 		return ob->ec != NULL;
1063 	} else if (ca) {
1064 		bool drop = ob->dev == ca->dev_idx;
1065 		struct open_bucket *ob2;
1066 		unsigned i;
1067 
1068 		if (!drop && ob->ec) {
1069 			unsigned nr_blocks;
1070 
1071 			mutex_lock(&ob->ec->lock);
1072 			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1073 
1074 			for (i = 0; i < nr_blocks; i++) {
1075 				if (!ob->ec->blocks[i])
1076 					continue;
1077 
1078 				ob2 = c->open_buckets + ob->ec->blocks[i];
1079 				drop |= ob2->dev == ca->dev_idx;
1080 			}
1081 			mutex_unlock(&ob->ec->lock);
1082 		}
1083 
1084 		return drop;
1085 	} else {
1086 		return true;
1087 	}
1088 }
1089 
1090 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1091 				 bool ec, struct write_point *wp)
1092 {
1093 	struct open_buckets ptrs = { .nr = 0 };
1094 	struct open_bucket *ob;
1095 	unsigned i;
1096 
1097 	mutex_lock(&wp->lock);
1098 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1099 		if (should_drop_bucket(ob, c, ca, ec))
1100 			bch2_open_bucket_put(c, ob);
1101 		else
1102 			ob_push(c, &ptrs, ob);
1103 	wp->ptrs = ptrs;
1104 	mutex_unlock(&wp->lock);
1105 }
1106 
1107 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1108 			    bool ec)
1109 {
1110 	unsigned i;
1111 
1112 	/* Next, close write points that point to this device... */
1113 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1114 		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1115 
1116 	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1117 	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1118 	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1119 
1120 	mutex_lock(&c->btree_reserve_cache_lock);
1121 	while (c->btree_reserve_cache_nr) {
1122 		struct btree_alloc *a =
1123 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1124 
1125 		bch2_open_buckets_put(c, &a->ob);
1126 	}
1127 	mutex_unlock(&c->btree_reserve_cache_lock);
1128 
1129 	spin_lock(&c->freelist_lock);
1130 	i = 0;
1131 	while (i < c->open_buckets_partial_nr) {
1132 		struct open_bucket *ob =
1133 			c->open_buckets + c->open_buckets_partial[i];
1134 
1135 		if (should_drop_bucket(ob, c, ca, ec)) {
1136 			--c->open_buckets_partial_nr;
1137 			swap(c->open_buckets_partial[i],
1138 			     c->open_buckets_partial[c->open_buckets_partial_nr]);
1139 			ob->on_partial_list = false;
1140 			spin_unlock(&c->freelist_lock);
1141 			bch2_open_bucket_put(c, ob);
1142 			spin_lock(&c->freelist_lock);
1143 		} else {
1144 			i++;
1145 		}
1146 	}
1147 	spin_unlock(&c->freelist_lock);
1148 
1149 	bch2_ec_stop_dev(c, ca);
1150 }
1151 
1152 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1153 						 unsigned long write_point)
1154 {
1155 	unsigned hash =
1156 		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1157 
1158 	return &c->write_points_hash[hash];
1159 }
1160 
1161 static struct write_point *__writepoint_find(struct hlist_head *head,
1162 					     unsigned long write_point)
1163 {
1164 	struct write_point *wp;
1165 
1166 	rcu_read_lock();
1167 	hlist_for_each_entry_rcu(wp, head, node)
1168 		if (wp->write_point == write_point)
1169 			goto out;
1170 	wp = NULL;
1171 out:
1172 	rcu_read_unlock();
1173 	return wp;
1174 }
1175 
1176 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1177 {
1178 	u64 stranded	= c->write_points_nr * c->bucket_size_max;
1179 	u64 free	= bch2_fs_usage_read_short(c).free;
1180 
1181 	return stranded * factor > free;
1182 }
1183 
1184 static bool try_increase_writepoints(struct bch_fs *c)
1185 {
1186 	struct write_point *wp;
1187 
1188 	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1189 	    too_many_writepoints(c, 32))
1190 		return false;
1191 
1192 	wp = c->write_points + c->write_points_nr++;
1193 	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1194 	return true;
1195 }
1196 
1197 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1198 {
1199 	struct bch_fs *c = trans->c;
1200 	struct write_point *wp;
1201 	struct open_bucket *ob;
1202 	unsigned i;
1203 
1204 	mutex_lock(&c->write_points_hash_lock);
1205 	if (c->write_points_nr < old_nr) {
1206 		mutex_unlock(&c->write_points_hash_lock);
1207 		return true;
1208 	}
1209 
1210 	if (c->write_points_nr == 1 ||
1211 	    !too_many_writepoints(c, 8)) {
1212 		mutex_unlock(&c->write_points_hash_lock);
1213 		return false;
1214 	}
1215 
1216 	wp = c->write_points + --c->write_points_nr;
1217 
1218 	hlist_del_rcu(&wp->node);
1219 	mutex_unlock(&c->write_points_hash_lock);
1220 
1221 	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1222 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1223 		open_bucket_free_unused(c, ob);
1224 	wp->ptrs.nr = 0;
1225 	mutex_unlock(&wp->lock);
1226 	return true;
1227 }
1228 
1229 static struct write_point *writepoint_find(struct btree_trans *trans,
1230 					   unsigned long write_point)
1231 {
1232 	struct bch_fs *c = trans->c;
1233 	struct write_point *wp, *oldest;
1234 	struct hlist_head *head;
1235 
1236 	if (!(write_point & 1UL)) {
1237 		wp = (struct write_point *) write_point;
1238 		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1239 		return wp;
1240 	}
1241 
1242 	head = writepoint_hash(c, write_point);
1243 restart_find:
1244 	wp = __writepoint_find(head, write_point);
1245 	if (wp) {
1246 lock_wp:
1247 		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1248 		if (wp->write_point == write_point)
1249 			goto out;
1250 		mutex_unlock(&wp->lock);
1251 		goto restart_find;
1252 	}
1253 restart_find_oldest:
1254 	oldest = NULL;
1255 	for (wp = c->write_points;
1256 	     wp < c->write_points + c->write_points_nr; wp++)
1257 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
1258 			oldest = wp;
1259 
1260 	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1261 	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1262 	if (oldest >= c->write_points + c->write_points_nr ||
1263 	    try_increase_writepoints(c)) {
1264 		mutex_unlock(&c->write_points_hash_lock);
1265 		mutex_unlock(&oldest->lock);
1266 		goto restart_find_oldest;
1267 	}
1268 
1269 	wp = __writepoint_find(head, write_point);
1270 	if (wp && wp != oldest) {
1271 		mutex_unlock(&c->write_points_hash_lock);
1272 		mutex_unlock(&oldest->lock);
1273 		goto lock_wp;
1274 	}
1275 
1276 	wp = oldest;
1277 	hlist_del_rcu(&wp->node);
1278 	wp->write_point = write_point;
1279 	hlist_add_head_rcu(&wp->node, head);
1280 	mutex_unlock(&c->write_points_hash_lock);
1281 out:
1282 	wp->last_used = local_clock();
1283 	return wp;
1284 }
1285 
1286 static noinline void
1287 deallocate_extra_replicas(struct bch_fs *c,
1288 			  struct open_buckets *ptrs,
1289 			  struct open_buckets *ptrs_no_use,
1290 			  unsigned extra_replicas)
1291 {
1292 	struct open_buckets ptrs2 = { 0 };
1293 	struct open_bucket *ob;
1294 	unsigned i;
1295 
1296 	open_bucket_for_each(c, ptrs, ob, i) {
1297 		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1298 
1299 		if (d && d <= extra_replicas) {
1300 			extra_replicas -= d;
1301 			ob_push(c, ptrs_no_use, ob);
1302 		} else {
1303 			ob_push(c, &ptrs2, ob);
1304 		}
1305 	}
1306 
1307 	*ptrs = ptrs2;
1308 }
1309 
1310 /*
1311  * Get us an open_bucket we can allocate from, return with it locked:
1312  */
1313 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1314 			     unsigned target,
1315 			     unsigned erasure_code,
1316 			     struct write_point_specifier write_point,
1317 			     struct bch_devs_list *devs_have,
1318 			     unsigned nr_replicas,
1319 			     unsigned nr_replicas_required,
1320 			     enum bch_watermark watermark,
1321 			     unsigned flags,
1322 			     struct closure *cl,
1323 			     struct write_point **wp_ret)
1324 {
1325 	struct bch_fs *c = trans->c;
1326 	struct write_point *wp;
1327 	struct open_bucket *ob;
1328 	struct open_buckets ptrs;
1329 	unsigned nr_effective, write_points_nr;
1330 	bool have_cache;
1331 	int ret;
1332 	int i;
1333 
1334 	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1335 		erasure_code = false;
1336 
1337 	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1338 
1339 	BUG_ON(!nr_replicas || !nr_replicas_required);
1340 retry:
1341 	ptrs.nr		= 0;
1342 	nr_effective	= 0;
1343 	write_points_nr = c->write_points_nr;
1344 	have_cache	= false;
1345 
1346 	*wp_ret = wp = writepoint_find(trans, write_point.v);
1347 
1348 	/* metadata may not allocate on cache devices: */
1349 	if (wp->data_type != BCH_DATA_user)
1350 		have_cache = true;
1351 
1352 	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1353 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1354 					      target, erasure_code,
1355 					      nr_replicas, &nr_effective,
1356 					      &have_cache, watermark,
1357 					      flags, NULL);
1358 		if (!ret ||
1359 		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
1360 			goto alloc_done;
1361 
1362 		/* Don't retry from all devices if we're out of open buckets: */
1363 		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1364 			int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1365 					      target, erasure_code,
1366 					      nr_replicas, &nr_effective,
1367 					      &have_cache, watermark,
1368 					      flags, cl);
1369 			if (!ret ||
1370 			    bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1371 			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1372 				goto alloc_done;
1373 		}
1374 
1375 		/*
1376 		 * Only try to allocate cache (durability = 0 devices) from the
1377 		 * specified target:
1378 		 */
1379 		have_cache = true;
1380 
1381 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1382 					      0, erasure_code,
1383 					      nr_replicas, &nr_effective,
1384 					      &have_cache, watermark,
1385 					      flags, cl);
1386 	} else {
1387 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1388 					      target, erasure_code,
1389 					      nr_replicas, &nr_effective,
1390 					      &have_cache, watermark,
1391 					      flags, cl);
1392 	}
1393 alloc_done:
1394 	BUG_ON(!ret && nr_effective < nr_replicas);
1395 
1396 	if (erasure_code && !ec_open_bucket(c, &ptrs))
1397 		pr_debug("failed to get ec bucket: ret %u", ret);
1398 
1399 	if (ret == -BCH_ERR_insufficient_devices &&
1400 	    nr_effective >= nr_replicas_required)
1401 		ret = 0;
1402 
1403 	if (ret)
1404 		goto err;
1405 
1406 	if (nr_effective > nr_replicas)
1407 		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1408 
1409 	/* Free buckets we didn't use: */
1410 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1411 		open_bucket_free_unused(c, ob);
1412 
1413 	wp->ptrs = ptrs;
1414 
1415 	wp->sectors_free = UINT_MAX;
1416 
1417 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1418 		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1419 
1420 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1421 
1422 	return 0;
1423 err:
1424 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1425 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1426 			ob_push(c, &ptrs, ob);
1427 		else
1428 			open_bucket_free_unused(c, ob);
1429 	wp->ptrs = ptrs;
1430 
1431 	mutex_unlock(&wp->lock);
1432 
1433 	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1434 	    try_decrease_writepoints(trans, write_points_nr))
1435 		goto retry;
1436 
1437 	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1438 	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
1439 		return cl
1440 			? -BCH_ERR_bucket_alloc_blocked
1441 			: -BCH_ERR_ENOSPC_bucket_alloc;
1442 
1443 	return ret;
1444 }
1445 
1446 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1447 {
1448 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1449 
1450 	return (struct bch_extent_ptr) {
1451 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
1452 		.gen	= ob->gen,
1453 		.dev	= ob->dev,
1454 		.offset	= bucket_to_sector(ca, ob->bucket) +
1455 			ca->mi.bucket_size -
1456 			ob->sectors_free,
1457 	};
1458 }
1459 
1460 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1461 				    struct bkey_i *k, unsigned sectors,
1462 				    bool cached)
1463 {
1464 	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1465 }
1466 
1467 /*
1468  * Append pointers to the space we just allocated to @k, and mark @sectors space
1469  * as allocated out of @ob
1470  */
1471 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1472 {
1473 	bch2_alloc_sectors_done_inlined(c, wp);
1474 }
1475 
1476 static inline void writepoint_init(struct write_point *wp,
1477 				   enum bch_data_type type)
1478 {
1479 	mutex_init(&wp->lock);
1480 	wp->data_type = type;
1481 
1482 	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1483 	INIT_LIST_HEAD(&wp->writes);
1484 	spin_lock_init(&wp->writes_lock);
1485 }
1486 
1487 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1488 {
1489 	struct open_bucket *ob;
1490 	struct write_point *wp;
1491 
1492 	mutex_init(&c->write_points_hash_lock);
1493 	c->write_points_nr = ARRAY_SIZE(c->write_points);
1494 
1495 	/* open bucket 0 is a sentinal NULL: */
1496 	spin_lock_init(&c->open_buckets[0].lock);
1497 
1498 	for (ob = c->open_buckets + 1;
1499 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1500 		spin_lock_init(&ob->lock);
1501 		c->open_buckets_nr_free++;
1502 
1503 		ob->freelist = c->open_buckets_freelist;
1504 		c->open_buckets_freelist = ob - c->open_buckets;
1505 	}
1506 
1507 	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
1508 	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
1509 	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
1510 
1511 	for (wp = c->write_points;
1512 	     wp < c->write_points + c->write_points_nr; wp++) {
1513 		writepoint_init(wp, BCH_DATA_user);
1514 
1515 		wp->last_used	= local_clock();
1516 		wp->write_point	= (unsigned long) wp;
1517 		hlist_add_head_rcu(&wp->node,
1518 				   writepoint_hash(c, wp->write_point));
1519 	}
1520 }
1521 
1522 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1523 {
1524 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1525 	unsigned data_type = ob->data_type;
1526 	barrier(); /* READ_ONCE() doesn't work on bitfields */
1527 
1528 	prt_printf(out, "%zu ref %u ",
1529 		   ob - c->open_buckets,
1530 		   atomic_read(&ob->pin));
1531 	bch2_prt_data_type(out, data_type);
1532 	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
1533 		   ob->dev, ob->bucket, ob->gen,
1534 		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1535 	if (ob->ec)
1536 		prt_printf(out, " ec idx %llu", ob->ec->idx);
1537 	if (ob->on_partial_list)
1538 		prt_str(out, " partial");
1539 	prt_newline(out);
1540 }
1541 
1542 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1543 {
1544 	struct open_bucket *ob;
1545 
1546 	out->atomic++;
1547 
1548 	for (ob = c->open_buckets;
1549 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1550 	     ob++) {
1551 		spin_lock(&ob->lock);
1552 		if (ob->valid && !ob->on_partial_list)
1553 			bch2_open_bucket_to_text(out, c, ob);
1554 		spin_unlock(&ob->lock);
1555 	}
1556 
1557 	--out->atomic;
1558 }
1559 
1560 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1561 {
1562 	unsigned i;
1563 
1564 	out->atomic++;
1565 	spin_lock(&c->freelist_lock);
1566 
1567 	for (i = 0; i < c->open_buckets_partial_nr; i++)
1568 		bch2_open_bucket_to_text(out, c,
1569 				c->open_buckets + c->open_buckets_partial[i]);
1570 
1571 	spin_unlock(&c->freelist_lock);
1572 	--out->atomic;
1573 }
1574 
1575 static const char * const bch2_write_point_states[] = {
1576 #define x(n)	#n,
1577 	WRITE_POINT_STATES()
1578 #undef x
1579 	NULL
1580 };
1581 
1582 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1583 				     struct write_point *wp)
1584 {
1585 	struct open_bucket *ob;
1586 	unsigned i;
1587 
1588 	prt_printf(out, "%lu: ", wp->write_point);
1589 	prt_human_readable_u64(out, wp->sectors_allocated);
1590 
1591 	prt_printf(out, " last wrote: ");
1592 	bch2_pr_time_units(out, sched_clock() - wp->last_used);
1593 
1594 	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1595 		prt_printf(out, " %s: ", bch2_write_point_states[i]);
1596 		bch2_pr_time_units(out, wp->time[i]);
1597 	}
1598 
1599 	prt_newline(out);
1600 
1601 	printbuf_indent_add(out, 2);
1602 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1603 		bch2_open_bucket_to_text(out, c, ob);
1604 	printbuf_indent_sub(out, 2);
1605 }
1606 
1607 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1608 {
1609 	struct write_point *wp;
1610 
1611 	prt_str(out, "Foreground write points\n");
1612 	for (wp = c->write_points;
1613 	     wp < c->write_points + ARRAY_SIZE(c->write_points);
1614 	     wp++)
1615 		bch2_write_point_to_text(out, c, wp);
1616 
1617 	prt_str(out, "Copygc write point\n");
1618 	bch2_write_point_to_text(out, c, &c->copygc_write_point);
1619 
1620 	prt_str(out, "Rebalance write point\n");
1621 	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1622 
1623 	prt_str(out, "Btree write point\n");
1624 	bch2_write_point_to_text(out, c, &c->btree_write_point);
1625 }
1626