xref: /linux/fs/bcachefs/alloc_foreground.c (revision 881f1bb5e25c8982ed963b2d319fc0fc732e55db)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13 
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io_write.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33 
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37 
38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39 					   struct mutex *lock)
40 {
41 	if (!mutex_trylock(lock)) {
42 		bch2_trans_unlock(trans);
43 		mutex_lock(lock);
44 	}
45 }
46 
47 const char * const bch2_watermarks[] = {
48 #define x(t) #t,
49 	BCH_WATERMARKS()
50 #undef x
51 	NULL
52 };
53 
54 /*
55  * Open buckets represent a bucket that's currently being allocated from.  They
56  * serve two purposes:
57  *
58  *  - They track buckets that have been partially allocated, allowing for
59  *    sub-bucket sized allocations - they're used by the sector allocator below
60  *
61  *  - They provide a reference to the buckets they own that mark and sweep GC
62  *    can find, until the new allocation has a pointer to it inserted into the
63  *    btree
64  *
65  * When allocating some space with the sector allocator, the allocation comes
66  * with a reference to an open bucket - the caller is required to put that
67  * reference _after_ doing the index update that makes its allocation reachable.
68  */
69 
70 void bch2_reset_alloc_cursors(struct bch_fs *c)
71 {
72 	rcu_read_lock();
73 	for_each_member_device_rcu(c, ca, NULL)
74 		ca->alloc_cursor = 0;
75 	rcu_read_unlock();
76 }
77 
78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
79 {
80 	open_bucket_idx_t idx = ob - c->open_buckets;
81 	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
82 
83 	ob->hash = *slot;
84 	*slot = idx;
85 }
86 
87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
88 {
89 	open_bucket_idx_t idx = ob - c->open_buckets;
90 	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
91 
92 	while (*slot != idx) {
93 		BUG_ON(!*slot);
94 		slot = &c->open_buckets[*slot].hash;
95 	}
96 
97 	*slot = ob->hash;
98 	ob->hash = 0;
99 }
100 
101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
102 {
103 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
104 
105 	if (ob->ec) {
106 		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
107 		return;
108 	}
109 
110 	percpu_down_read(&c->mark_lock);
111 	spin_lock(&ob->lock);
112 
113 	ob->valid = false;
114 	ob->data_type = 0;
115 
116 	spin_unlock(&ob->lock);
117 	percpu_up_read(&c->mark_lock);
118 
119 	spin_lock(&c->freelist_lock);
120 	bch2_open_bucket_hash_remove(c, ob);
121 
122 	ob->freelist = c->open_buckets_freelist;
123 	c->open_buckets_freelist = ob - c->open_buckets;
124 
125 	c->open_buckets_nr_free++;
126 	ca->nr_open_buckets--;
127 	spin_unlock(&c->freelist_lock);
128 
129 	closure_wake_up(&c->open_buckets_wait);
130 }
131 
132 void bch2_open_bucket_write_error(struct bch_fs *c,
133 				  struct open_buckets *obs,
134 				  unsigned dev)
135 {
136 	struct open_bucket *ob;
137 	unsigned i;
138 
139 	open_bucket_for_each(c, obs, ob, i)
140 		if (ob->dev == dev && ob->ec)
141 			bch2_ec_bucket_cancel(c, ob);
142 }
143 
144 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
145 {
146 	struct open_bucket *ob;
147 
148 	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
149 
150 	ob = c->open_buckets + c->open_buckets_freelist;
151 	c->open_buckets_freelist = ob->freelist;
152 	atomic_set(&ob->pin, 1);
153 	ob->data_type = 0;
154 
155 	c->open_buckets_nr_free--;
156 	return ob;
157 }
158 
159 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
160 {
161 	BUG_ON(c->open_buckets_partial_nr >=
162 	       ARRAY_SIZE(c->open_buckets_partial));
163 
164 	spin_lock(&c->freelist_lock);
165 	ob->on_partial_list = true;
166 	c->open_buckets_partial[c->open_buckets_partial_nr++] =
167 		ob - c->open_buckets;
168 	spin_unlock(&c->freelist_lock);
169 
170 	closure_wake_up(&c->open_buckets_wait);
171 	closure_wake_up(&c->freelist_wait);
172 }
173 
174 /* _only_ for allocating the journal on a new device: */
175 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
176 {
177 	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
178 		u64 b = ca->new_fs_bucket_idx++;
179 
180 		if (!is_superblock_bucket(ca, b) &&
181 		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
182 			return b;
183 	}
184 
185 	return -1;
186 }
187 
188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
189 {
190 	switch (watermark) {
191 	case BCH_WATERMARK_interior_updates:
192 		return 0;
193 	case BCH_WATERMARK_reclaim:
194 		return OPEN_BUCKETS_COUNT / 6;
195 	case BCH_WATERMARK_btree:
196 	case BCH_WATERMARK_btree_copygc:
197 		return OPEN_BUCKETS_COUNT / 4;
198 	case BCH_WATERMARK_copygc:
199 		return OPEN_BUCKETS_COUNT / 3;
200 	default:
201 		return OPEN_BUCKETS_COUNT / 2;
202 	}
203 }
204 
205 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
206 					      u64 bucket,
207 					      enum bch_watermark watermark,
208 					      const struct bch_alloc_v4 *a,
209 					      struct bucket_alloc_state *s,
210 					      struct closure *cl)
211 {
212 	struct open_bucket *ob;
213 
214 	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
215 		s->skipped_nouse++;
216 		return NULL;
217 	}
218 
219 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
220 		s->skipped_open++;
221 		return NULL;
222 	}
223 
224 	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
225 			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
226 		s->skipped_need_journal_commit++;
227 		return NULL;
228 	}
229 
230 	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
231 		s->skipped_nocow++;
232 		return NULL;
233 	}
234 
235 	spin_lock(&c->freelist_lock);
236 
237 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
238 		if (cl)
239 			closure_wait(&c->open_buckets_wait, cl);
240 
241 		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
242 		spin_unlock(&c->freelist_lock);
243 		return ERR_PTR(-BCH_ERR_open_buckets_empty);
244 	}
245 
246 	/* Recheck under lock: */
247 	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
248 		spin_unlock(&c->freelist_lock);
249 		s->skipped_open++;
250 		return NULL;
251 	}
252 
253 	ob = bch2_open_bucket_alloc(c);
254 
255 	spin_lock(&ob->lock);
256 
257 	ob->valid	= true;
258 	ob->sectors_free = ca->mi.bucket_size;
259 	ob->dev		= ca->dev_idx;
260 	ob->gen		= a->gen;
261 	ob->bucket	= bucket;
262 	spin_unlock(&ob->lock);
263 
264 	ca->nr_open_buckets++;
265 	bch2_open_bucket_hash_add(c, ob);
266 
267 	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
268 	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
269 
270 	spin_unlock(&c->freelist_lock);
271 	return ob;
272 }
273 
274 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
275 					    enum bch_watermark watermark, u64 free_entry,
276 					    struct bucket_alloc_state *s,
277 					    struct bkey_s_c freespace_k,
278 					    struct closure *cl)
279 {
280 	struct bch_fs *c = trans->c;
281 	struct btree_iter iter = { NULL };
282 	struct bkey_s_c k;
283 	struct open_bucket *ob;
284 	struct bch_alloc_v4 a_convert;
285 	const struct bch_alloc_v4 *a;
286 	u64 b = free_entry & ~(~0ULL << 56);
287 	unsigned genbits = free_entry >> 56;
288 	struct printbuf buf = PRINTBUF;
289 	int ret;
290 
291 	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
292 		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
293 		       "  freespace key ",
294 			ca->mi.first_bucket, ca->mi.nbuckets);
295 		bch2_bkey_val_to_text(&buf, c, freespace_k);
296 		bch2_trans_inconsistent(trans, "%s", buf.buf);
297 		ob = ERR_PTR(-EIO);
298 		goto err;
299 	}
300 
301 	k = bch2_bkey_get_iter(trans, &iter,
302 			       BTREE_ID_alloc, POS(ca->dev_idx, b),
303 			       BTREE_ITER_CACHED);
304 	ret = bkey_err(k);
305 	if (ret) {
306 		ob = ERR_PTR(ret);
307 		goto err;
308 	}
309 
310 	a = bch2_alloc_to_v4(k, &a_convert);
311 
312 	if (a->data_type != BCH_DATA_free) {
313 		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
314 			ob = NULL;
315 			goto err;
316 		}
317 
318 		prt_printf(&buf, "non free bucket in freespace btree\n"
319 		       "  freespace key ");
320 		bch2_bkey_val_to_text(&buf, c, freespace_k);
321 		prt_printf(&buf, "\n  ");
322 		bch2_bkey_val_to_text(&buf, c, k);
323 		bch2_trans_inconsistent(trans, "%s", buf.buf);
324 		ob = ERR_PTR(-EIO);
325 		goto err;
326 	}
327 
328 	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
329 	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
330 		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
331 		       "  freespace key ",
332 		       genbits, alloc_freespace_genbits(*a) >> 56);
333 		bch2_bkey_val_to_text(&buf, c, freespace_k);
334 		prt_printf(&buf, "\n  ");
335 		bch2_bkey_val_to_text(&buf, c, k);
336 		bch2_trans_inconsistent(trans, "%s", buf.buf);
337 		ob = ERR_PTR(-EIO);
338 		goto err;
339 	}
340 
341 	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
342 		struct bch_backpointer bp;
343 		struct bpos bp_pos = POS_MIN;
344 
345 		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
346 						&bp_pos, &bp,
347 						BTREE_ITER_NOPRESERVE);
348 		if (ret) {
349 			ob = ERR_PTR(ret);
350 			goto err;
351 		}
352 
353 		if (!bkey_eq(bp_pos, POS_MAX)) {
354 			/*
355 			 * Bucket may have data in it - we don't call
356 			 * bc2h_trans_inconnsistent() because fsck hasn't
357 			 * finished yet
358 			 */
359 			ob = NULL;
360 			goto err;
361 		}
362 	}
363 
364 	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
365 	if (!ob)
366 		set_btree_iter_dontneed(&iter);
367 err:
368 	if (iter.path)
369 		set_btree_iter_dontneed(&iter);
370 	bch2_trans_iter_exit(trans, &iter);
371 	printbuf_exit(&buf);
372 	return ob;
373 }
374 
375 /*
376  * This path is for before the freespace btree is initialized:
377  *
378  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
379  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
380  */
381 static noinline struct open_bucket *
382 bch2_bucket_alloc_early(struct btree_trans *trans,
383 			struct bch_dev *ca,
384 			enum bch_watermark watermark,
385 			struct bucket_alloc_state *s,
386 			struct closure *cl)
387 {
388 	struct btree_iter iter, citer;
389 	struct bkey_s_c k, ck;
390 	struct open_bucket *ob = NULL;
391 	u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
392 	u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
393 	u64 alloc_cursor = alloc_start;
394 	int ret;
395 
396 	/*
397 	 * Scan with an uncached iterator to avoid polluting the key cache. An
398 	 * uncached iter will return a cached key if one exists, but if not
399 	 * there is no other underlying protection for the associated key cache
400 	 * slot. To avoid racing bucket allocations, look up the cached key slot
401 	 * of any likely allocation candidate before attempting to proceed with
402 	 * the allocation. This provides proper exclusion on the associated
403 	 * bucket.
404 	 */
405 again:
406 	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
407 			   BTREE_ITER_SLOTS, k, ret) {
408 		struct bch_alloc_v4 a_convert;
409 		const struct bch_alloc_v4 *a;
410 
411 		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
412 			break;
413 
414 		if (ca->new_fs_bucket_idx &&
415 		    is_superblock_bucket(ca, k.k->p.offset))
416 			continue;
417 
418 		a = bch2_alloc_to_v4(k, &a_convert);
419 		if (a->data_type != BCH_DATA_free)
420 			continue;
421 
422 		/* now check the cached key to serialize concurrent allocs of the bucket */
423 		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
424 		ret = bkey_err(ck);
425 		if (ret)
426 			break;
427 
428 		a = bch2_alloc_to_v4(ck, &a_convert);
429 		if (a->data_type != BCH_DATA_free)
430 			goto next;
431 
432 		s->buckets_seen++;
433 
434 		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
435 next:
436 		set_btree_iter_dontneed(&citer);
437 		bch2_trans_iter_exit(trans, &citer);
438 		if (ob)
439 			break;
440 	}
441 	bch2_trans_iter_exit(trans, &iter);
442 
443 	alloc_cursor = iter.pos.offset;
444 	ca->alloc_cursor = alloc_cursor;
445 
446 	if (!ob && ret)
447 		ob = ERR_PTR(ret);
448 
449 	if (!ob && alloc_start > first_bucket) {
450 		alloc_cursor = alloc_start = first_bucket;
451 		goto again;
452 	}
453 
454 	return ob;
455 }
456 
457 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
458 						   struct bch_dev *ca,
459 						   enum bch_watermark watermark,
460 						   struct bucket_alloc_state *s,
461 						   struct closure *cl)
462 {
463 	struct btree_iter iter;
464 	struct bkey_s_c k;
465 	struct open_bucket *ob = NULL;
466 	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
467 	u64 alloc_cursor = alloc_start;
468 	int ret;
469 
470 	BUG_ON(ca->new_fs_bucket_idx);
471 again:
472 	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
473 				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
474 		if (k.k->p.inode != ca->dev_idx)
475 			break;
476 
477 		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
478 		     alloc_cursor < k.k->p.offset;
479 		     alloc_cursor++) {
480 			ret = btree_trans_too_many_iters(trans);
481 			if (ret) {
482 				ob = ERR_PTR(ret);
483 				break;
484 			}
485 
486 			s->buckets_seen++;
487 
488 			ob = try_alloc_bucket(trans, ca, watermark,
489 					      alloc_cursor, s, k, cl);
490 			if (ob) {
491 				set_btree_iter_dontneed(&iter);
492 				break;
493 			}
494 		}
495 
496 		if (ob || ret)
497 			break;
498 	}
499 	bch2_trans_iter_exit(trans, &iter);
500 
501 	ca->alloc_cursor = alloc_cursor;
502 
503 	if (!ob && ret)
504 		ob = ERR_PTR(ret);
505 
506 	if (!ob && alloc_start > ca->mi.first_bucket) {
507 		alloc_cursor = alloc_start = ca->mi.first_bucket;
508 		goto again;
509 	}
510 
511 	return ob;
512 }
513 
514 /**
515  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
516  * @trans:	transaction object
517  * @ca:		device to allocate from
518  * @watermark:	how important is this allocation?
519  * @cl:		if not NULL, closure to be used to wait if buckets not available
520  * @usage:	for secondarily also returning the current device usage
521  *
522  * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
523  */
524 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
525 				      struct bch_dev *ca,
526 				      enum bch_watermark watermark,
527 				      struct closure *cl,
528 				      struct bch_dev_usage *usage)
529 {
530 	struct bch_fs *c = trans->c;
531 	struct open_bucket *ob = NULL;
532 	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
533 	u64 avail;
534 	struct bucket_alloc_state s = { 0 };
535 	bool waiting = false;
536 again:
537 	bch2_dev_usage_read_fast(ca, usage);
538 	avail = dev_buckets_free(ca, *usage, watermark);
539 
540 	if (usage->d[BCH_DATA_need_discard].buckets > avail)
541 		bch2_do_discards(c);
542 
543 	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
544 		bch2_do_gc_gens(c);
545 
546 	if (should_invalidate_buckets(ca, *usage))
547 		bch2_do_invalidates(c);
548 
549 	if (!avail) {
550 		if (cl && !waiting) {
551 			closure_wait(&c->freelist_wait, cl);
552 			waiting = true;
553 			goto again;
554 		}
555 
556 		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
557 
558 		ob = ERR_PTR(-BCH_ERR_freelist_empty);
559 		goto err;
560 	}
561 
562 	if (waiting)
563 		closure_wake_up(&c->freelist_wait);
564 alloc:
565 	ob = likely(freespace)
566 		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
567 		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
568 
569 	if (s.skipped_need_journal_commit * 2 > avail)
570 		bch2_journal_flush_async(&c->journal, NULL);
571 
572 	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
573 		freespace = false;
574 		goto alloc;
575 	}
576 err:
577 	if (!ob)
578 		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
579 
580 	if (!IS_ERR(ob))
581 		trace_and_count(c, bucket_alloc, ca,
582 				bch2_watermarks[watermark],
583 				ob->bucket,
584 				usage->d[BCH_DATA_free].buckets,
585 				avail,
586 				bch2_copygc_wait_amount(c),
587 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
588 				&s,
589 				cl == NULL,
590 				"");
591 	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
592 		trace_and_count(c, bucket_alloc_fail, ca,
593 				bch2_watermarks[watermark],
594 				0,
595 				usage->d[BCH_DATA_free].buckets,
596 				avail,
597 				bch2_copygc_wait_amount(c),
598 				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
599 				&s,
600 				cl == NULL,
601 				bch2_err_str(PTR_ERR(ob)));
602 
603 	return ob;
604 }
605 
606 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
607 				      enum bch_watermark watermark,
608 				      struct closure *cl)
609 {
610 	struct bch_dev_usage usage;
611 	struct open_bucket *ob;
612 
613 	bch2_trans_do(c, NULL, NULL, 0,
614 		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
615 							cl, &usage)));
616 	return ob;
617 }
618 
619 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
620 			    unsigned l, unsigned r)
621 {
622 	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
623 		(stripe->next_alloc[l] < stripe->next_alloc[r]));
624 }
625 
626 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
627 
628 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
629 					  struct dev_stripe_state *stripe,
630 					  struct bch_devs_mask *devs)
631 {
632 	struct dev_alloc_list ret = { .nr = 0 };
633 	unsigned i;
634 
635 	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
636 		ret.devs[ret.nr++] = i;
637 
638 	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
639 	return ret;
640 }
641 
642 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
643 			       struct dev_stripe_state *stripe,
644 			       struct bch_dev_usage *usage)
645 {
646 	u64 *v = stripe->next_alloc + ca->dev_idx;
647 	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
648 	u64 free_space_inv = free_space
649 		? div64_u64(1ULL << 48, free_space)
650 		: 1ULL << 48;
651 	u64 scale = *v / 4;
652 
653 	if (*v + free_space_inv >= *v)
654 		*v += free_space_inv;
655 	else
656 		*v = U64_MAX;
657 
658 	for (v = stripe->next_alloc;
659 	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
660 		*v = *v < scale ? 0 : *v - scale;
661 }
662 
663 void bch2_dev_stripe_increment(struct bch_dev *ca,
664 			       struct dev_stripe_state *stripe)
665 {
666 	struct bch_dev_usage usage;
667 
668 	bch2_dev_usage_read_fast(ca, &usage);
669 	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
670 }
671 
672 static int add_new_bucket(struct bch_fs *c,
673 			   struct open_buckets *ptrs,
674 			   struct bch_devs_mask *devs_may_alloc,
675 			   unsigned nr_replicas,
676 			   unsigned *nr_effective,
677 			   bool *have_cache,
678 			   unsigned flags,
679 			   struct open_bucket *ob)
680 {
681 	unsigned durability =
682 		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
683 
684 	BUG_ON(*nr_effective >= nr_replicas);
685 
686 	__clear_bit(ob->dev, devs_may_alloc->d);
687 	*nr_effective	+= durability;
688 	*have_cache	|= !durability;
689 
690 	ob_push(c, ptrs, ob);
691 
692 	if (*nr_effective >= nr_replicas)
693 		return 1;
694 	if (ob->ec)
695 		return 1;
696 	return 0;
697 }
698 
699 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
700 		      struct open_buckets *ptrs,
701 		      struct dev_stripe_state *stripe,
702 		      struct bch_devs_mask *devs_may_alloc,
703 		      unsigned nr_replicas,
704 		      unsigned *nr_effective,
705 		      bool *have_cache,
706 		      unsigned flags,
707 		      enum bch_data_type data_type,
708 		      enum bch_watermark watermark,
709 		      struct closure *cl)
710 {
711 	struct bch_fs *c = trans->c;
712 	struct dev_alloc_list devs_sorted =
713 		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
714 	unsigned dev;
715 	struct bch_dev *ca;
716 	int ret = -BCH_ERR_insufficient_devices;
717 	unsigned i;
718 
719 	BUG_ON(*nr_effective >= nr_replicas);
720 
721 	for (i = 0; i < devs_sorted.nr; i++) {
722 		struct bch_dev_usage usage;
723 		struct open_bucket *ob;
724 
725 		dev = devs_sorted.devs[i];
726 
727 		rcu_read_lock();
728 		ca = rcu_dereference(c->devs[dev]);
729 		if (ca)
730 			percpu_ref_get(&ca->ref);
731 		rcu_read_unlock();
732 
733 		if (!ca)
734 			continue;
735 
736 		if (!ca->mi.durability && *have_cache) {
737 			percpu_ref_put(&ca->ref);
738 			continue;
739 		}
740 
741 		ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
742 		if (!IS_ERR(ob))
743 			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
744 		percpu_ref_put(&ca->ref);
745 
746 		if (IS_ERR(ob)) {
747 			ret = PTR_ERR(ob);
748 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
749 				break;
750 			continue;
751 		}
752 
753 		ob->data_type = data_type;
754 
755 		if (add_new_bucket(c, ptrs, devs_may_alloc,
756 				   nr_replicas, nr_effective,
757 				   have_cache, flags, ob)) {
758 			ret = 0;
759 			break;
760 		}
761 	}
762 
763 	return ret;
764 }
765 
766 /* Allocate from stripes: */
767 
768 /*
769  * if we can't allocate a new stripe because there are already too many
770  * partially filled stripes, force allocating from an existing stripe even when
771  * it's to a device we don't want:
772  */
773 
774 static int bucket_alloc_from_stripe(struct btree_trans *trans,
775 			 struct open_buckets *ptrs,
776 			 struct write_point *wp,
777 			 struct bch_devs_mask *devs_may_alloc,
778 			 u16 target,
779 			 unsigned nr_replicas,
780 			 unsigned *nr_effective,
781 			 bool *have_cache,
782 			 enum bch_watermark watermark,
783 			 unsigned flags,
784 			 struct closure *cl)
785 {
786 	struct bch_fs *c = trans->c;
787 	struct dev_alloc_list devs_sorted;
788 	struct ec_stripe_head *h;
789 	struct open_bucket *ob;
790 	unsigned i, ec_idx;
791 	int ret = 0;
792 
793 	if (nr_replicas < 2)
794 		return 0;
795 
796 	if (ec_open_bucket(c, ptrs))
797 		return 0;
798 
799 	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
800 	if (IS_ERR(h))
801 		return PTR_ERR(h);
802 	if (!h)
803 		return 0;
804 
805 	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
806 
807 	for (i = 0; i < devs_sorted.nr; i++)
808 		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
809 			if (!h->s->blocks[ec_idx])
810 				continue;
811 
812 			ob = c->open_buckets + h->s->blocks[ec_idx];
813 			if (ob->dev == devs_sorted.devs[i] &&
814 			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
815 				goto got_bucket;
816 		}
817 	goto out_put_head;
818 got_bucket:
819 	ob->ec_idx	= ec_idx;
820 	ob->ec		= h->s;
821 	ec_stripe_new_get(h->s, STRIPE_REF_io);
822 
823 	ret = add_new_bucket(c, ptrs, devs_may_alloc,
824 			     nr_replicas, nr_effective,
825 			     have_cache, flags, ob);
826 out_put_head:
827 	bch2_ec_stripe_head_put(c, h);
828 	return ret;
829 }
830 
831 /* Sector allocator */
832 
833 static bool want_bucket(struct bch_fs *c,
834 			struct write_point *wp,
835 			struct bch_devs_mask *devs_may_alloc,
836 			bool *have_cache, bool ec,
837 			struct open_bucket *ob)
838 {
839 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
840 
841 	if (!test_bit(ob->dev, devs_may_alloc->d))
842 		return false;
843 
844 	if (ob->data_type != wp->data_type)
845 		return false;
846 
847 	if (!ca->mi.durability &&
848 	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
849 		return false;
850 
851 	if (ec != (ob->ec != NULL))
852 		return false;
853 
854 	return true;
855 }
856 
857 static int bucket_alloc_set_writepoint(struct bch_fs *c,
858 				       struct open_buckets *ptrs,
859 				       struct write_point *wp,
860 				       struct bch_devs_mask *devs_may_alloc,
861 				       unsigned nr_replicas,
862 				       unsigned *nr_effective,
863 				       bool *have_cache,
864 				       bool ec, unsigned flags)
865 {
866 	struct open_buckets ptrs_skip = { .nr = 0 };
867 	struct open_bucket *ob;
868 	unsigned i;
869 	int ret = 0;
870 
871 	open_bucket_for_each(c, &wp->ptrs, ob, i) {
872 		if (!ret && want_bucket(c, wp, devs_may_alloc,
873 					have_cache, ec, ob))
874 			ret = add_new_bucket(c, ptrs, devs_may_alloc,
875 				       nr_replicas, nr_effective,
876 				       have_cache, flags, ob);
877 		else
878 			ob_push(c, &ptrs_skip, ob);
879 	}
880 	wp->ptrs = ptrs_skip;
881 
882 	return ret;
883 }
884 
885 static int bucket_alloc_set_partial(struct bch_fs *c,
886 				    struct open_buckets *ptrs,
887 				    struct write_point *wp,
888 				    struct bch_devs_mask *devs_may_alloc,
889 				    unsigned nr_replicas,
890 				    unsigned *nr_effective,
891 				    bool *have_cache, bool ec,
892 				    enum bch_watermark watermark,
893 				    unsigned flags)
894 {
895 	int i, ret = 0;
896 
897 	if (!c->open_buckets_partial_nr)
898 		return 0;
899 
900 	spin_lock(&c->freelist_lock);
901 
902 	if (!c->open_buckets_partial_nr)
903 		goto unlock;
904 
905 	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
906 		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
907 
908 		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
909 			struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
910 			struct bch_dev_usage usage;
911 			u64 avail;
912 
913 			bch2_dev_usage_read_fast(ca, &usage);
914 			avail = dev_buckets_free(ca, usage, watermark);
915 			if (!avail)
916 				continue;
917 
918 			array_remove_item(c->open_buckets_partial,
919 					  c->open_buckets_partial_nr,
920 					  i);
921 			ob->on_partial_list = false;
922 
923 			ret = add_new_bucket(c, ptrs, devs_may_alloc,
924 					     nr_replicas, nr_effective,
925 					     have_cache, flags, ob);
926 			if (ret)
927 				break;
928 		}
929 	}
930 unlock:
931 	spin_unlock(&c->freelist_lock);
932 	return ret;
933 }
934 
935 static int __open_bucket_add_buckets(struct btree_trans *trans,
936 			struct open_buckets *ptrs,
937 			struct write_point *wp,
938 			struct bch_devs_list *devs_have,
939 			u16 target,
940 			bool erasure_code,
941 			unsigned nr_replicas,
942 			unsigned *nr_effective,
943 			bool *have_cache,
944 			enum bch_watermark watermark,
945 			unsigned flags,
946 			struct closure *_cl)
947 {
948 	struct bch_fs *c = trans->c;
949 	struct bch_devs_mask devs;
950 	struct open_bucket *ob;
951 	struct closure *cl = NULL;
952 	unsigned i;
953 	int ret;
954 
955 	devs = target_rw_devs(c, wp->data_type, target);
956 
957 	/* Don't allocate from devices we already have pointers to: */
958 	darray_for_each(*devs_have, i)
959 		__clear_bit(*i, devs.d);
960 
961 	open_bucket_for_each(c, ptrs, ob, i)
962 		__clear_bit(ob->dev, devs.d);
963 
964 	if (erasure_code && ec_open_bucket(c, ptrs))
965 		return 0;
966 
967 	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
968 				 nr_replicas, nr_effective,
969 				 have_cache, erasure_code, flags);
970 	if (ret)
971 		return ret;
972 
973 	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
974 				 nr_replicas, nr_effective,
975 				 have_cache, erasure_code, watermark, flags);
976 	if (ret)
977 		return ret;
978 
979 	if (erasure_code) {
980 		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
981 					 target,
982 					 nr_replicas, nr_effective,
983 					 have_cache,
984 					 watermark, flags, _cl);
985 	} else {
986 retry_blocking:
987 		/*
988 		 * Try nonblocking first, so that if one device is full we'll try from
989 		 * other devices:
990 		 */
991 		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
992 					nr_replicas, nr_effective, have_cache,
993 					flags, wp->data_type, watermark, cl);
994 		if (ret &&
995 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
996 		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
997 		    !cl && _cl) {
998 			cl = _cl;
999 			goto retry_blocking;
1000 		}
1001 	}
1002 
1003 	return ret;
1004 }
1005 
1006 static int open_bucket_add_buckets(struct btree_trans *trans,
1007 			struct open_buckets *ptrs,
1008 			struct write_point *wp,
1009 			struct bch_devs_list *devs_have,
1010 			u16 target,
1011 			unsigned erasure_code,
1012 			unsigned nr_replicas,
1013 			unsigned *nr_effective,
1014 			bool *have_cache,
1015 			enum bch_watermark watermark,
1016 			unsigned flags,
1017 			struct closure *cl)
1018 {
1019 	int ret;
1020 
1021 	if (erasure_code) {
1022 		ret = __open_bucket_add_buckets(trans, ptrs, wp,
1023 				devs_have, target, erasure_code,
1024 				nr_replicas, nr_effective, have_cache,
1025 				watermark, flags, cl);
1026 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1027 		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1028 		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1029 		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1030 			return ret;
1031 		if (*nr_effective >= nr_replicas)
1032 			return 0;
1033 	}
1034 
1035 	ret = __open_bucket_add_buckets(trans, ptrs, wp,
1036 			devs_have, target, false,
1037 			nr_replicas, nr_effective, have_cache,
1038 			watermark, flags, cl);
1039 	return ret < 0 ? ret : 0;
1040 }
1041 
1042 /**
1043  * should_drop_bucket - check if this is open_bucket should go away
1044  * @ob:		open_bucket to predicate on
1045  * @c:		filesystem handle
1046  * @ca:		if set, we're killing buckets for a particular device
1047  * @ec:		if true, we're shutting down erasure coding and killing all ec
1048  *		open_buckets
1049  *		otherwise, return true
1050  * Returns: true if we should kill this open_bucket
1051  *
1052  * We're killing open_buckets because we're shutting down a device, erasure
1053  * coding, or the entire filesystem - check if this open_bucket matches:
1054  */
1055 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1056 			       struct bch_dev *ca, bool ec)
1057 {
1058 	if (ec) {
1059 		return ob->ec != NULL;
1060 	} else if (ca) {
1061 		bool drop = ob->dev == ca->dev_idx;
1062 		struct open_bucket *ob2;
1063 		unsigned i;
1064 
1065 		if (!drop && ob->ec) {
1066 			unsigned nr_blocks;
1067 
1068 			mutex_lock(&ob->ec->lock);
1069 			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1070 
1071 			for (i = 0; i < nr_blocks; i++) {
1072 				if (!ob->ec->blocks[i])
1073 					continue;
1074 
1075 				ob2 = c->open_buckets + ob->ec->blocks[i];
1076 				drop |= ob2->dev == ca->dev_idx;
1077 			}
1078 			mutex_unlock(&ob->ec->lock);
1079 		}
1080 
1081 		return drop;
1082 	} else {
1083 		return true;
1084 	}
1085 }
1086 
1087 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1088 				 bool ec, struct write_point *wp)
1089 {
1090 	struct open_buckets ptrs = { .nr = 0 };
1091 	struct open_bucket *ob;
1092 	unsigned i;
1093 
1094 	mutex_lock(&wp->lock);
1095 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1096 		if (should_drop_bucket(ob, c, ca, ec))
1097 			bch2_open_bucket_put(c, ob);
1098 		else
1099 			ob_push(c, &ptrs, ob);
1100 	wp->ptrs = ptrs;
1101 	mutex_unlock(&wp->lock);
1102 }
1103 
1104 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1105 			    bool ec)
1106 {
1107 	unsigned i;
1108 
1109 	/* Next, close write points that point to this device... */
1110 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1111 		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1112 
1113 	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1114 	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1115 	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1116 
1117 	mutex_lock(&c->btree_reserve_cache_lock);
1118 	while (c->btree_reserve_cache_nr) {
1119 		struct btree_alloc *a =
1120 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1121 
1122 		bch2_open_buckets_put(c, &a->ob);
1123 	}
1124 	mutex_unlock(&c->btree_reserve_cache_lock);
1125 
1126 	spin_lock(&c->freelist_lock);
1127 	i = 0;
1128 	while (i < c->open_buckets_partial_nr) {
1129 		struct open_bucket *ob =
1130 			c->open_buckets + c->open_buckets_partial[i];
1131 
1132 		if (should_drop_bucket(ob, c, ca, ec)) {
1133 			--c->open_buckets_partial_nr;
1134 			swap(c->open_buckets_partial[i],
1135 			     c->open_buckets_partial[c->open_buckets_partial_nr]);
1136 			ob->on_partial_list = false;
1137 			spin_unlock(&c->freelist_lock);
1138 			bch2_open_bucket_put(c, ob);
1139 			spin_lock(&c->freelist_lock);
1140 		} else {
1141 			i++;
1142 		}
1143 	}
1144 	spin_unlock(&c->freelist_lock);
1145 
1146 	bch2_ec_stop_dev(c, ca);
1147 }
1148 
1149 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1150 						 unsigned long write_point)
1151 {
1152 	unsigned hash =
1153 		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1154 
1155 	return &c->write_points_hash[hash];
1156 }
1157 
1158 static struct write_point *__writepoint_find(struct hlist_head *head,
1159 					     unsigned long write_point)
1160 {
1161 	struct write_point *wp;
1162 
1163 	rcu_read_lock();
1164 	hlist_for_each_entry_rcu(wp, head, node)
1165 		if (wp->write_point == write_point)
1166 			goto out;
1167 	wp = NULL;
1168 out:
1169 	rcu_read_unlock();
1170 	return wp;
1171 }
1172 
1173 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1174 {
1175 	u64 stranded	= c->write_points_nr * c->bucket_size_max;
1176 	u64 free	= bch2_fs_usage_read_short(c).free;
1177 
1178 	return stranded * factor > free;
1179 }
1180 
1181 static bool try_increase_writepoints(struct bch_fs *c)
1182 {
1183 	struct write_point *wp;
1184 
1185 	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1186 	    too_many_writepoints(c, 32))
1187 		return false;
1188 
1189 	wp = c->write_points + c->write_points_nr++;
1190 	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1191 	return true;
1192 }
1193 
1194 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1195 {
1196 	struct bch_fs *c = trans->c;
1197 	struct write_point *wp;
1198 	struct open_bucket *ob;
1199 	unsigned i;
1200 
1201 	mutex_lock(&c->write_points_hash_lock);
1202 	if (c->write_points_nr < old_nr) {
1203 		mutex_unlock(&c->write_points_hash_lock);
1204 		return true;
1205 	}
1206 
1207 	if (c->write_points_nr == 1 ||
1208 	    !too_many_writepoints(c, 8)) {
1209 		mutex_unlock(&c->write_points_hash_lock);
1210 		return false;
1211 	}
1212 
1213 	wp = c->write_points + --c->write_points_nr;
1214 
1215 	hlist_del_rcu(&wp->node);
1216 	mutex_unlock(&c->write_points_hash_lock);
1217 
1218 	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1219 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1220 		open_bucket_free_unused(c, ob);
1221 	wp->ptrs.nr = 0;
1222 	mutex_unlock(&wp->lock);
1223 	return true;
1224 }
1225 
1226 static struct write_point *writepoint_find(struct btree_trans *trans,
1227 					   unsigned long write_point)
1228 {
1229 	struct bch_fs *c = trans->c;
1230 	struct write_point *wp, *oldest;
1231 	struct hlist_head *head;
1232 
1233 	if (!(write_point & 1UL)) {
1234 		wp = (struct write_point *) write_point;
1235 		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1236 		return wp;
1237 	}
1238 
1239 	head = writepoint_hash(c, write_point);
1240 restart_find:
1241 	wp = __writepoint_find(head, write_point);
1242 	if (wp) {
1243 lock_wp:
1244 		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1245 		if (wp->write_point == write_point)
1246 			goto out;
1247 		mutex_unlock(&wp->lock);
1248 		goto restart_find;
1249 	}
1250 restart_find_oldest:
1251 	oldest = NULL;
1252 	for (wp = c->write_points;
1253 	     wp < c->write_points + c->write_points_nr; wp++)
1254 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
1255 			oldest = wp;
1256 
1257 	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1258 	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1259 	if (oldest >= c->write_points + c->write_points_nr ||
1260 	    try_increase_writepoints(c)) {
1261 		mutex_unlock(&c->write_points_hash_lock);
1262 		mutex_unlock(&oldest->lock);
1263 		goto restart_find_oldest;
1264 	}
1265 
1266 	wp = __writepoint_find(head, write_point);
1267 	if (wp && wp != oldest) {
1268 		mutex_unlock(&c->write_points_hash_lock);
1269 		mutex_unlock(&oldest->lock);
1270 		goto lock_wp;
1271 	}
1272 
1273 	wp = oldest;
1274 	hlist_del_rcu(&wp->node);
1275 	wp->write_point = write_point;
1276 	hlist_add_head_rcu(&wp->node, head);
1277 	mutex_unlock(&c->write_points_hash_lock);
1278 out:
1279 	wp->last_used = local_clock();
1280 	return wp;
1281 }
1282 
1283 static noinline void
1284 deallocate_extra_replicas(struct bch_fs *c,
1285 			  struct open_buckets *ptrs,
1286 			  struct open_buckets *ptrs_no_use,
1287 			  unsigned extra_replicas)
1288 {
1289 	struct open_buckets ptrs2 = { 0 };
1290 	struct open_bucket *ob;
1291 	unsigned i;
1292 
1293 	open_bucket_for_each(c, ptrs, ob, i) {
1294 		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1295 
1296 		if (d && d <= extra_replicas) {
1297 			extra_replicas -= d;
1298 			ob_push(c, ptrs_no_use, ob);
1299 		} else {
1300 			ob_push(c, &ptrs2, ob);
1301 		}
1302 	}
1303 
1304 	*ptrs = ptrs2;
1305 }
1306 
1307 /*
1308  * Get us an open_bucket we can allocate from, return with it locked:
1309  */
1310 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1311 			     unsigned target,
1312 			     unsigned erasure_code,
1313 			     struct write_point_specifier write_point,
1314 			     struct bch_devs_list *devs_have,
1315 			     unsigned nr_replicas,
1316 			     unsigned nr_replicas_required,
1317 			     enum bch_watermark watermark,
1318 			     unsigned flags,
1319 			     struct closure *cl,
1320 			     struct write_point **wp_ret)
1321 {
1322 	struct bch_fs *c = trans->c;
1323 	struct write_point *wp;
1324 	struct open_bucket *ob;
1325 	struct open_buckets ptrs;
1326 	unsigned nr_effective, write_points_nr;
1327 	bool have_cache;
1328 	int ret;
1329 	int i;
1330 
1331 	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1332 		erasure_code = false;
1333 
1334 	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1335 
1336 	BUG_ON(!nr_replicas || !nr_replicas_required);
1337 retry:
1338 	ptrs.nr		= 0;
1339 	nr_effective	= 0;
1340 	write_points_nr = c->write_points_nr;
1341 	have_cache	= false;
1342 
1343 	*wp_ret = wp = writepoint_find(trans, write_point.v);
1344 
1345 	/* metadata may not allocate on cache devices: */
1346 	if (wp->data_type != BCH_DATA_user)
1347 		have_cache = true;
1348 
1349 	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1350 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1351 					      target, erasure_code,
1352 					      nr_replicas, &nr_effective,
1353 					      &have_cache, watermark,
1354 					      flags, NULL);
1355 		if (!ret ||
1356 		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
1357 			goto alloc_done;
1358 
1359 		/* Don't retry from all devices if we're out of open buckets: */
1360 		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
1361 			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1362 					      target, erasure_code,
1363 					      nr_replicas, &nr_effective,
1364 					      &have_cache, watermark,
1365 					      flags, cl);
1366 			if (!ret2 ||
1367 			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
1368 			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
1369 				ret = ret2;
1370 				goto alloc_done;
1371 			}
1372 		}
1373 
1374 		/*
1375 		 * Only try to allocate cache (durability = 0 devices) from the
1376 		 * specified target:
1377 		 */
1378 		have_cache = true;
1379 
1380 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1381 					      0, erasure_code,
1382 					      nr_replicas, &nr_effective,
1383 					      &have_cache, watermark,
1384 					      flags, cl);
1385 	} else {
1386 		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1387 					      target, erasure_code,
1388 					      nr_replicas, &nr_effective,
1389 					      &have_cache, watermark,
1390 					      flags, cl);
1391 	}
1392 alloc_done:
1393 	BUG_ON(!ret && nr_effective < nr_replicas);
1394 
1395 	if (erasure_code && !ec_open_bucket(c, &ptrs))
1396 		pr_debug("failed to get ec bucket: ret %u", ret);
1397 
1398 	if (ret == -BCH_ERR_insufficient_devices &&
1399 	    nr_effective >= nr_replicas_required)
1400 		ret = 0;
1401 
1402 	if (ret)
1403 		goto err;
1404 
1405 	if (nr_effective > nr_replicas)
1406 		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1407 
1408 	/* Free buckets we didn't use: */
1409 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1410 		open_bucket_free_unused(c, ob);
1411 
1412 	wp->ptrs = ptrs;
1413 
1414 	wp->sectors_free = UINT_MAX;
1415 
1416 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1417 		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1418 
1419 	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1420 
1421 	return 0;
1422 err:
1423 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1424 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1425 			ob_push(c, &ptrs, ob);
1426 		else
1427 			open_bucket_free_unused(c, ob);
1428 	wp->ptrs = ptrs;
1429 
1430 	mutex_unlock(&wp->lock);
1431 
1432 	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1433 	    try_decrease_writepoints(trans, write_points_nr))
1434 		goto retry;
1435 
1436 	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1437 	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
1438 		return cl
1439 			? -BCH_ERR_bucket_alloc_blocked
1440 			: -BCH_ERR_ENOSPC_bucket_alloc;
1441 
1442 	return ret;
1443 }
1444 
1445 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1446 {
1447 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1448 
1449 	return (struct bch_extent_ptr) {
1450 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
1451 		.gen	= ob->gen,
1452 		.dev	= ob->dev,
1453 		.offset	= bucket_to_sector(ca, ob->bucket) +
1454 			ca->mi.bucket_size -
1455 			ob->sectors_free,
1456 	};
1457 }
1458 
1459 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1460 				    struct bkey_i *k, unsigned sectors,
1461 				    bool cached)
1462 {
1463 	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1464 }
1465 
1466 /*
1467  * Append pointers to the space we just allocated to @k, and mark @sectors space
1468  * as allocated out of @ob
1469  */
1470 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1471 {
1472 	bch2_alloc_sectors_done_inlined(c, wp);
1473 }
1474 
1475 static inline void writepoint_init(struct write_point *wp,
1476 				   enum bch_data_type type)
1477 {
1478 	mutex_init(&wp->lock);
1479 	wp->data_type = type;
1480 
1481 	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1482 	INIT_LIST_HEAD(&wp->writes);
1483 	spin_lock_init(&wp->writes_lock);
1484 }
1485 
1486 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1487 {
1488 	struct open_bucket *ob;
1489 	struct write_point *wp;
1490 
1491 	mutex_init(&c->write_points_hash_lock);
1492 	c->write_points_nr = ARRAY_SIZE(c->write_points);
1493 
1494 	/* open bucket 0 is a sentinal NULL: */
1495 	spin_lock_init(&c->open_buckets[0].lock);
1496 
1497 	for (ob = c->open_buckets + 1;
1498 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1499 		spin_lock_init(&ob->lock);
1500 		c->open_buckets_nr_free++;
1501 
1502 		ob->freelist = c->open_buckets_freelist;
1503 		c->open_buckets_freelist = ob - c->open_buckets;
1504 	}
1505 
1506 	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
1507 	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
1508 	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
1509 
1510 	for (wp = c->write_points;
1511 	     wp < c->write_points + c->write_points_nr; wp++) {
1512 		writepoint_init(wp, BCH_DATA_user);
1513 
1514 		wp->last_used	= local_clock();
1515 		wp->write_point	= (unsigned long) wp;
1516 		hlist_add_head_rcu(&wp->node,
1517 				   writepoint_hash(c, wp->write_point));
1518 	}
1519 }
1520 
1521 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1522 {
1523 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1524 	unsigned data_type = ob->data_type;
1525 	barrier(); /* READ_ONCE() doesn't work on bitfields */
1526 
1527 	prt_printf(out, "%zu ref %u ",
1528 		   ob - c->open_buckets,
1529 		   atomic_read(&ob->pin));
1530 	bch2_prt_data_type(out, data_type);
1531 	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
1532 		   ob->dev, ob->bucket, ob->gen,
1533 		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1534 	if (ob->ec)
1535 		prt_printf(out, " ec idx %llu", ob->ec->idx);
1536 	if (ob->on_partial_list)
1537 		prt_str(out, " partial");
1538 	prt_newline(out);
1539 }
1540 
1541 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1542 {
1543 	struct open_bucket *ob;
1544 
1545 	out->atomic++;
1546 
1547 	for (ob = c->open_buckets;
1548 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1549 	     ob++) {
1550 		spin_lock(&ob->lock);
1551 		if (ob->valid && !ob->on_partial_list)
1552 			bch2_open_bucket_to_text(out, c, ob);
1553 		spin_unlock(&ob->lock);
1554 	}
1555 
1556 	--out->atomic;
1557 }
1558 
1559 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1560 {
1561 	unsigned i;
1562 
1563 	out->atomic++;
1564 	spin_lock(&c->freelist_lock);
1565 
1566 	for (i = 0; i < c->open_buckets_partial_nr; i++)
1567 		bch2_open_bucket_to_text(out, c,
1568 				c->open_buckets + c->open_buckets_partial[i]);
1569 
1570 	spin_unlock(&c->freelist_lock);
1571 	--out->atomic;
1572 }
1573 
1574 static const char * const bch2_write_point_states[] = {
1575 #define x(n)	#n,
1576 	WRITE_POINT_STATES()
1577 #undef x
1578 	NULL
1579 };
1580 
1581 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1582 				     struct write_point *wp)
1583 {
1584 	struct open_bucket *ob;
1585 	unsigned i;
1586 
1587 	prt_printf(out, "%lu: ", wp->write_point);
1588 	prt_human_readable_u64(out, wp->sectors_allocated);
1589 
1590 	prt_printf(out, " last wrote: ");
1591 	bch2_pr_time_units(out, sched_clock() - wp->last_used);
1592 
1593 	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1594 		prt_printf(out, " %s: ", bch2_write_point_states[i]);
1595 		bch2_pr_time_units(out, wp->time[i]);
1596 	}
1597 
1598 	prt_newline(out);
1599 
1600 	printbuf_indent_add(out, 2);
1601 	open_bucket_for_each(c, &wp->ptrs, ob, i)
1602 		bch2_open_bucket_to_text(out, c, ob);
1603 	printbuf_indent_sub(out, 2);
1604 }
1605 
1606 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1607 {
1608 	struct write_point *wp;
1609 
1610 	prt_str(out, "Foreground write points\n");
1611 	for (wp = c->write_points;
1612 	     wp < c->write_points + ARRAY_SIZE(c->write_points);
1613 	     wp++)
1614 		bch2_write_point_to_text(out, c, wp);
1615 
1616 	prt_str(out, "Copygc write point\n");
1617 	bch2_write_point_to_text(out, c, &c->copygc_write_point);
1618 
1619 	prt_str(out, "Rebalance write point\n");
1620 	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1621 
1622 	prt_str(out, "Btree write point\n");
1623 	bch2_write_point_to_text(out, c, &c->btree_write_point);
1624 }
1625