xref: /linux/fs/gfs2/rgrp.c (revision f2bf88c4afc8c5ab92b40af24819933e57d0968c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461 			count[0], rgd->rd_free);
462 		gfs2_consist_rgrpd(rgd);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469 			count[1], tmp);
470 		gfs2_consist_rgrpd(rgd);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476 			count[2] + count[3], rgd->rd_dinodes);
477 		gfs2_consist_rgrpd(rgd);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
593 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594 		    const char *fs_id_buf)
595 {
596 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597 
598 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599 		       (unsigned long long)ip->i_no_addr,
600 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601 		       rs->rs_rbm.offset, rs->rs_free);
602 }
603 
604 /**
605  * __rs_deltree - remove a multi-block reservation from the rgd tree
606  * @rs: The reservation to remove
607  *
608  */
609 static void __rs_deltree(struct gfs2_blkreserv *rs)
610 {
611 	struct gfs2_rgrpd *rgd;
612 
613 	if (!gfs2_rs_active(rs))
614 		return;
615 
616 	rgd = rs->rs_rbm.rgd;
617 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619 	RB_CLEAR_NODE(&rs->rs_node);
620 
621 	if (rs->rs_free) {
622 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623 				 rs->rs_free - 1;
624 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625 		struct gfs2_bitmap *start, *last;
626 
627 		/* return reserved blocks to the rgrp */
628 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630 		/* The rgrp extent failure point is likely not to increase;
631 		   it will only do so if the freed blocks are somehow
632 		   contiguous with a span of free blocks that follows. Still,
633 		   it will force the number to be recalculated later. */
634 		rgd->rd_extfail_pt += rs->rs_free;
635 		rs->rs_free = 0;
636 		if (gfs2_rbm_from_block(&last_rbm, last_block))
637 			return;
638 		start = rbm_bi(&rs->rs_rbm);
639 		last = rbm_bi(&last_rbm);
640 		do
641 			clear_bit(GBF_FULL, &start->bi_flags);
642 		while (start++ != last);
643 	}
644 }
645 
646 /**
647  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648  * @rs: The reservation to remove
649  *
650  */
651 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652 {
653 	struct gfs2_rgrpd *rgd;
654 
655 	rgd = rs->rs_rbm.rgd;
656 	if (rgd) {
657 		spin_lock(&rgd->rd_rsspin);
658 		__rs_deltree(rs);
659 		BUG_ON(rs->rs_free);
660 		spin_unlock(&rgd->rd_rsspin);
661 	}
662 }
663 
664 /**
665  * gfs2_rs_delete - delete a multi-block reservation
666  * @ip: The inode for this reservation
667  * @wcount: The inode's write count, or NULL
668  *
669  */
670 void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
671 {
672 	down_write(&ip->i_rw_mutex);
673 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
674 		gfs2_rs_deltree(&ip->i_res);
675 	up_write(&ip->i_rw_mutex);
676 }
677 
678 /**
679  * return_all_reservations - return all reserved blocks back to the rgrp.
680  * @rgd: the rgrp that needs its space back
681  *
682  * We previously reserved a bunch of blocks for allocation. Now we need to
683  * give them back. This leave the reservation structures in tact, but removes
684  * all of their corresponding "no-fly zones".
685  */
686 static void return_all_reservations(struct gfs2_rgrpd *rgd)
687 {
688 	struct rb_node *n;
689 	struct gfs2_blkreserv *rs;
690 
691 	spin_lock(&rgd->rd_rsspin);
692 	while ((n = rb_first(&rgd->rd_rstree))) {
693 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
694 		__rs_deltree(rs);
695 	}
696 	spin_unlock(&rgd->rd_rsspin);
697 }
698 
699 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
700 {
701 	struct rb_node *n;
702 	struct gfs2_rgrpd *rgd;
703 	struct gfs2_glock *gl;
704 
705 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
706 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
707 		gl = rgd->rd_gl;
708 
709 		rb_erase(n, &sdp->sd_rindex_tree);
710 
711 		if (gl) {
712 			if (gl->gl_state != LM_ST_UNLOCKED) {
713 				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
714 				flush_delayed_work(&gl->gl_work);
715 			}
716 			gfs2_rgrp_brelse(rgd);
717 			glock_clear_object(gl, rgd);
718 			gfs2_glock_put(gl);
719 		}
720 
721 		gfs2_free_clones(rgd);
722 		kfree(rgd->rd_bits);
723 		rgd->rd_bits = NULL;
724 		return_all_reservations(rgd);
725 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
726 	}
727 }
728 
729 /**
730  * gfs2_compute_bitstructs - Compute the bitmap sizes
731  * @rgd: The resource group descriptor
732  *
733  * Calculates bitmap descriptors, one for each block that contains bitmap data
734  *
735  * Returns: errno
736  */
737 
738 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
739 {
740 	struct gfs2_sbd *sdp = rgd->rd_sbd;
741 	struct gfs2_bitmap *bi;
742 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
743 	u32 bytes_left, bytes;
744 	int x;
745 
746 	if (!length)
747 		return -EINVAL;
748 
749 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
750 	if (!rgd->rd_bits)
751 		return -ENOMEM;
752 
753 	bytes_left = rgd->rd_bitbytes;
754 
755 	for (x = 0; x < length; x++) {
756 		bi = rgd->rd_bits + x;
757 
758 		bi->bi_flags = 0;
759 		/* small rgrp; bitmap stored completely in header block */
760 		if (length == 1) {
761 			bytes = bytes_left;
762 			bi->bi_offset = sizeof(struct gfs2_rgrp);
763 			bi->bi_start = 0;
764 			bi->bi_bytes = bytes;
765 			bi->bi_blocks = bytes * GFS2_NBBY;
766 		/* header block */
767 		} else if (x == 0) {
768 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
769 			bi->bi_offset = sizeof(struct gfs2_rgrp);
770 			bi->bi_start = 0;
771 			bi->bi_bytes = bytes;
772 			bi->bi_blocks = bytes * GFS2_NBBY;
773 		/* last block */
774 		} else if (x + 1 == length) {
775 			bytes = bytes_left;
776 			bi->bi_offset = sizeof(struct gfs2_meta_header);
777 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
778 			bi->bi_bytes = bytes;
779 			bi->bi_blocks = bytes * GFS2_NBBY;
780 		/* other blocks */
781 		} else {
782 			bytes = sdp->sd_sb.sb_bsize -
783 				sizeof(struct gfs2_meta_header);
784 			bi->bi_offset = sizeof(struct gfs2_meta_header);
785 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
786 			bi->bi_bytes = bytes;
787 			bi->bi_blocks = bytes * GFS2_NBBY;
788 		}
789 
790 		bytes_left -= bytes;
791 	}
792 
793 	if (bytes_left) {
794 		gfs2_consist_rgrpd(rgd);
795 		return -EIO;
796 	}
797 	bi = rgd->rd_bits + (length - 1);
798 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
799 		gfs2_lm(sdp,
800 			"ri_addr = %llu\n"
801 			"ri_length = %u\n"
802 			"ri_data0 = %llu\n"
803 			"ri_data = %u\n"
804 			"ri_bitbytes = %u\n"
805 			"start=%u len=%u offset=%u\n",
806 			(unsigned long long)rgd->rd_addr,
807 			rgd->rd_length,
808 			(unsigned long long)rgd->rd_data0,
809 			rgd->rd_data,
810 			rgd->rd_bitbytes,
811 			bi->bi_start, bi->bi_bytes, bi->bi_offset);
812 		gfs2_consist_rgrpd(rgd);
813 		return -EIO;
814 	}
815 
816 	return 0;
817 }
818 
819 /**
820  * gfs2_ri_total - Total up the file system space, according to the rindex.
821  * @sdp: the filesystem
822  *
823  */
824 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
825 {
826 	u64 total_data = 0;
827 	struct inode *inode = sdp->sd_rindex;
828 	struct gfs2_inode *ip = GFS2_I(inode);
829 	char buf[sizeof(struct gfs2_rindex)];
830 	int error, rgrps;
831 
832 	for (rgrps = 0;; rgrps++) {
833 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
834 
835 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
836 			break;
837 		error = gfs2_internal_read(ip, buf, &pos,
838 					   sizeof(struct gfs2_rindex));
839 		if (error != sizeof(struct gfs2_rindex))
840 			break;
841 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
842 	}
843 	return total_data;
844 }
845 
846 static int rgd_insert(struct gfs2_rgrpd *rgd)
847 {
848 	struct gfs2_sbd *sdp = rgd->rd_sbd;
849 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
850 
851 	/* Figure out where to put new node */
852 	while (*newn) {
853 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
854 						  rd_node);
855 
856 		parent = *newn;
857 		if (rgd->rd_addr < cur->rd_addr)
858 			newn = &((*newn)->rb_left);
859 		else if (rgd->rd_addr > cur->rd_addr)
860 			newn = &((*newn)->rb_right);
861 		else
862 			return -EEXIST;
863 	}
864 
865 	rb_link_node(&rgd->rd_node, parent, newn);
866 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
867 	sdp->sd_rgrps++;
868 	return 0;
869 }
870 
871 /**
872  * read_rindex_entry - Pull in a new resource index entry from the disk
873  * @ip: Pointer to the rindex inode
874  *
875  * Returns: 0 on success, > 0 on EOF, error code otherwise
876  */
877 
878 static int read_rindex_entry(struct gfs2_inode *ip)
879 {
880 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
881 	const unsigned bsize = sdp->sd_sb.sb_bsize;
882 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
883 	struct gfs2_rindex buf;
884 	int error;
885 	struct gfs2_rgrpd *rgd;
886 
887 	if (pos >= i_size_read(&ip->i_inode))
888 		return 1;
889 
890 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
891 				   sizeof(struct gfs2_rindex));
892 
893 	if (error != sizeof(struct gfs2_rindex))
894 		return (error == 0) ? 1 : error;
895 
896 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
897 	error = -ENOMEM;
898 	if (!rgd)
899 		return error;
900 
901 	rgd->rd_sbd = sdp;
902 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
903 	rgd->rd_length = be32_to_cpu(buf.ri_length);
904 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
905 	rgd->rd_data = be32_to_cpu(buf.ri_data);
906 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
907 	spin_lock_init(&rgd->rd_rsspin);
908 
909 	error = compute_bitstructs(rgd);
910 	if (error)
911 		goto fail;
912 
913 	error = gfs2_glock_get(sdp, rgd->rd_addr,
914 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
915 	if (error)
916 		goto fail;
917 
918 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
919 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
920 	if (rgd->rd_data > sdp->sd_max_rg_data)
921 		sdp->sd_max_rg_data = rgd->rd_data;
922 	spin_lock(&sdp->sd_rindex_spin);
923 	error = rgd_insert(rgd);
924 	spin_unlock(&sdp->sd_rindex_spin);
925 	if (!error) {
926 		glock_set_object(rgd->rd_gl, rgd);
927 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
928 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
929 						    rgd->rd_length) * bsize) - 1;
930 		return 0;
931 	}
932 
933 	error = 0; /* someone else read in the rgrp; free it and ignore it */
934 	gfs2_glock_put(rgd->rd_gl);
935 
936 fail:
937 	kfree(rgd->rd_bits);
938 	rgd->rd_bits = NULL;
939 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
940 	return error;
941 }
942 
943 /**
944  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
945  * @sdp: the GFS2 superblock
946  *
947  * The purpose of this function is to select a subset of the resource groups
948  * and mark them as PREFERRED. We do it in such a way that each node prefers
949  * to use a unique set of rgrps to minimize glock contention.
950  */
951 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
952 {
953 	struct gfs2_rgrpd *rgd, *first;
954 	int i;
955 
956 	/* Skip an initial number of rgrps, based on this node's journal ID.
957 	   That should start each node out on its own set. */
958 	rgd = gfs2_rgrpd_get_first(sdp);
959 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
960 		rgd = gfs2_rgrpd_get_next(rgd);
961 	first = rgd;
962 
963 	do {
964 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
965 		for (i = 0; i < sdp->sd_journals; i++) {
966 			rgd = gfs2_rgrpd_get_next(rgd);
967 			if (!rgd || rgd == first)
968 				break;
969 		}
970 	} while (rgd && rgd != first);
971 }
972 
973 /**
974  * gfs2_ri_update - Pull in a new resource index from the disk
975  * @ip: pointer to the rindex inode
976  *
977  * Returns: 0 on successful update, error code otherwise
978  */
979 
980 static int gfs2_ri_update(struct gfs2_inode *ip)
981 {
982 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
983 	int error;
984 
985 	do {
986 		error = read_rindex_entry(ip);
987 	} while (error == 0);
988 
989 	if (error < 0)
990 		return error;
991 
992 	set_rgrp_preferences(sdp);
993 
994 	sdp->sd_rindex_uptodate = 1;
995 	return 0;
996 }
997 
998 /**
999  * gfs2_rindex_update - Update the rindex if required
1000  * @sdp: The GFS2 superblock
1001  *
1002  * We grab a lock on the rindex inode to make sure that it doesn't
1003  * change whilst we are performing an operation. We keep this lock
1004  * for quite long periods of time compared to other locks. This
1005  * doesn't matter, since it is shared and it is very, very rarely
1006  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1007  *
1008  * This makes sure that we're using the latest copy of the resource index
1009  * special file, which might have been updated if someone expanded the
1010  * filesystem (via gfs2_grow utility), which adds new resource groups.
1011  *
1012  * Returns: 0 on succeess, error code otherwise
1013  */
1014 
1015 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1016 {
1017 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1018 	struct gfs2_glock *gl = ip->i_gl;
1019 	struct gfs2_holder ri_gh;
1020 	int error = 0;
1021 	int unlock_required = 0;
1022 
1023 	/* Read new copy from disk if we don't have the latest */
1024 	if (!sdp->sd_rindex_uptodate) {
1025 		if (!gfs2_glock_is_locked_by_me(gl)) {
1026 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1027 			if (error)
1028 				return error;
1029 			unlock_required = 1;
1030 		}
1031 		if (!sdp->sd_rindex_uptodate)
1032 			error = gfs2_ri_update(ip);
1033 		if (unlock_required)
1034 			gfs2_glock_dq_uninit(&ri_gh);
1035 	}
1036 
1037 	return error;
1038 }
1039 
1040 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1041 {
1042 	const struct gfs2_rgrp *str = buf;
1043 	u32 rg_flags;
1044 
1045 	rg_flags = be32_to_cpu(str->rg_flags);
1046 	rg_flags &= ~GFS2_RDF_MASK;
1047 	rgd->rd_flags &= GFS2_RDF_MASK;
1048 	rgd->rd_flags |= rg_flags;
1049 	rgd->rd_free = be32_to_cpu(str->rg_free);
1050 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1051 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1052 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1053 }
1054 
1055 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1056 {
1057 	const struct gfs2_rgrp *str = buf;
1058 
1059 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1060 	rgl->rl_flags = str->rg_flags;
1061 	rgl->rl_free = str->rg_free;
1062 	rgl->rl_dinodes = str->rg_dinodes;
1063 	rgl->rl_igeneration = str->rg_igeneration;
1064 	rgl->__pad = 0UL;
1065 }
1066 
1067 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1068 {
1069 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1070 	struct gfs2_rgrp *str = buf;
1071 	u32 crc;
1072 
1073 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1074 	str->rg_free = cpu_to_be32(rgd->rd_free);
1075 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1076 	if (next == NULL)
1077 		str->rg_skip = 0;
1078 	else if (next->rd_addr > rgd->rd_addr)
1079 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1080 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1081 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1082 	str->rg_data = cpu_to_be32(rgd->rd_data);
1083 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1084 	str->rg_crc = 0;
1085 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1086 	str->rg_crc = cpu_to_be32(crc);
1087 
1088 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1089 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1090 }
1091 
1092 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1093 {
1094 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1095 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1096 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1097 	int valid = 1;
1098 
1099 	if (rgl->rl_flags != str->rg_flags) {
1100 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1101 			(unsigned long long)rgd->rd_addr,
1102 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1103 		valid = 0;
1104 	}
1105 	if (rgl->rl_free != str->rg_free) {
1106 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1107 			(unsigned long long)rgd->rd_addr,
1108 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1109 		valid = 0;
1110 	}
1111 	if (rgl->rl_dinodes != str->rg_dinodes) {
1112 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1113 			(unsigned long long)rgd->rd_addr,
1114 			be32_to_cpu(rgl->rl_dinodes),
1115 			be32_to_cpu(str->rg_dinodes));
1116 		valid = 0;
1117 	}
1118 	if (rgl->rl_igeneration != str->rg_igeneration) {
1119 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1120 			(unsigned long long)rgd->rd_addr,
1121 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1122 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1123 		valid = 0;
1124 	}
1125 	return valid;
1126 }
1127 
1128 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1129 {
1130 	struct gfs2_bitmap *bi;
1131 	const u32 length = rgd->rd_length;
1132 	const u8 *buffer = NULL;
1133 	u32 i, goal, count = 0;
1134 
1135 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1136 		goal = 0;
1137 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1138 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1139 		while (goal < bi->bi_blocks) {
1140 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1141 					   GFS2_BLKST_UNLINKED);
1142 			if (goal == BFITNOENT)
1143 				break;
1144 			count++;
1145 			goal++;
1146 		}
1147 	}
1148 
1149 	return count;
1150 }
1151 
1152 
1153 /**
1154  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1155  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1156  *
1157  * Read in all of a Resource Group's header and bitmap blocks.
1158  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1159  *
1160  * Returns: errno
1161  */
1162 
1163 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1164 {
1165 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1166 	struct gfs2_glock *gl = rgd->rd_gl;
1167 	unsigned int length = rgd->rd_length;
1168 	struct gfs2_bitmap *bi;
1169 	unsigned int x, y;
1170 	int error;
1171 
1172 	if (rgd->rd_bits[0].bi_bh != NULL)
1173 		return 0;
1174 
1175 	for (x = 0; x < length; x++) {
1176 		bi = rgd->rd_bits + x;
1177 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1178 		if (error)
1179 			goto fail;
1180 	}
1181 
1182 	for (y = length; y--;) {
1183 		bi = rgd->rd_bits + y;
1184 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1185 		if (error)
1186 			goto fail;
1187 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1188 					      GFS2_METATYPE_RG)) {
1189 			error = -EIO;
1190 			goto fail;
1191 		}
1192 	}
1193 
1194 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1195 		for (x = 0; x < length; x++)
1196 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1197 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1198 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1199 		rgd->rd_free_clone = rgd->rd_free;
1200 		/* max out the rgrp allocation failure point */
1201 		rgd->rd_extfail_pt = rgd->rd_free;
1202 	}
1203 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1204 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1205 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1206 				     rgd->rd_bits[0].bi_bh->b_data);
1207 	}
1208 	else if (sdp->sd_args.ar_rgrplvb) {
1209 		if (!gfs2_rgrp_lvb_valid(rgd)){
1210 			gfs2_consist_rgrpd(rgd);
1211 			error = -EIO;
1212 			goto fail;
1213 		}
1214 		if (rgd->rd_rgl->rl_unlinked == 0)
1215 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1216 	}
1217 	return 0;
1218 
1219 fail:
1220 	while (x--) {
1221 		bi = rgd->rd_bits + x;
1222 		brelse(bi->bi_bh);
1223 		bi->bi_bh = NULL;
1224 		gfs2_assert_warn(sdp, !bi->bi_clone);
1225 	}
1226 
1227 	return error;
1228 }
1229 
1230 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1231 {
1232 	u32 rl_flags;
1233 
1234 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1235 		return 0;
1236 
1237 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1238 		return gfs2_rgrp_bh_get(rgd);
1239 
1240 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1241 	rl_flags &= ~GFS2_RDF_MASK;
1242 	rgd->rd_flags &= GFS2_RDF_MASK;
1243 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1244 	if (rgd->rd_rgl->rl_unlinked == 0)
1245 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1246 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1247 	rgd->rd_free_clone = rgd->rd_free;
1248 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1249 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1250 	return 0;
1251 }
1252 
1253 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1254 {
1255 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1256 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1257 
1258 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1259 		return 0;
1260 	return gfs2_rgrp_bh_get(rgd);
1261 }
1262 
1263 /**
1264  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1265  * @rgd: The resource group
1266  *
1267  */
1268 
1269 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1270 {
1271 	int x, length = rgd->rd_length;
1272 
1273 	for (x = 0; x < length; x++) {
1274 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1275 		if (bi->bi_bh) {
1276 			brelse(bi->bi_bh);
1277 			bi->bi_bh = NULL;
1278 		}
1279 	}
1280 }
1281 
1282 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1283 			     struct buffer_head *bh,
1284 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1285 {
1286 	struct super_block *sb = sdp->sd_vfs;
1287 	u64 blk;
1288 	sector_t start = 0;
1289 	sector_t nr_blks = 0;
1290 	int rv;
1291 	unsigned int x;
1292 	u32 trimmed = 0;
1293 	u8 diff;
1294 
1295 	for (x = 0; x < bi->bi_bytes; x++) {
1296 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1297 		clone += bi->bi_offset;
1298 		clone += x;
1299 		if (bh) {
1300 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1301 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1302 		} else {
1303 			diff = ~(*clone | (*clone >> 1));
1304 		}
1305 		diff &= 0x55;
1306 		if (diff == 0)
1307 			continue;
1308 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1309 		while(diff) {
1310 			if (diff & 1) {
1311 				if (nr_blks == 0)
1312 					goto start_new_extent;
1313 				if ((start + nr_blks) != blk) {
1314 					if (nr_blks >= minlen) {
1315 						rv = sb_issue_discard(sb,
1316 							start, nr_blks,
1317 							GFP_NOFS, 0);
1318 						if (rv)
1319 							goto fail;
1320 						trimmed += nr_blks;
1321 					}
1322 					nr_blks = 0;
1323 start_new_extent:
1324 					start = blk;
1325 				}
1326 				nr_blks++;
1327 			}
1328 			diff >>= 2;
1329 			blk++;
1330 		}
1331 	}
1332 	if (nr_blks >= minlen) {
1333 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1334 		if (rv)
1335 			goto fail;
1336 		trimmed += nr_blks;
1337 	}
1338 	if (ptrimmed)
1339 		*ptrimmed = trimmed;
1340 	return 0;
1341 
1342 fail:
1343 	if (sdp->sd_args.ar_discard)
1344 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1345 	sdp->sd_args.ar_discard = 0;
1346 	return -EIO;
1347 }
1348 
1349 /**
1350  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1351  * @filp: Any file on the filesystem
1352  * @argp: Pointer to the arguments (also used to pass result)
1353  *
1354  * Returns: 0 on success, otherwise error code
1355  */
1356 
1357 int gfs2_fitrim(struct file *filp, void __user *argp)
1358 {
1359 	struct inode *inode = file_inode(filp);
1360 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1361 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1362 	struct buffer_head *bh;
1363 	struct gfs2_rgrpd *rgd;
1364 	struct gfs2_rgrpd *rgd_end;
1365 	struct gfs2_holder gh;
1366 	struct fstrim_range r;
1367 	int ret = 0;
1368 	u64 amt;
1369 	u64 trimmed = 0;
1370 	u64 start, end, minlen;
1371 	unsigned int x;
1372 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1373 
1374 	if (!capable(CAP_SYS_ADMIN))
1375 		return -EPERM;
1376 
1377 	if (!blk_queue_discard(q))
1378 		return -EOPNOTSUPP;
1379 
1380 	if (copy_from_user(&r, argp, sizeof(r)))
1381 		return -EFAULT;
1382 
1383 	ret = gfs2_rindex_update(sdp);
1384 	if (ret)
1385 		return ret;
1386 
1387 	start = r.start >> bs_shift;
1388 	end = start + (r.len >> bs_shift);
1389 	minlen = max_t(u64, r.minlen,
1390 		       q->limits.discard_granularity) >> bs_shift;
1391 
1392 	if (end <= start || minlen > sdp->sd_max_rg_data)
1393 		return -EINVAL;
1394 
1395 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1396 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1397 
1398 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1399 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1400 		return -EINVAL; /* start is beyond the end of the fs */
1401 
1402 	while (1) {
1403 
1404 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1405 		if (ret)
1406 			goto out;
1407 
1408 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1409 			/* Trim each bitmap in the rgrp */
1410 			for (x = 0; x < rgd->rd_length; x++) {
1411 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1412 				ret = gfs2_rgrp_send_discards(sdp,
1413 						rgd->rd_data0, NULL, bi, minlen,
1414 						&amt);
1415 				if (ret) {
1416 					gfs2_glock_dq_uninit(&gh);
1417 					goto out;
1418 				}
1419 				trimmed += amt;
1420 			}
1421 
1422 			/* Mark rgrp as having been trimmed */
1423 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1424 			if (ret == 0) {
1425 				bh = rgd->rd_bits[0].bi_bh;
1426 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1427 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1428 				gfs2_rgrp_out(rgd, bh->b_data);
1429 				gfs2_trans_end(sdp);
1430 			}
1431 		}
1432 		gfs2_glock_dq_uninit(&gh);
1433 
1434 		if (rgd == rgd_end)
1435 			break;
1436 
1437 		rgd = gfs2_rgrpd_get_next(rgd);
1438 	}
1439 
1440 out:
1441 	r.len = trimmed << bs_shift;
1442 	if (copy_to_user(argp, &r, sizeof(r)))
1443 		return -EFAULT;
1444 
1445 	return ret;
1446 }
1447 
1448 /**
1449  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1450  * @ip: the inode structure
1451  *
1452  */
1453 static void rs_insert(struct gfs2_inode *ip)
1454 {
1455 	struct rb_node **newn, *parent = NULL;
1456 	int rc;
1457 	struct gfs2_blkreserv *rs = &ip->i_res;
1458 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1459 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1460 
1461 	BUG_ON(gfs2_rs_active(rs));
1462 
1463 	spin_lock(&rgd->rd_rsspin);
1464 	newn = &rgd->rd_rstree.rb_node;
1465 	while (*newn) {
1466 		struct gfs2_blkreserv *cur =
1467 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1468 
1469 		parent = *newn;
1470 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1471 		if (rc > 0)
1472 			newn = &((*newn)->rb_right);
1473 		else if (rc < 0)
1474 			newn = &((*newn)->rb_left);
1475 		else {
1476 			spin_unlock(&rgd->rd_rsspin);
1477 			WARN_ON(1);
1478 			return;
1479 		}
1480 	}
1481 
1482 	rb_link_node(&rs->rs_node, parent, newn);
1483 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1484 
1485 	/* Do our rgrp accounting for the reservation */
1486 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1487 	spin_unlock(&rgd->rd_rsspin);
1488 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1489 }
1490 
1491 /**
1492  * rgd_free - return the number of free blocks we can allocate.
1493  * @rgd: the resource group
1494  *
1495  * This function returns the number of free blocks for an rgrp.
1496  * That's the clone-free blocks (blocks that are free, not including those
1497  * still being used for unlinked files that haven't been deleted.)
1498  *
1499  * It also subtracts any blocks reserved by someone else, but does not
1500  * include free blocks that are still part of our current reservation,
1501  * because obviously we can (and will) allocate them.
1502  */
1503 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1504 {
1505 	u32 tot_reserved, tot_free;
1506 
1507 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1508 		return 0;
1509 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1510 
1511 	if (rgd->rd_free_clone < tot_reserved)
1512 		tot_reserved = 0;
1513 
1514 	tot_free = rgd->rd_free_clone - tot_reserved;
1515 
1516 	return tot_free;
1517 }
1518 
1519 /**
1520  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1521  * @rgd: the resource group descriptor
1522  * @ip: pointer to the inode for which we're reserving blocks
1523  * @ap: the allocation parameters
1524  *
1525  */
1526 
1527 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1528 			   const struct gfs2_alloc_parms *ap)
1529 {
1530 	struct gfs2_rbm rbm = { .rgd = rgd, };
1531 	u64 goal;
1532 	struct gfs2_blkreserv *rs = &ip->i_res;
1533 	u32 extlen;
1534 	u32 free_blocks = rgd_free(rgd, rs);
1535 	int ret;
1536 	struct inode *inode = &ip->i_inode;
1537 
1538 	if (S_ISDIR(inode->i_mode))
1539 		extlen = 1;
1540 	else {
1541 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1542 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1543 	}
1544 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1545 		return;
1546 
1547 	/* Find bitmap block that contains bits for goal block */
1548 	if (rgrp_contains_block(rgd, ip->i_goal))
1549 		goal = ip->i_goal;
1550 	else
1551 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1552 
1553 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1554 		return;
1555 
1556 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1557 	if (ret == 0) {
1558 		rs->rs_rbm = rbm;
1559 		rs->rs_free = extlen;
1560 		rs_insert(ip);
1561 	} else {
1562 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1563 			rgd->rd_last_alloc = 0;
1564 	}
1565 }
1566 
1567 /**
1568  * gfs2_next_unreserved_block - Return next block that is not reserved
1569  * @rgd: The resource group
1570  * @block: The starting block
1571  * @length: The required length
1572  * @ip: Ignore any reservations for this inode
1573  *
1574  * If the block does not appear in any reservation, then return the
1575  * block number unchanged. If it does appear in the reservation, then
1576  * keep looking through the tree of reservations in order to find the
1577  * first block number which is not reserved.
1578  */
1579 
1580 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1581 				      u32 length,
1582 				      const struct gfs2_inode *ip)
1583 {
1584 	struct gfs2_blkreserv *rs;
1585 	struct rb_node *n;
1586 	int rc;
1587 
1588 	spin_lock(&rgd->rd_rsspin);
1589 	n = rgd->rd_rstree.rb_node;
1590 	while (n) {
1591 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1592 		rc = rs_cmp(block, length, rs);
1593 		if (rc < 0)
1594 			n = n->rb_left;
1595 		else if (rc > 0)
1596 			n = n->rb_right;
1597 		else
1598 			break;
1599 	}
1600 
1601 	if (n) {
1602 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1603 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1604 			n = n->rb_right;
1605 			if (n == NULL)
1606 				break;
1607 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1608 		}
1609 	}
1610 
1611 	spin_unlock(&rgd->rd_rsspin);
1612 	return block;
1613 }
1614 
1615 /**
1616  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1617  * @rbm: The current position in the resource group
1618  * @ip: The inode for which we are searching for blocks
1619  * @minext: The minimum extent length
1620  * @maxext: A pointer to the maximum extent structure
1621  *
1622  * This checks the current position in the rgrp to see whether there is
1623  * a reservation covering this block. If not then this function is a
1624  * no-op. If there is, then the position is moved to the end of the
1625  * contiguous reservation(s) so that we are pointing at the first
1626  * non-reserved block.
1627  *
1628  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1629  */
1630 
1631 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1632 					     const struct gfs2_inode *ip,
1633 					     u32 minext,
1634 					     struct gfs2_extent *maxext)
1635 {
1636 	u64 block = gfs2_rbm_to_block(rbm);
1637 	u32 extlen = 1;
1638 	u64 nblock;
1639 	int ret;
1640 
1641 	/*
1642 	 * If we have a minimum extent length, then skip over any extent
1643 	 * which is less than the min extent length in size.
1644 	 */
1645 	if (minext) {
1646 		extlen = gfs2_free_extlen(rbm, minext);
1647 		if (extlen <= maxext->len)
1648 			goto fail;
1649 	}
1650 
1651 	/*
1652 	 * Check the extent which has been found against the reservations
1653 	 * and skip if parts of it are already reserved
1654 	 */
1655 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1656 	if (nblock == block) {
1657 		if (!minext || extlen >= minext)
1658 			return 0;
1659 
1660 		if (extlen > maxext->len) {
1661 			maxext->len = extlen;
1662 			maxext->rbm = *rbm;
1663 		}
1664 fail:
1665 		nblock = block + extlen;
1666 	}
1667 	ret = gfs2_rbm_from_block(rbm, nblock);
1668 	if (ret < 0)
1669 		return ret;
1670 	return 1;
1671 }
1672 
1673 /**
1674  * gfs2_rbm_find - Look for blocks of a particular state
1675  * @rbm: Value/result starting position and final position
1676  * @state: The state which we want to find
1677  * @minext: Pointer to the requested extent length (NULL for a single block)
1678  *          This is updated to be the actual reservation size.
1679  * @ip: If set, check for reservations
1680  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1681  *          around until we've reached the starting point.
1682  *
1683  * Side effects:
1684  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1685  *   has no free blocks in it.
1686  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1687  *   has come up short on a free block search.
1688  *
1689  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1690  */
1691 
1692 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1693 			 const struct gfs2_inode *ip, bool nowrap)
1694 {
1695 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1696 	struct buffer_head *bh;
1697 	int last_bii;
1698 	u32 offset;
1699 	u8 *buffer;
1700 	bool wrapped = false;
1701 	int ret;
1702 	struct gfs2_bitmap *bi;
1703 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1704 
1705 	/*
1706 	 * Determine the last bitmap to search.  If we're not starting at the
1707 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1708 	 * the entire resource group.
1709 	 */
1710 	last_bii = rbm->bii - (rbm->offset == 0);
1711 
1712 	while(1) {
1713 		bi = rbm_bi(rbm);
1714 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1715 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1716 		    (state == GFS2_BLKST_FREE))
1717 			goto next_bitmap;
1718 
1719 		bh = bi->bi_bh;
1720 		buffer = bh->b_data + bi->bi_offset;
1721 		WARN_ON(!buffer_uptodate(bh));
1722 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1723 			buffer = bi->bi_clone + bi->bi_offset;
1724 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1725 		if (offset == BFITNOENT) {
1726 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1727 				set_bit(GBF_FULL, &bi->bi_flags);
1728 			goto next_bitmap;
1729 		}
1730 		rbm->offset = offset;
1731 		if (ip == NULL)
1732 			return 0;
1733 
1734 		ret = gfs2_reservation_check_and_update(rbm, ip,
1735 							minext ? *minext : 0,
1736 							&maxext);
1737 		if (ret == 0)
1738 			return 0;
1739 		if (ret > 0)
1740 			goto next_iter;
1741 		if (ret == -E2BIG) {
1742 			rbm->bii = 0;
1743 			rbm->offset = 0;
1744 			goto res_covered_end_of_rgrp;
1745 		}
1746 		return ret;
1747 
1748 next_bitmap:	/* Find next bitmap in the rgrp */
1749 		rbm->offset = 0;
1750 		rbm->bii++;
1751 		if (rbm->bii == rbm->rgd->rd_length)
1752 			rbm->bii = 0;
1753 res_covered_end_of_rgrp:
1754 		if (rbm->bii == 0) {
1755 			if (wrapped)
1756 				break;
1757 			wrapped = true;
1758 			if (nowrap)
1759 				break;
1760 		}
1761 next_iter:
1762 		/* Have we scanned the entire resource group? */
1763 		if (wrapped && rbm->bii > last_bii)
1764 			break;
1765 	}
1766 
1767 	if (minext == NULL || state != GFS2_BLKST_FREE)
1768 		return -ENOSPC;
1769 
1770 	/* If the extent was too small, and it's smaller than the smallest
1771 	   to have failed before, remember for future reference that it's
1772 	   useless to search this rgrp again for this amount or more. */
1773 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1774 	    *minext < rbm->rgd->rd_extfail_pt)
1775 		rbm->rgd->rd_extfail_pt = *minext;
1776 
1777 	/* If the maximum extent we found is big enough to fulfill the
1778 	   minimum requirements, use it anyway. */
1779 	if (maxext.len) {
1780 		*rbm = maxext.rbm;
1781 		*minext = maxext.len;
1782 		return 0;
1783 	}
1784 
1785 	return -ENOSPC;
1786 }
1787 
1788 /**
1789  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1790  * @rgd: The rgrp
1791  * @last_unlinked: block address of the last dinode we unlinked
1792  * @skip: block address we should explicitly not unlink
1793  *
1794  * Returns: 0 if no error
1795  *          The inode, if one has been found, in inode.
1796  */
1797 
1798 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1799 {
1800 	u64 block;
1801 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1802 	struct gfs2_glock *gl;
1803 	struct gfs2_inode *ip;
1804 	int error;
1805 	int found = 0;
1806 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1807 
1808 	while (1) {
1809 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1810 				      true);
1811 		if (error == -ENOSPC)
1812 			break;
1813 		if (WARN_ON_ONCE(error))
1814 			break;
1815 
1816 		block = gfs2_rbm_to_block(&rbm);
1817 		if (gfs2_rbm_from_block(&rbm, block + 1))
1818 			break;
1819 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1820 			continue;
1821 		if (block == skip)
1822 			continue;
1823 		*last_unlinked = block;
1824 
1825 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1826 		if (error)
1827 			continue;
1828 
1829 		/* If the inode is already in cache, we can ignore it here
1830 		 * because the existing inode disposal code will deal with
1831 		 * it when all refs have gone away. Accessing gl_object like
1832 		 * this is not safe in general. Here it is ok because we do
1833 		 * not dereference the pointer, and we only need an approx
1834 		 * answer to whether it is NULL or not.
1835 		 */
1836 		ip = gl->gl_object;
1837 
1838 		if (ip || !gfs2_queue_delete_work(gl, 0))
1839 			gfs2_glock_put(gl);
1840 		else
1841 			found++;
1842 
1843 		/* Limit reclaim to sensible number of tasks */
1844 		if (found > NR_CPUS)
1845 			return;
1846 	}
1847 
1848 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1849 	return;
1850 }
1851 
1852 /**
1853  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1854  * @rgd: The rgrp in question
1855  * @loops: An indication of how picky we can be (0=very, 1=less so)
1856  *
1857  * This function uses the recently added glock statistics in order to
1858  * figure out whether a parciular resource group is suffering from
1859  * contention from multiple nodes. This is done purely on the basis
1860  * of timings, since this is the only data we have to work with and
1861  * our aim here is to reject a resource group which is highly contended
1862  * but (very important) not to do this too often in order to ensure that
1863  * we do not land up introducing fragmentation by changing resource
1864  * groups when not actually required.
1865  *
1866  * The calculation is fairly simple, we want to know whether the SRTTB
1867  * (i.e. smoothed round trip time for blocking operations) to acquire
1868  * the lock for this rgrp's glock is significantly greater than the
1869  * time taken for resource groups on average. We introduce a margin in
1870  * the form of the variable @var which is computed as the sum of the two
1871  * respective variences, and multiplied by a factor depending on @loops
1872  * and whether we have a lot of data to base the decision on. This is
1873  * then tested against the square difference of the means in order to
1874  * decide whether the result is statistically significant or not.
1875  *
1876  * Returns: A boolean verdict on the congestion status
1877  */
1878 
1879 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1880 {
1881 	const struct gfs2_glock *gl = rgd->rd_gl;
1882 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1883 	struct gfs2_lkstats *st;
1884 	u64 r_dcount, l_dcount;
1885 	u64 l_srttb, a_srttb = 0;
1886 	s64 srttb_diff;
1887 	u64 sqr_diff;
1888 	u64 var;
1889 	int cpu, nonzero = 0;
1890 
1891 	preempt_disable();
1892 	for_each_present_cpu(cpu) {
1893 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1894 		if (st->stats[GFS2_LKS_SRTTB]) {
1895 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1896 			nonzero++;
1897 		}
1898 	}
1899 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1900 	if (nonzero)
1901 		do_div(a_srttb, nonzero);
1902 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1903 	var = st->stats[GFS2_LKS_SRTTVARB] +
1904 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1905 	preempt_enable();
1906 
1907 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1908 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1909 
1910 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1911 		return false;
1912 
1913 	srttb_diff = a_srttb - l_srttb;
1914 	sqr_diff = srttb_diff * srttb_diff;
1915 
1916 	var *= 2;
1917 	if (l_dcount < 8 || r_dcount < 8)
1918 		var *= 2;
1919 	if (loops == 1)
1920 		var *= 2;
1921 
1922 	return ((srttb_diff < 0) && (sqr_diff > var));
1923 }
1924 
1925 /**
1926  * gfs2_rgrp_used_recently
1927  * @rs: The block reservation with the rgrp to test
1928  * @msecs: The time limit in milliseconds
1929  *
1930  * Returns: True if the rgrp glock has been used within the time limit
1931  */
1932 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1933 				    u64 msecs)
1934 {
1935 	u64 tdiff;
1936 
1937 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1938                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1939 
1940 	return tdiff > (msecs * 1000 * 1000);
1941 }
1942 
1943 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1944 {
1945 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1946 	u32 skip;
1947 
1948 	get_random_bytes(&skip, sizeof(skip));
1949 	return skip % sdp->sd_rgrps;
1950 }
1951 
1952 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1953 {
1954 	struct gfs2_rgrpd *rgd = *pos;
1955 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1956 
1957 	rgd = gfs2_rgrpd_get_next(rgd);
1958 	if (rgd == NULL)
1959 		rgd = gfs2_rgrpd_get_first(sdp);
1960 	*pos = rgd;
1961 	if (rgd != begin) /* If we didn't wrap */
1962 		return true;
1963 	return false;
1964 }
1965 
1966 /**
1967  * fast_to_acquire - determine if a resource group will be fast to acquire
1968  *
1969  * If this is one of our preferred rgrps, it should be quicker to acquire,
1970  * because we tried to set ourselves up as dlm lock master.
1971  */
1972 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1973 {
1974 	struct gfs2_glock *gl = rgd->rd_gl;
1975 
1976 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1977 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1978 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1979 		return 1;
1980 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1981 		return 1;
1982 	return 0;
1983 }
1984 
1985 /**
1986  * gfs2_inplace_reserve - Reserve space in the filesystem
1987  * @ip: the inode to reserve space for
1988  * @ap: the allocation parameters
1989  *
1990  * We try our best to find an rgrp that has at least ap->target blocks
1991  * available. After a couple of passes (loops == 2), the prospects of finding
1992  * such an rgrp diminish. At this stage, we return the first rgrp that has
1993  * at least ap->min_target blocks available. Either way, we set ap->allowed to
1994  * the number of blocks available in the chosen rgrp.
1995  *
1996  * Returns: 0 on success,
1997  *          -ENOMEM if a suitable rgrp can't be found
1998  *          errno otherwise
1999  */
2000 
2001 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2002 {
2003 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2004 	struct gfs2_rgrpd *begin = NULL;
2005 	struct gfs2_blkreserv *rs = &ip->i_res;
2006 	int error = 0, rg_locked, flags = 0;
2007 	u64 last_unlinked = NO_BLOCK;
2008 	int loops = 0;
2009 	u32 free_blocks, skip = 0;
2010 
2011 	if (sdp->sd_args.ar_rgrplvb)
2012 		flags |= GL_SKIP;
2013 	if (gfs2_assert_warn(sdp, ap->target))
2014 		return -EINVAL;
2015 	if (gfs2_rs_active(rs)) {
2016 		begin = rs->rs_rbm.rgd;
2017 	} else if (rs->rs_rbm.rgd &&
2018 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2019 		begin = rs->rs_rbm.rgd;
2020 	} else {
2021 		check_and_update_goal(ip);
2022 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2023 	}
2024 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2025 		skip = gfs2_orlov_skip(ip);
2026 	if (rs->rs_rbm.rgd == NULL)
2027 		return -EBADSLT;
2028 
2029 	while (loops < 3) {
2030 		rg_locked = 1;
2031 
2032 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2033 			rg_locked = 0;
2034 			if (skip && skip--)
2035 				goto next_rgrp;
2036 			if (!gfs2_rs_active(rs)) {
2037 				if (loops == 0 &&
2038 				    !fast_to_acquire(rs->rs_rbm.rgd))
2039 					goto next_rgrp;
2040 				if ((loops < 2) &&
2041 				    gfs2_rgrp_used_recently(rs, 1000) &&
2042 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2043 					goto next_rgrp;
2044 			}
2045 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2046 						   LM_ST_EXCLUSIVE, flags,
2047 						   &ip->i_rgd_gh);
2048 			if (unlikely(error))
2049 				return error;
2050 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2051 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2052 				goto skip_rgrp;
2053 			if (sdp->sd_args.ar_rgrplvb) {
2054 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2055 				if (unlikely(error)) {
2056 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2057 					return error;
2058 				}
2059 			}
2060 		}
2061 
2062 		/* Skip unusable resource groups */
2063 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2064 						 GFS2_RDF_ERROR)) ||
2065 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2066 			goto skip_rgrp;
2067 
2068 		if (sdp->sd_args.ar_rgrplvb)
2069 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2070 
2071 		/* Get a reservation if we don't already have one */
2072 		if (!gfs2_rs_active(rs))
2073 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2074 
2075 		/* Skip rgrps when we can't get a reservation on first pass */
2076 		if (!gfs2_rs_active(rs) && (loops < 1))
2077 			goto check_rgrp;
2078 
2079 		/* If rgrp has enough free space, use it */
2080 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2081 		if (free_blocks >= ap->target ||
2082 		    (loops == 2 && ap->min_target &&
2083 		     free_blocks >= ap->min_target)) {
2084 			ap->allowed = free_blocks;
2085 			return 0;
2086 		}
2087 check_rgrp:
2088 		/* Check for unlinked inodes which can be reclaimed */
2089 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2090 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2091 					ip->i_no_addr);
2092 skip_rgrp:
2093 		/* Drop reservation, if we couldn't use reserved rgrp */
2094 		if (gfs2_rs_active(rs))
2095 			gfs2_rs_deltree(rs);
2096 
2097 		/* Unlock rgrp if required */
2098 		if (!rg_locked)
2099 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2100 next_rgrp:
2101 		/* Find the next rgrp, and continue looking */
2102 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2103 			continue;
2104 		if (skip)
2105 			continue;
2106 
2107 		/* If we've scanned all the rgrps, but found no free blocks
2108 		 * then this checks for some less likely conditions before
2109 		 * trying again.
2110 		 */
2111 		loops++;
2112 		/* Check that fs hasn't grown if writing to rindex */
2113 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2114 			error = gfs2_ri_update(ip);
2115 			if (error)
2116 				return error;
2117 		}
2118 		/* Flushing the log may release space */
2119 		if (loops == 2)
2120 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2121 				       GFS2_LFC_INPLACE_RESERVE);
2122 	}
2123 
2124 	return -ENOSPC;
2125 }
2126 
2127 /**
2128  * gfs2_inplace_release - release an inplace reservation
2129  * @ip: the inode the reservation was taken out on
2130  *
2131  * Release a reservation made by gfs2_inplace_reserve().
2132  */
2133 
2134 void gfs2_inplace_release(struct gfs2_inode *ip)
2135 {
2136 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2137 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2138 }
2139 
2140 /**
2141  * gfs2_alloc_extent - allocate an extent from a given bitmap
2142  * @rbm: the resource group information
2143  * @dinode: TRUE if the first block we allocate is for a dinode
2144  * @n: The extent length (value/result)
2145  *
2146  * Add the bitmap buffer to the transaction.
2147  * Set the found bits to @new_state to change block's allocation state.
2148  */
2149 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2150 			     unsigned int *n)
2151 {
2152 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2153 	const unsigned int elen = *n;
2154 	u64 block;
2155 	int ret;
2156 
2157 	*n = 1;
2158 	block = gfs2_rbm_to_block(rbm);
2159 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2160 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2161 	block++;
2162 	while (*n < elen) {
2163 		ret = gfs2_rbm_from_block(&pos, block);
2164 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2165 			break;
2166 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2167 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2168 		(*n)++;
2169 		block++;
2170 	}
2171 }
2172 
2173 /**
2174  * rgblk_free - Change alloc state of given block(s)
2175  * @sdp: the filesystem
2176  * @rgd: the resource group the blocks are in
2177  * @bstart: the start of a run of blocks to free
2178  * @blen: the length of the block run (all must lie within ONE RG!)
2179  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2180  */
2181 
2182 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2183 		       u64 bstart, u32 blen, unsigned char new_state)
2184 {
2185 	struct gfs2_rbm rbm;
2186 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2187 
2188 	rbm.rgd = rgd;
2189 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2190 		return;
2191 	while (blen--) {
2192 		bi = rbm_bi(&rbm);
2193 		if (bi != bi_prev) {
2194 			if (!bi->bi_clone) {
2195 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2196 						      GFP_NOFS | __GFP_NOFAIL);
2197 				memcpy(bi->bi_clone + bi->bi_offset,
2198 				       bi->bi_bh->b_data + bi->bi_offset,
2199 				       bi->bi_bytes);
2200 			}
2201 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2202 			bi_prev = bi;
2203 		}
2204 		gfs2_setbit(&rbm, false, new_state);
2205 		gfs2_rbm_incr(&rbm);
2206 	}
2207 }
2208 
2209 /**
2210  * gfs2_rgrp_dump - print out an rgrp
2211  * @seq: The iterator
2212  * @gl: The glock in question
2213  * @fs_id_buf: pointer to file system id (if requested)
2214  *
2215  */
2216 
2217 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
2218 		    const char *fs_id_buf)
2219 {
2220 	struct gfs2_rgrpd *rgd = gl->gl_object;
2221 	struct gfs2_blkreserv *trs;
2222 	const struct rb_node *n;
2223 
2224 	if (rgd == NULL)
2225 		return;
2226 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2227 		       fs_id_buf,
2228 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2229 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2230 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2231 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2232 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2233 
2234 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2235 			       be32_to_cpu(rgl->rl_flags),
2236 			       be32_to_cpu(rgl->rl_free),
2237 			       be32_to_cpu(rgl->rl_dinodes));
2238 	}
2239 	spin_lock(&rgd->rd_rsspin);
2240 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2241 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2242 		dump_rs(seq, trs, fs_id_buf);
2243 	}
2244 	spin_unlock(&rgd->rd_rsspin);
2245 }
2246 
2247 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2248 {
2249 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2250 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2251 
2252 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2253 		(unsigned long long)rgd->rd_addr);
2254 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2255 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2256 	gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
2257 	rgd->rd_flags |= GFS2_RDF_ERROR;
2258 }
2259 
2260 /**
2261  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2262  * @ip: The inode we have just allocated blocks for
2263  * @rbm: The start of the allocated blocks
2264  * @len: The extent length
2265  *
2266  * Adjusts a reservation after an allocation has taken place. If the
2267  * reservation does not match the allocation, or if it is now empty
2268  * then it is removed.
2269  */
2270 
2271 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2272 				    const struct gfs2_rbm *rbm, unsigned len)
2273 {
2274 	struct gfs2_blkreserv *rs = &ip->i_res;
2275 	struct gfs2_rgrpd *rgd = rbm->rgd;
2276 	unsigned rlen;
2277 	u64 block;
2278 	int ret;
2279 
2280 	spin_lock(&rgd->rd_rsspin);
2281 	if (gfs2_rs_active(rs)) {
2282 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2283 			block = gfs2_rbm_to_block(rbm);
2284 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2285 			rlen = min(rs->rs_free, len);
2286 			rs->rs_free -= rlen;
2287 			rgd->rd_reserved -= rlen;
2288 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2289 			if (rs->rs_free && !ret)
2290 				goto out;
2291 			/* We used up our block reservation, so we should
2292 			   reserve more blocks next time. */
2293 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2294 		}
2295 		__rs_deltree(rs);
2296 	}
2297 out:
2298 	spin_unlock(&rgd->rd_rsspin);
2299 }
2300 
2301 /**
2302  * gfs2_set_alloc_start - Set starting point for block allocation
2303  * @rbm: The rbm which will be set to the required location
2304  * @ip: The gfs2 inode
2305  * @dinode: Flag to say if allocation includes a new inode
2306  *
2307  * This sets the starting point from the reservation if one is active
2308  * otherwise it falls back to guessing a start point based on the
2309  * inode's goal block or the last allocation point in the rgrp.
2310  */
2311 
2312 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2313 				 const struct gfs2_inode *ip, bool dinode)
2314 {
2315 	u64 goal;
2316 
2317 	if (gfs2_rs_active(&ip->i_res)) {
2318 		*rbm = ip->i_res.rs_rbm;
2319 		return;
2320 	}
2321 
2322 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2323 		goal = ip->i_goal;
2324 	else
2325 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2326 
2327 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2328 		rbm->bii = 0;
2329 		rbm->offset = 0;
2330 	}
2331 }
2332 
2333 /**
2334  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2335  * @ip: the inode to allocate the block for
2336  * @bn: Used to return the starting block number
2337  * @nblocks: requested number of blocks/extent length (value/result)
2338  * @dinode: 1 if we're allocating a dinode block, else 0
2339  * @generation: the generation number of the inode
2340  *
2341  * Returns: 0 or error
2342  */
2343 
2344 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2345 		      bool dinode, u64 *generation)
2346 {
2347 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2348 	struct buffer_head *dibh;
2349 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2350 	unsigned int ndata;
2351 	u64 block; /* block, within the file system scope */
2352 	int error;
2353 
2354 	gfs2_set_alloc_start(&rbm, ip, dinode);
2355 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2356 
2357 	if (error == -ENOSPC) {
2358 		gfs2_set_alloc_start(&rbm, ip, dinode);
2359 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2360 	}
2361 
2362 	/* Since all blocks are reserved in advance, this shouldn't happen */
2363 	if (error) {
2364 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2365 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2366 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2367 			rbm.rgd->rd_extfail_pt);
2368 		goto rgrp_error;
2369 	}
2370 
2371 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2372 	block = gfs2_rbm_to_block(&rbm);
2373 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2374 	if (gfs2_rs_active(&ip->i_res))
2375 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2376 	ndata = *nblocks;
2377 	if (dinode)
2378 		ndata--;
2379 
2380 	if (!dinode) {
2381 		ip->i_goal = block + ndata - 1;
2382 		error = gfs2_meta_inode_buffer(ip, &dibh);
2383 		if (error == 0) {
2384 			struct gfs2_dinode *di =
2385 				(struct gfs2_dinode *)dibh->b_data;
2386 			gfs2_trans_add_meta(ip->i_gl, dibh);
2387 			di->di_goal_meta = di->di_goal_data =
2388 				cpu_to_be64(ip->i_goal);
2389 			brelse(dibh);
2390 		}
2391 	}
2392 	if (rbm.rgd->rd_free < *nblocks) {
2393 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2394 		goto rgrp_error;
2395 	}
2396 
2397 	rbm.rgd->rd_free -= *nblocks;
2398 	if (dinode) {
2399 		rbm.rgd->rd_dinodes++;
2400 		*generation = rbm.rgd->rd_igeneration++;
2401 		if (*generation == 0)
2402 			*generation = rbm.rgd->rd_igeneration++;
2403 	}
2404 
2405 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2406 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2407 
2408 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2409 	if (dinode)
2410 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2411 
2412 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2413 
2414 	rbm.rgd->rd_free_clone -= *nblocks;
2415 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2416 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2417 	*bn = block;
2418 	return 0;
2419 
2420 rgrp_error:
2421 	gfs2_rgrp_error(rbm.rgd);
2422 	return -EIO;
2423 }
2424 
2425 /**
2426  * __gfs2_free_blocks - free a contiguous run of block(s)
2427  * @ip: the inode these blocks are being freed from
2428  * @rgd: the resource group the blocks are in
2429  * @bstart: first block of a run of contiguous blocks
2430  * @blen: the length of the block run
2431  * @meta: 1 if the blocks represent metadata
2432  *
2433  */
2434 
2435 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2436 			u64 bstart, u32 blen, int meta)
2437 {
2438 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2439 
2440 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2441 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2442 	rgd->rd_free += blen;
2443 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2444 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2445 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2446 
2447 	/* Directories keep their data in the metadata address space */
2448 	if (meta || ip->i_depth)
2449 		gfs2_meta_wipe(ip, bstart, blen);
2450 }
2451 
2452 /**
2453  * gfs2_free_meta - free a contiguous run of data block(s)
2454  * @ip: the inode these blocks are being freed from
2455  * @rgd: the resource group the blocks are in
2456  * @bstart: first block of a run of contiguous blocks
2457  * @blen: the length of the block run
2458  *
2459  */
2460 
2461 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2462 		    u64 bstart, u32 blen)
2463 {
2464 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2465 
2466 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2467 	gfs2_statfs_change(sdp, 0, +blen, 0);
2468 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2469 }
2470 
2471 void gfs2_unlink_di(struct inode *inode)
2472 {
2473 	struct gfs2_inode *ip = GFS2_I(inode);
2474 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2475 	struct gfs2_rgrpd *rgd;
2476 	u64 blkno = ip->i_no_addr;
2477 
2478 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2479 	if (!rgd)
2480 		return;
2481 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2482 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2483 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2484 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2485 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2486 }
2487 
2488 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2489 {
2490 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2491 
2492 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2493 	if (!rgd->rd_dinodes)
2494 		gfs2_consist_rgrpd(rgd);
2495 	rgd->rd_dinodes--;
2496 	rgd->rd_free++;
2497 
2498 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2499 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2500 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2501 
2502 	gfs2_statfs_change(sdp, 0, +1, -1);
2503 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2504 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2505 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2506 }
2507 
2508 /**
2509  * gfs2_check_blk_type - Check the type of a block
2510  * @sdp: The superblock
2511  * @no_addr: The block number to check
2512  * @type: The block type we are looking for
2513  *
2514  * Returns: 0 if the block type matches the expected type
2515  *          -ESTALE if it doesn't match
2516  *          or -ve errno if something went wrong while checking
2517  */
2518 
2519 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2520 {
2521 	struct gfs2_rgrpd *rgd;
2522 	struct gfs2_holder rgd_gh;
2523 	struct gfs2_rbm rbm;
2524 	int error = -EINVAL;
2525 
2526 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2527 	if (!rgd)
2528 		goto fail;
2529 
2530 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2531 	if (error)
2532 		goto fail;
2533 
2534 	rbm.rgd = rgd;
2535 	error = gfs2_rbm_from_block(&rbm, no_addr);
2536 	if (WARN_ON_ONCE(error))
2537 		goto fail;
2538 
2539 	if (gfs2_testbit(&rbm, false) != type)
2540 		error = -ESTALE;
2541 
2542 	gfs2_glock_dq_uninit(&rgd_gh);
2543 fail:
2544 	return error;
2545 }
2546 
2547 /**
2548  * gfs2_rlist_add - add a RG to a list of RGs
2549  * @ip: the inode
2550  * @rlist: the list of resource groups
2551  * @block: the block
2552  *
2553  * Figure out what RG a block belongs to and add that RG to the list
2554  *
2555  * FIXME: Don't use NOFAIL
2556  *
2557  */
2558 
2559 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2560 		    u64 block)
2561 {
2562 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2563 	struct gfs2_rgrpd *rgd;
2564 	struct gfs2_rgrpd **tmp;
2565 	unsigned int new_space;
2566 	unsigned int x;
2567 
2568 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2569 		return;
2570 
2571 	/*
2572 	 * The resource group last accessed is kept in the last position.
2573 	 */
2574 
2575 	if (rlist->rl_rgrps) {
2576 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2577 		if (rgrp_contains_block(rgd, block))
2578 			return;
2579 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2580 	} else {
2581 		rgd = ip->i_res.rs_rbm.rgd;
2582 		if (!rgd || !rgrp_contains_block(rgd, block))
2583 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2584 	}
2585 
2586 	if (!rgd) {
2587 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2588 		       (unsigned long long)block);
2589 		return;
2590 	}
2591 
2592 	for (x = 0; x < rlist->rl_rgrps; x++) {
2593 		if (rlist->rl_rgd[x] == rgd) {
2594 			swap(rlist->rl_rgd[x],
2595 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2596 			return;
2597 		}
2598 	}
2599 
2600 	if (rlist->rl_rgrps == rlist->rl_space) {
2601 		new_space = rlist->rl_space + 10;
2602 
2603 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2604 			      GFP_NOFS | __GFP_NOFAIL);
2605 
2606 		if (rlist->rl_rgd) {
2607 			memcpy(tmp, rlist->rl_rgd,
2608 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2609 			kfree(rlist->rl_rgd);
2610 		}
2611 
2612 		rlist->rl_space = new_space;
2613 		rlist->rl_rgd = tmp;
2614 	}
2615 
2616 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2617 }
2618 
2619 /**
2620  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2621  *      and initialize an array of glock holders for them
2622  * @rlist: the list of resource groups
2623  *
2624  * FIXME: Don't use NOFAIL
2625  *
2626  */
2627 
2628 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2629 {
2630 	unsigned int x;
2631 
2632 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2633 				      sizeof(struct gfs2_holder),
2634 				      GFP_NOFS | __GFP_NOFAIL);
2635 	for (x = 0; x < rlist->rl_rgrps; x++)
2636 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2637 				LM_ST_EXCLUSIVE, 0,
2638 				&rlist->rl_ghs[x]);
2639 }
2640 
2641 /**
2642  * gfs2_rlist_free - free a resource group list
2643  * @rlist: the list of resource groups
2644  *
2645  */
2646 
2647 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2648 {
2649 	unsigned int x;
2650 
2651 	kfree(rlist->rl_rgd);
2652 
2653 	if (rlist->rl_ghs) {
2654 		for (x = 0; x < rlist->rl_rgrps; x++)
2655 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2656 		kfree(rlist->rl_ghs);
2657 		rlist->rl_ghs = NULL;
2658 	}
2659 }
2660 
2661