xref: /linux/fs/gfs2/rgrp.c (revision 7f356166aebb0d956d367dfe55e19d7783277d09)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461 			count[0], rgd->rd_free);
462 		gfs2_consist_rgrpd(rgd);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469 			count[1], tmp);
470 		gfs2_consist_rgrpd(rgd);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476 			count[2] + count[3], rgd->rd_dinodes);
477 		gfs2_consist_rgrpd(rgd);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
593 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594 		    const char *fs_id_buf)
595 {
596 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597 
598 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599 		       (unsigned long long)ip->i_no_addr,
600 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601 		       rs->rs_rbm.offset, rs->rs_free);
602 }
603 
604 /**
605  * __rs_deltree - remove a multi-block reservation from the rgd tree
606  * @rs: The reservation to remove
607  *
608  */
609 static void __rs_deltree(struct gfs2_blkreserv *rs)
610 {
611 	struct gfs2_rgrpd *rgd;
612 
613 	if (!gfs2_rs_active(rs))
614 		return;
615 
616 	rgd = rs->rs_rbm.rgd;
617 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619 	RB_CLEAR_NODE(&rs->rs_node);
620 
621 	if (rs->rs_free) {
622 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623 				 rs->rs_free - 1;
624 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625 		struct gfs2_bitmap *start, *last;
626 
627 		/* return reserved blocks to the rgrp */
628 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630 		/* The rgrp extent failure point is likely not to increase;
631 		   it will only do so if the freed blocks are somehow
632 		   contiguous with a span of free blocks that follows. Still,
633 		   it will force the number to be recalculated later. */
634 		rgd->rd_extfail_pt += rs->rs_free;
635 		rs->rs_free = 0;
636 		if (gfs2_rbm_from_block(&last_rbm, last_block))
637 			return;
638 		start = rbm_bi(&rs->rs_rbm);
639 		last = rbm_bi(&last_rbm);
640 		do
641 			clear_bit(GBF_FULL, &start->bi_flags);
642 		while (start++ != last);
643 	}
644 }
645 
646 /**
647  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648  * @rs: The reservation to remove
649  *
650  */
651 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652 {
653 	struct gfs2_rgrpd *rgd;
654 
655 	rgd = rs->rs_rbm.rgd;
656 	if (rgd) {
657 		spin_lock(&rgd->rd_rsspin);
658 		__rs_deltree(rs);
659 		BUG_ON(rs->rs_free);
660 		spin_unlock(&rgd->rd_rsspin);
661 	}
662 }
663 
664 /**
665  * gfs2_rs_delete - delete a multi-block reservation
666  * @ip: The inode for this reservation
667  * @wcount: The inode's write count, or NULL
668  *
669  */
670 void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
671 {
672 	down_write(&ip->i_rw_mutex);
673 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
674 		gfs2_rs_deltree(&ip->i_res);
675 	up_write(&ip->i_rw_mutex);
676 }
677 
678 /**
679  * return_all_reservations - return all reserved blocks back to the rgrp.
680  * @rgd: the rgrp that needs its space back
681  *
682  * We previously reserved a bunch of blocks for allocation. Now we need to
683  * give them back. This leave the reservation structures in tact, but removes
684  * all of their corresponding "no-fly zones".
685  */
686 static void return_all_reservations(struct gfs2_rgrpd *rgd)
687 {
688 	struct rb_node *n;
689 	struct gfs2_blkreserv *rs;
690 
691 	spin_lock(&rgd->rd_rsspin);
692 	while ((n = rb_first(&rgd->rd_rstree))) {
693 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
694 		__rs_deltree(rs);
695 	}
696 	spin_unlock(&rgd->rd_rsspin);
697 }
698 
699 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
700 {
701 	struct rb_node *n;
702 	struct gfs2_rgrpd *rgd;
703 	struct gfs2_glock *gl;
704 
705 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
706 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
707 		gl = rgd->rd_gl;
708 
709 		rb_erase(n, &sdp->sd_rindex_tree);
710 
711 		if (gl) {
712 			if (gl->gl_state != LM_ST_UNLOCKED) {
713 				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
714 				flush_delayed_work(&gl->gl_work);
715 			}
716 			gfs2_rgrp_brelse(rgd);
717 			glock_clear_object(gl, rgd);
718 			gfs2_glock_put(gl);
719 		}
720 
721 		gfs2_free_clones(rgd);
722 		return_all_reservations(rgd);
723 		kfree(rgd->rd_bits);
724 		rgd->rd_bits = NULL;
725 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
726 	}
727 }
728 
729 /**
730  * gfs2_compute_bitstructs - Compute the bitmap sizes
731  * @rgd: The resource group descriptor
732  *
733  * Calculates bitmap descriptors, one for each block that contains bitmap data
734  *
735  * Returns: errno
736  */
737 
738 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
739 {
740 	struct gfs2_sbd *sdp = rgd->rd_sbd;
741 	struct gfs2_bitmap *bi;
742 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
743 	u32 bytes_left, bytes;
744 	int x;
745 
746 	if (!length)
747 		return -EINVAL;
748 
749 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
750 	if (!rgd->rd_bits)
751 		return -ENOMEM;
752 
753 	bytes_left = rgd->rd_bitbytes;
754 
755 	for (x = 0; x < length; x++) {
756 		bi = rgd->rd_bits + x;
757 
758 		bi->bi_flags = 0;
759 		/* small rgrp; bitmap stored completely in header block */
760 		if (length == 1) {
761 			bytes = bytes_left;
762 			bi->bi_offset = sizeof(struct gfs2_rgrp);
763 			bi->bi_start = 0;
764 			bi->bi_bytes = bytes;
765 			bi->bi_blocks = bytes * GFS2_NBBY;
766 		/* header block */
767 		} else if (x == 0) {
768 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
769 			bi->bi_offset = sizeof(struct gfs2_rgrp);
770 			bi->bi_start = 0;
771 			bi->bi_bytes = bytes;
772 			bi->bi_blocks = bytes * GFS2_NBBY;
773 		/* last block */
774 		} else if (x + 1 == length) {
775 			bytes = bytes_left;
776 			bi->bi_offset = sizeof(struct gfs2_meta_header);
777 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
778 			bi->bi_bytes = bytes;
779 			bi->bi_blocks = bytes * GFS2_NBBY;
780 		/* other blocks */
781 		} else {
782 			bytes = sdp->sd_sb.sb_bsize -
783 				sizeof(struct gfs2_meta_header);
784 			bi->bi_offset = sizeof(struct gfs2_meta_header);
785 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
786 			bi->bi_bytes = bytes;
787 			bi->bi_blocks = bytes * GFS2_NBBY;
788 		}
789 
790 		bytes_left -= bytes;
791 	}
792 
793 	if (bytes_left) {
794 		gfs2_consist_rgrpd(rgd);
795 		return -EIO;
796 	}
797 	bi = rgd->rd_bits + (length - 1);
798 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
799 		gfs2_lm(sdp,
800 			"ri_addr = %llu\n"
801 			"ri_length = %u\n"
802 			"ri_data0 = %llu\n"
803 			"ri_data = %u\n"
804 			"ri_bitbytes = %u\n"
805 			"start=%u len=%u offset=%u\n",
806 			(unsigned long long)rgd->rd_addr,
807 			rgd->rd_length,
808 			(unsigned long long)rgd->rd_data0,
809 			rgd->rd_data,
810 			rgd->rd_bitbytes,
811 			bi->bi_start, bi->bi_bytes, bi->bi_offset);
812 		gfs2_consist_rgrpd(rgd);
813 		return -EIO;
814 	}
815 
816 	return 0;
817 }
818 
819 /**
820  * gfs2_ri_total - Total up the file system space, according to the rindex.
821  * @sdp: the filesystem
822  *
823  */
824 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
825 {
826 	u64 total_data = 0;
827 	struct inode *inode = sdp->sd_rindex;
828 	struct gfs2_inode *ip = GFS2_I(inode);
829 	char buf[sizeof(struct gfs2_rindex)];
830 	int error, rgrps;
831 
832 	for (rgrps = 0;; rgrps++) {
833 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
834 
835 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
836 			break;
837 		error = gfs2_internal_read(ip, buf, &pos,
838 					   sizeof(struct gfs2_rindex));
839 		if (error != sizeof(struct gfs2_rindex))
840 			break;
841 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
842 	}
843 	return total_data;
844 }
845 
846 static int rgd_insert(struct gfs2_rgrpd *rgd)
847 {
848 	struct gfs2_sbd *sdp = rgd->rd_sbd;
849 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
850 
851 	/* Figure out where to put new node */
852 	while (*newn) {
853 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
854 						  rd_node);
855 
856 		parent = *newn;
857 		if (rgd->rd_addr < cur->rd_addr)
858 			newn = &((*newn)->rb_left);
859 		else if (rgd->rd_addr > cur->rd_addr)
860 			newn = &((*newn)->rb_right);
861 		else
862 			return -EEXIST;
863 	}
864 
865 	rb_link_node(&rgd->rd_node, parent, newn);
866 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
867 	sdp->sd_rgrps++;
868 	return 0;
869 }
870 
871 /**
872  * read_rindex_entry - Pull in a new resource index entry from the disk
873  * @ip: Pointer to the rindex inode
874  *
875  * Returns: 0 on success, > 0 on EOF, error code otherwise
876  */
877 
878 static int read_rindex_entry(struct gfs2_inode *ip)
879 {
880 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
881 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
882 	struct gfs2_rindex buf;
883 	int error;
884 	struct gfs2_rgrpd *rgd;
885 
886 	if (pos >= i_size_read(&ip->i_inode))
887 		return 1;
888 
889 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
890 				   sizeof(struct gfs2_rindex));
891 
892 	if (error != sizeof(struct gfs2_rindex))
893 		return (error == 0) ? 1 : error;
894 
895 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
896 	error = -ENOMEM;
897 	if (!rgd)
898 		return error;
899 
900 	rgd->rd_sbd = sdp;
901 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
902 	rgd->rd_length = be32_to_cpu(buf.ri_length);
903 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
904 	rgd->rd_data = be32_to_cpu(buf.ri_data);
905 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
906 	spin_lock_init(&rgd->rd_rsspin);
907 
908 	error = compute_bitstructs(rgd);
909 	if (error)
910 		goto fail;
911 
912 	error = gfs2_glock_get(sdp, rgd->rd_addr,
913 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
914 	if (error)
915 		goto fail;
916 
917 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
918 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
919 	if (rgd->rd_data > sdp->sd_max_rg_data)
920 		sdp->sd_max_rg_data = rgd->rd_data;
921 	spin_lock(&sdp->sd_rindex_spin);
922 	error = rgd_insert(rgd);
923 	spin_unlock(&sdp->sd_rindex_spin);
924 	if (!error) {
925 		glock_set_object(rgd->rd_gl, rgd);
926 		return 0;
927 	}
928 
929 	error = 0; /* someone else read in the rgrp; free it and ignore it */
930 	gfs2_glock_put(rgd->rd_gl);
931 
932 fail:
933 	kfree(rgd->rd_bits);
934 	rgd->rd_bits = NULL;
935 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
936 	return error;
937 }
938 
939 /**
940  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
941  * @sdp: the GFS2 superblock
942  *
943  * The purpose of this function is to select a subset of the resource groups
944  * and mark them as PREFERRED. We do it in such a way that each node prefers
945  * to use a unique set of rgrps to minimize glock contention.
946  */
947 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
948 {
949 	struct gfs2_rgrpd *rgd, *first;
950 	int i;
951 
952 	/* Skip an initial number of rgrps, based on this node's journal ID.
953 	   That should start each node out on its own set. */
954 	rgd = gfs2_rgrpd_get_first(sdp);
955 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
956 		rgd = gfs2_rgrpd_get_next(rgd);
957 	first = rgd;
958 
959 	do {
960 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
961 		for (i = 0; i < sdp->sd_journals; i++) {
962 			rgd = gfs2_rgrpd_get_next(rgd);
963 			if (!rgd || rgd == first)
964 				break;
965 		}
966 	} while (rgd && rgd != first);
967 }
968 
969 /**
970  * gfs2_ri_update - Pull in a new resource index from the disk
971  * @ip: pointer to the rindex inode
972  *
973  * Returns: 0 on successful update, error code otherwise
974  */
975 
976 static int gfs2_ri_update(struct gfs2_inode *ip)
977 {
978 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
979 	int error;
980 
981 	do {
982 		error = read_rindex_entry(ip);
983 	} while (error == 0);
984 
985 	if (error < 0)
986 		return error;
987 
988 	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
989 		fs_err(sdp, "no resource groups found in the file system.\n");
990 		return -ENOENT;
991 	}
992 	set_rgrp_preferences(sdp);
993 
994 	sdp->sd_rindex_uptodate = 1;
995 	return 0;
996 }
997 
998 /**
999  * gfs2_rindex_update - Update the rindex if required
1000  * @sdp: The GFS2 superblock
1001  *
1002  * We grab a lock on the rindex inode to make sure that it doesn't
1003  * change whilst we are performing an operation. We keep this lock
1004  * for quite long periods of time compared to other locks. This
1005  * doesn't matter, since it is shared and it is very, very rarely
1006  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1007  *
1008  * This makes sure that we're using the latest copy of the resource index
1009  * special file, which might have been updated if someone expanded the
1010  * filesystem (via gfs2_grow utility), which adds new resource groups.
1011  *
1012  * Returns: 0 on succeess, error code otherwise
1013  */
1014 
1015 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1016 {
1017 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1018 	struct gfs2_glock *gl = ip->i_gl;
1019 	struct gfs2_holder ri_gh;
1020 	int error = 0;
1021 	int unlock_required = 0;
1022 
1023 	/* Read new copy from disk if we don't have the latest */
1024 	if (!sdp->sd_rindex_uptodate) {
1025 		if (!gfs2_glock_is_locked_by_me(gl)) {
1026 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1027 			if (error)
1028 				return error;
1029 			unlock_required = 1;
1030 		}
1031 		if (!sdp->sd_rindex_uptodate)
1032 			error = gfs2_ri_update(ip);
1033 		if (unlock_required)
1034 			gfs2_glock_dq_uninit(&ri_gh);
1035 	}
1036 
1037 	return error;
1038 }
1039 
1040 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1041 {
1042 	const struct gfs2_rgrp *str = buf;
1043 	u32 rg_flags;
1044 
1045 	rg_flags = be32_to_cpu(str->rg_flags);
1046 	rg_flags &= ~GFS2_RDF_MASK;
1047 	rgd->rd_flags &= GFS2_RDF_MASK;
1048 	rgd->rd_flags |= rg_flags;
1049 	rgd->rd_free = be32_to_cpu(str->rg_free);
1050 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1051 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1052 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1053 }
1054 
1055 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1056 {
1057 	const struct gfs2_rgrp *str = buf;
1058 
1059 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1060 	rgl->rl_flags = str->rg_flags;
1061 	rgl->rl_free = str->rg_free;
1062 	rgl->rl_dinodes = str->rg_dinodes;
1063 	rgl->rl_igeneration = str->rg_igeneration;
1064 	rgl->__pad = 0UL;
1065 }
1066 
1067 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1068 {
1069 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1070 	struct gfs2_rgrp *str = buf;
1071 	u32 crc;
1072 
1073 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1074 	str->rg_free = cpu_to_be32(rgd->rd_free);
1075 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1076 	if (next == NULL)
1077 		str->rg_skip = 0;
1078 	else if (next->rd_addr > rgd->rd_addr)
1079 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1080 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1081 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1082 	str->rg_data = cpu_to_be32(rgd->rd_data);
1083 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1084 	str->rg_crc = 0;
1085 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1086 	str->rg_crc = cpu_to_be32(crc);
1087 
1088 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1089 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1090 }
1091 
1092 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1093 {
1094 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1095 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1096 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1097 	int valid = 1;
1098 
1099 	if (rgl->rl_flags != str->rg_flags) {
1100 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1101 			(unsigned long long)rgd->rd_addr,
1102 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1103 		valid = 0;
1104 	}
1105 	if (rgl->rl_free != str->rg_free) {
1106 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1107 			(unsigned long long)rgd->rd_addr,
1108 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1109 		valid = 0;
1110 	}
1111 	if (rgl->rl_dinodes != str->rg_dinodes) {
1112 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1113 			(unsigned long long)rgd->rd_addr,
1114 			be32_to_cpu(rgl->rl_dinodes),
1115 			be32_to_cpu(str->rg_dinodes));
1116 		valid = 0;
1117 	}
1118 	if (rgl->rl_igeneration != str->rg_igeneration) {
1119 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1120 			(unsigned long long)rgd->rd_addr,
1121 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1122 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1123 		valid = 0;
1124 	}
1125 	return valid;
1126 }
1127 
1128 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1129 {
1130 	struct gfs2_bitmap *bi;
1131 	const u32 length = rgd->rd_length;
1132 	const u8 *buffer = NULL;
1133 	u32 i, goal, count = 0;
1134 
1135 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1136 		goal = 0;
1137 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1138 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1139 		while (goal < bi->bi_blocks) {
1140 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1141 					   GFS2_BLKST_UNLINKED);
1142 			if (goal == BFITNOENT)
1143 				break;
1144 			count++;
1145 			goal++;
1146 		}
1147 	}
1148 
1149 	return count;
1150 }
1151 
1152 
1153 /**
1154  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1155  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1156  *
1157  * Read in all of a Resource Group's header and bitmap blocks.
1158  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1159  *
1160  * Returns: errno
1161  */
1162 
1163 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1164 {
1165 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1166 	struct gfs2_glock *gl = rgd->rd_gl;
1167 	unsigned int length = rgd->rd_length;
1168 	struct gfs2_bitmap *bi;
1169 	unsigned int x, y;
1170 	int error;
1171 
1172 	if (rgd->rd_bits[0].bi_bh != NULL)
1173 		return 0;
1174 
1175 	for (x = 0; x < length; x++) {
1176 		bi = rgd->rd_bits + x;
1177 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1178 		if (error)
1179 			goto fail;
1180 	}
1181 
1182 	for (y = length; y--;) {
1183 		bi = rgd->rd_bits + y;
1184 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1185 		if (error)
1186 			goto fail;
1187 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1188 					      GFS2_METATYPE_RG)) {
1189 			error = -EIO;
1190 			goto fail;
1191 		}
1192 	}
1193 
1194 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1195 		for (x = 0; x < length; x++)
1196 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1197 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1198 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1199 		rgd->rd_free_clone = rgd->rd_free;
1200 		/* max out the rgrp allocation failure point */
1201 		rgd->rd_extfail_pt = rgd->rd_free;
1202 	}
1203 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1204 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1205 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1206 				     rgd->rd_bits[0].bi_bh->b_data);
1207 	}
1208 	else if (sdp->sd_args.ar_rgrplvb) {
1209 		if (!gfs2_rgrp_lvb_valid(rgd)){
1210 			gfs2_consist_rgrpd(rgd);
1211 			error = -EIO;
1212 			goto fail;
1213 		}
1214 		if (rgd->rd_rgl->rl_unlinked == 0)
1215 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1216 	}
1217 	return 0;
1218 
1219 fail:
1220 	while (x--) {
1221 		bi = rgd->rd_bits + x;
1222 		brelse(bi->bi_bh);
1223 		bi->bi_bh = NULL;
1224 		gfs2_assert_warn(sdp, !bi->bi_clone);
1225 	}
1226 
1227 	return error;
1228 }
1229 
1230 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1231 {
1232 	u32 rl_flags;
1233 
1234 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1235 		return 0;
1236 
1237 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1238 		return gfs2_rgrp_bh_get(rgd);
1239 
1240 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1241 	rl_flags &= ~GFS2_RDF_MASK;
1242 	rgd->rd_flags &= GFS2_RDF_MASK;
1243 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1244 	if (rgd->rd_rgl->rl_unlinked == 0)
1245 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1246 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1247 	rgd->rd_free_clone = rgd->rd_free;
1248 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1249 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1250 	return 0;
1251 }
1252 
1253 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1254 {
1255 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1256 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1257 
1258 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1259 		return 0;
1260 	return gfs2_rgrp_bh_get(rgd);
1261 }
1262 
1263 /**
1264  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1265  * @rgd: The resource group
1266  *
1267  */
1268 
1269 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1270 {
1271 	int x, length = rgd->rd_length;
1272 
1273 	for (x = 0; x < length; x++) {
1274 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1275 		if (bi->bi_bh) {
1276 			brelse(bi->bi_bh);
1277 			bi->bi_bh = NULL;
1278 		}
1279 	}
1280 }
1281 
1282 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1283 			     struct buffer_head *bh,
1284 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1285 {
1286 	struct super_block *sb = sdp->sd_vfs;
1287 	u64 blk;
1288 	sector_t start = 0;
1289 	sector_t nr_blks = 0;
1290 	int rv;
1291 	unsigned int x;
1292 	u32 trimmed = 0;
1293 	u8 diff;
1294 
1295 	for (x = 0; x < bi->bi_bytes; x++) {
1296 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1297 		clone += bi->bi_offset;
1298 		clone += x;
1299 		if (bh) {
1300 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1301 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1302 		} else {
1303 			diff = ~(*clone | (*clone >> 1));
1304 		}
1305 		diff &= 0x55;
1306 		if (diff == 0)
1307 			continue;
1308 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1309 		while(diff) {
1310 			if (diff & 1) {
1311 				if (nr_blks == 0)
1312 					goto start_new_extent;
1313 				if ((start + nr_blks) != blk) {
1314 					if (nr_blks >= minlen) {
1315 						rv = sb_issue_discard(sb,
1316 							start, nr_blks,
1317 							GFP_NOFS, 0);
1318 						if (rv)
1319 							goto fail;
1320 						trimmed += nr_blks;
1321 					}
1322 					nr_blks = 0;
1323 start_new_extent:
1324 					start = blk;
1325 				}
1326 				nr_blks++;
1327 			}
1328 			diff >>= 2;
1329 			blk++;
1330 		}
1331 	}
1332 	if (nr_blks >= minlen) {
1333 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1334 		if (rv)
1335 			goto fail;
1336 		trimmed += nr_blks;
1337 	}
1338 	if (ptrimmed)
1339 		*ptrimmed = trimmed;
1340 	return 0;
1341 
1342 fail:
1343 	if (sdp->sd_args.ar_discard)
1344 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1345 	sdp->sd_args.ar_discard = 0;
1346 	return -EIO;
1347 }
1348 
1349 /**
1350  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1351  * @filp: Any file on the filesystem
1352  * @argp: Pointer to the arguments (also used to pass result)
1353  *
1354  * Returns: 0 on success, otherwise error code
1355  */
1356 
1357 int gfs2_fitrim(struct file *filp, void __user *argp)
1358 {
1359 	struct inode *inode = file_inode(filp);
1360 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1361 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1362 	struct buffer_head *bh;
1363 	struct gfs2_rgrpd *rgd;
1364 	struct gfs2_rgrpd *rgd_end;
1365 	struct gfs2_holder gh;
1366 	struct fstrim_range r;
1367 	int ret = 0;
1368 	u64 amt;
1369 	u64 trimmed = 0;
1370 	u64 start, end, minlen;
1371 	unsigned int x;
1372 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1373 
1374 	if (!capable(CAP_SYS_ADMIN))
1375 		return -EPERM;
1376 
1377 	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1378 		return -EROFS;
1379 
1380 	if (!blk_queue_discard(q))
1381 		return -EOPNOTSUPP;
1382 
1383 	if (copy_from_user(&r, argp, sizeof(r)))
1384 		return -EFAULT;
1385 
1386 	ret = gfs2_rindex_update(sdp);
1387 	if (ret)
1388 		return ret;
1389 
1390 	start = r.start >> bs_shift;
1391 	end = start + (r.len >> bs_shift);
1392 	minlen = max_t(u64, r.minlen,
1393 		       q->limits.discard_granularity) >> bs_shift;
1394 
1395 	if (end <= start || minlen > sdp->sd_max_rg_data)
1396 		return -EINVAL;
1397 
1398 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1399 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1400 
1401 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1402 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1403 		return -EINVAL; /* start is beyond the end of the fs */
1404 
1405 	while (1) {
1406 
1407 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1408 		if (ret)
1409 			goto out;
1410 
1411 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1412 			/* Trim each bitmap in the rgrp */
1413 			for (x = 0; x < rgd->rd_length; x++) {
1414 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1415 				ret = gfs2_rgrp_send_discards(sdp,
1416 						rgd->rd_data0, NULL, bi, minlen,
1417 						&amt);
1418 				if (ret) {
1419 					gfs2_glock_dq_uninit(&gh);
1420 					goto out;
1421 				}
1422 				trimmed += amt;
1423 			}
1424 
1425 			/* Mark rgrp as having been trimmed */
1426 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1427 			if (ret == 0) {
1428 				bh = rgd->rd_bits[0].bi_bh;
1429 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1430 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1431 				gfs2_rgrp_out(rgd, bh->b_data);
1432 				gfs2_trans_end(sdp);
1433 			}
1434 		}
1435 		gfs2_glock_dq_uninit(&gh);
1436 
1437 		if (rgd == rgd_end)
1438 			break;
1439 
1440 		rgd = gfs2_rgrpd_get_next(rgd);
1441 	}
1442 
1443 out:
1444 	r.len = trimmed << bs_shift;
1445 	if (copy_to_user(argp, &r, sizeof(r)))
1446 		return -EFAULT;
1447 
1448 	return ret;
1449 }
1450 
1451 /**
1452  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1453  * @ip: the inode structure
1454  *
1455  */
1456 static void rs_insert(struct gfs2_inode *ip)
1457 {
1458 	struct rb_node **newn, *parent = NULL;
1459 	int rc;
1460 	struct gfs2_blkreserv *rs = &ip->i_res;
1461 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1462 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1463 
1464 	BUG_ON(gfs2_rs_active(rs));
1465 
1466 	spin_lock(&rgd->rd_rsspin);
1467 	newn = &rgd->rd_rstree.rb_node;
1468 	while (*newn) {
1469 		struct gfs2_blkreserv *cur =
1470 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1471 
1472 		parent = *newn;
1473 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1474 		if (rc > 0)
1475 			newn = &((*newn)->rb_right);
1476 		else if (rc < 0)
1477 			newn = &((*newn)->rb_left);
1478 		else {
1479 			spin_unlock(&rgd->rd_rsspin);
1480 			WARN_ON(1);
1481 			return;
1482 		}
1483 	}
1484 
1485 	rb_link_node(&rs->rs_node, parent, newn);
1486 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1487 
1488 	/* Do our rgrp accounting for the reservation */
1489 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1490 	spin_unlock(&rgd->rd_rsspin);
1491 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1492 }
1493 
1494 /**
1495  * rgd_free - return the number of free blocks we can allocate.
1496  * @rgd: the resource group
1497  *
1498  * This function returns the number of free blocks for an rgrp.
1499  * That's the clone-free blocks (blocks that are free, not including those
1500  * still being used for unlinked files that haven't been deleted.)
1501  *
1502  * It also subtracts any blocks reserved by someone else, but does not
1503  * include free blocks that are still part of our current reservation,
1504  * because obviously we can (and will) allocate them.
1505  */
1506 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1507 {
1508 	u32 tot_reserved, tot_free;
1509 
1510 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1511 		return 0;
1512 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1513 
1514 	if (rgd->rd_free_clone < tot_reserved)
1515 		tot_reserved = 0;
1516 
1517 	tot_free = rgd->rd_free_clone - tot_reserved;
1518 
1519 	return tot_free;
1520 }
1521 
1522 /**
1523  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1524  * @rgd: the resource group descriptor
1525  * @ip: pointer to the inode for which we're reserving blocks
1526  * @ap: the allocation parameters
1527  *
1528  */
1529 
1530 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1531 			   const struct gfs2_alloc_parms *ap)
1532 {
1533 	struct gfs2_rbm rbm = { .rgd = rgd, };
1534 	u64 goal;
1535 	struct gfs2_blkreserv *rs = &ip->i_res;
1536 	u32 extlen;
1537 	u32 free_blocks = rgd_free(rgd, rs);
1538 	int ret;
1539 	struct inode *inode = &ip->i_inode;
1540 
1541 	if (S_ISDIR(inode->i_mode))
1542 		extlen = 1;
1543 	else {
1544 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1545 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1546 	}
1547 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1548 		return;
1549 
1550 	/* Find bitmap block that contains bits for goal block */
1551 	if (rgrp_contains_block(rgd, ip->i_goal))
1552 		goal = ip->i_goal;
1553 	else
1554 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1555 
1556 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1557 		return;
1558 
1559 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1560 	if (ret == 0) {
1561 		rs->rs_rbm = rbm;
1562 		rs->rs_free = extlen;
1563 		rs_insert(ip);
1564 	} else {
1565 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1566 			rgd->rd_last_alloc = 0;
1567 	}
1568 }
1569 
1570 /**
1571  * gfs2_next_unreserved_block - Return next block that is not reserved
1572  * @rgd: The resource group
1573  * @block: The starting block
1574  * @length: The required length
1575  * @ip: Ignore any reservations for this inode
1576  *
1577  * If the block does not appear in any reservation, then return the
1578  * block number unchanged. If it does appear in the reservation, then
1579  * keep looking through the tree of reservations in order to find the
1580  * first block number which is not reserved.
1581  */
1582 
1583 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1584 				      u32 length,
1585 				      const struct gfs2_inode *ip)
1586 {
1587 	struct gfs2_blkreserv *rs;
1588 	struct rb_node *n;
1589 	int rc;
1590 
1591 	spin_lock(&rgd->rd_rsspin);
1592 	n = rgd->rd_rstree.rb_node;
1593 	while (n) {
1594 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1595 		rc = rs_cmp(block, length, rs);
1596 		if (rc < 0)
1597 			n = n->rb_left;
1598 		else if (rc > 0)
1599 			n = n->rb_right;
1600 		else
1601 			break;
1602 	}
1603 
1604 	if (n) {
1605 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1606 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1607 			n = n->rb_right;
1608 			if (n == NULL)
1609 				break;
1610 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1611 		}
1612 	}
1613 
1614 	spin_unlock(&rgd->rd_rsspin);
1615 	return block;
1616 }
1617 
1618 /**
1619  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1620  * @rbm: The current position in the resource group
1621  * @ip: The inode for which we are searching for blocks
1622  * @minext: The minimum extent length
1623  * @maxext: A pointer to the maximum extent structure
1624  *
1625  * This checks the current position in the rgrp to see whether there is
1626  * a reservation covering this block. If not then this function is a
1627  * no-op. If there is, then the position is moved to the end of the
1628  * contiguous reservation(s) so that we are pointing at the first
1629  * non-reserved block.
1630  *
1631  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1632  */
1633 
1634 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1635 					     const struct gfs2_inode *ip,
1636 					     u32 minext,
1637 					     struct gfs2_extent *maxext)
1638 {
1639 	u64 block = gfs2_rbm_to_block(rbm);
1640 	u32 extlen = 1;
1641 	u64 nblock;
1642 	int ret;
1643 
1644 	/*
1645 	 * If we have a minimum extent length, then skip over any extent
1646 	 * which is less than the min extent length in size.
1647 	 */
1648 	if (minext) {
1649 		extlen = gfs2_free_extlen(rbm, minext);
1650 		if (extlen <= maxext->len)
1651 			goto fail;
1652 	}
1653 
1654 	/*
1655 	 * Check the extent which has been found against the reservations
1656 	 * and skip if parts of it are already reserved
1657 	 */
1658 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1659 	if (nblock == block) {
1660 		if (!minext || extlen >= minext)
1661 			return 0;
1662 
1663 		if (extlen > maxext->len) {
1664 			maxext->len = extlen;
1665 			maxext->rbm = *rbm;
1666 		}
1667 fail:
1668 		nblock = block + extlen;
1669 	}
1670 	ret = gfs2_rbm_from_block(rbm, nblock);
1671 	if (ret < 0)
1672 		return ret;
1673 	return 1;
1674 }
1675 
1676 /**
1677  * gfs2_rbm_find - Look for blocks of a particular state
1678  * @rbm: Value/result starting position and final position
1679  * @state: The state which we want to find
1680  * @minext: Pointer to the requested extent length (NULL for a single block)
1681  *          This is updated to be the actual reservation size.
1682  * @ip: If set, check for reservations
1683  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1684  *          around until we've reached the starting point.
1685  *
1686  * Side effects:
1687  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1688  *   has no free blocks in it.
1689  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1690  *   has come up short on a free block search.
1691  *
1692  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1693  */
1694 
1695 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1696 			 const struct gfs2_inode *ip, bool nowrap)
1697 {
1698 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1699 	struct buffer_head *bh;
1700 	int last_bii;
1701 	u32 offset;
1702 	u8 *buffer;
1703 	bool wrapped = false;
1704 	int ret;
1705 	struct gfs2_bitmap *bi;
1706 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1707 
1708 	/*
1709 	 * Determine the last bitmap to search.  If we're not starting at the
1710 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1711 	 * the entire resource group.
1712 	 */
1713 	last_bii = rbm->bii - (rbm->offset == 0);
1714 
1715 	while(1) {
1716 		bi = rbm_bi(rbm);
1717 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1718 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1719 		    (state == GFS2_BLKST_FREE))
1720 			goto next_bitmap;
1721 
1722 		bh = bi->bi_bh;
1723 		buffer = bh->b_data + bi->bi_offset;
1724 		WARN_ON(!buffer_uptodate(bh));
1725 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1726 			buffer = bi->bi_clone + bi->bi_offset;
1727 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1728 		if (offset == BFITNOENT) {
1729 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1730 				set_bit(GBF_FULL, &bi->bi_flags);
1731 			goto next_bitmap;
1732 		}
1733 		rbm->offset = offset;
1734 		if (ip == NULL)
1735 			return 0;
1736 
1737 		ret = gfs2_reservation_check_and_update(rbm, ip,
1738 							minext ? *minext : 0,
1739 							&maxext);
1740 		if (ret == 0)
1741 			return 0;
1742 		if (ret > 0)
1743 			goto next_iter;
1744 		if (ret == -E2BIG) {
1745 			rbm->bii = 0;
1746 			rbm->offset = 0;
1747 			goto res_covered_end_of_rgrp;
1748 		}
1749 		return ret;
1750 
1751 next_bitmap:	/* Find next bitmap in the rgrp */
1752 		rbm->offset = 0;
1753 		rbm->bii++;
1754 		if (rbm->bii == rbm->rgd->rd_length)
1755 			rbm->bii = 0;
1756 res_covered_end_of_rgrp:
1757 		if (rbm->bii == 0) {
1758 			if (wrapped)
1759 				break;
1760 			wrapped = true;
1761 			if (nowrap)
1762 				break;
1763 		}
1764 next_iter:
1765 		/* Have we scanned the entire resource group? */
1766 		if (wrapped && rbm->bii > last_bii)
1767 			break;
1768 	}
1769 
1770 	if (minext == NULL || state != GFS2_BLKST_FREE)
1771 		return -ENOSPC;
1772 
1773 	/* If the extent was too small, and it's smaller than the smallest
1774 	   to have failed before, remember for future reference that it's
1775 	   useless to search this rgrp again for this amount or more. */
1776 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1777 	    *minext < rbm->rgd->rd_extfail_pt)
1778 		rbm->rgd->rd_extfail_pt = *minext;
1779 
1780 	/* If the maximum extent we found is big enough to fulfill the
1781 	   minimum requirements, use it anyway. */
1782 	if (maxext.len) {
1783 		*rbm = maxext.rbm;
1784 		*minext = maxext.len;
1785 		return 0;
1786 	}
1787 
1788 	return -ENOSPC;
1789 }
1790 
1791 /**
1792  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1793  * @rgd: The rgrp
1794  * @last_unlinked: block address of the last dinode we unlinked
1795  * @skip: block address we should explicitly not unlink
1796  *
1797  * Returns: 0 if no error
1798  *          The inode, if one has been found, in inode.
1799  */
1800 
1801 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1802 {
1803 	u64 block;
1804 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1805 	struct gfs2_glock *gl;
1806 	struct gfs2_inode *ip;
1807 	int error;
1808 	int found = 0;
1809 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1810 
1811 	while (1) {
1812 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1813 				      true);
1814 		if (error == -ENOSPC)
1815 			break;
1816 		if (WARN_ON_ONCE(error))
1817 			break;
1818 
1819 		block = gfs2_rbm_to_block(&rbm);
1820 		if (gfs2_rbm_from_block(&rbm, block + 1))
1821 			break;
1822 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1823 			continue;
1824 		if (block == skip)
1825 			continue;
1826 		*last_unlinked = block;
1827 
1828 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1829 		if (error)
1830 			continue;
1831 
1832 		/* If the inode is already in cache, we can ignore it here
1833 		 * because the existing inode disposal code will deal with
1834 		 * it when all refs have gone away. Accessing gl_object like
1835 		 * this is not safe in general. Here it is ok because we do
1836 		 * not dereference the pointer, and we only need an approx
1837 		 * answer to whether it is NULL or not.
1838 		 */
1839 		ip = gl->gl_object;
1840 
1841 		if (ip || !gfs2_queue_delete_work(gl, 0))
1842 			gfs2_glock_put(gl);
1843 		else
1844 			found++;
1845 
1846 		/* Limit reclaim to sensible number of tasks */
1847 		if (found > NR_CPUS)
1848 			return;
1849 	}
1850 
1851 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1852 	return;
1853 }
1854 
1855 /**
1856  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1857  * @rgd: The rgrp in question
1858  * @loops: An indication of how picky we can be (0=very, 1=less so)
1859  *
1860  * This function uses the recently added glock statistics in order to
1861  * figure out whether a parciular resource group is suffering from
1862  * contention from multiple nodes. This is done purely on the basis
1863  * of timings, since this is the only data we have to work with and
1864  * our aim here is to reject a resource group which is highly contended
1865  * but (very important) not to do this too often in order to ensure that
1866  * we do not land up introducing fragmentation by changing resource
1867  * groups when not actually required.
1868  *
1869  * The calculation is fairly simple, we want to know whether the SRTTB
1870  * (i.e. smoothed round trip time for blocking operations) to acquire
1871  * the lock for this rgrp's glock is significantly greater than the
1872  * time taken for resource groups on average. We introduce a margin in
1873  * the form of the variable @var which is computed as the sum of the two
1874  * respective variences, and multiplied by a factor depending on @loops
1875  * and whether we have a lot of data to base the decision on. This is
1876  * then tested against the square difference of the means in order to
1877  * decide whether the result is statistically significant or not.
1878  *
1879  * Returns: A boolean verdict on the congestion status
1880  */
1881 
1882 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1883 {
1884 	const struct gfs2_glock *gl = rgd->rd_gl;
1885 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1886 	struct gfs2_lkstats *st;
1887 	u64 r_dcount, l_dcount;
1888 	u64 l_srttb, a_srttb = 0;
1889 	s64 srttb_diff;
1890 	u64 sqr_diff;
1891 	u64 var;
1892 	int cpu, nonzero = 0;
1893 
1894 	preempt_disable();
1895 	for_each_present_cpu(cpu) {
1896 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1897 		if (st->stats[GFS2_LKS_SRTTB]) {
1898 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1899 			nonzero++;
1900 		}
1901 	}
1902 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1903 	if (nonzero)
1904 		do_div(a_srttb, nonzero);
1905 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1906 	var = st->stats[GFS2_LKS_SRTTVARB] +
1907 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1908 	preempt_enable();
1909 
1910 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1911 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1912 
1913 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1914 		return false;
1915 
1916 	srttb_diff = a_srttb - l_srttb;
1917 	sqr_diff = srttb_diff * srttb_diff;
1918 
1919 	var *= 2;
1920 	if (l_dcount < 8 || r_dcount < 8)
1921 		var *= 2;
1922 	if (loops == 1)
1923 		var *= 2;
1924 
1925 	return ((srttb_diff < 0) && (sqr_diff > var));
1926 }
1927 
1928 /**
1929  * gfs2_rgrp_used_recently
1930  * @rs: The block reservation with the rgrp to test
1931  * @msecs: The time limit in milliseconds
1932  *
1933  * Returns: True if the rgrp glock has been used within the time limit
1934  */
1935 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1936 				    u64 msecs)
1937 {
1938 	u64 tdiff;
1939 
1940 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1941                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1942 
1943 	return tdiff > (msecs * 1000 * 1000);
1944 }
1945 
1946 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1947 {
1948 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1949 	u32 skip;
1950 
1951 	get_random_bytes(&skip, sizeof(skip));
1952 	return skip % sdp->sd_rgrps;
1953 }
1954 
1955 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1956 {
1957 	struct gfs2_rgrpd *rgd = *pos;
1958 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1959 
1960 	rgd = gfs2_rgrpd_get_next(rgd);
1961 	if (rgd == NULL)
1962 		rgd = gfs2_rgrpd_get_first(sdp);
1963 	*pos = rgd;
1964 	if (rgd != begin) /* If we didn't wrap */
1965 		return true;
1966 	return false;
1967 }
1968 
1969 /**
1970  * fast_to_acquire - determine if a resource group will be fast to acquire
1971  *
1972  * If this is one of our preferred rgrps, it should be quicker to acquire,
1973  * because we tried to set ourselves up as dlm lock master.
1974  */
1975 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1976 {
1977 	struct gfs2_glock *gl = rgd->rd_gl;
1978 
1979 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1980 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1981 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1982 		return 1;
1983 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1984 		return 1;
1985 	return 0;
1986 }
1987 
1988 /**
1989  * gfs2_inplace_reserve - Reserve space in the filesystem
1990  * @ip: the inode to reserve space for
1991  * @ap: the allocation parameters
1992  *
1993  * We try our best to find an rgrp that has at least ap->target blocks
1994  * available. After a couple of passes (loops == 2), the prospects of finding
1995  * such an rgrp diminish. At this stage, we return the first rgrp that has
1996  * at least ap->min_target blocks available. Either way, we set ap->allowed to
1997  * the number of blocks available in the chosen rgrp.
1998  *
1999  * Returns: 0 on success,
2000  *          -ENOMEM if a suitable rgrp can't be found
2001  *          errno otherwise
2002  */
2003 
2004 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2005 {
2006 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2007 	struct gfs2_rgrpd *begin = NULL;
2008 	struct gfs2_blkreserv *rs = &ip->i_res;
2009 	int error = 0, rg_locked, flags = 0;
2010 	u64 last_unlinked = NO_BLOCK;
2011 	int loops = 0;
2012 	u32 free_blocks, skip = 0;
2013 
2014 	if (sdp->sd_args.ar_rgrplvb)
2015 		flags |= GL_SKIP;
2016 	if (gfs2_assert_warn(sdp, ap->target))
2017 		return -EINVAL;
2018 	if (gfs2_rs_active(rs)) {
2019 		begin = rs->rs_rbm.rgd;
2020 	} else if (rs->rs_rbm.rgd &&
2021 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2022 		begin = rs->rs_rbm.rgd;
2023 	} else {
2024 		check_and_update_goal(ip);
2025 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2026 	}
2027 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2028 		skip = gfs2_orlov_skip(ip);
2029 	if (rs->rs_rbm.rgd == NULL)
2030 		return -EBADSLT;
2031 
2032 	while (loops < 3) {
2033 		rg_locked = 1;
2034 
2035 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2036 			rg_locked = 0;
2037 			if (skip && skip--)
2038 				goto next_rgrp;
2039 			if (!gfs2_rs_active(rs)) {
2040 				if (loops == 0 &&
2041 				    !fast_to_acquire(rs->rs_rbm.rgd))
2042 					goto next_rgrp;
2043 				if ((loops < 2) &&
2044 				    gfs2_rgrp_used_recently(rs, 1000) &&
2045 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2046 					goto next_rgrp;
2047 			}
2048 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2049 						   LM_ST_EXCLUSIVE, flags,
2050 						   &ip->i_rgd_gh);
2051 			if (unlikely(error))
2052 				return error;
2053 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2054 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2055 				goto skip_rgrp;
2056 			if (sdp->sd_args.ar_rgrplvb) {
2057 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2058 				if (unlikely(error)) {
2059 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2060 					return error;
2061 				}
2062 			}
2063 		}
2064 
2065 		/* Skip unusable resource groups */
2066 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2067 						 GFS2_RDF_ERROR)) ||
2068 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2069 			goto skip_rgrp;
2070 
2071 		if (sdp->sd_args.ar_rgrplvb)
2072 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2073 
2074 		/* Get a reservation if we don't already have one */
2075 		if (!gfs2_rs_active(rs))
2076 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2077 
2078 		/* Skip rgrps when we can't get a reservation on first pass */
2079 		if (!gfs2_rs_active(rs) && (loops < 1))
2080 			goto check_rgrp;
2081 
2082 		/* If rgrp has enough free space, use it */
2083 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2084 		if (free_blocks >= ap->target ||
2085 		    (loops == 2 && ap->min_target &&
2086 		     free_blocks >= ap->min_target)) {
2087 			ap->allowed = free_blocks;
2088 			return 0;
2089 		}
2090 check_rgrp:
2091 		/* Check for unlinked inodes which can be reclaimed */
2092 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2093 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2094 					ip->i_no_addr);
2095 skip_rgrp:
2096 		/* Drop reservation, if we couldn't use reserved rgrp */
2097 		if (gfs2_rs_active(rs))
2098 			gfs2_rs_deltree(rs);
2099 
2100 		/* Unlock rgrp if required */
2101 		if (!rg_locked)
2102 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2103 next_rgrp:
2104 		/* Find the next rgrp, and continue looking */
2105 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2106 			continue;
2107 		if (skip)
2108 			continue;
2109 
2110 		/* If we've scanned all the rgrps, but found no free blocks
2111 		 * then this checks for some less likely conditions before
2112 		 * trying again.
2113 		 */
2114 		loops++;
2115 		/* Check that fs hasn't grown if writing to rindex */
2116 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2117 			error = gfs2_ri_update(ip);
2118 			if (error)
2119 				return error;
2120 		}
2121 		/* Flushing the log may release space */
2122 		if (loops == 2)
2123 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2124 				       GFS2_LFC_INPLACE_RESERVE);
2125 	}
2126 
2127 	return -ENOSPC;
2128 }
2129 
2130 /**
2131  * gfs2_inplace_release - release an inplace reservation
2132  * @ip: the inode the reservation was taken out on
2133  *
2134  * Release a reservation made by gfs2_inplace_reserve().
2135  */
2136 
2137 void gfs2_inplace_release(struct gfs2_inode *ip)
2138 {
2139 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2140 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2141 }
2142 
2143 /**
2144  * gfs2_alloc_extent - allocate an extent from a given bitmap
2145  * @rbm: the resource group information
2146  * @dinode: TRUE if the first block we allocate is for a dinode
2147  * @n: The extent length (value/result)
2148  *
2149  * Add the bitmap buffer to the transaction.
2150  * Set the found bits to @new_state to change block's allocation state.
2151  */
2152 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2153 			     unsigned int *n)
2154 {
2155 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2156 	const unsigned int elen = *n;
2157 	u64 block;
2158 	int ret;
2159 
2160 	*n = 1;
2161 	block = gfs2_rbm_to_block(rbm);
2162 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2163 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2164 	block++;
2165 	while (*n < elen) {
2166 		ret = gfs2_rbm_from_block(&pos, block);
2167 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2168 			break;
2169 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2170 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2171 		(*n)++;
2172 		block++;
2173 	}
2174 }
2175 
2176 /**
2177  * rgblk_free - Change alloc state of given block(s)
2178  * @sdp: the filesystem
2179  * @rgd: the resource group the blocks are in
2180  * @bstart: the start of a run of blocks to free
2181  * @blen: the length of the block run (all must lie within ONE RG!)
2182  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2183  */
2184 
2185 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2186 		       u64 bstart, u32 blen, unsigned char new_state)
2187 {
2188 	struct gfs2_rbm rbm;
2189 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2190 
2191 	rbm.rgd = rgd;
2192 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2193 		return;
2194 	while (blen--) {
2195 		bi = rbm_bi(&rbm);
2196 		if (bi != bi_prev) {
2197 			if (!bi->bi_clone) {
2198 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2199 						      GFP_NOFS | __GFP_NOFAIL);
2200 				memcpy(bi->bi_clone + bi->bi_offset,
2201 				       bi->bi_bh->b_data + bi->bi_offset,
2202 				       bi->bi_bytes);
2203 			}
2204 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2205 			bi_prev = bi;
2206 		}
2207 		gfs2_setbit(&rbm, false, new_state);
2208 		gfs2_rbm_incr(&rbm);
2209 	}
2210 }
2211 
2212 /**
2213  * gfs2_rgrp_dump - print out an rgrp
2214  * @seq: The iterator
2215  * @rgd: The rgrp in question
2216  * @fs_id_buf: pointer to file system id (if requested)
2217  *
2218  */
2219 
2220 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2221 		    const char *fs_id_buf)
2222 {
2223 	struct gfs2_blkreserv *trs;
2224 	const struct rb_node *n;
2225 
2226 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2227 		       fs_id_buf,
2228 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2229 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2230 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2231 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2232 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2233 
2234 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2235 			       be32_to_cpu(rgl->rl_flags),
2236 			       be32_to_cpu(rgl->rl_free),
2237 			       be32_to_cpu(rgl->rl_dinodes));
2238 	}
2239 	spin_lock(&rgd->rd_rsspin);
2240 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2241 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2242 		dump_rs(seq, trs, fs_id_buf);
2243 	}
2244 	spin_unlock(&rgd->rd_rsspin);
2245 }
2246 
2247 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2248 {
2249 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2250 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2251 
2252 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2253 		(unsigned long long)rgd->rd_addr);
2254 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2255 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2256 	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2257 	rgd->rd_flags |= GFS2_RDF_ERROR;
2258 }
2259 
2260 /**
2261  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2262  * @ip: The inode we have just allocated blocks for
2263  * @rbm: The start of the allocated blocks
2264  * @len: The extent length
2265  *
2266  * Adjusts a reservation after an allocation has taken place. If the
2267  * reservation does not match the allocation, or if it is now empty
2268  * then it is removed.
2269  */
2270 
2271 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2272 				    const struct gfs2_rbm *rbm, unsigned len)
2273 {
2274 	struct gfs2_blkreserv *rs = &ip->i_res;
2275 	struct gfs2_rgrpd *rgd = rbm->rgd;
2276 	unsigned rlen;
2277 	u64 block;
2278 	int ret;
2279 
2280 	spin_lock(&rgd->rd_rsspin);
2281 	if (gfs2_rs_active(rs)) {
2282 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2283 			block = gfs2_rbm_to_block(rbm);
2284 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2285 			rlen = min(rs->rs_free, len);
2286 			rs->rs_free -= rlen;
2287 			rgd->rd_reserved -= rlen;
2288 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2289 			if (rs->rs_free && !ret)
2290 				goto out;
2291 			/* We used up our block reservation, so we should
2292 			   reserve more blocks next time. */
2293 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2294 		}
2295 		__rs_deltree(rs);
2296 	}
2297 out:
2298 	spin_unlock(&rgd->rd_rsspin);
2299 }
2300 
2301 /**
2302  * gfs2_set_alloc_start - Set starting point for block allocation
2303  * @rbm: The rbm which will be set to the required location
2304  * @ip: The gfs2 inode
2305  * @dinode: Flag to say if allocation includes a new inode
2306  *
2307  * This sets the starting point from the reservation if one is active
2308  * otherwise it falls back to guessing a start point based on the
2309  * inode's goal block or the last allocation point in the rgrp.
2310  */
2311 
2312 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2313 				 const struct gfs2_inode *ip, bool dinode)
2314 {
2315 	u64 goal;
2316 
2317 	if (gfs2_rs_active(&ip->i_res)) {
2318 		*rbm = ip->i_res.rs_rbm;
2319 		return;
2320 	}
2321 
2322 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2323 		goal = ip->i_goal;
2324 	else
2325 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2326 
2327 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2328 		rbm->bii = 0;
2329 		rbm->offset = 0;
2330 	}
2331 }
2332 
2333 /**
2334  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2335  * @ip: the inode to allocate the block for
2336  * @bn: Used to return the starting block number
2337  * @nblocks: requested number of blocks/extent length (value/result)
2338  * @dinode: 1 if we're allocating a dinode block, else 0
2339  * @generation: the generation number of the inode
2340  *
2341  * Returns: 0 or error
2342  */
2343 
2344 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2345 		      bool dinode, u64 *generation)
2346 {
2347 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2348 	struct buffer_head *dibh;
2349 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2350 	unsigned int ndata;
2351 	u64 block; /* block, within the file system scope */
2352 	int error;
2353 
2354 	gfs2_set_alloc_start(&rbm, ip, dinode);
2355 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2356 
2357 	if (error == -ENOSPC) {
2358 		gfs2_set_alloc_start(&rbm, ip, dinode);
2359 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2360 	}
2361 
2362 	/* Since all blocks are reserved in advance, this shouldn't happen */
2363 	if (error) {
2364 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2365 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2366 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2367 			rbm.rgd->rd_extfail_pt);
2368 		goto rgrp_error;
2369 	}
2370 
2371 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2372 	block = gfs2_rbm_to_block(&rbm);
2373 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2374 	if (gfs2_rs_active(&ip->i_res))
2375 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2376 	ndata = *nblocks;
2377 	if (dinode)
2378 		ndata--;
2379 
2380 	if (!dinode) {
2381 		ip->i_goal = block + ndata - 1;
2382 		error = gfs2_meta_inode_buffer(ip, &dibh);
2383 		if (error == 0) {
2384 			struct gfs2_dinode *di =
2385 				(struct gfs2_dinode *)dibh->b_data;
2386 			gfs2_trans_add_meta(ip->i_gl, dibh);
2387 			di->di_goal_meta = di->di_goal_data =
2388 				cpu_to_be64(ip->i_goal);
2389 			brelse(dibh);
2390 		}
2391 	}
2392 	if (rbm.rgd->rd_free < *nblocks) {
2393 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2394 		goto rgrp_error;
2395 	}
2396 
2397 	rbm.rgd->rd_free -= *nblocks;
2398 	if (dinode) {
2399 		rbm.rgd->rd_dinodes++;
2400 		*generation = rbm.rgd->rd_igeneration++;
2401 		if (*generation == 0)
2402 			*generation = rbm.rgd->rd_igeneration++;
2403 	}
2404 
2405 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2406 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2407 
2408 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2409 	if (dinode)
2410 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2411 
2412 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2413 
2414 	rbm.rgd->rd_free_clone -= *nblocks;
2415 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2416 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2417 	*bn = block;
2418 	return 0;
2419 
2420 rgrp_error:
2421 	gfs2_rgrp_error(rbm.rgd);
2422 	return -EIO;
2423 }
2424 
2425 /**
2426  * __gfs2_free_blocks - free a contiguous run of block(s)
2427  * @ip: the inode these blocks are being freed from
2428  * @rgd: the resource group the blocks are in
2429  * @bstart: first block of a run of contiguous blocks
2430  * @blen: the length of the block run
2431  * @meta: 1 if the blocks represent metadata
2432  *
2433  */
2434 
2435 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2436 			u64 bstart, u32 blen, int meta)
2437 {
2438 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2439 
2440 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2441 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2442 	rgd->rd_free += blen;
2443 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2444 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2445 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2446 
2447 	/* Directories keep their data in the metadata address space */
2448 	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2449 		gfs2_journal_wipe(ip, bstart, blen);
2450 }
2451 
2452 /**
2453  * gfs2_free_meta - free a contiguous run of data block(s)
2454  * @ip: the inode these blocks are being freed from
2455  * @rgd: the resource group the blocks are in
2456  * @bstart: first block of a run of contiguous blocks
2457  * @blen: the length of the block run
2458  *
2459  */
2460 
2461 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2462 		    u64 bstart, u32 blen)
2463 {
2464 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2465 
2466 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2467 	gfs2_statfs_change(sdp, 0, +blen, 0);
2468 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2469 }
2470 
2471 void gfs2_unlink_di(struct inode *inode)
2472 {
2473 	struct gfs2_inode *ip = GFS2_I(inode);
2474 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2475 	struct gfs2_rgrpd *rgd;
2476 	u64 blkno = ip->i_no_addr;
2477 
2478 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2479 	if (!rgd)
2480 		return;
2481 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2482 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2483 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2484 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2485 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2486 }
2487 
2488 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2489 {
2490 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2491 
2492 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2493 	if (!rgd->rd_dinodes)
2494 		gfs2_consist_rgrpd(rgd);
2495 	rgd->rd_dinodes--;
2496 	rgd->rd_free++;
2497 
2498 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2499 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2500 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2501 
2502 	gfs2_statfs_change(sdp, 0, +1, -1);
2503 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2504 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2505 	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2506 }
2507 
2508 /**
2509  * gfs2_check_blk_type - Check the type of a block
2510  * @sdp: The superblock
2511  * @no_addr: The block number to check
2512  * @type: The block type we are looking for
2513  *
2514  * Returns: 0 if the block type matches the expected type
2515  *          -ESTALE if it doesn't match
2516  *          or -ve errno if something went wrong while checking
2517  */
2518 
2519 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2520 {
2521 	struct gfs2_rgrpd *rgd;
2522 	struct gfs2_holder rgd_gh;
2523 	struct gfs2_rbm rbm;
2524 	int error = -EINVAL;
2525 
2526 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2527 	if (!rgd)
2528 		goto fail;
2529 
2530 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2531 	if (error)
2532 		goto fail;
2533 
2534 	rbm.rgd = rgd;
2535 	error = gfs2_rbm_from_block(&rbm, no_addr);
2536 	if (!WARN_ON_ONCE(error)) {
2537 		if (gfs2_testbit(&rbm, false) != type)
2538 			error = -ESTALE;
2539 	}
2540 
2541 	gfs2_glock_dq_uninit(&rgd_gh);
2542 
2543 fail:
2544 	return error;
2545 }
2546 
2547 /**
2548  * gfs2_rlist_add - add a RG to a list of RGs
2549  * @ip: the inode
2550  * @rlist: the list of resource groups
2551  * @block: the block
2552  *
2553  * Figure out what RG a block belongs to and add that RG to the list
2554  *
2555  * FIXME: Don't use NOFAIL
2556  *
2557  */
2558 
2559 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2560 		    u64 block)
2561 {
2562 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2563 	struct gfs2_rgrpd *rgd;
2564 	struct gfs2_rgrpd **tmp;
2565 	unsigned int new_space;
2566 	unsigned int x;
2567 
2568 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2569 		return;
2570 
2571 	/*
2572 	 * The resource group last accessed is kept in the last position.
2573 	 */
2574 
2575 	if (rlist->rl_rgrps) {
2576 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2577 		if (rgrp_contains_block(rgd, block))
2578 			return;
2579 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2580 	} else {
2581 		rgd = ip->i_res.rs_rbm.rgd;
2582 		if (!rgd || !rgrp_contains_block(rgd, block))
2583 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2584 	}
2585 
2586 	if (!rgd) {
2587 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2588 		       (unsigned long long)block);
2589 		return;
2590 	}
2591 
2592 	for (x = 0; x < rlist->rl_rgrps; x++) {
2593 		if (rlist->rl_rgd[x] == rgd) {
2594 			swap(rlist->rl_rgd[x],
2595 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2596 			return;
2597 		}
2598 	}
2599 
2600 	if (rlist->rl_rgrps == rlist->rl_space) {
2601 		new_space = rlist->rl_space + 10;
2602 
2603 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2604 			      GFP_NOFS | __GFP_NOFAIL);
2605 
2606 		if (rlist->rl_rgd) {
2607 			memcpy(tmp, rlist->rl_rgd,
2608 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2609 			kfree(rlist->rl_rgd);
2610 		}
2611 
2612 		rlist->rl_space = new_space;
2613 		rlist->rl_rgd = tmp;
2614 	}
2615 
2616 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2617 }
2618 
2619 /**
2620  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2621  *      and initialize an array of glock holders for them
2622  * @rlist: the list of resource groups
2623  *
2624  * FIXME: Don't use NOFAIL
2625  *
2626  */
2627 
2628 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2629 {
2630 	unsigned int x;
2631 
2632 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2633 				      sizeof(struct gfs2_holder),
2634 				      GFP_NOFS | __GFP_NOFAIL);
2635 	for (x = 0; x < rlist->rl_rgrps; x++)
2636 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2637 				LM_ST_EXCLUSIVE, 0,
2638 				&rlist->rl_ghs[x]);
2639 }
2640 
2641 /**
2642  * gfs2_rlist_free - free a resource group list
2643  * @rlist: the list of resource groups
2644  *
2645  */
2646 
2647 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2648 {
2649 	unsigned int x;
2650 
2651 	kfree(rlist->rl_rgd);
2652 
2653 	if (rlist->rl_ghs) {
2654 		for (x = 0; x < rlist->rl_rgrps; x++)
2655 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2656 		kfree(rlist->rl_ghs);
2657 		rlist->rl_ghs = NULL;
2658 	}
2659 }
2660 
2661