xref: /linux/fs/gfs2/rgrp.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/completion.h>
15 #include <linux/buffer_head.h>
16 #include <linux/fs.h>
17 #include <linux/gfs2_ondisk.h>
18 #include <linux/prefetch.h>
19 #include <linux/blkdev.h>
20 #include <linux/rbtree.h>
21 #include <linux/random.h>
22 
23 #include "gfs2.h"
24 #include "incore.h"
25 #include "glock.h"
26 #include "glops.h"
27 #include "lops.h"
28 #include "meta_io.h"
29 #include "quota.h"
30 #include "rgrp.h"
31 #include "super.h"
32 #include "trans.h"
33 #include "util.h"
34 #include "log.h"
35 #include "inode.h"
36 #include "trace_gfs2.h"
37 
38 #define BFITNOENT ((u32)~0)
39 #define NO_BLOCK ((u64)~0)
40 
41 #if BITS_PER_LONG == 32
42 #define LBITMASK   (0x55555555UL)
43 #define LBITSKIP55 (0x55555555UL)
44 #define LBITSKIP00 (0x00000000UL)
45 #else
46 #define LBITMASK   (0x5555555555555555UL)
47 #define LBITSKIP55 (0x5555555555555555UL)
48 #define LBITSKIP00 (0x0000000000000000UL)
49 #endif
50 
51 /*
52  * These routines are used by the resource group routines (rgrp.c)
53  * to keep track of block allocation.  Each block is represented by two
54  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
55  *
56  * 0 = Free
57  * 1 = Used (not metadata)
58  * 2 = Unlinked (still in use) inode
59  * 3 = Used (metadata)
60  */
61 
62 struct gfs2_extent {
63 	struct gfs2_rbm rbm;
64 	u32 len;
65 };
66 
67 static const char valid_change[16] = {
68 	        /* current */
69 	/* n */ 0, 1, 1, 1,
70 	/* e */ 1, 0, 0, 0,
71 	/* w */ 0, 0, 0, 1,
72 	        1, 0, 0, 0
73 };
74 
75 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
76 			 const struct gfs2_inode *ip, bool nowrap);
77 
78 
79 /**
80  * gfs2_setbit - Set a bit in the bitmaps
81  * @rbm: The position of the bit to set
82  * @do_clone: Also set the clone bitmap, if it exists
83  * @new_state: the new state of the block
84  *
85  */
86 
87 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
88 			       unsigned char new_state)
89 {
90 	unsigned char *byte1, *byte2, *end, cur_state;
91 	struct gfs2_bitmap *bi = rbm_bi(rbm);
92 	unsigned int buflen = bi->bi_len;
93 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
94 
95 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
96 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
97 
98 	BUG_ON(byte1 >= end);
99 
100 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
101 
102 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
103 		pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
104 			rbm->offset, cur_state, new_state);
105 		pr_warn("rgrp=0x%llx bi_start=0x%x\n",
106 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
107 		pr_warn("bi_offset=0x%x bi_len=0x%x\n",
108 			bi->bi_offset, bi->bi_len);
109 		dump_stack();
110 		gfs2_consist_rgrpd(rbm->rgd);
111 		return;
112 	}
113 	*byte1 ^= (cur_state ^ new_state) << bit;
114 
115 	if (do_clone && bi->bi_clone) {
116 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
117 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
118 		*byte2 ^= (cur_state ^ new_state) << bit;
119 	}
120 }
121 
122 /**
123  * gfs2_testbit - test a bit in the bitmaps
124  * @rbm: The bit to test
125  *
126  * Returns: The two bit block state of the requested bit
127  */
128 
129 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
130 {
131 	struct gfs2_bitmap *bi = rbm_bi(rbm);
132 	const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
133 	const u8 *byte;
134 	unsigned int bit;
135 
136 	byte = buffer + (rbm->offset / GFS2_NBBY);
137 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
138 
139 	return (*byte >> bit) & GFS2_BIT_MASK;
140 }
141 
142 /**
143  * gfs2_bit_search
144  * @ptr: Pointer to bitmap data
145  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
146  * @state: The state we are searching for
147  *
148  * We xor the bitmap data with a patter which is the bitwise opposite
149  * of what we are looking for, this gives rise to a pattern of ones
150  * wherever there is a match. Since we have two bits per entry, we
151  * take this pattern, shift it down by one place and then and it with
152  * the original. All the even bit positions (0,2,4, etc) then represent
153  * successful matches, so we mask with 0x55555..... to remove the unwanted
154  * odd bit positions.
155  *
156  * This allows searching of a whole u64 at once (32 blocks) with a
157  * single test (on 64 bit arches).
158  */
159 
160 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
161 {
162 	u64 tmp;
163 	static const u64 search[] = {
164 		[0] = 0xffffffffffffffffULL,
165 		[1] = 0xaaaaaaaaaaaaaaaaULL,
166 		[2] = 0x5555555555555555ULL,
167 		[3] = 0x0000000000000000ULL,
168 	};
169 	tmp = le64_to_cpu(*ptr) ^ search[state];
170 	tmp &= (tmp >> 1);
171 	tmp &= mask;
172 	return tmp;
173 }
174 
175 /**
176  * rs_cmp - multi-block reservation range compare
177  * @blk: absolute file system block number of the new reservation
178  * @len: number of blocks in the new reservation
179  * @rs: existing reservation to compare against
180  *
181  * returns: 1 if the block range is beyond the reach of the reservation
182  *         -1 if the block range is before the start of the reservation
183  *          0 if the block range overlaps with the reservation
184  */
185 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
186 {
187 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
188 
189 	if (blk >= startblk + rs->rs_free)
190 		return 1;
191 	if (blk + len - 1 < startblk)
192 		return -1;
193 	return 0;
194 }
195 
196 /**
197  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
198  *       a block in a given allocation state.
199  * @buf: the buffer that holds the bitmaps
200  * @len: the length (in bytes) of the buffer
201  * @goal: start search at this block's bit-pair (within @buffer)
202  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
203  *
204  * Scope of @goal and returned block number is only within this bitmap buffer,
205  * not entire rgrp or filesystem.  @buffer will be offset from the actual
206  * beginning of a bitmap block buffer, skipping any header structures, but
207  * headers are always a multiple of 64 bits long so that the buffer is
208  * always aligned to a 64 bit boundary.
209  *
210  * The size of the buffer is in bytes, but is it assumed that it is
211  * always ok to read a complete multiple of 64 bits at the end
212  * of the block in case the end is no aligned to a natural boundary.
213  *
214  * Return: the block number (bitmap buffer scope) that was found
215  */
216 
217 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
218 		       u32 goal, u8 state)
219 {
220 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
221 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
222 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
223 	u64 tmp;
224 	u64 mask = 0x5555555555555555ULL;
225 	u32 bit;
226 
227 	/* Mask off bits we don't care about at the start of the search */
228 	mask <<= spoint;
229 	tmp = gfs2_bit_search(ptr, mask, state);
230 	ptr++;
231 	while(tmp == 0 && ptr < end) {
232 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
233 		ptr++;
234 	}
235 	/* Mask off any bits which are more than len bytes from the start */
236 	if (ptr == end && (len & (sizeof(u64) - 1)))
237 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
238 	/* Didn't find anything, so return */
239 	if (tmp == 0)
240 		return BFITNOENT;
241 	ptr--;
242 	bit = __ffs64(tmp);
243 	bit /= 2;	/* two bits per entry in the bitmap */
244 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
245 }
246 
247 /**
248  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
249  * @rbm: The rbm with rgd already set correctly
250  * @block: The block number (filesystem relative)
251  *
252  * This sets the bi and offset members of an rbm based on a
253  * resource group and a filesystem relative block number. The
254  * resource group must be set in the rbm on entry, the bi and
255  * offset members will be set by this function.
256  *
257  * Returns: 0 on success, or an error code
258  */
259 
260 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
261 {
262 	u64 rblock = block - rbm->rgd->rd_data0;
263 
264 	if (WARN_ON_ONCE(rblock > UINT_MAX))
265 		return -EINVAL;
266 	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
267 		return -E2BIG;
268 
269 	rbm->bii = 0;
270 	rbm->offset = (u32)(rblock);
271 	/* Check if the block is within the first block */
272 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
273 		return 0;
274 
275 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
276 	rbm->offset += (sizeof(struct gfs2_rgrp) -
277 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
278 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
279 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
280 	return 0;
281 }
282 
283 /**
284  * gfs2_rbm_incr - increment an rbm structure
285  * @rbm: The rbm with rgd already set correctly
286  *
287  * This function takes an existing rbm structure and increments it to the next
288  * viable block offset.
289  *
290  * Returns: If incrementing the offset would cause the rbm to go past the
291  *          end of the rgrp, true is returned, otherwise false.
292  *
293  */
294 
295 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
296 {
297 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
298 		rbm->offset++;
299 		return false;
300 	}
301 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
302 		return true;
303 
304 	rbm->offset = 0;
305 	rbm->bii++;
306 	return false;
307 }
308 
309 /**
310  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
311  * @rbm: Position to search (value/result)
312  * @n_unaligned: Number of unaligned blocks to check
313  * @len: Decremented for each block found (terminate on zero)
314  *
315  * Returns: true if a non-free block is encountered
316  */
317 
318 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
319 {
320 	u32 n;
321 	u8 res;
322 
323 	for (n = 0; n < n_unaligned; n++) {
324 		res = gfs2_testbit(rbm);
325 		if (res != GFS2_BLKST_FREE)
326 			return true;
327 		(*len)--;
328 		if (*len == 0)
329 			return true;
330 		if (gfs2_rbm_incr(rbm))
331 			return true;
332 	}
333 
334 	return false;
335 }
336 
337 /**
338  * gfs2_free_extlen - Return extent length of free blocks
339  * @rrbm: Starting position
340  * @len: Max length to check
341  *
342  * Starting at the block specified by the rbm, see how many free blocks
343  * there are, not reading more than len blocks ahead. This can be done
344  * using memchr_inv when the blocks are byte aligned, but has to be done
345  * on a block by block basis in case of unaligned blocks. Also this
346  * function can cope with bitmap boundaries (although it must stop on
347  * a resource group boundary)
348  *
349  * Returns: Number of free blocks in the extent
350  */
351 
352 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
353 {
354 	struct gfs2_rbm rbm = *rrbm;
355 	u32 n_unaligned = rbm.offset & 3;
356 	u32 size = len;
357 	u32 bytes;
358 	u32 chunk_size;
359 	u8 *ptr, *start, *end;
360 	u64 block;
361 	struct gfs2_bitmap *bi;
362 
363 	if (n_unaligned &&
364 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
365 		goto out;
366 
367 	n_unaligned = len & 3;
368 	/* Start is now byte aligned */
369 	while (len > 3) {
370 		bi = rbm_bi(&rbm);
371 		start = bi->bi_bh->b_data;
372 		if (bi->bi_clone)
373 			start = bi->bi_clone;
374 		end = start + bi->bi_bh->b_size;
375 		start += bi->bi_offset;
376 		BUG_ON(rbm.offset & 3);
377 		start += (rbm.offset / GFS2_NBBY);
378 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
379 		ptr = memchr_inv(start, 0, bytes);
380 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
381 		chunk_size *= GFS2_NBBY;
382 		BUG_ON(len < chunk_size);
383 		len -= chunk_size;
384 		block = gfs2_rbm_to_block(&rbm);
385 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
386 			n_unaligned = 0;
387 			break;
388 		}
389 		if (ptr) {
390 			n_unaligned = 3;
391 			break;
392 		}
393 		n_unaligned = len & 3;
394 	}
395 
396 	/* Deal with any bits left over at the end */
397 	if (n_unaligned)
398 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
399 out:
400 	return size - len;
401 }
402 
403 /**
404  * gfs2_bitcount - count the number of bits in a certain state
405  * @rgd: the resource group descriptor
406  * @buffer: the buffer that holds the bitmaps
407  * @buflen: the length (in bytes) of the buffer
408  * @state: the state of the block we're looking for
409  *
410  * Returns: The number of bits
411  */
412 
413 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
414 			 unsigned int buflen, u8 state)
415 {
416 	const u8 *byte = buffer;
417 	const u8 *end = buffer + buflen;
418 	const u8 state1 = state << 2;
419 	const u8 state2 = state << 4;
420 	const u8 state3 = state << 6;
421 	u32 count = 0;
422 
423 	for (; byte < end; byte++) {
424 		if (((*byte) & 0x03) == state)
425 			count++;
426 		if (((*byte) & 0x0C) == state1)
427 			count++;
428 		if (((*byte) & 0x30) == state2)
429 			count++;
430 		if (((*byte) & 0xC0) == state3)
431 			count++;
432 	}
433 
434 	return count;
435 }
436 
437 /**
438  * gfs2_rgrp_verify - Verify that a resource group is consistent
439  * @rgd: the rgrp
440  *
441  */
442 
443 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
444 {
445 	struct gfs2_sbd *sdp = rgd->rd_sbd;
446 	struct gfs2_bitmap *bi = NULL;
447 	u32 length = rgd->rd_length;
448 	u32 count[4], tmp;
449 	int buf, x;
450 
451 	memset(count, 0, 4 * sizeof(u32));
452 
453 	/* Count # blocks in each of 4 possible allocation states */
454 	for (buf = 0; buf < length; buf++) {
455 		bi = rgd->rd_bits + buf;
456 		for (x = 0; x < 4; x++)
457 			count[x] += gfs2_bitcount(rgd,
458 						  bi->bi_bh->b_data +
459 						  bi->bi_offset,
460 						  bi->bi_len, x);
461 	}
462 
463 	if (count[0] != rgd->rd_free) {
464 		if (gfs2_consist_rgrpd(rgd))
465 			fs_err(sdp, "free data mismatch:  %u != %u\n",
466 			       count[0], rgd->rd_free);
467 		return;
468 	}
469 
470 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
471 	if (count[1] != tmp) {
472 		if (gfs2_consist_rgrpd(rgd))
473 			fs_err(sdp, "used data mismatch:  %u != %u\n",
474 			       count[1], tmp);
475 		return;
476 	}
477 
478 	if (count[2] + count[3] != rgd->rd_dinodes) {
479 		if (gfs2_consist_rgrpd(rgd))
480 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
481 			       count[2] + count[3], rgd->rd_dinodes);
482 		return;
483 	}
484 }
485 
486 static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
487 {
488 	u64 first = rgd->rd_data0;
489 	u64 last = first + rgd->rd_data;
490 	return first <= block && block < last;
491 }
492 
493 /**
494  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
495  * @sdp: The GFS2 superblock
496  * @blk: The data block number
497  * @exact: True if this needs to be an exact match
498  *
499  * Returns: The resource group, or NULL if not found
500  */
501 
502 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
503 {
504 	struct rb_node *n, *next;
505 	struct gfs2_rgrpd *cur;
506 
507 	spin_lock(&sdp->sd_rindex_spin);
508 	n = sdp->sd_rindex_tree.rb_node;
509 	while (n) {
510 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
511 		next = NULL;
512 		if (blk < cur->rd_addr)
513 			next = n->rb_left;
514 		else if (blk >= cur->rd_data0 + cur->rd_data)
515 			next = n->rb_right;
516 		if (next == NULL) {
517 			spin_unlock(&sdp->sd_rindex_spin);
518 			if (exact) {
519 				if (blk < cur->rd_addr)
520 					return NULL;
521 				if (blk >= cur->rd_data0 + cur->rd_data)
522 					return NULL;
523 			}
524 			return cur;
525 		}
526 		n = next;
527 	}
528 	spin_unlock(&sdp->sd_rindex_spin);
529 
530 	return NULL;
531 }
532 
533 /**
534  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
535  * @sdp: The GFS2 superblock
536  *
537  * Returns: The first rgrp in the filesystem
538  */
539 
540 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
541 {
542 	const struct rb_node *n;
543 	struct gfs2_rgrpd *rgd;
544 
545 	spin_lock(&sdp->sd_rindex_spin);
546 	n = rb_first(&sdp->sd_rindex_tree);
547 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
548 	spin_unlock(&sdp->sd_rindex_spin);
549 
550 	return rgd;
551 }
552 
553 /**
554  * gfs2_rgrpd_get_next - get the next RG
555  * @rgd: the resource group descriptor
556  *
557  * Returns: The next rgrp
558  */
559 
560 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
561 {
562 	struct gfs2_sbd *sdp = rgd->rd_sbd;
563 	const struct rb_node *n;
564 
565 	spin_lock(&sdp->sd_rindex_spin);
566 	n = rb_next(&rgd->rd_node);
567 	if (n == NULL)
568 		n = rb_first(&sdp->sd_rindex_tree);
569 
570 	if (unlikely(&rgd->rd_node == n)) {
571 		spin_unlock(&sdp->sd_rindex_spin);
572 		return NULL;
573 	}
574 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
575 	spin_unlock(&sdp->sd_rindex_spin);
576 	return rgd;
577 }
578 
579 void check_and_update_goal(struct gfs2_inode *ip)
580 {
581 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
582 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
583 		ip->i_goal = ip->i_no_addr;
584 }
585 
586 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
587 {
588 	int x;
589 
590 	for (x = 0; x < rgd->rd_length; x++) {
591 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
592 		kfree(bi->bi_clone);
593 		bi->bi_clone = NULL;
594 	}
595 }
596 
597 /**
598  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
599  *                 plus a quota allocations data structure, if necessary
600  * @ip: the inode for this reservation
601  */
602 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
603 {
604 	return gfs2_qa_alloc(ip);
605 }
606 
607 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
608 {
609 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
610 		       (unsigned long long)rs->rs_inum,
611 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
612 		       rs->rs_rbm.offset, rs->rs_free);
613 }
614 
615 /**
616  * __rs_deltree - remove a multi-block reservation from the rgd tree
617  * @rs: The reservation to remove
618  *
619  */
620 static void __rs_deltree(struct gfs2_blkreserv *rs)
621 {
622 	struct gfs2_rgrpd *rgd;
623 
624 	if (!gfs2_rs_active(rs))
625 		return;
626 
627 	rgd = rs->rs_rbm.rgd;
628 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
629 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
630 	RB_CLEAR_NODE(&rs->rs_node);
631 
632 	if (rs->rs_free) {
633 		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
634 
635 		/* return reserved blocks to the rgrp */
636 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
637 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
638 		/* The rgrp extent failure point is likely not to increase;
639 		   it will only do so if the freed blocks are somehow
640 		   contiguous with a span of free blocks that follows. Still,
641 		   it will force the number to be recalculated later. */
642 		rgd->rd_extfail_pt += rs->rs_free;
643 		rs->rs_free = 0;
644 		clear_bit(GBF_FULL, &bi->bi_flags);
645 	}
646 }
647 
648 /**
649  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
650  * @rs: The reservation to remove
651  *
652  */
653 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
654 {
655 	struct gfs2_rgrpd *rgd;
656 
657 	rgd = rs->rs_rbm.rgd;
658 	if (rgd) {
659 		spin_lock(&rgd->rd_rsspin);
660 		__rs_deltree(rs);
661 		spin_unlock(&rgd->rd_rsspin);
662 	}
663 }
664 
665 /**
666  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
667  * @ip: The inode for this reservation
668  * @wcount: The inode's write count, or NULL
669  *
670  */
671 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
672 {
673 	down_write(&ip->i_rw_mutex);
674 	if ((wcount == NULL) || (atomic_read(wcount) <= 1)) {
675 		gfs2_rs_deltree(&ip->i_res);
676 		BUG_ON(ip->i_res.rs_free);
677 	}
678 	up_write(&ip->i_rw_mutex);
679 	gfs2_qa_delete(ip, wcount);
680 }
681 
682 /**
683  * return_all_reservations - return all reserved blocks back to the rgrp.
684  * @rgd: the rgrp that needs its space back
685  *
686  * We previously reserved a bunch of blocks for allocation. Now we need to
687  * give them back. This leave the reservation structures in tact, but removes
688  * all of their corresponding "no-fly zones".
689  */
690 static void return_all_reservations(struct gfs2_rgrpd *rgd)
691 {
692 	struct rb_node *n;
693 	struct gfs2_blkreserv *rs;
694 
695 	spin_lock(&rgd->rd_rsspin);
696 	while ((n = rb_first(&rgd->rd_rstree))) {
697 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
698 		__rs_deltree(rs);
699 	}
700 	spin_unlock(&rgd->rd_rsspin);
701 }
702 
703 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
704 {
705 	struct rb_node *n;
706 	struct gfs2_rgrpd *rgd;
707 	struct gfs2_glock *gl;
708 
709 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
710 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
711 		gl = rgd->rd_gl;
712 
713 		rb_erase(n, &sdp->sd_rindex_tree);
714 
715 		if (gl) {
716 			spin_lock(&gl->gl_lockref.lock);
717 			gl->gl_object = NULL;
718 			spin_unlock(&gl->gl_lockref.lock);
719 			gfs2_glock_add_to_lru(gl);
720 			gfs2_glock_put(gl);
721 		}
722 
723 		gfs2_free_clones(rgd);
724 		kfree(rgd->rd_bits);
725 		return_all_reservations(rgd);
726 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
727 	}
728 }
729 
730 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
731 {
732 	pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
733 	pr_info("ri_length = %u\n", rgd->rd_length);
734 	pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
735 	pr_info("ri_data = %u\n", rgd->rd_data);
736 	pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
737 }
738 
739 /**
740  * gfs2_compute_bitstructs - Compute the bitmap sizes
741  * @rgd: The resource group descriptor
742  *
743  * Calculates bitmap descriptors, one for each block that contains bitmap data
744  *
745  * Returns: errno
746  */
747 
748 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
749 {
750 	struct gfs2_sbd *sdp = rgd->rd_sbd;
751 	struct gfs2_bitmap *bi;
752 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
753 	u32 bytes_left, bytes;
754 	int x;
755 
756 	if (!length)
757 		return -EINVAL;
758 
759 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
760 	if (!rgd->rd_bits)
761 		return -ENOMEM;
762 
763 	bytes_left = rgd->rd_bitbytes;
764 
765 	for (x = 0; x < length; x++) {
766 		bi = rgd->rd_bits + x;
767 
768 		bi->bi_flags = 0;
769 		/* small rgrp; bitmap stored completely in header block */
770 		if (length == 1) {
771 			bytes = bytes_left;
772 			bi->bi_offset = sizeof(struct gfs2_rgrp);
773 			bi->bi_start = 0;
774 			bi->bi_len = bytes;
775 			bi->bi_blocks = bytes * GFS2_NBBY;
776 		/* header block */
777 		} else if (x == 0) {
778 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
779 			bi->bi_offset = sizeof(struct gfs2_rgrp);
780 			bi->bi_start = 0;
781 			bi->bi_len = bytes;
782 			bi->bi_blocks = bytes * GFS2_NBBY;
783 		/* last block */
784 		} else if (x + 1 == length) {
785 			bytes = bytes_left;
786 			bi->bi_offset = sizeof(struct gfs2_meta_header);
787 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
788 			bi->bi_len = bytes;
789 			bi->bi_blocks = bytes * GFS2_NBBY;
790 		/* other blocks */
791 		} else {
792 			bytes = sdp->sd_sb.sb_bsize -
793 				sizeof(struct gfs2_meta_header);
794 			bi->bi_offset = sizeof(struct gfs2_meta_header);
795 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
796 			bi->bi_len = bytes;
797 			bi->bi_blocks = bytes * GFS2_NBBY;
798 		}
799 
800 		bytes_left -= bytes;
801 	}
802 
803 	if (bytes_left) {
804 		gfs2_consist_rgrpd(rgd);
805 		return -EIO;
806 	}
807 	bi = rgd->rd_bits + (length - 1);
808 	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
809 		if (gfs2_consist_rgrpd(rgd)) {
810 			gfs2_rindex_print(rgd);
811 			fs_err(sdp, "start=%u len=%u offset=%u\n",
812 			       bi->bi_start, bi->bi_len, bi->bi_offset);
813 		}
814 		return -EIO;
815 	}
816 
817 	return 0;
818 }
819 
820 /**
821  * gfs2_ri_total - Total up the file system space, according to the rindex.
822  * @sdp: the filesystem
823  *
824  */
825 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
826 {
827 	u64 total_data = 0;
828 	struct inode *inode = sdp->sd_rindex;
829 	struct gfs2_inode *ip = GFS2_I(inode);
830 	char buf[sizeof(struct gfs2_rindex)];
831 	int error, rgrps;
832 
833 	for (rgrps = 0;; rgrps++) {
834 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
835 
836 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
837 			break;
838 		error = gfs2_internal_read(ip, buf, &pos,
839 					   sizeof(struct gfs2_rindex));
840 		if (error != sizeof(struct gfs2_rindex))
841 			break;
842 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
843 	}
844 	return total_data;
845 }
846 
847 static int rgd_insert(struct gfs2_rgrpd *rgd)
848 {
849 	struct gfs2_sbd *sdp = rgd->rd_sbd;
850 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
851 
852 	/* Figure out where to put new node */
853 	while (*newn) {
854 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
855 						  rd_node);
856 
857 		parent = *newn;
858 		if (rgd->rd_addr < cur->rd_addr)
859 			newn = &((*newn)->rb_left);
860 		else if (rgd->rd_addr > cur->rd_addr)
861 			newn = &((*newn)->rb_right);
862 		else
863 			return -EEXIST;
864 	}
865 
866 	rb_link_node(&rgd->rd_node, parent, newn);
867 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
868 	sdp->sd_rgrps++;
869 	return 0;
870 }
871 
872 /**
873  * read_rindex_entry - Pull in a new resource index entry from the disk
874  * @ip: Pointer to the rindex inode
875  *
876  * Returns: 0 on success, > 0 on EOF, error code otherwise
877  */
878 
879 static int read_rindex_entry(struct gfs2_inode *ip)
880 {
881 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
882 	const unsigned bsize = sdp->sd_sb.sb_bsize;
883 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
884 	struct gfs2_rindex buf;
885 	int error;
886 	struct gfs2_rgrpd *rgd;
887 
888 	if (pos >= i_size_read(&ip->i_inode))
889 		return 1;
890 
891 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
892 				   sizeof(struct gfs2_rindex));
893 
894 	if (error != sizeof(struct gfs2_rindex))
895 		return (error == 0) ? 1 : error;
896 
897 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
898 	error = -ENOMEM;
899 	if (!rgd)
900 		return error;
901 
902 	rgd->rd_sbd = sdp;
903 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
904 	rgd->rd_length = be32_to_cpu(buf.ri_length);
905 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
906 	rgd->rd_data = be32_to_cpu(buf.ri_data);
907 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
908 	spin_lock_init(&rgd->rd_rsspin);
909 
910 	error = compute_bitstructs(rgd);
911 	if (error)
912 		goto fail;
913 
914 	error = gfs2_glock_get(sdp, rgd->rd_addr,
915 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
916 	if (error)
917 		goto fail;
918 
919 	rgd->rd_gl->gl_object = rgd;
920 	rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
921 	rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
922 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
923 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
924 	if (rgd->rd_data > sdp->sd_max_rg_data)
925 		sdp->sd_max_rg_data = rgd->rd_data;
926 	spin_lock(&sdp->sd_rindex_spin);
927 	error = rgd_insert(rgd);
928 	spin_unlock(&sdp->sd_rindex_spin);
929 	if (!error)
930 		return 0;
931 
932 	error = 0; /* someone else read in the rgrp; free it and ignore it */
933 	gfs2_glock_put(rgd->rd_gl);
934 
935 fail:
936 	kfree(rgd->rd_bits);
937 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
938 	return error;
939 }
940 
941 /**
942  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
943  * @sdp: the GFS2 superblock
944  *
945  * The purpose of this function is to select a subset of the resource groups
946  * and mark them as PREFERRED. We do it in such a way that each node prefers
947  * to use a unique set of rgrps to minimize glock contention.
948  */
949 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
950 {
951 	struct gfs2_rgrpd *rgd, *first;
952 	int i;
953 
954 	/* Skip an initial number of rgrps, based on this node's journal ID.
955 	   That should start each node out on its own set. */
956 	rgd = gfs2_rgrpd_get_first(sdp);
957 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
958 		rgd = gfs2_rgrpd_get_next(rgd);
959 	first = rgd;
960 
961 	do {
962 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
963 		for (i = 0; i < sdp->sd_journals; i++) {
964 			rgd = gfs2_rgrpd_get_next(rgd);
965 			if (!rgd || rgd == first)
966 				break;
967 		}
968 	} while (rgd && rgd != first);
969 }
970 
971 /**
972  * gfs2_ri_update - Pull in a new resource index from the disk
973  * @ip: pointer to the rindex inode
974  *
975  * Returns: 0 on successful update, error code otherwise
976  */
977 
978 static int gfs2_ri_update(struct gfs2_inode *ip)
979 {
980 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
981 	int error;
982 
983 	do {
984 		error = read_rindex_entry(ip);
985 	} while (error == 0);
986 
987 	if (error < 0)
988 		return error;
989 
990 	set_rgrp_preferences(sdp);
991 
992 	sdp->sd_rindex_uptodate = 1;
993 	return 0;
994 }
995 
996 /**
997  * gfs2_rindex_update - Update the rindex if required
998  * @sdp: The GFS2 superblock
999  *
1000  * We grab a lock on the rindex inode to make sure that it doesn't
1001  * change whilst we are performing an operation. We keep this lock
1002  * for quite long periods of time compared to other locks. This
1003  * doesn't matter, since it is shared and it is very, very rarely
1004  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1005  *
1006  * This makes sure that we're using the latest copy of the resource index
1007  * special file, which might have been updated if someone expanded the
1008  * filesystem (via gfs2_grow utility), which adds new resource groups.
1009  *
1010  * Returns: 0 on succeess, error code otherwise
1011  */
1012 
1013 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1014 {
1015 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1016 	struct gfs2_glock *gl = ip->i_gl;
1017 	struct gfs2_holder ri_gh;
1018 	int error = 0;
1019 	int unlock_required = 0;
1020 
1021 	/* Read new copy from disk if we don't have the latest */
1022 	if (!sdp->sd_rindex_uptodate) {
1023 		if (!gfs2_glock_is_locked_by_me(gl)) {
1024 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1025 			if (error)
1026 				return error;
1027 			unlock_required = 1;
1028 		}
1029 		if (!sdp->sd_rindex_uptodate)
1030 			error = gfs2_ri_update(ip);
1031 		if (unlock_required)
1032 			gfs2_glock_dq_uninit(&ri_gh);
1033 	}
1034 
1035 	return error;
1036 }
1037 
1038 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1039 {
1040 	const struct gfs2_rgrp *str = buf;
1041 	u32 rg_flags;
1042 
1043 	rg_flags = be32_to_cpu(str->rg_flags);
1044 	rg_flags &= ~GFS2_RDF_MASK;
1045 	rgd->rd_flags &= GFS2_RDF_MASK;
1046 	rgd->rd_flags |= rg_flags;
1047 	rgd->rd_free = be32_to_cpu(str->rg_free);
1048 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1049 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1050 }
1051 
1052 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1053 {
1054 	struct gfs2_rgrp *str = buf;
1055 
1056 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1057 	str->rg_free = cpu_to_be32(rgd->rd_free);
1058 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1059 	str->__pad = cpu_to_be32(0);
1060 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1061 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1062 }
1063 
1064 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1065 {
1066 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1067 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1068 
1069 	if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
1070 	    rgl->rl_dinodes != str->rg_dinodes ||
1071 	    rgl->rl_igeneration != str->rg_igeneration)
1072 		return 0;
1073 	return 1;
1074 }
1075 
1076 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1077 {
1078 	const struct gfs2_rgrp *str = buf;
1079 
1080 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1081 	rgl->rl_flags = str->rg_flags;
1082 	rgl->rl_free = str->rg_free;
1083 	rgl->rl_dinodes = str->rg_dinodes;
1084 	rgl->rl_igeneration = str->rg_igeneration;
1085 	rgl->__pad = 0UL;
1086 }
1087 
1088 static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change)
1089 {
1090 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1091 	u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change;
1092 	rgl->rl_unlinked = cpu_to_be32(unlinked);
1093 }
1094 
1095 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1096 {
1097 	struct gfs2_bitmap *bi;
1098 	const u32 length = rgd->rd_length;
1099 	const u8 *buffer = NULL;
1100 	u32 i, goal, count = 0;
1101 
1102 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1103 		goal = 0;
1104 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1105 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1106 		while (goal < bi->bi_len * GFS2_NBBY) {
1107 			goal = gfs2_bitfit(buffer, bi->bi_len, goal,
1108 					   GFS2_BLKST_UNLINKED);
1109 			if (goal == BFITNOENT)
1110 				break;
1111 			count++;
1112 			goal++;
1113 		}
1114 	}
1115 
1116 	return count;
1117 }
1118 
1119 
1120 /**
1121  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1122  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1123  *
1124  * Read in all of a Resource Group's header and bitmap blocks.
1125  * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
1126  *
1127  * Returns: errno
1128  */
1129 
1130 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1131 {
1132 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1133 	struct gfs2_glock *gl = rgd->rd_gl;
1134 	unsigned int length = rgd->rd_length;
1135 	struct gfs2_bitmap *bi;
1136 	unsigned int x, y;
1137 	int error;
1138 
1139 	if (rgd->rd_bits[0].bi_bh != NULL)
1140 		return 0;
1141 
1142 	for (x = 0; x < length; x++) {
1143 		bi = rgd->rd_bits + x;
1144 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1145 		if (error)
1146 			goto fail;
1147 	}
1148 
1149 	for (y = length; y--;) {
1150 		bi = rgd->rd_bits + y;
1151 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1152 		if (error)
1153 			goto fail;
1154 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1155 					      GFS2_METATYPE_RG)) {
1156 			error = -EIO;
1157 			goto fail;
1158 		}
1159 	}
1160 
1161 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1162 		for (x = 0; x < length; x++)
1163 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1164 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1165 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1166 		rgd->rd_free_clone = rgd->rd_free;
1167 		/* max out the rgrp allocation failure point */
1168 		rgd->rd_extfail_pt = rgd->rd_free;
1169 	}
1170 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1171 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1172 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1173 				     rgd->rd_bits[0].bi_bh->b_data);
1174 	}
1175 	else if (sdp->sd_args.ar_rgrplvb) {
1176 		if (!gfs2_rgrp_lvb_valid(rgd)){
1177 			gfs2_consist_rgrpd(rgd);
1178 			error = -EIO;
1179 			goto fail;
1180 		}
1181 		if (rgd->rd_rgl->rl_unlinked == 0)
1182 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1183 	}
1184 	return 0;
1185 
1186 fail:
1187 	while (x--) {
1188 		bi = rgd->rd_bits + x;
1189 		brelse(bi->bi_bh);
1190 		bi->bi_bh = NULL;
1191 		gfs2_assert_warn(sdp, !bi->bi_clone);
1192 	}
1193 
1194 	return error;
1195 }
1196 
1197 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1198 {
1199 	u32 rl_flags;
1200 
1201 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1202 		return 0;
1203 
1204 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1205 		return gfs2_rgrp_bh_get(rgd);
1206 
1207 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1208 	rl_flags &= ~GFS2_RDF_MASK;
1209 	rgd->rd_flags &= GFS2_RDF_MASK;
1210 	rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1211 	if (rgd->rd_rgl->rl_unlinked == 0)
1212 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1213 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1214 	rgd->rd_free_clone = rgd->rd_free;
1215 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1216 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1217 	return 0;
1218 }
1219 
1220 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1221 {
1222 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1223 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1224 
1225 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1226 		return 0;
1227 	return gfs2_rgrp_bh_get(rgd);
1228 }
1229 
1230 /**
1231  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1232  * @rgd: The resource group
1233  *
1234  */
1235 
1236 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1237 {
1238 	int x, length = rgd->rd_length;
1239 
1240 	for (x = 0; x < length; x++) {
1241 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1242 		if (bi->bi_bh) {
1243 			brelse(bi->bi_bh);
1244 			bi->bi_bh = NULL;
1245 		}
1246 	}
1247 
1248 }
1249 
1250 /**
1251  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1252  * @gh: The glock holder for the resource group
1253  *
1254  */
1255 
1256 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1257 {
1258 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1259 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1260 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1261 
1262 	if (rgd && demote_requested)
1263 		gfs2_rgrp_brelse(rgd);
1264 }
1265 
1266 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1267 			     struct buffer_head *bh,
1268 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1269 {
1270 	struct super_block *sb = sdp->sd_vfs;
1271 	u64 blk;
1272 	sector_t start = 0;
1273 	sector_t nr_blks = 0;
1274 	int rv;
1275 	unsigned int x;
1276 	u32 trimmed = 0;
1277 	u8 diff;
1278 
1279 	for (x = 0; x < bi->bi_len; x++) {
1280 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1281 		clone += bi->bi_offset;
1282 		clone += x;
1283 		if (bh) {
1284 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1285 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1286 		} else {
1287 			diff = ~(*clone | (*clone >> 1));
1288 		}
1289 		diff &= 0x55;
1290 		if (diff == 0)
1291 			continue;
1292 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1293 		while(diff) {
1294 			if (diff & 1) {
1295 				if (nr_blks == 0)
1296 					goto start_new_extent;
1297 				if ((start + nr_blks) != blk) {
1298 					if (nr_blks >= minlen) {
1299 						rv = sb_issue_discard(sb,
1300 							start, nr_blks,
1301 							GFP_NOFS, 0);
1302 						if (rv)
1303 							goto fail;
1304 						trimmed += nr_blks;
1305 					}
1306 					nr_blks = 0;
1307 start_new_extent:
1308 					start = blk;
1309 				}
1310 				nr_blks++;
1311 			}
1312 			diff >>= 2;
1313 			blk++;
1314 		}
1315 	}
1316 	if (nr_blks >= minlen) {
1317 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1318 		if (rv)
1319 			goto fail;
1320 		trimmed += nr_blks;
1321 	}
1322 	if (ptrimmed)
1323 		*ptrimmed = trimmed;
1324 	return 0;
1325 
1326 fail:
1327 	if (sdp->sd_args.ar_discard)
1328 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
1329 	sdp->sd_args.ar_discard = 0;
1330 	return -EIO;
1331 }
1332 
1333 /**
1334  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1335  * @filp: Any file on the filesystem
1336  * @argp: Pointer to the arguments (also used to pass result)
1337  *
1338  * Returns: 0 on success, otherwise error code
1339  */
1340 
1341 int gfs2_fitrim(struct file *filp, void __user *argp)
1342 {
1343 	struct inode *inode = file_inode(filp);
1344 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1345 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1346 	struct buffer_head *bh;
1347 	struct gfs2_rgrpd *rgd;
1348 	struct gfs2_rgrpd *rgd_end;
1349 	struct gfs2_holder gh;
1350 	struct fstrim_range r;
1351 	int ret = 0;
1352 	u64 amt;
1353 	u64 trimmed = 0;
1354 	u64 start, end, minlen;
1355 	unsigned int x;
1356 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1357 
1358 	if (!capable(CAP_SYS_ADMIN))
1359 		return -EPERM;
1360 
1361 	if (!blk_queue_discard(q))
1362 		return -EOPNOTSUPP;
1363 
1364 	if (copy_from_user(&r, argp, sizeof(r)))
1365 		return -EFAULT;
1366 
1367 	ret = gfs2_rindex_update(sdp);
1368 	if (ret)
1369 		return ret;
1370 
1371 	start = r.start >> bs_shift;
1372 	end = start + (r.len >> bs_shift);
1373 	minlen = max_t(u64, r.minlen,
1374 		       q->limits.discard_granularity) >> bs_shift;
1375 
1376 	if (end <= start || minlen > sdp->sd_max_rg_data)
1377 		return -EINVAL;
1378 
1379 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1380 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1381 
1382 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1383 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1384 		return -EINVAL; /* start is beyond the end of the fs */
1385 
1386 	while (1) {
1387 
1388 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1389 		if (ret)
1390 			goto out;
1391 
1392 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1393 			/* Trim each bitmap in the rgrp */
1394 			for (x = 0; x < rgd->rd_length; x++) {
1395 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1396 				ret = gfs2_rgrp_send_discards(sdp,
1397 						rgd->rd_data0, NULL, bi, minlen,
1398 						&amt);
1399 				if (ret) {
1400 					gfs2_glock_dq_uninit(&gh);
1401 					goto out;
1402 				}
1403 				trimmed += amt;
1404 			}
1405 
1406 			/* Mark rgrp as having been trimmed */
1407 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1408 			if (ret == 0) {
1409 				bh = rgd->rd_bits[0].bi_bh;
1410 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1411 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1412 				gfs2_rgrp_out(rgd, bh->b_data);
1413 				gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
1414 				gfs2_trans_end(sdp);
1415 			}
1416 		}
1417 		gfs2_glock_dq_uninit(&gh);
1418 
1419 		if (rgd == rgd_end)
1420 			break;
1421 
1422 		rgd = gfs2_rgrpd_get_next(rgd);
1423 	}
1424 
1425 out:
1426 	r.len = trimmed << bs_shift;
1427 	if (copy_to_user(argp, &r, sizeof(r)))
1428 		return -EFAULT;
1429 
1430 	return ret;
1431 }
1432 
1433 /**
1434  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1435  * @ip: the inode structure
1436  *
1437  */
1438 static void rs_insert(struct gfs2_inode *ip)
1439 {
1440 	struct rb_node **newn, *parent = NULL;
1441 	int rc;
1442 	struct gfs2_blkreserv *rs = &ip->i_res;
1443 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1444 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1445 
1446 	BUG_ON(gfs2_rs_active(rs));
1447 
1448 	spin_lock(&rgd->rd_rsspin);
1449 	newn = &rgd->rd_rstree.rb_node;
1450 	while (*newn) {
1451 		struct gfs2_blkreserv *cur =
1452 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1453 
1454 		parent = *newn;
1455 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1456 		if (rc > 0)
1457 			newn = &((*newn)->rb_right);
1458 		else if (rc < 0)
1459 			newn = &((*newn)->rb_left);
1460 		else {
1461 			spin_unlock(&rgd->rd_rsspin);
1462 			WARN_ON(1);
1463 			return;
1464 		}
1465 	}
1466 
1467 	rb_link_node(&rs->rs_node, parent, newn);
1468 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1469 
1470 	/* Do our rgrp accounting for the reservation */
1471 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1472 	spin_unlock(&rgd->rd_rsspin);
1473 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1474 }
1475 
1476 /**
1477  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1478  * @rgd: the resource group descriptor
1479  * @ip: pointer to the inode for which we're reserving blocks
1480  * @ap: the allocation parameters
1481  *
1482  */
1483 
1484 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1485 			   const struct gfs2_alloc_parms *ap)
1486 {
1487 	struct gfs2_rbm rbm = { .rgd = rgd, };
1488 	u64 goal;
1489 	struct gfs2_blkreserv *rs = &ip->i_res;
1490 	u32 extlen;
1491 	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
1492 	int ret;
1493 	struct inode *inode = &ip->i_inode;
1494 
1495 	if (S_ISDIR(inode->i_mode))
1496 		extlen = 1;
1497 	else {
1498 		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
1499 		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1500 	}
1501 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1502 		return;
1503 
1504 	/* Find bitmap block that contains bits for goal block */
1505 	if (rgrp_contains_block(rgd, ip->i_goal))
1506 		goal = ip->i_goal;
1507 	else
1508 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1509 
1510 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1511 		return;
1512 
1513 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1514 	if (ret == 0) {
1515 		rs->rs_rbm = rbm;
1516 		rs->rs_free = extlen;
1517 		rs->rs_inum = ip->i_no_addr;
1518 		rs_insert(ip);
1519 	} else {
1520 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1521 			rgd->rd_last_alloc = 0;
1522 	}
1523 }
1524 
1525 /**
1526  * gfs2_next_unreserved_block - Return next block that is not reserved
1527  * @rgd: The resource group
1528  * @block: The starting block
1529  * @length: The required length
1530  * @ip: Ignore any reservations for this inode
1531  *
1532  * If the block does not appear in any reservation, then return the
1533  * block number unchanged. If it does appear in the reservation, then
1534  * keep looking through the tree of reservations in order to find the
1535  * first block number which is not reserved.
1536  */
1537 
1538 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1539 				      u32 length,
1540 				      const struct gfs2_inode *ip)
1541 {
1542 	struct gfs2_blkreserv *rs;
1543 	struct rb_node *n;
1544 	int rc;
1545 
1546 	spin_lock(&rgd->rd_rsspin);
1547 	n = rgd->rd_rstree.rb_node;
1548 	while (n) {
1549 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1550 		rc = rs_cmp(block, length, rs);
1551 		if (rc < 0)
1552 			n = n->rb_left;
1553 		else if (rc > 0)
1554 			n = n->rb_right;
1555 		else
1556 			break;
1557 	}
1558 
1559 	if (n) {
1560 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1561 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1562 			n = n->rb_right;
1563 			if (n == NULL)
1564 				break;
1565 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1566 		}
1567 	}
1568 
1569 	spin_unlock(&rgd->rd_rsspin);
1570 	return block;
1571 }
1572 
1573 /**
1574  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1575  * @rbm: The current position in the resource group
1576  * @ip: The inode for which we are searching for blocks
1577  * @minext: The minimum extent length
1578  * @maxext: A pointer to the maximum extent structure
1579  *
1580  * This checks the current position in the rgrp to see whether there is
1581  * a reservation covering this block. If not then this function is a
1582  * no-op. If there is, then the position is moved to the end of the
1583  * contiguous reservation(s) so that we are pointing at the first
1584  * non-reserved block.
1585  *
1586  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1587  */
1588 
1589 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1590 					     const struct gfs2_inode *ip,
1591 					     u32 minext,
1592 					     struct gfs2_extent *maxext)
1593 {
1594 	u64 block = gfs2_rbm_to_block(rbm);
1595 	u32 extlen = 1;
1596 	u64 nblock;
1597 	int ret;
1598 
1599 	/*
1600 	 * If we have a minimum extent length, then skip over any extent
1601 	 * which is less than the min extent length in size.
1602 	 */
1603 	if (minext) {
1604 		extlen = gfs2_free_extlen(rbm, minext);
1605 		if (extlen <= maxext->len)
1606 			goto fail;
1607 	}
1608 
1609 	/*
1610 	 * Check the extent which has been found against the reservations
1611 	 * and skip if parts of it are already reserved
1612 	 */
1613 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1614 	if (nblock == block) {
1615 		if (!minext || extlen >= minext)
1616 			return 0;
1617 
1618 		if (extlen > maxext->len) {
1619 			maxext->len = extlen;
1620 			maxext->rbm = *rbm;
1621 		}
1622 fail:
1623 		nblock = block + extlen;
1624 	}
1625 	ret = gfs2_rbm_from_block(rbm, nblock);
1626 	if (ret < 0)
1627 		return ret;
1628 	return 1;
1629 }
1630 
1631 /**
1632  * gfs2_rbm_find - Look for blocks of a particular state
1633  * @rbm: Value/result starting position and final position
1634  * @state: The state which we want to find
1635  * @minext: Pointer to the requested extent length (NULL for a single block)
1636  *          This is updated to be the actual reservation size.
1637  * @ip: If set, check for reservations
1638  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1639  *          around until we've reached the starting point.
1640  *
1641  * Side effects:
1642  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1643  *   has no free blocks in it.
1644  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1645  *   has come up short on a free block search.
1646  *
1647  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1648  */
1649 
1650 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1651 			 const struct gfs2_inode *ip, bool nowrap)
1652 {
1653 	struct buffer_head *bh;
1654 	int initial_bii;
1655 	u32 initial_offset;
1656 	int first_bii = rbm->bii;
1657 	u32 first_offset = rbm->offset;
1658 	u32 offset;
1659 	u8 *buffer;
1660 	int n = 0;
1661 	int iters = rbm->rgd->rd_length;
1662 	int ret;
1663 	struct gfs2_bitmap *bi;
1664 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1665 
1666 	/* If we are not starting at the beginning of a bitmap, then we
1667 	 * need to add one to the bitmap count to ensure that we search
1668 	 * the starting bitmap twice.
1669 	 */
1670 	if (rbm->offset != 0)
1671 		iters++;
1672 
1673 	while(1) {
1674 		bi = rbm_bi(rbm);
1675 		if (test_bit(GBF_FULL, &bi->bi_flags) &&
1676 		    (state == GFS2_BLKST_FREE))
1677 			goto next_bitmap;
1678 
1679 		bh = bi->bi_bh;
1680 		buffer = bh->b_data + bi->bi_offset;
1681 		WARN_ON(!buffer_uptodate(bh));
1682 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1683 			buffer = bi->bi_clone + bi->bi_offset;
1684 		initial_offset = rbm->offset;
1685 		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
1686 		if (offset == BFITNOENT)
1687 			goto bitmap_full;
1688 		rbm->offset = offset;
1689 		if (ip == NULL)
1690 			return 0;
1691 
1692 		initial_bii = rbm->bii;
1693 		ret = gfs2_reservation_check_and_update(rbm, ip,
1694 							minext ? *minext : 0,
1695 							&maxext);
1696 		if (ret == 0)
1697 			return 0;
1698 		if (ret > 0) {
1699 			n += (rbm->bii - initial_bii);
1700 			goto next_iter;
1701 		}
1702 		if (ret == -E2BIG) {
1703 			rbm->bii = 0;
1704 			rbm->offset = 0;
1705 			n += (rbm->bii - initial_bii);
1706 			goto res_covered_end_of_rgrp;
1707 		}
1708 		return ret;
1709 
1710 bitmap_full:	/* Mark bitmap as full and fall through */
1711 		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1712 			set_bit(GBF_FULL, &bi->bi_flags);
1713 
1714 next_bitmap:	/* Find next bitmap in the rgrp */
1715 		rbm->offset = 0;
1716 		rbm->bii++;
1717 		if (rbm->bii == rbm->rgd->rd_length)
1718 			rbm->bii = 0;
1719 res_covered_end_of_rgrp:
1720 		if ((rbm->bii == 0) && nowrap)
1721 			break;
1722 		n++;
1723 next_iter:
1724 		if (n >= iters)
1725 			break;
1726 	}
1727 
1728 	if (minext == NULL || state != GFS2_BLKST_FREE)
1729 		return -ENOSPC;
1730 
1731 	/* If the extent was too small, and it's smaller than the smallest
1732 	   to have failed before, remember for future reference that it's
1733 	   useless to search this rgrp again for this amount or more. */
1734 	if ((first_offset == 0) && (first_bii == 0) &&
1735 	    (*minext < rbm->rgd->rd_extfail_pt))
1736 		rbm->rgd->rd_extfail_pt = *minext;
1737 
1738 	/* If the maximum extent we found is big enough to fulfill the
1739 	   minimum requirements, use it anyway. */
1740 	if (maxext.len) {
1741 		*rbm = maxext.rbm;
1742 		*minext = maxext.len;
1743 		return 0;
1744 	}
1745 
1746 	return -ENOSPC;
1747 }
1748 
1749 /**
1750  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1751  * @rgd: The rgrp
1752  * @last_unlinked: block address of the last dinode we unlinked
1753  * @skip: block address we should explicitly not unlink
1754  *
1755  * Returns: 0 if no error
1756  *          The inode, if one has been found, in inode.
1757  */
1758 
1759 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1760 {
1761 	u64 block;
1762 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1763 	struct gfs2_glock *gl;
1764 	struct gfs2_inode *ip;
1765 	int error;
1766 	int found = 0;
1767 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1768 
1769 	while (1) {
1770 		down_write(&sdp->sd_log_flush_lock);
1771 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1772 				      true);
1773 		up_write(&sdp->sd_log_flush_lock);
1774 		if (error == -ENOSPC)
1775 			break;
1776 		if (WARN_ON_ONCE(error))
1777 			break;
1778 
1779 		block = gfs2_rbm_to_block(&rbm);
1780 		if (gfs2_rbm_from_block(&rbm, block + 1))
1781 			break;
1782 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1783 			continue;
1784 		if (block == skip)
1785 			continue;
1786 		*last_unlinked = block;
1787 
1788 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1789 		if (error)
1790 			continue;
1791 
1792 		/* If the inode is already in cache, we can ignore it here
1793 		 * because the existing inode disposal code will deal with
1794 		 * it when all refs have gone away. Accessing gl_object like
1795 		 * this is not safe in general. Here it is ok because we do
1796 		 * not dereference the pointer, and we only need an approx
1797 		 * answer to whether it is NULL or not.
1798 		 */
1799 		ip = gl->gl_object;
1800 
1801 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1802 			gfs2_glock_put(gl);
1803 		else
1804 			found++;
1805 
1806 		/* Limit reclaim to sensible number of tasks */
1807 		if (found > NR_CPUS)
1808 			return;
1809 	}
1810 
1811 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1812 	return;
1813 }
1814 
1815 /**
1816  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1817  * @rgd: The rgrp in question
1818  * @loops: An indication of how picky we can be (0=very, 1=less so)
1819  *
1820  * This function uses the recently added glock statistics in order to
1821  * figure out whether a parciular resource group is suffering from
1822  * contention from multiple nodes. This is done purely on the basis
1823  * of timings, since this is the only data we have to work with and
1824  * our aim here is to reject a resource group which is highly contended
1825  * but (very important) not to do this too often in order to ensure that
1826  * we do not land up introducing fragmentation by changing resource
1827  * groups when not actually required.
1828  *
1829  * The calculation is fairly simple, we want to know whether the SRTTB
1830  * (i.e. smoothed round trip time for blocking operations) to acquire
1831  * the lock for this rgrp's glock is significantly greater than the
1832  * time taken for resource groups on average. We introduce a margin in
1833  * the form of the variable @var which is computed as the sum of the two
1834  * respective variences, and multiplied by a factor depending on @loops
1835  * and whether we have a lot of data to base the decision on. This is
1836  * then tested against the square difference of the means in order to
1837  * decide whether the result is statistically significant or not.
1838  *
1839  * Returns: A boolean verdict on the congestion status
1840  */
1841 
1842 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1843 {
1844 	const struct gfs2_glock *gl = rgd->rd_gl;
1845 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1846 	struct gfs2_lkstats *st;
1847 	u64 r_dcount, l_dcount;
1848 	u64 l_srttb, a_srttb = 0;
1849 	s64 srttb_diff;
1850 	u64 sqr_diff;
1851 	u64 var;
1852 	int cpu, nonzero = 0;
1853 
1854 	preempt_disable();
1855 	for_each_present_cpu(cpu) {
1856 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1857 		if (st->stats[GFS2_LKS_SRTTB]) {
1858 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1859 			nonzero++;
1860 		}
1861 	}
1862 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1863 	if (nonzero)
1864 		do_div(a_srttb, nonzero);
1865 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1866 	var = st->stats[GFS2_LKS_SRTTVARB] +
1867 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1868 	preempt_enable();
1869 
1870 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1871 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1872 
1873 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1874 		return false;
1875 
1876 	srttb_diff = a_srttb - l_srttb;
1877 	sqr_diff = srttb_diff * srttb_diff;
1878 
1879 	var *= 2;
1880 	if (l_dcount < 8 || r_dcount < 8)
1881 		var *= 2;
1882 	if (loops == 1)
1883 		var *= 2;
1884 
1885 	return ((srttb_diff < 0) && (sqr_diff > var));
1886 }
1887 
1888 /**
1889  * gfs2_rgrp_used_recently
1890  * @rs: The block reservation with the rgrp to test
1891  * @msecs: The time limit in milliseconds
1892  *
1893  * Returns: True if the rgrp glock has been used within the time limit
1894  */
1895 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1896 				    u64 msecs)
1897 {
1898 	u64 tdiff;
1899 
1900 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1901                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1902 
1903 	return tdiff > (msecs * 1000 * 1000);
1904 }
1905 
1906 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1907 {
1908 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1909 	u32 skip;
1910 
1911 	get_random_bytes(&skip, sizeof(skip));
1912 	return skip % sdp->sd_rgrps;
1913 }
1914 
1915 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1916 {
1917 	struct gfs2_rgrpd *rgd = *pos;
1918 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1919 
1920 	rgd = gfs2_rgrpd_get_next(rgd);
1921 	if (rgd == NULL)
1922 		rgd = gfs2_rgrpd_get_first(sdp);
1923 	*pos = rgd;
1924 	if (rgd != begin) /* If we didn't wrap */
1925 		return true;
1926 	return false;
1927 }
1928 
1929 /**
1930  * fast_to_acquire - determine if a resource group will be fast to acquire
1931  *
1932  * If this is one of our preferred rgrps, it should be quicker to acquire,
1933  * because we tried to set ourselves up as dlm lock master.
1934  */
1935 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1936 {
1937 	struct gfs2_glock *gl = rgd->rd_gl;
1938 
1939 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1940 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1941 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1942 		return 1;
1943 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1944 		return 1;
1945 	return 0;
1946 }
1947 
1948 /**
1949  * gfs2_inplace_reserve - Reserve space in the filesystem
1950  * @ip: the inode to reserve space for
1951  * @ap: the allocation parameters
1952  *
1953  * We try our best to find an rgrp that has at least ap->target blocks
1954  * available. After a couple of passes (loops == 2), the prospects of finding
1955  * such an rgrp diminish. At this stage, we return the first rgrp that has
1956  * atleast ap->min_target blocks available. Either way, we set ap->allowed to
1957  * the number of blocks available in the chosen rgrp.
1958  *
1959  * Returns: 0 on success,
1960  *          -ENOMEM if a suitable rgrp can't be found
1961  *          errno otherwise
1962  */
1963 
1964 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
1965 {
1966 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1967 	struct gfs2_rgrpd *begin = NULL;
1968 	struct gfs2_blkreserv *rs = &ip->i_res;
1969 	int error = 0, rg_locked, flags = 0;
1970 	u64 last_unlinked = NO_BLOCK;
1971 	int loops = 0;
1972 	u32 skip = 0;
1973 
1974 	if (sdp->sd_args.ar_rgrplvb)
1975 		flags |= GL_SKIP;
1976 	if (gfs2_assert_warn(sdp, ap->target))
1977 		return -EINVAL;
1978 	if (gfs2_rs_active(rs)) {
1979 		begin = rs->rs_rbm.rgd;
1980 	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
1981 		rs->rs_rbm.rgd = begin = ip->i_rgd;
1982 	} else {
1983 		check_and_update_goal(ip);
1984 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1985 	}
1986 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
1987 		skip = gfs2_orlov_skip(ip);
1988 	if (rs->rs_rbm.rgd == NULL)
1989 		return -EBADSLT;
1990 
1991 	while (loops < 3) {
1992 		rg_locked = 1;
1993 
1994 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
1995 			rg_locked = 0;
1996 			if (skip && skip--)
1997 				goto next_rgrp;
1998 			if (!gfs2_rs_active(rs)) {
1999 				if (loops == 0 &&
2000 				    !fast_to_acquire(rs->rs_rbm.rgd))
2001 					goto next_rgrp;
2002 				if ((loops < 2) &&
2003 				    gfs2_rgrp_used_recently(rs, 1000) &&
2004 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2005 					goto next_rgrp;
2006 			}
2007 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2008 						   LM_ST_EXCLUSIVE, flags,
2009 						   &rs->rs_rgd_gh);
2010 			if (unlikely(error))
2011 				return error;
2012 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2013 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2014 				goto skip_rgrp;
2015 			if (sdp->sd_args.ar_rgrplvb) {
2016 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2017 				if (unlikely(error)) {
2018 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2019 					return error;
2020 				}
2021 			}
2022 		}
2023 
2024 		/* Skip unuseable resource groups */
2025 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2026 						 GFS2_RDF_ERROR)) ||
2027 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2028 			goto skip_rgrp;
2029 
2030 		if (sdp->sd_args.ar_rgrplvb)
2031 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2032 
2033 		/* Get a reservation if we don't already have one */
2034 		if (!gfs2_rs_active(rs))
2035 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2036 
2037 		/* Skip rgrps when we can't get a reservation on first pass */
2038 		if (!gfs2_rs_active(rs) && (loops < 1))
2039 			goto check_rgrp;
2040 
2041 		/* If rgrp has enough free space, use it */
2042 		if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
2043 		    (loops == 2 && ap->min_target &&
2044 		     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
2045 			ip->i_rgd = rs->rs_rbm.rgd;
2046 			ap->allowed = ip->i_rgd->rd_free_clone;
2047 			return 0;
2048 		}
2049 check_rgrp:
2050 		/* Check for unlinked inodes which can be reclaimed */
2051 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2052 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2053 					ip->i_no_addr);
2054 skip_rgrp:
2055 		/* Drop reservation, if we couldn't use reserved rgrp */
2056 		if (gfs2_rs_active(rs))
2057 			gfs2_rs_deltree(rs);
2058 
2059 		/* Unlock rgrp if required */
2060 		if (!rg_locked)
2061 			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2062 next_rgrp:
2063 		/* Find the next rgrp, and continue looking */
2064 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2065 			continue;
2066 		if (skip)
2067 			continue;
2068 
2069 		/* If we've scanned all the rgrps, but found no free blocks
2070 		 * then this checks for some less likely conditions before
2071 		 * trying again.
2072 		 */
2073 		loops++;
2074 		/* Check that fs hasn't grown if writing to rindex */
2075 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2076 			error = gfs2_ri_update(ip);
2077 			if (error)
2078 				return error;
2079 		}
2080 		/* Flushing the log may release space */
2081 		if (loops == 2)
2082 			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
2083 	}
2084 
2085 	return -ENOSPC;
2086 }
2087 
2088 /**
2089  * gfs2_inplace_release - release an inplace reservation
2090  * @ip: the inode the reservation was taken out on
2091  *
2092  * Release a reservation made by gfs2_inplace_reserve().
2093  */
2094 
2095 void gfs2_inplace_release(struct gfs2_inode *ip)
2096 {
2097 	struct gfs2_blkreserv *rs = &ip->i_res;
2098 
2099 	if (rs->rs_rgd_gh.gh_gl)
2100 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2101 }
2102 
2103 /**
2104  * gfs2_get_block_type - Check a block in a RG is of given type
2105  * @rgd: the resource group holding the block
2106  * @block: the block number
2107  *
2108  * Returns: The block type (GFS2_BLKST_*)
2109  */
2110 
2111 static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
2112 {
2113 	struct gfs2_rbm rbm = { .rgd = rgd, };
2114 	int ret;
2115 
2116 	ret = gfs2_rbm_from_block(&rbm, block);
2117 	WARN_ON_ONCE(ret != 0);
2118 
2119 	return gfs2_testbit(&rbm);
2120 }
2121 
2122 
2123 /**
2124  * gfs2_alloc_extent - allocate an extent from a given bitmap
2125  * @rbm: the resource group information
2126  * @dinode: TRUE if the first block we allocate is for a dinode
2127  * @n: The extent length (value/result)
2128  *
2129  * Add the bitmap buffer to the transaction.
2130  * Set the found bits to @new_state to change block's allocation state.
2131  */
2132 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2133 			     unsigned int *n)
2134 {
2135 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2136 	const unsigned int elen = *n;
2137 	u64 block;
2138 	int ret;
2139 
2140 	*n = 1;
2141 	block = gfs2_rbm_to_block(rbm);
2142 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2143 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2144 	block++;
2145 	while (*n < elen) {
2146 		ret = gfs2_rbm_from_block(&pos, block);
2147 		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
2148 			break;
2149 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2150 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2151 		(*n)++;
2152 		block++;
2153 	}
2154 }
2155 
2156 /**
2157  * rgblk_free - Change alloc state of given block(s)
2158  * @sdp: the filesystem
2159  * @bstart: the start of a run of blocks to free
2160  * @blen: the length of the block run (all must lie within ONE RG!)
2161  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2162  *
2163  * Returns:  Resource group containing the block(s)
2164  */
2165 
2166 static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2167 				     u32 blen, unsigned char new_state)
2168 {
2169 	struct gfs2_rbm rbm;
2170 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2171 
2172 	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2173 	if (!rbm.rgd) {
2174 		if (gfs2_consist(sdp))
2175 			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
2176 		return NULL;
2177 	}
2178 
2179 	gfs2_rbm_from_block(&rbm, bstart);
2180 	while (blen--) {
2181 		bi = rbm_bi(&rbm);
2182 		if (bi != bi_prev) {
2183 			if (!bi->bi_clone) {
2184 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2185 						      GFP_NOFS | __GFP_NOFAIL);
2186 				memcpy(bi->bi_clone + bi->bi_offset,
2187 				       bi->bi_bh->b_data + bi->bi_offset,
2188 				       bi->bi_len);
2189 			}
2190 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2191 			bi_prev = bi;
2192 		}
2193 		gfs2_setbit(&rbm, false, new_state);
2194 		gfs2_rbm_incr(&rbm);
2195 	}
2196 
2197 	return rbm.rgd;
2198 }
2199 
2200 /**
2201  * gfs2_rgrp_dump - print out an rgrp
2202  * @seq: The iterator
2203  * @gl: The glock in question
2204  *
2205  */
2206 
2207 void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2208 {
2209 	struct gfs2_rgrpd *rgd = gl->gl_object;
2210 	struct gfs2_blkreserv *trs;
2211 	const struct rb_node *n;
2212 
2213 	if (rgd == NULL)
2214 		return;
2215 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2216 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2217 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2218 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2219 	spin_lock(&rgd->rd_rsspin);
2220 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2221 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2222 		dump_rs(seq, trs);
2223 	}
2224 	spin_unlock(&rgd->rd_rsspin);
2225 }
2226 
2227 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2228 {
2229 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2230 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2231 		(unsigned long long)rgd->rd_addr);
2232 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2233 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
2234 	rgd->rd_flags |= GFS2_RDF_ERROR;
2235 }
2236 
2237 /**
2238  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2239  * @ip: The inode we have just allocated blocks for
2240  * @rbm: The start of the allocated blocks
2241  * @len: The extent length
2242  *
2243  * Adjusts a reservation after an allocation has taken place. If the
2244  * reservation does not match the allocation, or if it is now empty
2245  * then it is removed.
2246  */
2247 
2248 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2249 				    const struct gfs2_rbm *rbm, unsigned len)
2250 {
2251 	struct gfs2_blkreserv *rs = &ip->i_res;
2252 	struct gfs2_rgrpd *rgd = rbm->rgd;
2253 	unsigned rlen;
2254 	u64 block;
2255 	int ret;
2256 
2257 	spin_lock(&rgd->rd_rsspin);
2258 	if (gfs2_rs_active(rs)) {
2259 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2260 			block = gfs2_rbm_to_block(rbm);
2261 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2262 			rlen = min(rs->rs_free, len);
2263 			rs->rs_free -= rlen;
2264 			rgd->rd_reserved -= rlen;
2265 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2266 			if (rs->rs_free && !ret)
2267 				goto out;
2268 			/* We used up our block reservation, so we should
2269 			   reserve more blocks next time. */
2270 			atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
2271 		}
2272 		__rs_deltree(rs);
2273 	}
2274 out:
2275 	spin_unlock(&rgd->rd_rsspin);
2276 }
2277 
2278 /**
2279  * gfs2_set_alloc_start - Set starting point for block allocation
2280  * @rbm: The rbm which will be set to the required location
2281  * @ip: The gfs2 inode
2282  * @dinode: Flag to say if allocation includes a new inode
2283  *
2284  * This sets the starting point from the reservation if one is active
2285  * otherwise it falls back to guessing a start point based on the
2286  * inode's goal block or the last allocation point in the rgrp.
2287  */
2288 
2289 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2290 				 const struct gfs2_inode *ip, bool dinode)
2291 {
2292 	u64 goal;
2293 
2294 	if (gfs2_rs_active(&ip->i_res)) {
2295 		*rbm = ip->i_res.rs_rbm;
2296 		return;
2297 	}
2298 
2299 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2300 		goal = ip->i_goal;
2301 	else
2302 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2303 
2304 	gfs2_rbm_from_block(rbm, goal);
2305 }
2306 
2307 /**
2308  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2309  * @ip: the inode to allocate the block for
2310  * @bn: Used to return the starting block number
2311  * @nblocks: requested number of blocks/extent length (value/result)
2312  * @dinode: 1 if we're allocating a dinode block, else 0
2313  * @generation: the generation number of the inode
2314  *
2315  * Returns: 0 or error
2316  */
2317 
2318 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2319 		      bool dinode, u64 *generation)
2320 {
2321 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2322 	struct buffer_head *dibh;
2323 	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
2324 	unsigned int ndata;
2325 	u64 block; /* block, within the file system scope */
2326 	int error;
2327 
2328 	gfs2_set_alloc_start(&rbm, ip, dinode);
2329 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2330 
2331 	if (error == -ENOSPC) {
2332 		gfs2_set_alloc_start(&rbm, ip, dinode);
2333 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2334 	}
2335 
2336 	/* Since all blocks are reserved in advance, this shouldn't happen */
2337 	if (error) {
2338 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2339 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2340 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2341 			rbm.rgd->rd_extfail_pt);
2342 		goto rgrp_error;
2343 	}
2344 
2345 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2346 	block = gfs2_rbm_to_block(&rbm);
2347 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2348 	if (gfs2_rs_active(&ip->i_res))
2349 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2350 	ndata = *nblocks;
2351 	if (dinode)
2352 		ndata--;
2353 
2354 	if (!dinode) {
2355 		ip->i_goal = block + ndata - 1;
2356 		error = gfs2_meta_inode_buffer(ip, &dibh);
2357 		if (error == 0) {
2358 			struct gfs2_dinode *di =
2359 				(struct gfs2_dinode *)dibh->b_data;
2360 			gfs2_trans_add_meta(ip->i_gl, dibh);
2361 			di->di_goal_meta = di->di_goal_data =
2362 				cpu_to_be64(ip->i_goal);
2363 			brelse(dibh);
2364 		}
2365 	}
2366 	if (rbm.rgd->rd_free < *nblocks) {
2367 		pr_warn("nblocks=%u\n", *nblocks);
2368 		goto rgrp_error;
2369 	}
2370 
2371 	rbm.rgd->rd_free -= *nblocks;
2372 	if (dinode) {
2373 		rbm.rgd->rd_dinodes++;
2374 		*generation = rbm.rgd->rd_igeneration++;
2375 		if (*generation == 0)
2376 			*generation = rbm.rgd->rd_igeneration++;
2377 	}
2378 
2379 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2380 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2381 	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
2382 
2383 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2384 	if (dinode)
2385 		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2386 
2387 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2388 
2389 	rbm.rgd->rd_free_clone -= *nblocks;
2390 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2391 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2392 	*bn = block;
2393 	return 0;
2394 
2395 rgrp_error:
2396 	gfs2_rgrp_error(rbm.rgd);
2397 	return -EIO;
2398 }
2399 
2400 /**
2401  * __gfs2_free_blocks - free a contiguous run of block(s)
2402  * @ip: the inode these blocks are being freed from
2403  * @bstart: first block of a run of contiguous blocks
2404  * @blen: the length of the block run
2405  * @meta: 1 if the blocks represent metadata
2406  *
2407  */
2408 
2409 void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
2410 {
2411 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2412 	struct gfs2_rgrpd *rgd;
2413 
2414 	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
2415 	if (!rgd)
2416 		return;
2417 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2418 	rgd->rd_free += blen;
2419 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2420 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2421 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2422 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
2423 
2424 	/* Directories keep their data in the metadata address space */
2425 	if (meta || ip->i_depth)
2426 		gfs2_meta_wipe(ip, bstart, blen);
2427 }
2428 
2429 /**
2430  * gfs2_free_meta - free a contiguous run of data block(s)
2431  * @ip: the inode these blocks are being freed from
2432  * @bstart: first block of a run of contiguous blocks
2433  * @blen: the length of the block run
2434  *
2435  */
2436 
2437 void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
2438 {
2439 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2440 
2441 	__gfs2_free_blocks(ip, bstart, blen, 1);
2442 	gfs2_statfs_change(sdp, 0, +blen, 0);
2443 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2444 }
2445 
2446 void gfs2_unlink_di(struct inode *inode)
2447 {
2448 	struct gfs2_inode *ip = GFS2_I(inode);
2449 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2450 	struct gfs2_rgrpd *rgd;
2451 	u64 blkno = ip->i_no_addr;
2452 
2453 	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
2454 	if (!rgd)
2455 		return;
2456 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2457 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2458 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2459 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
2460 	update_rgrp_lvb_unlinked(rgd, 1);
2461 }
2462 
2463 static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
2464 {
2465 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2466 	struct gfs2_rgrpd *tmp_rgd;
2467 
2468 	tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
2469 	if (!tmp_rgd)
2470 		return;
2471 	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
2472 
2473 	if (!rgd->rd_dinodes)
2474 		gfs2_consist_rgrpd(rgd);
2475 	rgd->rd_dinodes--;
2476 	rgd->rd_free++;
2477 
2478 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2479 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2480 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
2481 	update_rgrp_lvb_unlinked(rgd, -1);
2482 
2483 	gfs2_statfs_change(sdp, 0, +1, -1);
2484 }
2485 
2486 
2487 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2488 {
2489 	gfs2_free_uninit_di(rgd, ip->i_no_addr);
2490 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2491 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2492 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2493 }
2494 
2495 /**
2496  * gfs2_check_blk_type - Check the type of a block
2497  * @sdp: The superblock
2498  * @no_addr: The block number to check
2499  * @type: The block type we are looking for
2500  *
2501  * Returns: 0 if the block type matches the expected type
2502  *          -ESTALE if it doesn't match
2503  *          or -ve errno if something went wrong while checking
2504  */
2505 
2506 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2507 {
2508 	struct gfs2_rgrpd *rgd;
2509 	struct gfs2_holder rgd_gh;
2510 	int error = -EINVAL;
2511 
2512 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2513 	if (!rgd)
2514 		goto fail;
2515 
2516 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2517 	if (error)
2518 		goto fail;
2519 
2520 	if (gfs2_get_block_type(rgd, no_addr) != type)
2521 		error = -ESTALE;
2522 
2523 	gfs2_glock_dq_uninit(&rgd_gh);
2524 fail:
2525 	return error;
2526 }
2527 
2528 /**
2529  * gfs2_rlist_add - add a RG to a list of RGs
2530  * @ip: the inode
2531  * @rlist: the list of resource groups
2532  * @block: the block
2533  *
2534  * Figure out what RG a block belongs to and add that RG to the list
2535  *
2536  * FIXME: Don't use NOFAIL
2537  *
2538  */
2539 
2540 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2541 		    u64 block)
2542 {
2543 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2544 	struct gfs2_rgrpd *rgd;
2545 	struct gfs2_rgrpd **tmp;
2546 	unsigned int new_space;
2547 	unsigned int x;
2548 
2549 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2550 		return;
2551 
2552 	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
2553 		rgd = ip->i_rgd;
2554 	else
2555 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2556 	if (!rgd) {
2557 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
2558 		return;
2559 	}
2560 	ip->i_rgd = rgd;
2561 
2562 	for (x = 0; x < rlist->rl_rgrps; x++)
2563 		if (rlist->rl_rgd[x] == rgd)
2564 			return;
2565 
2566 	if (rlist->rl_rgrps == rlist->rl_space) {
2567 		new_space = rlist->rl_space + 10;
2568 
2569 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2570 			      GFP_NOFS | __GFP_NOFAIL);
2571 
2572 		if (rlist->rl_rgd) {
2573 			memcpy(tmp, rlist->rl_rgd,
2574 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2575 			kfree(rlist->rl_rgd);
2576 		}
2577 
2578 		rlist->rl_space = new_space;
2579 		rlist->rl_rgd = tmp;
2580 	}
2581 
2582 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2583 }
2584 
2585 /**
2586  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2587  *      and initialize an array of glock holders for them
2588  * @rlist: the list of resource groups
2589  * @state: the lock state to acquire the RG lock in
2590  *
2591  * FIXME: Don't use NOFAIL
2592  *
2593  */
2594 
2595 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
2596 {
2597 	unsigned int x;
2598 
2599 	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
2600 				GFP_NOFS | __GFP_NOFAIL);
2601 	for (x = 0; x < rlist->rl_rgrps; x++)
2602 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2603 				state, 0,
2604 				&rlist->rl_ghs[x]);
2605 }
2606 
2607 /**
2608  * gfs2_rlist_free - free a resource group list
2609  * @rlist: the list of resource groups
2610  *
2611  */
2612 
2613 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2614 {
2615 	unsigned int x;
2616 
2617 	kfree(rlist->rl_rgd);
2618 
2619 	if (rlist->rl_ghs) {
2620 		for (x = 0; x < rlist->rl_rgrps; x++)
2621 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2622 		kfree(rlist->rl_ghs);
2623 		rlist->rl_ghs = NULL;
2624 	}
2625 }
2626 
2627