xref: /linux/fs/ocfs2/suballoc.c (revision 9d9c1cfec01cdbf24bd9322ed555713a20422115)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * suballoc.c
4  *
5  * metadata alloc and free
6  * Inspired by ext3 block groups.
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  */
10 
11 #include <linux/fs.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/string.h>
15 #include <linux/highmem.h>
16 
17 #include <cluster/masklog.h>
18 
19 #include "ocfs2.h"
20 
21 #include "alloc.h"
22 #include "blockcheck.h"
23 #include "dlmglue.h"
24 #include "inode.h"
25 #include "journal.h"
26 #include "localalloc.h"
27 #include "suballoc.h"
28 #include "super.h"
29 #include "sysfile.h"
30 #include "uptodate.h"
31 #include "ocfs2_trace.h"
32 
33 #include "buffer_head_io.h"
34 
35 #define NOT_ALLOC_NEW_GROUP		0
36 #define ALLOC_NEW_GROUP			0x1
37 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
38 
39 #define OCFS2_MAX_TO_STEAL		1024
40 
41 struct ocfs2_suballoc_result {
42 	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
43 					   to 0 when a block group is
44 					   contiguous. */
45 	u64		sr_bg_stable_blkno; /*
46 					     * Doesn't change, always
47 					     * set to target block
48 					     * group descriptor
49 					     * block.
50 					     */
51 	u64		sr_blkno;	/* The first allocated block */
52 	unsigned int	sr_bit_offset;	/* The bit in the bg */
53 	unsigned int	sr_bits;	/* How many bits we claimed */
54 	unsigned int	sr_max_contig_bits; /* The length for contiguous
55 					     * free bits, only available
56 					     * for cluster group
57 					     */
58 };
59 
ocfs2_group_from_res(struct ocfs2_suballoc_result * res)60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
61 {
62 	if (res->sr_blkno == 0)
63 		return 0;
64 
65 	if (res->sr_bg_blkno)
66 		return res->sr_bg_blkno;
67 
68 	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
69 }
70 
71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
72 static int ocfs2_block_group_fill(handle_t *handle,
73 				  struct inode *alloc_inode,
74 				  struct buffer_head *bg_bh,
75 				  u64 group_blkno,
76 				  unsigned int group_clusters,
77 				  u16 my_chain,
78 				  struct ocfs2_chain_list *cl);
79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
80 				   struct inode *alloc_inode,
81 				   struct buffer_head *bh,
82 				   u64 max_block,
83 				   u64 *last_alloc_group,
84 				   int flags);
85 
86 static int ocfs2_cluster_group_search(struct inode *inode,
87 				      struct buffer_head *group_bh,
88 				      u32 bits_wanted, u32 min_bits,
89 				      u64 max_block,
90 				      struct ocfs2_suballoc_result *res);
91 static int ocfs2_block_group_search(struct inode *inode,
92 				    struct buffer_head *group_bh,
93 				    u32 bits_wanted, u32 min_bits,
94 				    u64 max_block,
95 				    struct ocfs2_suballoc_result *res);
96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
97 				     handle_t *handle,
98 				     u32 bits_wanted,
99 				     u32 min_bits,
100 				     struct ocfs2_suballoc_result *res);
101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
102 					 int nr);
103 static int ocfs2_relink_block_group(handle_t *handle,
104 				    struct inode *alloc_inode,
105 				    struct buffer_head *fe_bh,
106 				    struct buffer_head *bg_bh,
107 				    struct buffer_head *prev_bg_bh,
108 				    u16 chain);
109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
110 						     u32 wanted);
111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
112 						   u64 bg_blkno,
113 						   u16 bg_bit_off);
114 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
115 						u64 data_blkno,
116 						u64 *bg_blkno,
117 						u16 *bg_bit_off);
118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
119 					     u32 bits_wanted, u64 max_block,
120 					     int flags,
121 					     struct ocfs2_alloc_context **ac);
122 
ocfs2_free_ac_resource(struct ocfs2_alloc_context * ac)123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
124 {
125 	struct inode *inode = ac->ac_inode;
126 
127 	if (inode) {
128 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
129 			ocfs2_inode_unlock(inode, 1);
130 
131 		inode_unlock(inode);
132 
133 		iput(inode);
134 		ac->ac_inode = NULL;
135 	}
136 	brelse(ac->ac_bh);
137 	ac->ac_bh = NULL;
138 	ac->ac_resv = NULL;
139 	kfree(ac->ac_find_loc_priv);
140 	ac->ac_find_loc_priv = NULL;
141 }
142 
ocfs2_free_alloc_context(struct ocfs2_alloc_context * ac)143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
144 {
145 	ocfs2_free_ac_resource(ac);
146 	kfree(ac);
147 }
148 
ocfs2_bits_per_group(struct ocfs2_chain_list * cl)149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
150 {
151 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
152 }
153 
154 #define do_error(fmt, ...)						\
155 do {									\
156 	if (resize)							\
157 		mlog(ML_ERROR, fmt, ##__VA_ARGS__);			\
158 	else								\
159 		return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
160 } while (0)
161 
ocfs2_validate_gd_self(struct super_block * sb,struct buffer_head * bh,int resize)162 static int ocfs2_validate_gd_self(struct super_block *sb,
163 				  struct buffer_head *bh,
164 				  int resize)
165 {
166 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
167 
168 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
169 		do_error("Group descriptor #%llu has bad signature %.*s\n",
170 			 (unsigned long long)bh->b_blocknr, 7,
171 			 gd->bg_signature);
172 	}
173 
174 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
175 		do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
176 			 (unsigned long long)bh->b_blocknr,
177 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
178 	}
179 
180 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
181 		do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
182 			 (unsigned long long)bh->b_blocknr,
183 			 le32_to_cpu(gd->bg_generation));
184 	}
185 
186 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 		do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
188 			 (unsigned long long)bh->b_blocknr,
189 			 le16_to_cpu(gd->bg_bits),
190 			 le16_to_cpu(gd->bg_free_bits_count));
191 	}
192 
193 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
194 		do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
195 			 (unsigned long long)bh->b_blocknr,
196 			 le16_to_cpu(gd->bg_bits),
197 			 8 * le16_to_cpu(gd->bg_size));
198 	}
199 
200 	return 0;
201 }
202 
ocfs2_validate_gd_parent(struct super_block * sb,struct ocfs2_dinode * di,struct buffer_head * bh,int resize)203 static int ocfs2_validate_gd_parent(struct super_block *sb,
204 				    struct ocfs2_dinode *di,
205 				    struct buffer_head *bh,
206 				    int resize)
207 {
208 	unsigned int max_bits;
209 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
210 
211 	if (di->i_blkno != gd->bg_parent_dinode) {
212 		do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
213 			 (unsigned long long)bh->b_blocknr,
214 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
215 			 (unsigned long long)le64_to_cpu(di->i_blkno));
216 	}
217 
218 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
219 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
220 		do_error("Group descriptor #%llu has bit count of %u\n",
221 			 (unsigned long long)bh->b_blocknr,
222 			 le16_to_cpu(gd->bg_bits));
223 	}
224 
225 	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
226 	if ((le16_to_cpu(gd->bg_chain) >
227 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
228 	    ((le16_to_cpu(gd->bg_chain) ==
229 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
230 		do_error("Group descriptor #%llu has bad chain %u\n",
231 			 (unsigned long long)bh->b_blocknr,
232 			 le16_to_cpu(gd->bg_chain));
233 	}
234 
235 	return 0;
236 }
237 
238 #undef do_error
239 
240 /*
241  * This version only prints errors.  It does not fail the filesystem, and
242  * exists only for resize.
243  */
ocfs2_check_group_descriptor(struct super_block * sb,struct ocfs2_dinode * di,struct buffer_head * bh)244 int ocfs2_check_group_descriptor(struct super_block *sb,
245 				 struct ocfs2_dinode *di,
246 				 struct buffer_head *bh)
247 {
248 	int rc;
249 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
250 
251 	BUG_ON(!buffer_uptodate(bh));
252 
253 	/*
254 	 * If the ecc fails, we return the error but otherwise
255 	 * leave the filesystem running.  We know any error is
256 	 * local to this block.
257 	 */
258 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
259 	if (rc) {
260 		mlog(ML_ERROR,
261 		     "Checksum failed for group descriptor %llu\n",
262 		     (unsigned long long)bh->b_blocknr);
263 	} else
264 		rc = ocfs2_validate_gd_self(sb, bh, 1);
265 	if (!rc)
266 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
267 
268 	return rc;
269 }
270 
ocfs2_validate_group_descriptor(struct super_block * sb,struct buffer_head * bh)271 static int ocfs2_validate_group_descriptor(struct super_block *sb,
272 					   struct buffer_head *bh)
273 {
274 	int rc;
275 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
276 
277 	trace_ocfs2_validate_group_descriptor(
278 					(unsigned long long)bh->b_blocknr);
279 
280 	BUG_ON(!buffer_uptodate(bh));
281 
282 	/*
283 	 * If the ecc fails, we return the error but otherwise
284 	 * leave the filesystem running.  We know any error is
285 	 * local to this block.
286 	 */
287 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
288 	if (rc)
289 		return rc;
290 
291 	/*
292 	 * Errors after here are fatal.
293 	 */
294 
295 	return ocfs2_validate_gd_self(sb, bh, 0);
296 }
297 
ocfs2_read_group_descriptor(struct inode * inode,struct ocfs2_dinode * di,u64 gd_blkno,struct buffer_head ** bh)298 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
299 				u64 gd_blkno, struct buffer_head **bh)
300 {
301 	int rc;
302 	struct buffer_head *tmp = *bh;
303 
304 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
305 			      ocfs2_validate_group_descriptor);
306 	if (rc)
307 		goto out;
308 
309 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
310 	if (rc) {
311 		brelse(tmp);
312 		goto out;
313 	}
314 
315 	/* If ocfs2_read_block() got us a new bh, pass it up. */
316 	if (!*bh)
317 		*bh = tmp;
318 
319 out:
320 	return rc;
321 }
322 
ocfs2_bg_discontig_add_extent(struct ocfs2_super * osb,struct ocfs2_group_desc * bg,struct ocfs2_chain_list * cl,u64 p_blkno,unsigned int clusters)323 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
324 					  struct ocfs2_group_desc *bg,
325 					  struct ocfs2_chain_list *cl,
326 					  u64 p_blkno, unsigned int clusters)
327 {
328 	struct ocfs2_extent_list *el = &bg->bg_list;
329 	struct ocfs2_extent_rec *rec;
330 
331 	BUG_ON(!ocfs2_supports_discontig_bg(osb));
332 	if (!el->l_next_free_rec)
333 		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
334 	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
335 	rec->e_blkno = cpu_to_le64(p_blkno);
336 	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
337 				  le16_to_cpu(cl->cl_bpc));
338 	rec->e_leaf_clusters = cpu_to_le16(clusters);
339 	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
340 	le16_add_cpu(&bg->bg_free_bits_count,
341 		     clusters * le16_to_cpu(cl->cl_bpc));
342 	le16_add_cpu(&el->l_next_free_rec, 1);
343 }
344 
ocfs2_block_group_fill(handle_t * handle,struct inode * alloc_inode,struct buffer_head * bg_bh,u64 group_blkno,unsigned int group_clusters,u16 my_chain,struct ocfs2_chain_list * cl)345 static int ocfs2_block_group_fill(handle_t *handle,
346 				  struct inode *alloc_inode,
347 				  struct buffer_head *bg_bh,
348 				  u64 group_blkno,
349 				  unsigned int group_clusters,
350 				  u16 my_chain,
351 				  struct ocfs2_chain_list *cl)
352 {
353 	int status = 0;
354 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
355 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
356 	struct super_block * sb = alloc_inode->i_sb;
357 
358 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
359 		status = ocfs2_error(alloc_inode->i_sb,
360 				     "group block (%llu) != b_blocknr (%llu)\n",
361 				     (unsigned long long)group_blkno,
362 				     (unsigned long long) bg_bh->b_blocknr);
363 		goto bail;
364 	}
365 
366 	status = ocfs2_journal_access_gd(handle,
367 					 INODE_CACHE(alloc_inode),
368 					 bg_bh,
369 					 OCFS2_JOURNAL_ACCESS_CREATE);
370 	if (status < 0) {
371 		mlog_errno(status);
372 		goto bail;
373 	}
374 
375 	memset(bg, 0, sb->s_blocksize);
376 	strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
377 	bg->bg_generation = cpu_to_le32(osb->fs_generation);
378 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
379 						osb->s_feature_incompat));
380 	bg->bg_chain = cpu_to_le16(my_chain);
381 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
382 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
383 	bg->bg_blkno = cpu_to_le64(group_blkno);
384 	if (group_clusters == le16_to_cpu(cl->cl_cpg))
385 		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
386 	else
387 		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
388 					      group_clusters);
389 
390 	/* set the 1st bit in the bitmap to account for the descriptor block */
391 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
392 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
393 
394 	ocfs2_journal_dirty(handle, bg_bh);
395 
396 	/* There is no need to zero out or otherwise initialize the
397 	 * other blocks in a group - All valid FS metadata in a block
398 	 * group stores the superblock fs_generation value at
399 	 * allocation time. */
400 
401 bail:
402 	if (status)
403 		mlog_errno(status);
404 	return status;
405 }
406 
ocfs2_find_smallest_chain(struct ocfs2_chain_list * cl)407 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
408 {
409 	u16 curr, best;
410 
411 	best = curr = 0;
412 	while (curr < le16_to_cpu(cl->cl_count)) {
413 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
414 		    le32_to_cpu(cl->cl_recs[curr].c_total))
415 			best = curr;
416 		curr++;
417 	}
418 	return best;
419 }
420 
421 static struct buffer_head *
ocfs2_block_group_alloc_contig(struct ocfs2_super * osb,handle_t * handle,struct inode * alloc_inode,struct ocfs2_alloc_context * ac,struct ocfs2_chain_list * cl)422 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
423 			       struct inode *alloc_inode,
424 			       struct ocfs2_alloc_context *ac,
425 			       struct ocfs2_chain_list *cl)
426 {
427 	int status;
428 	u32 bit_off, num_bits;
429 	u64 bg_blkno;
430 	struct buffer_head *bg_bh;
431 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
432 
433 	status = ocfs2_claim_clusters(handle, ac,
434 				      le16_to_cpu(cl->cl_cpg), &bit_off,
435 				      &num_bits);
436 	if (status < 0) {
437 		if (status != -ENOSPC)
438 			mlog_errno(status);
439 		goto bail;
440 	}
441 
442 	/* setup the group */
443 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
444 	trace_ocfs2_block_group_alloc_contig(
445 	     (unsigned long long)bg_blkno, alloc_rec);
446 
447 	bg_bh = sb_getblk(osb->sb, bg_blkno);
448 	if (!bg_bh) {
449 		status = -ENOMEM;
450 		mlog_errno(status);
451 		goto bail;
452 	}
453 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
454 
455 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
456 					bg_blkno, num_bits, alloc_rec, cl);
457 	if (status < 0) {
458 		brelse(bg_bh);
459 		mlog_errno(status);
460 	}
461 
462 bail:
463 	return status ? ERR_PTR(status) : bg_bh;
464 }
465 
ocfs2_block_group_claim_bits(struct ocfs2_super * osb,handle_t * handle,struct ocfs2_alloc_context * ac,unsigned int min_bits,u32 * bit_off,u32 * num_bits)466 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
467 					handle_t *handle,
468 					struct ocfs2_alloc_context *ac,
469 					unsigned int min_bits,
470 					u32 *bit_off, u32 *num_bits)
471 {
472 	int status = 0;
473 
474 	while (min_bits) {
475 		status = ocfs2_claim_clusters(handle, ac, min_bits,
476 					      bit_off, num_bits);
477 		if (status != -ENOSPC)
478 			break;
479 
480 		min_bits >>= 1;
481 	}
482 
483 	return status;
484 }
485 
ocfs2_block_group_grow_discontig(handle_t * handle,struct inode * alloc_inode,struct buffer_head * bg_bh,struct ocfs2_alloc_context * ac,struct ocfs2_chain_list * cl,unsigned int min_bits)486 static int ocfs2_block_group_grow_discontig(handle_t *handle,
487 					    struct inode *alloc_inode,
488 					    struct buffer_head *bg_bh,
489 					    struct ocfs2_alloc_context *ac,
490 					    struct ocfs2_chain_list *cl,
491 					    unsigned int min_bits)
492 {
493 	int status;
494 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
495 	struct ocfs2_group_desc *bg =
496 		(struct ocfs2_group_desc *)bg_bh->b_data;
497 	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
498 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
499 	u32 p_cpos, clusters;
500 	u64 p_blkno;
501 	struct ocfs2_extent_list *el = &bg->bg_list;
502 
503 	status = ocfs2_journal_access_gd(handle,
504 					 INODE_CACHE(alloc_inode),
505 					 bg_bh,
506 					 OCFS2_JOURNAL_ACCESS_CREATE);
507 	if (status < 0) {
508 		mlog_errno(status);
509 		goto bail;
510 	}
511 
512 	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
513 				le16_to_cpu(el->l_count))) {
514 		if (min_bits > needed)
515 			min_bits = needed;
516 		status = ocfs2_block_group_claim_bits(osb, handle, ac,
517 						      min_bits, &p_cpos,
518 						      &clusters);
519 		if (status < 0) {
520 			if (status != -ENOSPC)
521 				mlog_errno(status);
522 			goto bail;
523 		}
524 		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
525 		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
526 					      clusters);
527 
528 		min_bits = clusters;
529 		needed = le16_to_cpu(cl->cl_cpg) -
530 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
531 	}
532 
533 	if (needed > 0) {
534 		/*
535 		 * We have used up all the extent rec but can't fill up
536 		 * the cpg. So bail out.
537 		 */
538 		status = -ENOSPC;
539 		goto bail;
540 	}
541 
542 	ocfs2_journal_dirty(handle, bg_bh);
543 
544 bail:
545 	return status;
546 }
547 
ocfs2_bg_alloc_cleanup(handle_t * handle,struct ocfs2_alloc_context * cluster_ac,struct inode * alloc_inode,struct buffer_head * bg_bh)548 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
549 				   struct ocfs2_alloc_context *cluster_ac,
550 				   struct inode *alloc_inode,
551 				   struct buffer_head *bg_bh)
552 {
553 	int i, ret;
554 	struct ocfs2_group_desc *bg;
555 	struct ocfs2_extent_list *el;
556 	struct ocfs2_extent_rec *rec;
557 
558 	if (!bg_bh)
559 		return;
560 
561 	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
562 	el = &bg->bg_list;
563 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
564 		rec = &el->l_recs[i];
565 		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
566 					  cluster_ac->ac_bh,
567 					  le64_to_cpu(rec->e_blkno),
568 					  le16_to_cpu(rec->e_leaf_clusters));
569 		if (ret)
570 			mlog_errno(ret);
571 		/* Try all the clusters to free */
572 	}
573 
574 	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
575 	brelse(bg_bh);
576 }
577 
578 static struct buffer_head *
ocfs2_block_group_alloc_discontig(handle_t * handle,struct inode * alloc_inode,struct ocfs2_alloc_context * ac,struct ocfs2_chain_list * cl)579 ocfs2_block_group_alloc_discontig(handle_t *handle,
580 				  struct inode *alloc_inode,
581 				  struct ocfs2_alloc_context *ac,
582 				  struct ocfs2_chain_list *cl)
583 {
584 	int status;
585 	u32 bit_off, num_bits;
586 	u64 bg_blkno;
587 	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
588 	struct buffer_head *bg_bh = NULL;
589 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
590 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
591 
592 	if (!ocfs2_supports_discontig_bg(osb)) {
593 		status = -ENOSPC;
594 		goto bail;
595 	}
596 
597 	status = ocfs2_extend_trans(handle,
598 				    ocfs2_calc_bg_discontig_credits(osb->sb));
599 	if (status) {
600 		mlog_errno(status);
601 		goto bail;
602 	}
603 
604 	/*
605 	 * We're going to be grabbing from multiple cluster groups.
606 	 * We don't have enough credits to relink them all, and the
607 	 * cluster groups will be staying in cache for the duration of
608 	 * this operation.
609 	 */
610 	ac->ac_disable_chain_relink = 1;
611 
612 	/* Claim the first region */
613 	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
614 					      &bit_off, &num_bits);
615 	if (status < 0) {
616 		if (status != -ENOSPC)
617 			mlog_errno(status);
618 		goto bail;
619 	}
620 	min_bits = num_bits;
621 
622 	/* setup the group */
623 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
624 	trace_ocfs2_block_group_alloc_discontig(
625 				(unsigned long long)bg_blkno, alloc_rec);
626 
627 	bg_bh = sb_getblk(osb->sb, bg_blkno);
628 	if (!bg_bh) {
629 		status = -ENOMEM;
630 		mlog_errno(status);
631 		goto bail;
632 	}
633 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
634 
635 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
636 					bg_blkno, num_bits, alloc_rec, cl);
637 	if (status < 0) {
638 		mlog_errno(status);
639 		goto bail;
640 	}
641 
642 	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
643 						  bg_bh, ac, cl, min_bits);
644 	if (status)
645 		mlog_errno(status);
646 
647 bail:
648 	if (status)
649 		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
650 	return status ? ERR_PTR(status) : bg_bh;
651 }
652 
653 /*
654  * We expect the block group allocator to already be locked.
655  */
ocfs2_block_group_alloc(struct ocfs2_super * osb,struct inode * alloc_inode,struct buffer_head * bh,u64 max_block,u64 * last_alloc_group,int flags)656 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
657 				   struct inode *alloc_inode,
658 				   struct buffer_head *bh,
659 				   u64 max_block,
660 				   u64 *last_alloc_group,
661 				   int flags)
662 {
663 	int status, credits;
664 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
665 	struct ocfs2_chain_list *cl;
666 	struct ocfs2_alloc_context *ac = NULL;
667 	handle_t *handle = NULL;
668 	u16 alloc_rec;
669 	struct buffer_head *bg_bh = NULL;
670 	struct ocfs2_group_desc *bg;
671 
672 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
673 
674 	cl = &fe->id2.i_chain;
675 	status = ocfs2_reserve_clusters_with_limit(osb,
676 						   le16_to_cpu(cl->cl_cpg),
677 						   max_block, flags, &ac);
678 	if (status < 0) {
679 		if (status != -ENOSPC)
680 			mlog_errno(status);
681 		goto bail;
682 	}
683 
684 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
685 						 le16_to_cpu(cl->cl_cpg));
686 	handle = ocfs2_start_trans(osb, credits);
687 	if (IS_ERR(handle)) {
688 		status = PTR_ERR(handle);
689 		handle = NULL;
690 		mlog_errno(status);
691 		goto bail;
692 	}
693 
694 	if (last_alloc_group && *last_alloc_group != 0) {
695 		trace_ocfs2_block_group_alloc(
696 				(unsigned long long)*last_alloc_group);
697 		ac->ac_last_group = *last_alloc_group;
698 	}
699 
700 	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
701 					       ac, cl);
702 	if (PTR_ERR(bg_bh) == -ENOSPC) {
703 		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
704 		bg_bh = ocfs2_block_group_alloc_discontig(handle,
705 							  alloc_inode,
706 							  ac, cl);
707 	}
708 	if (IS_ERR(bg_bh)) {
709 		status = PTR_ERR(bg_bh);
710 		bg_bh = NULL;
711 		if (status != -ENOSPC)
712 			mlog_errno(status);
713 		goto bail;
714 	}
715 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
716 
717 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
718 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
719 	if (status < 0) {
720 		mlog_errno(status);
721 		goto bail;
722 	}
723 
724 	alloc_rec = le16_to_cpu(bg->bg_chain);
725 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
726 		     le16_to_cpu(bg->bg_free_bits_count));
727 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
728 		     le16_to_cpu(bg->bg_bits));
729 	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
730 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
731 		le16_add_cpu(&cl->cl_next_free_rec, 1);
732 
733 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
734 					le16_to_cpu(bg->bg_free_bits_count));
735 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
736 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
737 
738 	ocfs2_journal_dirty(handle, bh);
739 
740 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
741 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
742 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
743 					     le32_to_cpu(fe->i_clusters)));
744 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
745 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
746 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
747 	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
748 
749 	status = 0;
750 
751 	/* save the new last alloc group so that the caller can cache it. */
752 	if (last_alloc_group)
753 		*last_alloc_group = ac->ac_last_group;
754 
755 bail:
756 	if (handle)
757 		ocfs2_commit_trans(osb, handle);
758 
759 	if (ac)
760 		ocfs2_free_alloc_context(ac);
761 
762 	brelse(bg_bh);
763 
764 	if (status)
765 		mlog_errno(status);
766 	return status;
767 }
768 
ocfs2_reserve_suballoc_bits(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac,int type,u32 slot,u64 * last_alloc_group,int flags)769 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
770 				       struct ocfs2_alloc_context *ac,
771 				       int type,
772 				       u32 slot,
773 				       u64 *last_alloc_group,
774 				       int flags)
775 {
776 	int status;
777 	u32 bits_wanted = ac->ac_bits_wanted;
778 	struct inode *alloc_inode;
779 	struct buffer_head *bh = NULL;
780 	struct ocfs2_dinode *fe;
781 	u32 free_bits;
782 
783 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
784 	if (!alloc_inode) {
785 		mlog_errno(-EINVAL);
786 		return -EINVAL;
787 	}
788 
789 	inode_lock(alloc_inode);
790 
791 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
792 	if (status < 0) {
793 		inode_unlock(alloc_inode);
794 		iput(alloc_inode);
795 
796 		mlog_errno(status);
797 		return status;
798 	}
799 
800 	ac->ac_inode = alloc_inode;
801 	ac->ac_alloc_slot = slot;
802 
803 	fe = (struct ocfs2_dinode *) bh->b_data;
804 
805 	/* The bh was validated by the inode read inside
806 	 * ocfs2_inode_lock().  Any corruption is a code bug. */
807 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
808 
809 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
810 		status = ocfs2_error(alloc_inode->i_sb,
811 				     "Invalid chain allocator %llu\n",
812 				     (unsigned long long)le64_to_cpu(fe->i_blkno));
813 		goto bail;
814 	}
815 
816 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
817 		le32_to_cpu(fe->id1.bitmap1.i_used);
818 
819 	if (bits_wanted > free_bits) {
820 		/* cluster bitmap never grows */
821 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
822 			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
823 								free_bits);
824 			status = -ENOSPC;
825 			goto bail;
826 		}
827 
828 		if (!(flags & ALLOC_NEW_GROUP)) {
829 			trace_ocfs2_reserve_suballoc_bits_no_new_group(
830 						slot, bits_wanted, free_bits);
831 			status = -ENOSPC;
832 			goto bail;
833 		}
834 
835 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
836 						 ac->ac_max_block,
837 						 last_alloc_group, flags);
838 		if (status < 0) {
839 			if (status != -ENOSPC)
840 				mlog_errno(status);
841 			goto bail;
842 		}
843 		atomic_inc(&osb->alloc_stats.bg_extends);
844 
845 		/* You should never ask for this much metadata */
846 		BUG_ON(bits_wanted >
847 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
848 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
849 	}
850 
851 	get_bh(bh);
852 	ac->ac_bh = bh;
853 bail:
854 	brelse(bh);
855 
856 	if (status)
857 		mlog_errno(status);
858 	return status;
859 }
860 
ocfs2_init_inode_steal_slot(struct ocfs2_super * osb)861 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
862 {
863 	spin_lock(&osb->osb_lock);
864 	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
865 	spin_unlock(&osb->osb_lock);
866 	atomic_set(&osb->s_num_inodes_stolen, 0);
867 }
868 
ocfs2_init_meta_steal_slot(struct ocfs2_super * osb)869 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
870 {
871 	spin_lock(&osb->osb_lock);
872 	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
873 	spin_unlock(&osb->osb_lock);
874 	atomic_set(&osb->s_num_meta_stolen, 0);
875 }
876 
ocfs2_init_steal_slots(struct ocfs2_super * osb)877 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
878 {
879 	ocfs2_init_inode_steal_slot(osb);
880 	ocfs2_init_meta_steal_slot(osb);
881 }
882 
__ocfs2_set_steal_slot(struct ocfs2_super * osb,int slot,int type)883 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
884 {
885 	spin_lock(&osb->osb_lock);
886 	if (type == INODE_ALLOC_SYSTEM_INODE)
887 		osb->s_inode_steal_slot = (u16)slot;
888 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
889 		osb->s_meta_steal_slot = (u16)slot;
890 	spin_unlock(&osb->osb_lock);
891 }
892 
__ocfs2_get_steal_slot(struct ocfs2_super * osb,int type)893 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
894 {
895 	int slot = OCFS2_INVALID_SLOT;
896 
897 	spin_lock(&osb->osb_lock);
898 	if (type == INODE_ALLOC_SYSTEM_INODE)
899 		slot = osb->s_inode_steal_slot;
900 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
901 		slot = osb->s_meta_steal_slot;
902 	spin_unlock(&osb->osb_lock);
903 
904 	return slot;
905 }
906 
ocfs2_get_inode_steal_slot(struct ocfs2_super * osb)907 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
908 {
909 	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
910 }
911 
ocfs2_get_meta_steal_slot(struct ocfs2_super * osb)912 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
913 {
914 	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
915 }
916 
ocfs2_steal_resource(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac,int type)917 static int ocfs2_steal_resource(struct ocfs2_super *osb,
918 				struct ocfs2_alloc_context *ac,
919 				int type)
920 {
921 	int i, status = -ENOSPC;
922 	int slot = __ocfs2_get_steal_slot(osb, type);
923 
924 	/* Start to steal resource from the first slot after ours. */
925 	if (slot == OCFS2_INVALID_SLOT)
926 		slot = osb->slot_num + 1;
927 
928 	for (i = 0; i < osb->max_slots; i++, slot++) {
929 		if (slot == osb->max_slots)
930 			slot = 0;
931 
932 		if (slot == osb->slot_num)
933 			continue;
934 
935 		status = ocfs2_reserve_suballoc_bits(osb, ac,
936 						     type,
937 						     (u32)slot, NULL,
938 						     NOT_ALLOC_NEW_GROUP);
939 		if (status >= 0) {
940 			__ocfs2_set_steal_slot(osb, slot, type);
941 			break;
942 		}
943 
944 		ocfs2_free_ac_resource(ac);
945 	}
946 
947 	return status;
948 }
949 
ocfs2_steal_inode(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac)950 static int ocfs2_steal_inode(struct ocfs2_super *osb,
951 			     struct ocfs2_alloc_context *ac)
952 {
953 	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
954 }
955 
ocfs2_steal_meta(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac)956 static int ocfs2_steal_meta(struct ocfs2_super *osb,
957 			    struct ocfs2_alloc_context *ac)
958 {
959 	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
960 }
961 
ocfs2_reserve_new_metadata_blocks(struct ocfs2_super * osb,int blocks,struct ocfs2_alloc_context ** ac)962 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
963 				      int blocks,
964 				      struct ocfs2_alloc_context **ac)
965 {
966 	int status;
967 	int slot = ocfs2_get_meta_steal_slot(osb);
968 
969 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
970 	if (!(*ac)) {
971 		status = -ENOMEM;
972 		mlog_errno(status);
973 		goto bail;
974 	}
975 
976 	(*ac)->ac_bits_wanted = blocks;
977 	(*ac)->ac_which = OCFS2_AC_USE_META;
978 	(*ac)->ac_group_search = ocfs2_block_group_search;
979 
980 	if (slot != OCFS2_INVALID_SLOT &&
981 		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
982 		goto extent_steal;
983 
984 	atomic_set(&osb->s_num_meta_stolen, 0);
985 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
986 					     EXTENT_ALLOC_SYSTEM_INODE,
987 					     (u32)osb->slot_num, NULL,
988 					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
989 
990 
991 	if (status >= 0) {
992 		status = 0;
993 		if (slot != OCFS2_INVALID_SLOT)
994 			ocfs2_init_meta_steal_slot(osb);
995 		goto bail;
996 	} else if (status < 0 && status != -ENOSPC) {
997 		mlog_errno(status);
998 		goto bail;
999 	}
1000 
1001 	ocfs2_free_ac_resource(*ac);
1002 
1003 extent_steal:
1004 	status = ocfs2_steal_meta(osb, *ac);
1005 	atomic_inc(&osb->s_num_meta_stolen);
1006 	if (status < 0) {
1007 		if (status != -ENOSPC)
1008 			mlog_errno(status);
1009 		goto bail;
1010 	}
1011 
1012 	status = 0;
1013 bail:
1014 	if ((status < 0) && *ac) {
1015 		ocfs2_free_alloc_context(*ac);
1016 		*ac = NULL;
1017 	}
1018 
1019 	if (status)
1020 		mlog_errno(status);
1021 	return status;
1022 }
1023 
ocfs2_reserve_new_metadata(struct ocfs2_super * osb,struct ocfs2_extent_list * root_el,struct ocfs2_alloc_context ** ac)1024 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1025 			       struct ocfs2_extent_list *root_el,
1026 			       struct ocfs2_alloc_context **ac)
1027 {
1028 	return ocfs2_reserve_new_metadata_blocks(osb,
1029 					ocfs2_extend_meta_needed(root_el),
1030 					ac);
1031 }
1032 
ocfs2_reserve_new_inode(struct ocfs2_super * osb,struct ocfs2_alloc_context ** ac)1033 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1034 			    struct ocfs2_alloc_context **ac)
1035 {
1036 	int status;
1037 	int slot = ocfs2_get_inode_steal_slot(osb);
1038 	u64 alloc_group;
1039 
1040 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1041 	if (!(*ac)) {
1042 		status = -ENOMEM;
1043 		mlog_errno(status);
1044 		goto bail;
1045 	}
1046 
1047 	(*ac)->ac_bits_wanted = 1;
1048 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
1049 
1050 	(*ac)->ac_group_search = ocfs2_block_group_search;
1051 
1052 	/*
1053 	 * stat(2) can't handle i_ino > 32bits, so we tell the
1054 	 * lower levels not to allocate us a block group past that
1055 	 * limit.  The 'inode64' mount option avoids this behavior.
1056 	 */
1057 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1058 		(*ac)->ac_max_block = (u32)~0U;
1059 
1060 	/*
1061 	 * slot is set when we successfully steal inode from other nodes.
1062 	 * It is reset in 3 places:
1063 	 * 1. when we flush the truncate log
1064 	 * 2. when we complete local alloc recovery.
1065 	 * 3. when we successfully allocate from our own slot.
1066 	 * After it is set, we will go on stealing inodes until we find the
1067 	 * need to check our slots to see whether there is some space for us.
1068 	 */
1069 	if (slot != OCFS2_INVALID_SLOT &&
1070 	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1071 		goto inode_steal;
1072 
1073 	atomic_set(&osb->s_num_inodes_stolen, 0);
1074 	alloc_group = osb->osb_inode_alloc_group;
1075 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
1076 					     INODE_ALLOC_SYSTEM_INODE,
1077 					     (u32)osb->slot_num,
1078 					     &alloc_group,
1079 					     ALLOC_NEW_GROUP |
1080 					     ALLOC_GROUPS_FROM_GLOBAL);
1081 	if (status >= 0) {
1082 		status = 0;
1083 
1084 		spin_lock(&osb->osb_lock);
1085 		osb->osb_inode_alloc_group = alloc_group;
1086 		spin_unlock(&osb->osb_lock);
1087 		trace_ocfs2_reserve_new_inode_new_group(
1088 			(unsigned long long)alloc_group);
1089 
1090 		/*
1091 		 * Some inodes must be freed by us, so try to allocate
1092 		 * from our own next time.
1093 		 */
1094 		if (slot != OCFS2_INVALID_SLOT)
1095 			ocfs2_init_inode_steal_slot(osb);
1096 		goto bail;
1097 	} else if (status < 0 && status != -ENOSPC) {
1098 		mlog_errno(status);
1099 		goto bail;
1100 	}
1101 
1102 	ocfs2_free_ac_resource(*ac);
1103 
1104 inode_steal:
1105 	status = ocfs2_steal_inode(osb, *ac);
1106 	atomic_inc(&osb->s_num_inodes_stolen);
1107 	if (status < 0) {
1108 		if (status != -ENOSPC)
1109 			mlog_errno(status);
1110 		goto bail;
1111 	}
1112 
1113 	status = 0;
1114 bail:
1115 	if ((status < 0) && *ac) {
1116 		ocfs2_free_alloc_context(*ac);
1117 		*ac = NULL;
1118 	}
1119 
1120 	if (status)
1121 		mlog_errno(status);
1122 	return status;
1123 }
1124 
1125 /* local alloc code has to do the same thing, so rather than do this
1126  * twice.. */
ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac)1127 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1128 				      struct ocfs2_alloc_context *ac)
1129 {
1130 	int status;
1131 
1132 	ac->ac_which = OCFS2_AC_USE_MAIN;
1133 	ac->ac_group_search = ocfs2_cluster_group_search;
1134 
1135 	status = ocfs2_reserve_suballoc_bits(osb, ac,
1136 					     GLOBAL_BITMAP_SYSTEM_INODE,
1137 					     OCFS2_INVALID_SLOT, NULL,
1138 					     ALLOC_NEW_GROUP);
1139 	if (status < 0 && status != -ENOSPC)
1140 		mlog_errno(status);
1141 
1142 	return status;
1143 }
1144 
1145 /* Callers don't need to care which bitmap (local alloc or main) to
1146  * use so we figure it out for them, but unfortunately this clutters
1147  * things a bit. */
ocfs2_reserve_clusters_with_limit(struct ocfs2_super * osb,u32 bits_wanted,u64 max_block,int flags,struct ocfs2_alloc_context ** ac)1148 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1149 					     u32 bits_wanted, u64 max_block,
1150 					     int flags,
1151 					     struct ocfs2_alloc_context **ac)
1152 {
1153 	int status, ret = 0;
1154 	int retried = 0;
1155 
1156 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1157 	if (!(*ac)) {
1158 		status = -ENOMEM;
1159 		mlog_errno(status);
1160 		goto bail;
1161 	}
1162 
1163 	(*ac)->ac_bits_wanted = bits_wanted;
1164 	(*ac)->ac_max_block = max_block;
1165 
1166 	status = -ENOSPC;
1167 	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1168 	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1169 		status = ocfs2_reserve_local_alloc_bits(osb,
1170 							bits_wanted,
1171 							*ac);
1172 		if ((status < 0) && (status != -ENOSPC)) {
1173 			mlog_errno(status);
1174 			goto bail;
1175 		}
1176 	}
1177 
1178 	if (status == -ENOSPC) {
1179 retry:
1180 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1181 		/* Retry if there is sufficient space cached in truncate log */
1182 		if (status == -ENOSPC && !retried) {
1183 			retried = 1;
1184 			ocfs2_inode_unlock((*ac)->ac_inode, 1);
1185 			inode_unlock((*ac)->ac_inode);
1186 
1187 			ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1188 			if (ret == 1) {
1189 				iput((*ac)->ac_inode);
1190 				(*ac)->ac_inode = NULL;
1191 				goto retry;
1192 			}
1193 
1194 			if (ret < 0)
1195 				mlog_errno(ret);
1196 
1197 			inode_lock((*ac)->ac_inode);
1198 			ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1199 			if (ret < 0) {
1200 				mlog_errno(ret);
1201 				inode_unlock((*ac)->ac_inode);
1202 				iput((*ac)->ac_inode);
1203 				(*ac)->ac_inode = NULL;
1204 				goto bail;
1205 			}
1206 		}
1207 		if (status < 0) {
1208 			if (status != -ENOSPC)
1209 				mlog_errno(status);
1210 			goto bail;
1211 		}
1212 	}
1213 
1214 	status = 0;
1215 bail:
1216 	if ((status < 0) && *ac) {
1217 		ocfs2_free_alloc_context(*ac);
1218 		*ac = NULL;
1219 	}
1220 
1221 	if (status)
1222 		mlog_errno(status);
1223 	return status;
1224 }
1225 
ocfs2_reserve_clusters(struct ocfs2_super * osb,u32 bits_wanted,struct ocfs2_alloc_context ** ac)1226 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1227 			   u32 bits_wanted,
1228 			   struct ocfs2_alloc_context **ac)
1229 {
1230 	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1231 						 ALLOC_NEW_GROUP, ac);
1232 }
1233 
1234 /*
1235  * More or less lifted from ext3. I'll leave their description below:
1236  *
1237  * "For ext3 allocations, we must not reuse any blocks which are
1238  * allocated in the bitmap buffer's "last committed data" copy.  This
1239  * prevents deletes from freeing up the page for reuse until we have
1240  * committed the delete transaction.
1241  *
1242  * If we didn't do this, then deleting something and reallocating it as
1243  * data would allow the old block to be overwritten before the
1244  * transaction committed (because we force data to disk before commit).
1245  * This would lead to corruption if we crashed between overwriting the
1246  * data and committing the delete.
1247  *
1248  * @@@ We may want to make this allocation behaviour conditional on
1249  * data-writes at some point, and disable it for metadata allocations or
1250  * sync-data inodes."
1251  *
1252  * Note: OCFS2 already does this differently for metadata vs data
1253  * allocations, as those bitmaps are separate and undo access is never
1254  * called on a metadata group descriptor.
1255  */
ocfs2_test_bg_bit_allocatable(struct buffer_head * bg_bh,int nr)1256 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1257 					 int nr)
1258 {
1259 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1260 	struct journal_head *jh;
1261 	int ret;
1262 
1263 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1264 		return 0;
1265 
1266 	jh = jbd2_journal_grab_journal_head(bg_bh);
1267 	if (!jh)
1268 		return 1;
1269 
1270 	spin_lock(&jh->b_state_lock);
1271 	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
1272 	if (bg)
1273 		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1274 	else
1275 		ret = 1;
1276 	spin_unlock(&jh->b_state_lock);
1277 	jbd2_journal_put_journal_head(jh);
1278 
1279 	return ret;
1280 }
1281 
ocfs2_find_max_contig_free_bits(void * bitmap,u16 total_bits,u16 start)1282 u16 ocfs2_find_max_contig_free_bits(void *bitmap,
1283 			 u16 total_bits, u16 start)
1284 {
1285 	u16 offset, free_bits;
1286 	u16 contig_bits = 0;
1287 
1288 	while (start < total_bits) {
1289 		offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
1290 		if (offset == total_bits)
1291 			break;
1292 
1293 		start = ocfs2_find_next_bit(bitmap, total_bits, offset);
1294 		free_bits = start - offset;
1295 		if (contig_bits < free_bits)
1296 			contig_bits = free_bits;
1297 	}
1298 
1299 	return contig_bits;
1300 }
1301 
ocfs2_block_group_find_clear_bits(struct ocfs2_super * osb,struct buffer_head * bg_bh,unsigned int bits_wanted,unsigned int total_bits,struct ocfs2_suballoc_result * res)1302 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1303 					     struct buffer_head *bg_bh,
1304 					     unsigned int bits_wanted,
1305 					     unsigned int total_bits,
1306 					     struct ocfs2_suballoc_result *res)
1307 {
1308 	void *bitmap;
1309 	u16 best_offset, best_size;
1310 	u16 prev_best_size = 0;
1311 	int offset, start, found, status = 0;
1312 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1313 
1314 	/* Callers got this descriptor from
1315 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1316 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1317 
1318 	found = start = best_offset = best_size = 0;
1319 	bitmap = bg->bg_bitmap;
1320 
1321 	while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) <
1322 	       total_bits) {
1323 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1324 			/* We found a zero, but we can't use it as it
1325 			 * hasn't been put to disk yet! */
1326 			found = 0;
1327 			start = offset + 1;
1328 		} else if (offset == start) {
1329 			/* we found a zero */
1330 			found++;
1331 			/* move start to the next bit to test */
1332 			start++;
1333 		} else {
1334 			/* got a zero after some ones */
1335 			found = 1;
1336 			start = offset + 1;
1337 			prev_best_size = best_size;
1338 		}
1339 		if (found > best_size) {
1340 			best_size = found;
1341 			best_offset = start - found;
1342 		}
1343 		/* we got everything we needed */
1344 		if (found == bits_wanted) {
1345 			/* mlog(0, "Found it all!\n"); */
1346 			break;
1347 		}
1348 	}
1349 
1350 	/* best_size will be allocated, we save prev_best_size */
1351 	res->sr_max_contig_bits = prev_best_size;
1352 	if (best_size) {
1353 		res->sr_bit_offset = best_offset;
1354 		res->sr_bits = best_size;
1355 	} else {
1356 		status = -ENOSPC;
1357 		/* No error log here -- see the comment above
1358 		 * ocfs2_test_bg_bit_allocatable */
1359 	}
1360 
1361 	return status;
1362 }
1363 
ocfs2_block_group_set_bits(handle_t * handle,struct inode * alloc_inode,struct ocfs2_group_desc * bg,struct buffer_head * group_bh,unsigned int bit_off,unsigned int num_bits,unsigned int max_contig_bits,int fastpath)1364 int ocfs2_block_group_set_bits(handle_t *handle,
1365 					     struct inode *alloc_inode,
1366 					     struct ocfs2_group_desc *bg,
1367 					     struct buffer_head *group_bh,
1368 					     unsigned int bit_off,
1369 					     unsigned int num_bits,
1370 					     unsigned int max_contig_bits,
1371 					     int fastpath)
1372 {
1373 	int status;
1374 	void *bitmap = bg->bg_bitmap;
1375 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1376 	unsigned int start = bit_off + num_bits;
1377 	u16 contig_bits;
1378 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1379 
1380 	/* All callers get the descriptor via
1381 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1382 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1383 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1384 
1385 	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1386 
1387 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1388 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1389 
1390 	status = ocfs2_journal_access_gd(handle,
1391 					 INODE_CACHE(alloc_inode),
1392 					 group_bh,
1393 					 journal_type);
1394 	if (status < 0) {
1395 		mlog_errno(status);
1396 		goto bail;
1397 	}
1398 
1399 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1400 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1401 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1402 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
1403 				   le16_to_cpu(bg->bg_bits),
1404 				   le16_to_cpu(bg->bg_free_bits_count),
1405 				   num_bits);
1406 	}
1407 	while(num_bits--)
1408 		ocfs2_set_bit(bit_off++, bitmap);
1409 
1410 	/*
1411 	 * this is optimize path, caller set old contig value
1412 	 * in max_contig_bits to bypass finding action.
1413 	 */
1414 	if (fastpath) {
1415 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1416 	} else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
1417 		/*
1418 		 * Usually, the block group bitmap allocates only 1 bit
1419 		 * at a time, while the cluster group allocates n bits
1420 		 * each time. Therefore, we only save the contig bits for
1421 		 * the cluster group.
1422 		 */
1423 		contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
1424 				    le16_to_cpu(bg->bg_bits), start);
1425 		if (contig_bits > max_contig_bits)
1426 			max_contig_bits = contig_bits;
1427 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1428 		ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits);
1429 	} else {
1430 		bg->bg_contig_free_bits = 0;
1431 	}
1432 
1433 	ocfs2_journal_dirty(handle, group_bh);
1434 
1435 bail:
1436 	return status;
1437 }
1438 
1439 /* find the one with the most empty bits */
ocfs2_find_victim_chain(struct ocfs2_chain_list * cl)1440 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1441 {
1442 	u16 curr, best;
1443 
1444 	BUG_ON(!cl->cl_next_free_rec);
1445 
1446 	best = curr = 0;
1447 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1448 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1449 		    le32_to_cpu(cl->cl_recs[best].c_free))
1450 			best = curr;
1451 		curr++;
1452 	}
1453 
1454 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1455 	return best;
1456 }
1457 
ocfs2_relink_block_group(handle_t * handle,struct inode * alloc_inode,struct buffer_head * fe_bh,struct buffer_head * bg_bh,struct buffer_head * prev_bg_bh,u16 chain)1458 static int ocfs2_relink_block_group(handle_t *handle,
1459 				    struct inode *alloc_inode,
1460 				    struct buffer_head *fe_bh,
1461 				    struct buffer_head *bg_bh,
1462 				    struct buffer_head *prev_bg_bh,
1463 				    u16 chain)
1464 {
1465 	int status;
1466 	/* there is a really tiny chance the journal calls could fail,
1467 	 * but we wouldn't want inconsistent blocks in *any* case. */
1468 	u64 bg_ptr, prev_bg_ptr;
1469 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1470 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1471 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1472 
1473 	/* The caller got these descriptors from
1474 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1475 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1476 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1477 
1478 	trace_ocfs2_relink_block_group(
1479 		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1480 		(unsigned long long)le64_to_cpu(bg->bg_blkno),
1481 		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1482 
1483 	bg_ptr = le64_to_cpu(bg->bg_next_group);
1484 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1485 
1486 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1487 					 prev_bg_bh,
1488 					 OCFS2_JOURNAL_ACCESS_WRITE);
1489 	if (status < 0)
1490 		goto out;
1491 
1492 	prev_bg->bg_next_group = bg->bg_next_group;
1493 	ocfs2_journal_dirty(handle, prev_bg_bh);
1494 
1495 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1496 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1497 	if (status < 0)
1498 		goto out_rollback_prev_bg;
1499 
1500 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1501 	ocfs2_journal_dirty(handle, bg_bh);
1502 
1503 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1504 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1505 	if (status < 0)
1506 		goto out_rollback_bg;
1507 
1508 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1509 	ocfs2_journal_dirty(handle, fe_bh);
1510 
1511 out:
1512 	if (status < 0)
1513 		mlog_errno(status);
1514 	return status;
1515 
1516 out_rollback_bg:
1517 	bg->bg_next_group = cpu_to_le64(bg_ptr);
1518 out_rollback_prev_bg:
1519 	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1520 	goto out;
1521 }
1522 
ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc * bg,u32 wanted)1523 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1524 						     u32 wanted)
1525 {
1526 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1527 }
1528 
1529 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1530  * value on error. */
ocfs2_cluster_group_search(struct inode * inode,struct buffer_head * group_bh,u32 bits_wanted,u32 min_bits,u64 max_block,struct ocfs2_suballoc_result * res)1531 static int ocfs2_cluster_group_search(struct inode *inode,
1532 				      struct buffer_head *group_bh,
1533 				      u32 bits_wanted, u32 min_bits,
1534 				      u64 max_block,
1535 				      struct ocfs2_suballoc_result *res)
1536 {
1537 	int search = -ENOSPC;
1538 	int ret;
1539 	u64 blkoff;
1540 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1541 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1542 	unsigned int max_bits, gd_cluster_off;
1543 
1544 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1545 
1546 	if (le16_to_cpu(gd->bg_contig_free_bits) &&
1547 	    le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
1548 		return -ENOSPC;
1549 
1550 	/* ->bg_contig_free_bits may un-initialized, so compare again */
1551 	if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
1552 		max_bits = le16_to_cpu(gd->bg_bits);
1553 
1554 		/* Tail groups in cluster bitmaps which aren't cpg
1555 		 * aligned are prone to partial extension by a failed
1556 		 * fs resize. If the file system resize never got to
1557 		 * update the dinode cluster count, then we don't want
1558 		 * to trust any clusters past it, regardless of what
1559 		 * the group descriptor says. */
1560 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1561 							  le64_to_cpu(gd->bg_blkno));
1562 		if ((gd_cluster_off + max_bits) >
1563 		    OCFS2_I(inode)->ip_clusters) {
1564 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1565 			trace_ocfs2_cluster_group_search_wrong_max_bits(
1566 				(unsigned long long)le64_to_cpu(gd->bg_blkno),
1567 				le16_to_cpu(gd->bg_bits),
1568 				OCFS2_I(inode)->ip_clusters, max_bits);
1569 		}
1570 
1571 		ret = ocfs2_block_group_find_clear_bits(osb,
1572 							group_bh, bits_wanted,
1573 							max_bits, res);
1574 		if (ret)
1575 			return ret;
1576 
1577 		if (max_block) {
1578 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1579 							  gd_cluster_off +
1580 							  res->sr_bit_offset +
1581 							  res->sr_bits);
1582 			trace_ocfs2_cluster_group_search_max_block(
1583 				(unsigned long long)blkoff,
1584 				(unsigned long long)max_block);
1585 			if (blkoff > max_block)
1586 				return -ENOSPC;
1587 		}
1588 
1589 		/* ocfs2_block_group_find_clear_bits() might
1590 		 * return success, but we still want to return
1591 		 * -ENOSPC unless it found the minimum number
1592 		 * of bits. */
1593 		if (min_bits <= res->sr_bits)
1594 			search = 0; /* success */
1595 	}
1596 
1597 	return search;
1598 }
1599 
ocfs2_block_group_search(struct inode * inode,struct buffer_head * group_bh,u32 bits_wanted,u32 min_bits,u64 max_block,struct ocfs2_suballoc_result * res)1600 static int ocfs2_block_group_search(struct inode *inode,
1601 				    struct buffer_head *group_bh,
1602 				    u32 bits_wanted, u32 min_bits,
1603 				    u64 max_block,
1604 				    struct ocfs2_suballoc_result *res)
1605 {
1606 	int ret = -ENOSPC;
1607 	u64 blkoff;
1608 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1609 
1610 	BUG_ON(min_bits != 1);
1611 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
1612 
1613 	if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
1614 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1615 							group_bh, bits_wanted,
1616 							le16_to_cpu(bg->bg_bits),
1617 							res);
1618 		if (!ret && max_block) {
1619 			blkoff = le64_to_cpu(bg->bg_blkno) +
1620 				res->sr_bit_offset + res->sr_bits;
1621 			trace_ocfs2_block_group_search_max_block(
1622 				(unsigned long long)blkoff,
1623 				(unsigned long long)max_block);
1624 			if (blkoff > max_block)
1625 				ret = -ENOSPC;
1626 		}
1627 	}
1628 
1629 	return ret;
1630 }
1631 
ocfs2_alloc_dinode_update_counts(struct inode * inode,handle_t * handle,struct buffer_head * di_bh,u32 num_bits,u16 chain)1632 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1633 				       handle_t *handle,
1634 				       struct buffer_head *di_bh,
1635 				       u32 num_bits,
1636 				       u16 chain)
1637 {
1638 	int ret;
1639 	u32 tmp_used;
1640 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1641 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1642 
1643 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1644 				      OCFS2_JOURNAL_ACCESS_WRITE);
1645 	if (ret < 0) {
1646 		mlog_errno(ret);
1647 		goto out;
1648 	}
1649 
1650 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1651 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1652 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1653 	ocfs2_journal_dirty(handle, di_bh);
1654 
1655 out:
1656 	return ret;
1657 }
1658 
ocfs2_rollback_alloc_dinode_counts(struct inode * inode,struct buffer_head * di_bh,u32 num_bits,u16 chain)1659 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1660 				       struct buffer_head *di_bh,
1661 				       u32 num_bits,
1662 				       u16 chain)
1663 {
1664 	u32 tmp_used;
1665 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1666 	struct ocfs2_chain_list *cl;
1667 
1668 	cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1669 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1670 	di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1671 	le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1672 }
1673 
ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result * res,struct ocfs2_extent_rec * rec,struct ocfs2_chain_list * cl)1674 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1675 					 struct ocfs2_extent_rec *rec,
1676 					 struct ocfs2_chain_list *cl)
1677 {
1678 	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1679 	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1680 	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1681 
1682 	if (res->sr_bit_offset < bitoff)
1683 		return 0;
1684 	if (res->sr_bit_offset >= (bitoff + bitcount))
1685 		return 0;
1686 	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1687 		(res->sr_bit_offset - bitoff);
1688 	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1689 		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1690 	return 1;
1691 }
1692 
ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context * ac,struct ocfs2_group_desc * bg,struct ocfs2_suballoc_result * res)1693 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1694 					  struct ocfs2_group_desc *bg,
1695 					  struct ocfs2_suballoc_result *res)
1696 {
1697 	int i;
1698 	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1699 	struct ocfs2_extent_rec *rec;
1700 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1701 	struct ocfs2_chain_list *cl = &di->id2.i_chain;
1702 
1703 	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1704 		res->sr_blkno = 0;
1705 		return;
1706 	}
1707 
1708 	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1709 	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1710 	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1711 	    !bg->bg_list.l_next_free_rec)
1712 		return;
1713 
1714 	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1715 		rec = &bg->bg_list.l_recs[i];
1716 		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1717 			res->sr_bg_blkno = bg_blkno;  /* Restore */
1718 			break;
1719 		}
1720 	}
1721 }
1722 
ocfs2_search_one_group(struct ocfs2_alloc_context * ac,handle_t * handle,u32 bits_wanted,u32 min_bits,struct ocfs2_suballoc_result * res,u16 * bits_left)1723 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1724 				  handle_t *handle,
1725 				  u32 bits_wanted,
1726 				  u32 min_bits,
1727 				  struct ocfs2_suballoc_result *res,
1728 				  u16 *bits_left)
1729 {
1730 	int ret;
1731 	struct buffer_head *group_bh = NULL;
1732 	struct ocfs2_group_desc *gd;
1733 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1734 	struct inode *alloc_inode = ac->ac_inode;
1735 
1736 	ret = ocfs2_read_group_descriptor(alloc_inode, di,
1737 					  res->sr_bg_blkno, &group_bh);
1738 	if (ret < 0) {
1739 		mlog_errno(ret);
1740 		return ret;
1741 	}
1742 
1743 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1744 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1745 				  ac->ac_max_block, res);
1746 	if (ret < 0) {
1747 		if (ret != -ENOSPC)
1748 			mlog_errno(ret);
1749 		goto out;
1750 	}
1751 
1752 	if (!ret)
1753 		ocfs2_bg_discontig_fix_result(ac, gd, res);
1754 
1755 	/*
1756 	 * sr_bg_blkno might have been changed by
1757 	 * ocfs2_bg_discontig_fix_result
1758 	 */
1759 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1760 
1761 	if (ac->ac_find_loc_only)
1762 		goto out_loc_only;
1763 
1764 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1765 					       res->sr_bits,
1766 					       le16_to_cpu(gd->bg_chain));
1767 	if (ret < 0) {
1768 		mlog_errno(ret);
1769 		goto out;
1770 	}
1771 
1772 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1773 					 res->sr_bit_offset, res->sr_bits,
1774 					 res->sr_max_contig_bits, 0);
1775 	if (ret < 0) {
1776 		ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1777 					       res->sr_bits,
1778 					       le16_to_cpu(gd->bg_chain));
1779 		mlog_errno(ret);
1780 	}
1781 
1782 out_loc_only:
1783 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1784 
1785 out:
1786 	brelse(group_bh);
1787 
1788 	return ret;
1789 }
1790 
ocfs2_search_chain(struct ocfs2_alloc_context * ac,handle_t * handle,u32 bits_wanted,u32 min_bits,struct ocfs2_suballoc_result * res,u16 * bits_left)1791 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1792 			      handle_t *handle,
1793 			      u32 bits_wanted,
1794 			      u32 min_bits,
1795 			      struct ocfs2_suballoc_result *res,
1796 			      u16 *bits_left)
1797 {
1798 	int status;
1799 	u16 chain;
1800 	u32 contig_bits;
1801 	u64 next_group;
1802 	struct inode *alloc_inode = ac->ac_inode;
1803 	struct buffer_head *group_bh = NULL;
1804 	struct buffer_head *prev_group_bh = NULL;
1805 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1806 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1807 	struct ocfs2_group_desc *bg;
1808 
1809 	chain = ac->ac_chain;
1810 	trace_ocfs2_search_chain_begin(
1811 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1812 		bits_wanted, chain);
1813 
1814 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
1815 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
1816 					     &group_bh);
1817 	if (status < 0) {
1818 		mlog_errno(status);
1819 		goto bail;
1820 	}
1821 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1822 
1823 	status = -ENOSPC;
1824 	/* for now, the chain search is a bit simplistic. We just use
1825 	 * the 1st group with any empty bits. */
1826 	while (1) {
1827 		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
1828 			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
1829 			if (!contig_bits)
1830 				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
1831 						le16_to_cpu(bg->bg_bits), 0);
1832 			if (bits_wanted > contig_bits && contig_bits >= min_bits)
1833 				bits_wanted = contig_bits;
1834 		}
1835 
1836 		status = ac->ac_group_search(alloc_inode, group_bh,
1837 				bits_wanted, min_bits,
1838 				ac->ac_max_block, res);
1839 		if (status != -ENOSPC)
1840 			break;
1841 		if (!bg->bg_next_group)
1842 			break;
1843 
1844 		brelse(prev_group_bh);
1845 		prev_group_bh = NULL;
1846 
1847 		next_group = le64_to_cpu(bg->bg_next_group);
1848 		prev_group_bh = group_bh;
1849 		group_bh = NULL;
1850 		status = ocfs2_read_group_descriptor(alloc_inode, fe,
1851 						     next_group, &group_bh);
1852 		if (status < 0) {
1853 			mlog_errno(status);
1854 			goto bail;
1855 		}
1856 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1857 	}
1858 	if (status < 0) {
1859 		if (status != -ENOSPC)
1860 			mlog_errno(status);
1861 		goto bail;
1862 	}
1863 
1864 	trace_ocfs2_search_chain_succ(
1865 		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1866 
1867 	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1868 
1869 	BUG_ON(res->sr_bits == 0);
1870 	if (!status)
1871 		ocfs2_bg_discontig_fix_result(ac, bg, res);
1872 
1873 	/*
1874 	 * sr_bg_blkno might have been changed by
1875 	 * ocfs2_bg_discontig_fix_result
1876 	 */
1877 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1878 
1879 	/*
1880 	 * Keep track of previous block descriptor read. When
1881 	 * we find a target, if we have read more than X
1882 	 * number of descriptors, and the target is reasonably
1883 	 * empty, relink him to top of his chain.
1884 	 *
1885 	 * We've read 0 extra blocks and only send one more to
1886 	 * the transaction, yet the next guy to search has a
1887 	 * much easier time.
1888 	 *
1889 	 * Do this *after* figuring out how many bits we're taking out
1890 	 * of our target group.
1891 	 */
1892 	if (!ac->ac_disable_chain_relink &&
1893 	    (prev_group_bh) &&
1894 	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1895 		status = ocfs2_relink_block_group(handle, alloc_inode,
1896 						  ac->ac_bh, group_bh,
1897 						  prev_group_bh, chain);
1898 		if (status < 0) {
1899 			mlog_errno(status);
1900 			goto bail;
1901 		}
1902 	}
1903 
1904 	if (ac->ac_find_loc_only)
1905 		goto out_loc_only;
1906 
1907 	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1908 						  ac->ac_bh, res->sr_bits,
1909 						  chain);
1910 	if (status) {
1911 		mlog_errno(status);
1912 		goto bail;
1913 	}
1914 
1915 	status = ocfs2_block_group_set_bits(handle,
1916 					    alloc_inode,
1917 					    bg,
1918 					    group_bh,
1919 					    res->sr_bit_offset,
1920 					    res->sr_bits,
1921 					    res->sr_max_contig_bits,
1922 					    0);
1923 	if (status < 0) {
1924 		ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1925 					ac->ac_bh, res->sr_bits, chain);
1926 		mlog_errno(status);
1927 		goto bail;
1928 	}
1929 
1930 	trace_ocfs2_search_chain_end(
1931 			(unsigned long long)le64_to_cpu(fe->i_blkno),
1932 			res->sr_bits);
1933 
1934 out_loc_only:
1935 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
1936 bail:
1937 	brelse(group_bh);
1938 	brelse(prev_group_bh);
1939 
1940 	if (status)
1941 		mlog_errno(status);
1942 	return status;
1943 }
1944 
1945 /* will give out up to bits_wanted contiguous bits. */
ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context * ac,handle_t * handle,u32 bits_wanted,u32 min_bits,struct ocfs2_suballoc_result * res)1946 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1947 				     handle_t *handle,
1948 				     u32 bits_wanted,
1949 				     u32 min_bits,
1950 				     struct ocfs2_suballoc_result *res)
1951 {
1952 	int status;
1953 	u16 victim, i;
1954 	u16 bits_left = 0;
1955 	u64 hint = ac->ac_last_group;
1956 	struct ocfs2_chain_list *cl;
1957 	struct ocfs2_dinode *fe;
1958 
1959 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1960 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1961 	BUG_ON(!ac->ac_bh);
1962 
1963 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1964 
1965 	/* The bh was validated by the inode read during
1966 	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1967 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1968 
1969 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1970 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1971 		status = ocfs2_error(ac->ac_inode->i_sb,
1972 				     "Chain allocator dinode %llu has %u used bits but only %u total\n",
1973 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
1974 				     le32_to_cpu(fe->id1.bitmap1.i_used),
1975 				     le32_to_cpu(fe->id1.bitmap1.i_total));
1976 		goto bail;
1977 	}
1978 
1979 	res->sr_bg_blkno = hint;
1980 	if (res->sr_bg_blkno) {
1981 		/* Attempt to short-circuit the usual search mechanism
1982 		 * by jumping straight to the most recently used
1983 		 * allocation group. This helps us maintain some
1984 		 * contiguousness across allocations. */
1985 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
1986 						min_bits, res, &bits_left);
1987 		if (!status)
1988 			goto set_hint;
1989 		if (status < 0 && status != -ENOSPC) {
1990 			mlog_errno(status);
1991 			goto bail;
1992 		}
1993 	}
1994 
1995 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1996 	if (!le16_to_cpu(cl->cl_next_free_rec) ||
1997 	    le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) {
1998 		status = ocfs2_error(ac->ac_inode->i_sb,
1999 				     "Chain allocator dinode %llu has invalid next "
2000 				     "free chain record %u, but only %u total\n",
2001 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
2002 				     le16_to_cpu(cl->cl_next_free_rec),
2003 				     le16_to_cpu(cl->cl_count));
2004 		goto bail;
2005 	}
2006 
2007 	victim = ocfs2_find_victim_chain(cl);
2008 	ac->ac_chain = victim;
2009 
2010 search:
2011 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2012 				    res, &bits_left);
2013 	if (!status) {
2014 		if (ocfs2_is_cluster_bitmap(ac->ac_inode))
2015 			hint = res->sr_bg_blkno;
2016 		else
2017 			hint = ocfs2_group_from_res(res);
2018 		goto set_hint;
2019 	}
2020 	if (status < 0 && status != -ENOSPC) {
2021 		mlog_errno(status);
2022 		goto bail;
2023 	}
2024 
2025 	trace_ocfs2_claim_suballoc_bits(victim);
2026 
2027 	/* If we didn't pick a good victim, then just default to
2028 	 * searching each chain in order. Don't allow chain relinking
2029 	 * because we only calculate enough journal credits for one
2030 	 * relink per alloc. */
2031 	ac->ac_disable_chain_relink = 1;
2032 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
2033 		if (i == victim)
2034 			continue;
2035 		if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted)
2036 			continue;
2037 
2038 		ac->ac_chain = i;
2039 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2040 					    res, &bits_left);
2041 		if (!status) {
2042 			hint = ocfs2_group_from_res(res);
2043 			break;
2044 		}
2045 		if (status < 0 && status != -ENOSPC) {
2046 			mlog_errno(status);
2047 			goto bail;
2048 		}
2049 	}
2050 
2051 	/* Chains can't supply the bits_wanted contiguous space.
2052 	 * We should switch to using every single bit when allocating
2053 	 * from the global bitmap. */
2054 	if (i == le16_to_cpu(cl->cl_next_free_rec) &&
2055 	    status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
2056 		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
2057 		ac->ac_chain = victim;
2058 		goto search;
2059 	}
2060 
2061 set_hint:
2062 	if (status != -ENOSPC) {
2063 		/* If the next search of this group is not likely to
2064 		 * yield a suitable extent, then we reset the last
2065 		 * group hint so as to not waste a disk read */
2066 		if (bits_left < min_bits)
2067 			ac->ac_last_group = 0;
2068 		else
2069 			ac->ac_last_group = hint;
2070 	}
2071 
2072 bail:
2073 	if (status)
2074 		mlog_errno(status);
2075 	return status;
2076 }
2077 
ocfs2_claim_metadata(handle_t * handle,struct ocfs2_alloc_context * ac,u32 bits_wanted,u64 * suballoc_loc,u16 * suballoc_bit_start,unsigned int * num_bits,u64 * blkno_start)2078 int ocfs2_claim_metadata(handle_t *handle,
2079 			 struct ocfs2_alloc_context *ac,
2080 			 u32 bits_wanted,
2081 			 u64 *suballoc_loc,
2082 			 u16 *suballoc_bit_start,
2083 			 unsigned int *num_bits,
2084 			 u64 *blkno_start)
2085 {
2086 	int status;
2087 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2088 
2089 	BUG_ON(!ac);
2090 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2091 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2092 
2093 	status = ocfs2_claim_suballoc_bits(ac,
2094 					   handle,
2095 					   bits_wanted,
2096 					   1,
2097 					   &res);
2098 	if (status < 0) {
2099 		mlog_errno(status);
2100 		goto bail;
2101 	}
2102 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2103 
2104 	*suballoc_loc = res.sr_bg_blkno;
2105 	*suballoc_bit_start = res.sr_bit_offset;
2106 	*blkno_start = res.sr_blkno;
2107 	ac->ac_bits_given += res.sr_bits;
2108 	*num_bits = res.sr_bits;
2109 	status = 0;
2110 bail:
2111 	if (status)
2112 		mlog_errno(status);
2113 	return status;
2114 }
2115 
ocfs2_init_inode_ac_group(struct inode * dir,struct buffer_head * parent_di_bh,struct ocfs2_alloc_context * ac)2116 static void ocfs2_init_inode_ac_group(struct inode *dir,
2117 				      struct buffer_head *parent_di_bh,
2118 				      struct ocfs2_alloc_context *ac)
2119 {
2120 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2121 	/*
2122 	 * Try to allocate inodes from some specific group.
2123 	 *
2124 	 * If the parent dir has recorded the last group used in allocation,
2125 	 * cool, use it. Otherwise if we try to allocate new inode from the
2126 	 * same slot the parent dir belongs to, use the same chunk.
2127 	 *
2128 	 * We are very careful here to avoid the mistake of setting
2129 	 * ac_last_group to a group descriptor from a different (unlocked) slot.
2130 	 */
2131 	if (OCFS2_I(dir)->ip_last_used_group &&
2132 	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2133 		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2134 	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2135 		if (di->i_suballoc_loc)
2136 			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2137 		else
2138 			ac->ac_last_group = ocfs2_which_suballoc_group(
2139 					le64_to_cpu(di->i_blkno),
2140 					le16_to_cpu(di->i_suballoc_bit));
2141 	}
2142 }
2143 
ocfs2_save_inode_ac_group(struct inode * dir,struct ocfs2_alloc_context * ac)2144 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2145 					     struct ocfs2_alloc_context *ac)
2146 {
2147 	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2148 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2149 }
2150 
ocfs2_find_new_inode_loc(struct inode * dir,struct buffer_head * parent_fe_bh,struct ocfs2_alloc_context * ac,u64 * fe_blkno)2151 int ocfs2_find_new_inode_loc(struct inode *dir,
2152 			     struct buffer_head *parent_fe_bh,
2153 			     struct ocfs2_alloc_context *ac,
2154 			     u64 *fe_blkno)
2155 {
2156 	int ret;
2157 	handle_t *handle = NULL;
2158 	struct ocfs2_suballoc_result *res;
2159 
2160 	BUG_ON(!ac);
2161 	BUG_ON(ac->ac_bits_given != 0);
2162 	BUG_ON(ac->ac_bits_wanted != 1);
2163 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2164 
2165 	res = kzalloc(sizeof(*res), GFP_NOFS);
2166 	if (res == NULL) {
2167 		ret = -ENOMEM;
2168 		mlog_errno(ret);
2169 		goto out;
2170 	}
2171 
2172 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2173 
2174 	/*
2175 	 * The handle started here is for chain relink. Alternatively,
2176 	 * we could just disable relink for these calls.
2177 	 */
2178 	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2179 	if (IS_ERR(handle)) {
2180 		ret = PTR_ERR(handle);
2181 		handle = NULL;
2182 		mlog_errno(ret);
2183 		goto out;
2184 	}
2185 
2186 	/*
2187 	 * This will instruct ocfs2_claim_suballoc_bits and
2188 	 * ocfs2_search_one_group to search but save actual allocation
2189 	 * for later.
2190 	 */
2191 	ac->ac_find_loc_only = 1;
2192 
2193 	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2194 	if (ret < 0) {
2195 		mlog_errno(ret);
2196 		goto out;
2197 	}
2198 
2199 	ac->ac_find_loc_priv = res;
2200 	*fe_blkno = res->sr_blkno;
2201 	ocfs2_update_inode_fsync_trans(handle, dir, 0);
2202 out:
2203 	if (handle)
2204 		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2205 
2206 	if (ret)
2207 		kfree(res);
2208 
2209 	return ret;
2210 }
2211 
ocfs2_claim_new_inode_at_loc(handle_t * handle,struct inode * dir,struct ocfs2_alloc_context * ac,u64 * suballoc_loc,u16 * suballoc_bit,u64 di_blkno)2212 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2213 				 struct inode *dir,
2214 				 struct ocfs2_alloc_context *ac,
2215 				 u64 *suballoc_loc,
2216 				 u16 *suballoc_bit,
2217 				 u64 di_blkno)
2218 {
2219 	int ret;
2220 	u16 chain;
2221 	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2222 	struct buffer_head *bg_bh = NULL;
2223 	struct ocfs2_group_desc *bg;
2224 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2225 
2226 	/*
2227 	 * Since di_blkno is being passed back in, we check for any
2228 	 * inconsistencies which may have happened between
2229 	 * calls. These are code bugs as di_blkno is not expected to
2230 	 * change once returned from ocfs2_find_new_inode_loc()
2231 	 */
2232 	BUG_ON(res->sr_blkno != di_blkno);
2233 
2234 	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2235 					  res->sr_bg_stable_blkno, &bg_bh);
2236 	if (ret) {
2237 		mlog_errno(ret);
2238 		goto out;
2239 	}
2240 
2241 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2242 	chain = le16_to_cpu(bg->bg_chain);
2243 
2244 	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2245 					       ac->ac_bh, res->sr_bits,
2246 					       chain);
2247 	if (ret) {
2248 		mlog_errno(ret);
2249 		goto out;
2250 	}
2251 
2252 	ret = ocfs2_block_group_set_bits(handle,
2253 					 ac->ac_inode,
2254 					 bg,
2255 					 bg_bh,
2256 					 res->sr_bit_offset,
2257 					 res->sr_bits,
2258 					 res->sr_max_contig_bits,
2259 					 0);
2260 	if (ret < 0) {
2261 		ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2262 					       ac->ac_bh, res->sr_bits, chain);
2263 		mlog_errno(ret);
2264 		goto out;
2265 	}
2266 
2267 	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2268 					   res->sr_bits);
2269 
2270 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2271 
2272 	BUG_ON(res->sr_bits != 1);
2273 
2274 	*suballoc_loc = res->sr_bg_blkno;
2275 	*suballoc_bit = res->sr_bit_offset;
2276 	ac->ac_bits_given++;
2277 	ocfs2_save_inode_ac_group(dir, ac);
2278 
2279 out:
2280 	brelse(bg_bh);
2281 
2282 	return ret;
2283 }
2284 
ocfs2_claim_new_inode(handle_t * handle,struct inode * dir,struct buffer_head * parent_fe_bh,struct ocfs2_alloc_context * ac,u64 * suballoc_loc,u16 * suballoc_bit,u64 * fe_blkno)2285 int ocfs2_claim_new_inode(handle_t *handle,
2286 			  struct inode *dir,
2287 			  struct buffer_head *parent_fe_bh,
2288 			  struct ocfs2_alloc_context *ac,
2289 			  u64 *suballoc_loc,
2290 			  u16 *suballoc_bit,
2291 			  u64 *fe_blkno)
2292 {
2293 	int status;
2294 	struct ocfs2_suballoc_result res;
2295 
2296 	BUG_ON(!ac);
2297 	BUG_ON(ac->ac_bits_given != 0);
2298 	BUG_ON(ac->ac_bits_wanted != 1);
2299 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2300 
2301 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2302 
2303 	status = ocfs2_claim_suballoc_bits(ac,
2304 					   handle,
2305 					   1,
2306 					   1,
2307 					   &res);
2308 	if (status < 0) {
2309 		mlog_errno(status);
2310 		goto bail;
2311 	}
2312 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2313 
2314 	BUG_ON(res.sr_bits != 1);
2315 
2316 	*suballoc_loc = res.sr_bg_blkno;
2317 	*suballoc_bit = res.sr_bit_offset;
2318 	*fe_blkno = res.sr_blkno;
2319 	ac->ac_bits_given++;
2320 	ocfs2_save_inode_ac_group(dir, ac);
2321 	status = 0;
2322 bail:
2323 	if (status)
2324 		mlog_errno(status);
2325 	return status;
2326 }
2327 
2328 /* translate a group desc. blkno and it's bitmap offset into
2329  * disk cluster offset. */
ocfs2_desc_bitmap_to_cluster_off(struct inode * inode,u64 bg_blkno,u16 bg_bit_off)2330 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2331 						   u64 bg_blkno,
2332 						   u16 bg_bit_off)
2333 {
2334 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2335 	u32 cluster = 0;
2336 
2337 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2338 
2339 	if (bg_blkno != osb->first_cluster_group_blkno)
2340 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2341 	cluster += (u32) bg_bit_off;
2342 	return cluster;
2343 }
2344 
2345 /* given a cluster offset, calculate which block group it belongs to
2346  * and return that block offset. */
ocfs2_which_cluster_group(struct inode * inode,u32 cluster)2347 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2348 {
2349 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2350 	u32 group_no;
2351 
2352 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2353 
2354 	group_no = cluster / osb->bitmap_cpg;
2355 	if (!group_no)
2356 		return osb->first_cluster_group_blkno;
2357 	return ocfs2_clusters_to_blocks(inode->i_sb,
2358 					group_no * osb->bitmap_cpg);
2359 }
2360 
2361 /* given the block number of a cluster start, calculate which cluster
2362  * group and descriptor bitmap offset that corresponds to. */
ocfs2_block_to_cluster_group(struct inode * inode,u64 data_blkno,u64 * bg_blkno,u16 * bg_bit_off)2363 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2364 						u64 data_blkno,
2365 						u64 *bg_blkno,
2366 						u16 *bg_bit_off)
2367 {
2368 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2369 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2370 
2371 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2372 
2373 	*bg_blkno = ocfs2_which_cluster_group(inode,
2374 					      data_cluster);
2375 
2376 	if (*bg_blkno == osb->first_cluster_group_blkno)
2377 		*bg_bit_off = (u16) data_cluster;
2378 	else
2379 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2380 							     data_blkno - *bg_blkno);
2381 }
2382 
2383 /*
2384  * min_bits - minimum contiguous chunk from this total allocation we
2385  * can handle. set to what we asked for originally for a full
2386  * contig. allocation, set to '1' to indicate we can deal with extents
2387  * of any size.
2388  */
__ocfs2_claim_clusters(handle_t * handle,struct ocfs2_alloc_context * ac,u32 min_clusters,u32 max_clusters,u32 * cluster_start,u32 * num_clusters)2389 int __ocfs2_claim_clusters(handle_t *handle,
2390 			   struct ocfs2_alloc_context *ac,
2391 			   u32 min_clusters,
2392 			   u32 max_clusters,
2393 			   u32 *cluster_start,
2394 			   u32 *num_clusters)
2395 {
2396 	int status;
2397 	unsigned int bits_wanted = max_clusters;
2398 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2399 	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2400 
2401 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2402 
2403 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2404 	       && ac->ac_which != OCFS2_AC_USE_MAIN
2405 	       && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);
2406 
2407 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2408 		WARN_ON(min_clusters > 1);
2409 
2410 		status = ocfs2_claim_local_alloc_bits(osb,
2411 						      handle,
2412 						      ac,
2413 						      bits_wanted,
2414 						      cluster_start,
2415 						      num_clusters);
2416 		if (!status)
2417 			atomic_inc(&osb->alloc_stats.local_data);
2418 	} else {
2419 		if (min_clusters > (osb->bitmap_cpg - 1)) {
2420 			/* The only paths asking for contiguousness
2421 			 * should know about this already. */
2422 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2423 			     "group bitmap size %u!\n", min_clusters,
2424 			     osb->bitmap_cpg);
2425 			status = -ENOSPC;
2426 			goto bail;
2427 		}
2428 		/* clamp the current request down to a realistic size. */
2429 		if (bits_wanted > (osb->bitmap_cpg - 1))
2430 			bits_wanted = osb->bitmap_cpg - 1;
2431 
2432 		status = ocfs2_claim_suballoc_bits(ac,
2433 						   handle,
2434 						   bits_wanted,
2435 						   min_clusters,
2436 						   &res);
2437 		if (!status) {
2438 			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2439 			*cluster_start =
2440 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2441 								 res.sr_bg_blkno,
2442 								 res.sr_bit_offset);
2443 			atomic_inc(&osb->alloc_stats.bitmap_data);
2444 			*num_clusters = res.sr_bits;
2445 		}
2446 	}
2447 	if (status < 0) {
2448 		if (status != -ENOSPC)
2449 			mlog_errno(status);
2450 		goto bail;
2451 	}
2452 
2453 	ac->ac_bits_given += *num_clusters;
2454 
2455 bail:
2456 	if (status)
2457 		mlog_errno(status);
2458 	return status;
2459 }
2460 
ocfs2_claim_clusters(handle_t * handle,struct ocfs2_alloc_context * ac,u32 min_clusters,u32 * cluster_start,u32 * num_clusters)2461 int ocfs2_claim_clusters(handle_t *handle,
2462 			 struct ocfs2_alloc_context *ac,
2463 			 u32 min_clusters,
2464 			 u32 *cluster_start,
2465 			 u32 *num_clusters)
2466 {
2467 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2468 
2469 	return __ocfs2_claim_clusters(handle, ac, min_clusters,
2470 				      bits_wanted, cluster_start, num_clusters);
2471 }
2472 
ocfs2_block_group_clear_bits(handle_t * handle,struct inode * alloc_inode,struct ocfs2_group_desc * bg,struct buffer_head * group_bh,unsigned int bit_off,unsigned int num_bits,unsigned int max_contig_bits,void (* undo_fn)(unsigned int bit,unsigned long * bmap))2473 static int ocfs2_block_group_clear_bits(handle_t *handle,
2474 					struct inode *alloc_inode,
2475 					struct ocfs2_group_desc *bg,
2476 					struct buffer_head *group_bh,
2477 					unsigned int bit_off,
2478 					unsigned int num_bits,
2479 					unsigned int max_contig_bits,
2480 					void (*undo_fn)(unsigned int bit,
2481 							unsigned long *bmap))
2482 {
2483 	int status;
2484 	unsigned int tmp;
2485 	u16 contig_bits;
2486 	struct ocfs2_group_desc *undo_bg = NULL;
2487 	struct journal_head *jh;
2488 
2489 	/* The caller got this descriptor from
2490 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2491 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2492 
2493 	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2494 
2495 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2496 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2497 					 group_bh,
2498 					 undo_fn ?
2499 					 OCFS2_JOURNAL_ACCESS_UNDO :
2500 					 OCFS2_JOURNAL_ACCESS_WRITE);
2501 	if (status < 0) {
2502 		mlog_errno(status);
2503 		goto bail;
2504 	}
2505 
2506 	jh = bh2jh(group_bh);
2507 	if (undo_fn) {
2508 		spin_lock(&jh->b_state_lock);
2509 		undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
2510 		BUG_ON(!undo_bg);
2511 	}
2512 
2513 	tmp = num_bits;
2514 	while(tmp--) {
2515 		ocfs2_clear_bit((bit_off + tmp),
2516 				(unsigned long *) bg->bg_bitmap);
2517 		if (undo_fn)
2518 			undo_fn(bit_off + tmp,
2519 				(unsigned long *) undo_bg->bg_bitmap);
2520 	}
2521 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2522 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2523 		if (undo_fn)
2524 			spin_unlock(&jh->b_state_lock);
2525 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2526 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
2527 				   le16_to_cpu(bg->bg_bits),
2528 				   le16_to_cpu(bg->bg_free_bits_count),
2529 				   num_bits);
2530 	}
2531 
2532 	/*
2533 	 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
2534 	 * we still need to rescan whole bitmap.
2535 	 */
2536 	if (ocfs2_is_cluster_bitmap(alloc_inode)) {
2537 		contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
2538 				    le16_to_cpu(bg->bg_bits), 0);
2539 		if (contig_bits > max_contig_bits)
2540 			max_contig_bits = contig_bits;
2541 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
2542 	} else {
2543 		bg->bg_contig_free_bits = 0;
2544 	}
2545 
2546 	if (undo_fn)
2547 		spin_unlock(&jh->b_state_lock);
2548 
2549 	ocfs2_journal_dirty(handle, group_bh);
2550 bail:
2551 	return status;
2552 }
2553 
2554 /*
2555  * expects the suballoc inode to already be locked.
2556  */
_ocfs2_free_suballoc_bits(handle_t * handle,struct inode * alloc_inode,struct buffer_head * alloc_bh,unsigned int start_bit,u64 bg_blkno,unsigned int count,void (* undo_fn)(unsigned int bit,unsigned long * bitmap))2557 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2558 				     struct inode *alloc_inode,
2559 				     struct buffer_head *alloc_bh,
2560 				     unsigned int start_bit,
2561 				     u64 bg_blkno,
2562 				     unsigned int count,
2563 				     void (*undo_fn)(unsigned int bit,
2564 						     unsigned long *bitmap))
2565 {
2566 	int status = 0;
2567 	u32 tmp_used;
2568 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2569 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2570 	struct buffer_head *group_bh = NULL;
2571 	struct ocfs2_group_desc *group;
2572 	__le16 old_bg_contig_free_bits = 0;
2573 
2574 	/* The alloc_bh comes from ocfs2_free_dinode() or
2575 	 * ocfs2_free_clusters().  The callers have all locked the
2576 	 * allocator and gotten alloc_bh from the lock call.  This
2577 	 * validates the dinode buffer.  Any corruption that has happened
2578 	 * is a code bug. */
2579 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2580 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2581 
2582 	trace_ocfs2_free_suballoc_bits(
2583 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2584 		(unsigned long long)bg_blkno,
2585 		start_bit, count);
2586 
2587 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2588 					     &group_bh);
2589 	if (status < 0) {
2590 		mlog_errno(status);
2591 		goto bail;
2592 	}
2593 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2594 
2595 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2596 
2597 	if (ocfs2_is_cluster_bitmap(alloc_inode))
2598 		old_bg_contig_free_bits = group->bg_contig_free_bits;
2599 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2600 					      group, group_bh,
2601 					      start_bit, count, 0, undo_fn);
2602 	if (status < 0) {
2603 		mlog_errno(status);
2604 		goto bail;
2605 	}
2606 
2607 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2608 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2609 	if (status < 0) {
2610 		mlog_errno(status);
2611 		ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2612 				start_bit, count,
2613 				le16_to_cpu(old_bg_contig_free_bits), 1);
2614 		goto bail;
2615 	}
2616 
2617 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2618 		     count);
2619 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2620 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2621 	ocfs2_journal_dirty(handle, alloc_bh);
2622 
2623 bail:
2624 	brelse(group_bh);
2625 	return status;
2626 }
2627 
ocfs2_free_suballoc_bits(handle_t * handle,struct inode * alloc_inode,struct buffer_head * alloc_bh,unsigned int start_bit,u64 bg_blkno,unsigned int count)2628 int ocfs2_free_suballoc_bits(handle_t *handle,
2629 			     struct inode *alloc_inode,
2630 			     struct buffer_head *alloc_bh,
2631 			     unsigned int start_bit,
2632 			     u64 bg_blkno,
2633 			     unsigned int count)
2634 {
2635 	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2636 					 start_bit, bg_blkno, count, NULL);
2637 }
2638 
ocfs2_free_dinode(handle_t * handle,struct inode * inode_alloc_inode,struct buffer_head * inode_alloc_bh,struct ocfs2_dinode * di)2639 int ocfs2_free_dinode(handle_t *handle,
2640 		      struct inode *inode_alloc_inode,
2641 		      struct buffer_head *inode_alloc_bh,
2642 		      struct ocfs2_dinode *di)
2643 {
2644 	u64 blk = le64_to_cpu(di->i_blkno);
2645 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
2646 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2647 
2648 	if (di->i_suballoc_loc)
2649 		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2650 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2651 					inode_alloc_bh, bit, bg_blkno, 1);
2652 }
2653 
_ocfs2_free_clusters(handle_t * handle,struct inode * bitmap_inode,struct buffer_head * bitmap_bh,u64 start_blk,unsigned int num_clusters,void (* undo_fn)(unsigned int bit,unsigned long * bitmap))2654 static int _ocfs2_free_clusters(handle_t *handle,
2655 				struct inode *bitmap_inode,
2656 				struct buffer_head *bitmap_bh,
2657 				u64 start_blk,
2658 				unsigned int num_clusters,
2659 				void (*undo_fn)(unsigned int bit,
2660 						unsigned long *bitmap))
2661 {
2662 	int status;
2663 	u16 bg_start_bit;
2664 	u64 bg_blkno;
2665 
2666 	/* You can't ever have a contiguous set of clusters
2667 	 * bigger than a block group bitmap so we never have to worry
2668 	 * about looping on them.
2669 	 * This is expensive. We can safely remove once this stuff has
2670 	 * gotten tested really well. */
2671 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2672 				ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2673 							 start_blk)));
2674 
2675 
2676 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2677 				     &bg_start_bit);
2678 
2679 	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2680 			(unsigned long long)start_blk,
2681 			bg_start_bit, num_clusters);
2682 
2683 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2684 					   bg_start_bit, bg_blkno,
2685 					   num_clusters, undo_fn);
2686 	if (status < 0) {
2687 		mlog_errno(status);
2688 		goto out;
2689 	}
2690 
2691 	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2692 					 num_clusters);
2693 
2694 out:
2695 	return status;
2696 }
2697 
ocfs2_free_clusters(handle_t * handle,struct inode * bitmap_inode,struct buffer_head * bitmap_bh,u64 start_blk,unsigned int num_clusters)2698 int ocfs2_free_clusters(handle_t *handle,
2699 			struct inode *bitmap_inode,
2700 			struct buffer_head *bitmap_bh,
2701 			u64 start_blk,
2702 			unsigned int num_clusters)
2703 {
2704 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2705 				    start_blk, num_clusters,
2706 				    _ocfs2_set_bit);
2707 }
2708 
2709 /*
2710  * Give never-used clusters back to the global bitmap.  We don't need
2711  * to protect these bits in the undo buffer.
2712  */
ocfs2_release_clusters(handle_t * handle,struct inode * bitmap_inode,struct buffer_head * bitmap_bh,u64 start_blk,unsigned int num_clusters)2713 int ocfs2_release_clusters(handle_t *handle,
2714 			   struct inode *bitmap_inode,
2715 			   struct buffer_head *bitmap_bh,
2716 			   u64 start_blk,
2717 			   unsigned int num_clusters)
2718 {
2719 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2720 				    start_blk, num_clusters,
2721 				    _ocfs2_clear_bit);
2722 }
2723 
2724 /*
2725  * For a given allocation, determine which allocators will need to be
2726  * accessed, and lock them, reserving the appropriate number of bits.
2727  *
2728  * Sparse file systems call this from ocfs2_write_begin_nolock()
2729  * and ocfs2_allocate_unwritten_extents().
2730  *
2731  * File systems which don't support holes call this from
2732  * ocfs2_extend_allocation().
2733  */
ocfs2_lock_allocators(struct inode * inode,struct ocfs2_extent_tree * et,u32 clusters_to_add,u32 extents_to_split,struct ocfs2_alloc_context ** data_ac,struct ocfs2_alloc_context ** meta_ac)2734 int ocfs2_lock_allocators(struct inode *inode,
2735 			  struct ocfs2_extent_tree *et,
2736 			  u32 clusters_to_add, u32 extents_to_split,
2737 			  struct ocfs2_alloc_context **data_ac,
2738 			  struct ocfs2_alloc_context **meta_ac)
2739 {
2740 	int ret = 0, num_free_extents;
2741 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2742 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2743 
2744 	*meta_ac = NULL;
2745 	if (data_ac)
2746 		*data_ac = NULL;
2747 
2748 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2749 
2750 	num_free_extents = ocfs2_num_free_extents(et);
2751 	if (num_free_extents < 0) {
2752 		ret = num_free_extents;
2753 		mlog_errno(ret);
2754 		goto out;
2755 	}
2756 
2757 	/*
2758 	 * Sparse allocation file systems need to be more conservative
2759 	 * with reserving room for expansion - the actual allocation
2760 	 * happens while we've got a journal handle open so re-taking
2761 	 * a cluster lock (because we ran out of room for another
2762 	 * extent) will violate ordering rules.
2763 	 *
2764 	 * Most of the time we'll only be seeing this 1 cluster at a time
2765 	 * anyway.
2766 	 *
2767 	 * Always lock for any unwritten extents - we might want to
2768 	 * add blocks during a split.
2769 	 */
2770 	if (!num_free_extents ||
2771 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2772 		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2773 		if (ret < 0) {
2774 			if (ret != -ENOSPC)
2775 				mlog_errno(ret);
2776 			goto out;
2777 		}
2778 	}
2779 
2780 	if (clusters_to_add == 0)
2781 		goto out;
2782 
2783 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2784 	if (ret < 0) {
2785 		if (ret != -ENOSPC)
2786 			mlog_errno(ret);
2787 		goto out;
2788 	}
2789 
2790 out:
2791 	if (ret) {
2792 		if (*meta_ac) {
2793 			ocfs2_free_alloc_context(*meta_ac);
2794 			*meta_ac = NULL;
2795 		}
2796 
2797 		/*
2798 		 * We cannot have an error and a non null *data_ac.
2799 		 */
2800 	}
2801 
2802 	return ret;
2803 }
2804 
2805 /*
2806  * Read the inode specified by blkno to get suballoc_slot and
2807  * suballoc_bit.
2808  */
ocfs2_get_suballoc_slot_bit(struct ocfs2_super * osb,u64 blkno,u16 * suballoc_slot,u64 * group_blkno,u16 * suballoc_bit)2809 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2810 				       u16 *suballoc_slot, u64 *group_blkno,
2811 				       u16 *suballoc_bit)
2812 {
2813 	int status;
2814 	struct buffer_head *inode_bh = NULL;
2815 	struct ocfs2_dinode *inode_fe;
2816 
2817 	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2818 
2819 	/* dirty read disk */
2820 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2821 	if (status < 0) {
2822 		mlog(ML_ERROR, "read block %llu failed %d\n",
2823 		     (unsigned long long)blkno, status);
2824 		goto bail;
2825 	}
2826 
2827 	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2828 	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2829 		mlog(ML_ERROR, "invalid inode %llu requested\n",
2830 		     (unsigned long long)blkno);
2831 		status = -EINVAL;
2832 		goto bail;
2833 	}
2834 
2835 	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2836 	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2837 		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2838 		     (unsigned long long)blkno,
2839 		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2840 		status = -EINVAL;
2841 		goto bail;
2842 	}
2843 
2844 	if (suballoc_slot)
2845 		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2846 	if (suballoc_bit)
2847 		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2848 	if (group_blkno)
2849 		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2850 
2851 bail:
2852 	brelse(inode_bh);
2853 
2854 	if (status)
2855 		mlog_errno(status);
2856 	return status;
2857 }
2858 
2859 /*
2860  * test whether bit is SET in allocator bitmap or not.  on success, 0
2861  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2862  * is returned and *res is meaningless.  Call this after you have
2863  * cluster locked against suballoc, or you may get a result based on
2864  * non-up2date contents
2865  */
ocfs2_test_suballoc_bit(struct ocfs2_super * osb,struct inode * suballoc,struct buffer_head * alloc_bh,u64 group_blkno,u64 blkno,u16 bit,int * res)2866 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2867 				   struct inode *suballoc,
2868 				   struct buffer_head *alloc_bh,
2869 				   u64 group_blkno, u64 blkno,
2870 				   u16 bit, int *res)
2871 {
2872 	struct ocfs2_dinode *alloc_di;
2873 	struct ocfs2_group_desc *group;
2874 	struct buffer_head *group_bh = NULL;
2875 	u64 bg_blkno;
2876 	int status;
2877 
2878 	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2879 				      (unsigned int)bit);
2880 
2881 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2882 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2883 		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2884 		     (unsigned int)bit,
2885 		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2886 		status = -EINVAL;
2887 		goto bail;
2888 	}
2889 
2890 	bg_blkno = group_blkno ? group_blkno :
2891 		   ocfs2_which_suballoc_group(blkno, bit);
2892 	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2893 					     &group_bh);
2894 	if (status < 0) {
2895 		mlog(ML_ERROR, "read group %llu failed %d\n",
2896 		     (unsigned long long)bg_blkno, status);
2897 		goto bail;
2898 	}
2899 
2900 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2901 	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2902 
2903 bail:
2904 	brelse(group_bh);
2905 
2906 	if (status)
2907 		mlog_errno(status);
2908 	return status;
2909 }
2910 
2911 /*
2912  * Test if the bit representing this inode (blkno) is set in the
2913  * suballocator.
2914  *
2915  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2916  *
2917  * In the event of failure, a negative value is returned and *res is
2918  * meaningless.
2919  *
2920  * Callers must make sure to hold nfs_sync_lock to prevent
2921  * ocfs2_delete_inode() on another node from accessing the same
2922  * suballocator concurrently.
2923  */
ocfs2_test_inode_bit(struct ocfs2_super * osb,u64 blkno,int * res)2924 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2925 {
2926 	int status;
2927 	u64 group_blkno = 0;
2928 	u16 suballoc_bit = 0, suballoc_slot = 0;
2929 	struct inode *inode_alloc_inode;
2930 	struct buffer_head *alloc_bh = NULL;
2931 
2932 	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2933 
2934 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2935 					     &group_blkno, &suballoc_bit);
2936 	if (status < 0) {
2937 		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2938 		goto bail;
2939 	}
2940 
2941 	if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
2942 		inode_alloc_inode = ocfs2_get_system_file_inode(osb,
2943 			GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
2944 	else
2945 		inode_alloc_inode = ocfs2_get_system_file_inode(osb,
2946 			INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
2947 	if (!inode_alloc_inode) {
2948 		/* the error code could be inaccurate, but we are not able to
2949 		 * get the correct one. */
2950 		status = -EINVAL;
2951 		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2952 		     (u32)suballoc_slot);
2953 		goto bail;
2954 	}
2955 
2956 	inode_lock(inode_alloc_inode);
2957 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2958 	if (status < 0) {
2959 		inode_unlock(inode_alloc_inode);
2960 		iput(inode_alloc_inode);
2961 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2962 		     (u32)suballoc_slot, status);
2963 		goto bail;
2964 	}
2965 
2966 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2967 					 group_blkno, blkno, suballoc_bit, res);
2968 	if (status < 0)
2969 		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2970 
2971 	ocfs2_inode_unlock(inode_alloc_inode, 0);
2972 	inode_unlock(inode_alloc_inode);
2973 
2974 	iput(inode_alloc_inode);
2975 	brelse(alloc_bh);
2976 bail:
2977 	if (status)
2978 		mlog_errno(status);
2979 	return status;
2980 }
2981