1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * suballoc.c
4 *
5 * metadata alloc and free
6 * Inspired by ext3 block groups.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 */
10
11 #include <linux/fs.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/string.h>
15 #include <linux/highmem.h>
16
17 #include <cluster/masklog.h>
18
19 #include "ocfs2.h"
20
21 #include "alloc.h"
22 #include "blockcheck.h"
23 #include "dlmglue.h"
24 #include "inode.h"
25 #include "journal.h"
26 #include "localalloc.h"
27 #include "suballoc.h"
28 #include "super.h"
29 #include "sysfile.h"
30 #include "uptodate.h"
31 #include "ocfs2_trace.h"
32
33 #include "buffer_head_io.h"
34
35 #define NOT_ALLOC_NEW_GROUP 0
36 #define ALLOC_NEW_GROUP 0x1
37 #define ALLOC_GROUPS_FROM_GLOBAL 0x2
38
39 #define OCFS2_MAX_TO_STEAL 1024
40
41 struct ocfs2_suballoc_result {
42 u64 sr_bg_blkno; /* The bg we allocated from. Set
43 to 0 when a block group is
44 contiguous. */
45 u64 sr_bg_stable_blkno; /*
46 * Doesn't change, always
47 * set to target block
48 * group descriptor
49 * block.
50 */
51 u64 sr_blkno; /* The first allocated block */
52 unsigned int sr_bit_offset; /* The bit in the bg */
53 unsigned int sr_bits; /* How many bits we claimed */
54 unsigned int sr_max_contig_bits; /* The length for contiguous
55 * free bits, only available
56 * for cluster group
57 */
58 };
59
ocfs2_group_from_res(struct ocfs2_suballoc_result * res)60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
61 {
62 if (res->sr_blkno == 0)
63 return 0;
64
65 if (res->sr_bg_blkno)
66 return res->sr_bg_blkno;
67
68 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
69 }
70
71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
72 static int ocfs2_block_group_fill(handle_t *handle,
73 struct inode *alloc_inode,
74 struct buffer_head *bg_bh,
75 u64 group_blkno,
76 unsigned int group_clusters,
77 u16 my_chain,
78 struct ocfs2_chain_list *cl);
79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
80 struct inode *alloc_inode,
81 struct buffer_head *bh,
82 u64 max_block,
83 u64 *last_alloc_group,
84 int flags);
85
86 static int ocfs2_cluster_group_search(struct inode *inode,
87 struct buffer_head *group_bh,
88 u32 bits_wanted, u32 min_bits,
89 u64 max_block,
90 struct ocfs2_suballoc_result *res);
91 static int ocfs2_block_group_search(struct inode *inode,
92 struct buffer_head *group_bh,
93 u32 bits_wanted, u32 min_bits,
94 u64 max_block,
95 struct ocfs2_suballoc_result *res);
96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
97 handle_t *handle,
98 u32 bits_wanted,
99 u32 min_bits,
100 struct ocfs2_suballoc_result *res);
101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
102 int nr);
103 static int ocfs2_relink_block_group(handle_t *handle,
104 struct inode *alloc_inode,
105 struct buffer_head *fe_bh,
106 struct buffer_head *bg_bh,
107 struct buffer_head *prev_bg_bh,
108 u16 chain);
109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
110 u32 wanted);
111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
112 u64 bg_blkno,
113 u16 bg_bit_off);
114 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
115 u64 data_blkno,
116 u64 *bg_blkno,
117 u16 *bg_bit_off);
118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
119 u32 bits_wanted, u64 max_block,
120 int flags,
121 struct ocfs2_alloc_context **ac);
122
ocfs2_free_ac_resource(struct ocfs2_alloc_context * ac)123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
124 {
125 struct inode *inode = ac->ac_inode;
126
127 if (inode) {
128 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
129 ocfs2_inode_unlock(inode, 1);
130
131 inode_unlock(inode);
132
133 iput(inode);
134 ac->ac_inode = NULL;
135 }
136 brelse(ac->ac_bh);
137 ac->ac_bh = NULL;
138 ac->ac_resv = NULL;
139 kfree(ac->ac_find_loc_priv);
140 ac->ac_find_loc_priv = NULL;
141 }
142
ocfs2_free_alloc_context(struct ocfs2_alloc_context * ac)143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
144 {
145 ocfs2_free_ac_resource(ac);
146 kfree(ac);
147 }
148
ocfs2_bits_per_group(struct ocfs2_chain_list * cl)149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
150 {
151 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
152 }
153
154 #define do_error(fmt, ...) \
155 do { \
156 if (resize) \
157 mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
158 else \
159 return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
160 } while (0)
161
ocfs2_validate_gd_self(struct super_block * sb,struct buffer_head * bh,int resize)162 static int ocfs2_validate_gd_self(struct super_block *sb,
163 struct buffer_head *bh,
164 int resize)
165 {
166 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
167
168 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
169 do_error("Group descriptor #%llu has bad signature %.*s\n",
170 (unsigned long long)bh->b_blocknr, 7,
171 gd->bg_signature);
172 }
173
174 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
175 do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
176 (unsigned long long)bh->b_blocknr,
177 (unsigned long long)le64_to_cpu(gd->bg_blkno));
178 }
179
180 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
181 do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
182 (unsigned long long)bh->b_blocknr,
183 le32_to_cpu(gd->bg_generation));
184 }
185
186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
188 (unsigned long long)bh->b_blocknr,
189 le16_to_cpu(gd->bg_bits),
190 le16_to_cpu(gd->bg_free_bits_count));
191 }
192
193 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
194 do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
195 (unsigned long long)bh->b_blocknr,
196 le16_to_cpu(gd->bg_bits),
197 8 * le16_to_cpu(gd->bg_size));
198 }
199
200 return 0;
201 }
202
ocfs2_validate_gd_parent(struct super_block * sb,struct ocfs2_dinode * di,struct buffer_head * bh,int resize)203 static int ocfs2_validate_gd_parent(struct super_block *sb,
204 struct ocfs2_dinode *di,
205 struct buffer_head *bh,
206 int resize)
207 {
208 unsigned int max_bits;
209 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
210
211 if (di->i_blkno != gd->bg_parent_dinode) {
212 do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
213 (unsigned long long)bh->b_blocknr,
214 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
215 (unsigned long long)le64_to_cpu(di->i_blkno));
216 }
217
218 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
219 if (le16_to_cpu(gd->bg_bits) > max_bits) {
220 do_error("Group descriptor #%llu has bit count of %u\n",
221 (unsigned long long)bh->b_blocknr,
222 le16_to_cpu(gd->bg_bits));
223 }
224
225 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
226 if ((le16_to_cpu(gd->bg_chain) >
227 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
228 ((le16_to_cpu(gd->bg_chain) ==
229 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
230 do_error("Group descriptor #%llu has bad chain %u\n",
231 (unsigned long long)bh->b_blocknr,
232 le16_to_cpu(gd->bg_chain));
233 }
234
235 return 0;
236 }
237
238 #undef do_error
239
240 /*
241 * This version only prints errors. It does not fail the filesystem, and
242 * exists only for resize.
243 */
ocfs2_check_group_descriptor(struct super_block * sb,struct ocfs2_dinode * di,struct buffer_head * bh)244 int ocfs2_check_group_descriptor(struct super_block *sb,
245 struct ocfs2_dinode *di,
246 struct buffer_head *bh)
247 {
248 int rc;
249 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
250
251 BUG_ON(!buffer_uptodate(bh));
252
253 /*
254 * If the ecc fails, we return the error but otherwise
255 * leave the filesystem running. We know any error is
256 * local to this block.
257 */
258 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
259 if (rc) {
260 mlog(ML_ERROR,
261 "Checksum failed for group descriptor %llu\n",
262 (unsigned long long)bh->b_blocknr);
263 } else
264 rc = ocfs2_validate_gd_self(sb, bh, 1);
265 if (!rc)
266 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
267
268 return rc;
269 }
270
ocfs2_validate_group_descriptor(struct super_block * sb,struct buffer_head * bh)271 static int ocfs2_validate_group_descriptor(struct super_block *sb,
272 struct buffer_head *bh)
273 {
274 int rc;
275 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
276
277 trace_ocfs2_validate_group_descriptor(
278 (unsigned long long)bh->b_blocknr);
279
280 BUG_ON(!buffer_uptodate(bh));
281
282 /*
283 * If the ecc fails, we return the error but otherwise
284 * leave the filesystem running. We know any error is
285 * local to this block.
286 */
287 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
288 if (rc)
289 return rc;
290
291 /*
292 * Errors after here are fatal.
293 */
294
295 return ocfs2_validate_gd_self(sb, bh, 0);
296 }
297
298 /*
299 * The hint group descriptor (gd) may already have been released
300 * in _ocfs2_free_suballoc_bits(). We first check the gd signature,
301 * then perform the standard ocfs2_read_group_descriptor() jobs.
302 *
303 * If the gd signature is invalid, we return 'rc=0' and set
304 * '*released=1'. The caller is expected to handle this specific case.
305 * Otherwise, we return the actual error code.
306 *
307 * We treat gd signature corruption case as a release case. The
308 * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain()
309 * to search each gd block. The code will eventually find this
310 * corrupted gd block - Late, but not missed.
311 *
312 * Note:
313 * The caller is responsible for initializing the '*released' status.
314 */
ocfs2_read_hint_group_descriptor(struct inode * inode,struct ocfs2_dinode * di,u64 gd_blkno,struct buffer_head ** bh,int * released)315 static int ocfs2_read_hint_group_descriptor(struct inode *inode,
316 struct ocfs2_dinode *di, u64 gd_blkno,
317 struct buffer_head **bh, int *released)
318 {
319 int rc;
320 struct buffer_head *tmp = *bh;
321 struct ocfs2_group_desc *gd;
322
323 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL);
324 if (rc)
325 goto out;
326
327 gd = (struct ocfs2_group_desc *) tmp->b_data;
328 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
329 /*
330 * Invalid gd cache was set in ocfs2_read_block(),
331 * which will affect block_group allocation.
332 * Path:
333 * ocfs2_reserve_suballoc_bits
334 * ocfs2_block_group_alloc
335 * ocfs2_block_group_alloc_contig
336 * ocfs2_set_new_buffer_uptodate
337 */
338 ocfs2_remove_from_cache(INODE_CACHE(inode), tmp);
339 *released = 1; /* we return 'rc=0' for this case */
340 goto free_bh;
341 }
342
343 /* below jobs same with ocfs2_read_group_descriptor() */
344 if (!buffer_jbd(tmp)) {
345 rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp);
346 if (rc)
347 goto free_bh;
348 }
349
350 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
351 if (rc)
352 goto free_bh;
353
354 /* If ocfs2_read_block() got us a new bh, pass it up. */
355 if (!*bh)
356 *bh = tmp;
357
358 return rc;
359
360 free_bh:
361 brelse(tmp);
362 out:
363 return rc;
364 }
365
ocfs2_read_group_descriptor(struct inode * inode,struct ocfs2_dinode * di,u64 gd_blkno,struct buffer_head ** bh)366 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
367 u64 gd_blkno, struct buffer_head **bh)
368 {
369 int rc;
370 struct buffer_head *tmp = *bh;
371
372 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
373 ocfs2_validate_group_descriptor);
374 if (rc)
375 goto out;
376
377 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
378 if (rc) {
379 brelse(tmp);
380 goto out;
381 }
382
383 /* If ocfs2_read_block() got us a new bh, pass it up. */
384 if (!*bh)
385 *bh = tmp;
386
387 out:
388 return rc;
389 }
390
ocfs2_bg_discontig_add_extent(struct ocfs2_super * osb,struct ocfs2_group_desc * bg,struct ocfs2_chain_list * cl,u64 p_blkno,unsigned int clusters)391 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
392 struct ocfs2_group_desc *bg,
393 struct ocfs2_chain_list *cl,
394 u64 p_blkno, unsigned int clusters)
395 {
396 struct ocfs2_extent_list *el = &bg->bg_list;
397 struct ocfs2_extent_rec *rec;
398
399 BUG_ON(!ocfs2_supports_discontig_bg(osb));
400 if (!el->l_next_free_rec)
401 el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
402 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
403 rec->e_blkno = cpu_to_le64(p_blkno);
404 rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
405 le16_to_cpu(cl->cl_bpc));
406 rec->e_leaf_clusters = cpu_to_le16(clusters);
407 le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
408 le16_add_cpu(&bg->bg_free_bits_count,
409 clusters * le16_to_cpu(cl->cl_bpc));
410 le16_add_cpu(&el->l_next_free_rec, 1);
411 }
412
ocfs2_block_group_fill(handle_t * handle,struct inode * alloc_inode,struct buffer_head * bg_bh,u64 group_blkno,unsigned int group_clusters,u16 my_chain,struct ocfs2_chain_list * cl)413 static int ocfs2_block_group_fill(handle_t *handle,
414 struct inode *alloc_inode,
415 struct buffer_head *bg_bh,
416 u64 group_blkno,
417 unsigned int group_clusters,
418 u16 my_chain,
419 struct ocfs2_chain_list *cl)
420 {
421 int status = 0;
422 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
423 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
424 struct super_block * sb = alloc_inode->i_sb;
425
426 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
427 status = ocfs2_error(alloc_inode->i_sb,
428 "group block (%llu) != b_blocknr (%llu)\n",
429 (unsigned long long)group_blkno,
430 (unsigned long long) bg_bh->b_blocknr);
431 goto bail;
432 }
433
434 status = ocfs2_journal_access_gd(handle,
435 INODE_CACHE(alloc_inode),
436 bg_bh,
437 OCFS2_JOURNAL_ACCESS_CREATE);
438 if (status < 0) {
439 mlog_errno(status);
440 goto bail;
441 }
442
443 memset(bg, 0, sb->s_blocksize);
444 strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
445 bg->bg_generation = cpu_to_le32(osb->fs_generation);
446 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
447 osb->s_feature_incompat));
448 bg->bg_chain = cpu_to_le16(my_chain);
449 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
450 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
451 bg->bg_blkno = cpu_to_le64(group_blkno);
452 if (group_clusters == le16_to_cpu(cl->cl_cpg))
453 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
454 else
455 ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
456 group_clusters);
457
458 /* set the 1st bit in the bitmap to account for the descriptor block */
459 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
460 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
461
462 ocfs2_journal_dirty(handle, bg_bh);
463
464 /* There is no need to zero out or otherwise initialize the
465 * other blocks in a group - All valid FS metadata in a block
466 * group stores the superblock fs_generation value at
467 * allocation time. */
468
469 bail:
470 if (status)
471 mlog_errno(status);
472 return status;
473 }
474
ocfs2_find_smallest_chain(struct ocfs2_chain_list * cl)475 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
476 {
477 u16 curr, best;
478
479 best = curr = 0;
480 while (curr < le16_to_cpu(cl->cl_count)) {
481 if (le32_to_cpu(cl->cl_recs[best].c_total) >
482 le32_to_cpu(cl->cl_recs[curr].c_total))
483 best = curr;
484 curr++;
485 }
486 return best;
487 }
488
489 static struct buffer_head *
ocfs2_block_group_alloc_contig(struct ocfs2_super * osb,handle_t * handle,struct inode * alloc_inode,struct ocfs2_alloc_context * ac,struct ocfs2_chain_list * cl)490 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
491 struct inode *alloc_inode,
492 struct ocfs2_alloc_context *ac,
493 struct ocfs2_chain_list *cl)
494 {
495 int status;
496 u32 bit_off, num_bits;
497 u64 bg_blkno;
498 struct buffer_head *bg_bh;
499 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
500
501 status = ocfs2_claim_clusters(handle, ac,
502 le16_to_cpu(cl->cl_cpg), &bit_off,
503 &num_bits);
504 if (status < 0) {
505 if (status != -ENOSPC)
506 mlog_errno(status);
507 goto bail;
508 }
509
510 /* setup the group */
511 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
512 trace_ocfs2_block_group_alloc_contig(
513 (unsigned long long)bg_blkno, alloc_rec);
514
515 bg_bh = sb_getblk(osb->sb, bg_blkno);
516 if (!bg_bh) {
517 status = -ENOMEM;
518 mlog_errno(status);
519 goto bail;
520 }
521 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
522
523 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
524 bg_blkno, num_bits, alloc_rec, cl);
525 if (status < 0) {
526 brelse(bg_bh);
527 mlog_errno(status);
528 }
529
530 bail:
531 return status ? ERR_PTR(status) : bg_bh;
532 }
533
ocfs2_block_group_claim_bits(struct ocfs2_super * osb,handle_t * handle,struct ocfs2_alloc_context * ac,unsigned int min_bits,u32 * bit_off,u32 * num_bits)534 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
535 handle_t *handle,
536 struct ocfs2_alloc_context *ac,
537 unsigned int min_bits,
538 u32 *bit_off, u32 *num_bits)
539 {
540 int status = 0;
541
542 while (min_bits) {
543 status = ocfs2_claim_clusters(handle, ac, min_bits,
544 bit_off, num_bits);
545 if (status != -ENOSPC)
546 break;
547
548 min_bits >>= 1;
549 }
550
551 return status;
552 }
553
ocfs2_block_group_grow_discontig(handle_t * handle,struct inode * alloc_inode,struct buffer_head * bg_bh,struct ocfs2_alloc_context * ac,struct ocfs2_chain_list * cl,unsigned int min_bits)554 static int ocfs2_block_group_grow_discontig(handle_t *handle,
555 struct inode *alloc_inode,
556 struct buffer_head *bg_bh,
557 struct ocfs2_alloc_context *ac,
558 struct ocfs2_chain_list *cl,
559 unsigned int min_bits)
560 {
561 int status;
562 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
563 struct ocfs2_group_desc *bg =
564 (struct ocfs2_group_desc *)bg_bh->b_data;
565 unsigned int needed = le16_to_cpu(cl->cl_cpg) -
566 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
567 u32 p_cpos, clusters;
568 u64 p_blkno;
569 struct ocfs2_extent_list *el = &bg->bg_list;
570
571 status = ocfs2_journal_access_gd(handle,
572 INODE_CACHE(alloc_inode),
573 bg_bh,
574 OCFS2_JOURNAL_ACCESS_CREATE);
575 if (status < 0) {
576 mlog_errno(status);
577 goto bail;
578 }
579
580 while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
581 le16_to_cpu(el->l_count))) {
582 if (min_bits > needed)
583 min_bits = needed;
584 status = ocfs2_block_group_claim_bits(osb, handle, ac,
585 min_bits, &p_cpos,
586 &clusters);
587 if (status < 0) {
588 if (status != -ENOSPC)
589 mlog_errno(status);
590 goto bail;
591 }
592 p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
593 ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
594 clusters);
595
596 min_bits = clusters;
597 needed = le16_to_cpu(cl->cl_cpg) -
598 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
599 }
600
601 if (needed > 0) {
602 /*
603 * We have used up all the extent rec but can't fill up
604 * the cpg. So bail out.
605 */
606 status = -ENOSPC;
607 goto bail;
608 }
609
610 ocfs2_journal_dirty(handle, bg_bh);
611
612 bail:
613 return status;
614 }
615
ocfs2_bg_alloc_cleanup(handle_t * handle,struct ocfs2_alloc_context * cluster_ac,struct inode * alloc_inode,struct buffer_head * bg_bh)616 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
617 struct ocfs2_alloc_context *cluster_ac,
618 struct inode *alloc_inode,
619 struct buffer_head *bg_bh)
620 {
621 int i, ret;
622 struct ocfs2_group_desc *bg;
623 struct ocfs2_extent_list *el;
624 struct ocfs2_extent_rec *rec;
625
626 if (!bg_bh)
627 return;
628
629 bg = (struct ocfs2_group_desc *)bg_bh->b_data;
630 el = &bg->bg_list;
631 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
632 rec = &el->l_recs[i];
633 ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
634 cluster_ac->ac_bh,
635 le64_to_cpu(rec->e_blkno),
636 le16_to_cpu(rec->e_leaf_clusters));
637 if (ret)
638 mlog_errno(ret);
639 /* Try all the clusters to free */
640 }
641
642 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
643 brelse(bg_bh);
644 }
645
646 static struct buffer_head *
ocfs2_block_group_alloc_discontig(handle_t * handle,struct inode * alloc_inode,struct ocfs2_alloc_context * ac,struct ocfs2_chain_list * cl)647 ocfs2_block_group_alloc_discontig(handle_t *handle,
648 struct inode *alloc_inode,
649 struct ocfs2_alloc_context *ac,
650 struct ocfs2_chain_list *cl)
651 {
652 int status;
653 u32 bit_off, num_bits;
654 u64 bg_blkno;
655 unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
656 struct buffer_head *bg_bh = NULL;
657 unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
658 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
659
660 if (!ocfs2_supports_discontig_bg(osb)) {
661 status = -ENOSPC;
662 goto bail;
663 }
664
665 status = ocfs2_extend_trans(handle,
666 ocfs2_calc_bg_discontig_credits(osb->sb));
667 if (status) {
668 mlog_errno(status);
669 goto bail;
670 }
671
672 /*
673 * We're going to be grabbing from multiple cluster groups.
674 * We don't have enough credits to relink them all, and the
675 * cluster groups will be staying in cache for the duration of
676 * this operation.
677 */
678 ac->ac_disable_chain_relink = 1;
679
680 /* Claim the first region */
681 status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
682 &bit_off, &num_bits);
683 if (status < 0) {
684 if (status != -ENOSPC)
685 mlog_errno(status);
686 goto bail;
687 }
688 min_bits = num_bits;
689
690 /* setup the group */
691 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
692 trace_ocfs2_block_group_alloc_discontig(
693 (unsigned long long)bg_blkno, alloc_rec);
694
695 bg_bh = sb_getblk(osb->sb, bg_blkno);
696 if (!bg_bh) {
697 status = -ENOMEM;
698 mlog_errno(status);
699 goto bail;
700 }
701 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
702
703 status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
704 bg_blkno, num_bits, alloc_rec, cl);
705 if (status < 0) {
706 mlog_errno(status);
707 goto bail;
708 }
709
710 status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
711 bg_bh, ac, cl, min_bits);
712 if (status)
713 mlog_errno(status);
714
715 bail:
716 if (status)
717 ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
718 return status ? ERR_PTR(status) : bg_bh;
719 }
720
721 /*
722 * We expect the block group allocator to already be locked.
723 */
ocfs2_block_group_alloc(struct ocfs2_super * osb,struct inode * alloc_inode,struct buffer_head * bh,u64 max_block,u64 * last_alloc_group,int flags)724 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
725 struct inode *alloc_inode,
726 struct buffer_head *bh,
727 u64 max_block,
728 u64 *last_alloc_group,
729 int flags)
730 {
731 int status, credits;
732 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
733 struct ocfs2_chain_list *cl;
734 struct ocfs2_alloc_context *ac = NULL;
735 handle_t *handle = NULL;
736 u16 alloc_rec;
737 struct buffer_head *bg_bh = NULL;
738 struct ocfs2_group_desc *bg;
739
740 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
741
742 cl = &fe->id2.i_chain;
743 status = ocfs2_reserve_clusters_with_limit(osb,
744 le16_to_cpu(cl->cl_cpg),
745 max_block, flags, &ac);
746 if (status < 0) {
747 if (status != -ENOSPC)
748 mlog_errno(status);
749 goto bail;
750 }
751
752 credits = ocfs2_calc_group_alloc_credits(osb->sb,
753 le16_to_cpu(cl->cl_cpg));
754 handle = ocfs2_start_trans(osb, credits);
755 if (IS_ERR(handle)) {
756 status = PTR_ERR(handle);
757 handle = NULL;
758 mlog_errno(status);
759 goto bail;
760 }
761
762 if (last_alloc_group && *last_alloc_group != 0) {
763 trace_ocfs2_block_group_alloc(
764 (unsigned long long)*last_alloc_group);
765 ac->ac_last_group = *last_alloc_group;
766 }
767
768 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
769 ac, cl);
770 if (PTR_ERR(bg_bh) == -ENOSPC) {
771 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
772 bg_bh = ocfs2_block_group_alloc_discontig(handle,
773 alloc_inode,
774 ac, cl);
775 }
776 if (IS_ERR(bg_bh)) {
777 status = PTR_ERR(bg_bh);
778 bg_bh = NULL;
779 if (status != -ENOSPC)
780 mlog_errno(status);
781 goto bail;
782 }
783 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
784
785 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
786 bh, OCFS2_JOURNAL_ACCESS_WRITE);
787 if (status < 0) {
788 mlog_errno(status);
789 goto bail;
790 }
791
792 alloc_rec = le16_to_cpu(bg->bg_chain);
793 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
794 le16_to_cpu(bg->bg_free_bits_count));
795 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
796 le16_to_cpu(bg->bg_bits));
797 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
798 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
799 le16_add_cpu(&cl->cl_next_free_rec, 1);
800
801 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
802 le16_to_cpu(bg->bg_free_bits_count));
803 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
804 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
805
806 ocfs2_journal_dirty(handle, bh);
807
808 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
809 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
810 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
811 le32_to_cpu(fe->i_clusters)));
812 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
813 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
814 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
815 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
816
817 status = 0;
818
819 /* save the new last alloc group so that the caller can cache it. */
820 if (last_alloc_group)
821 *last_alloc_group = ac->ac_last_group;
822
823 bail:
824 if (handle)
825 ocfs2_commit_trans(osb, handle);
826
827 if (ac)
828 ocfs2_free_alloc_context(ac);
829
830 brelse(bg_bh);
831
832 if (status)
833 mlog_errno(status);
834 return status;
835 }
836
ocfs2_reserve_suballoc_bits(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac,int type,u32 slot,u64 * last_alloc_group,int flags)837 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
838 struct ocfs2_alloc_context *ac,
839 int type,
840 u32 slot,
841 u64 *last_alloc_group,
842 int flags)
843 {
844 int status;
845 u32 bits_wanted = ac->ac_bits_wanted;
846 struct inode *alloc_inode;
847 struct buffer_head *bh = NULL;
848 struct ocfs2_dinode *fe;
849 u32 free_bits;
850
851 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
852 if (!alloc_inode) {
853 mlog_errno(-EINVAL);
854 return -EINVAL;
855 }
856
857 inode_lock(alloc_inode);
858
859 status = ocfs2_inode_lock(alloc_inode, &bh, 1);
860 if (status < 0) {
861 inode_unlock(alloc_inode);
862 iput(alloc_inode);
863
864 mlog_errno(status);
865 return status;
866 }
867
868 ac->ac_inode = alloc_inode;
869 ac->ac_alloc_slot = slot;
870
871 fe = (struct ocfs2_dinode *) bh->b_data;
872
873 /* The bh was validated by the inode read inside
874 * ocfs2_inode_lock(). Any corruption is a code bug. */
875 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
876
877 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
878 status = ocfs2_error(alloc_inode->i_sb,
879 "Invalid chain allocator %llu\n",
880 (unsigned long long)le64_to_cpu(fe->i_blkno));
881 goto bail;
882 }
883
884 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
885 le32_to_cpu(fe->id1.bitmap1.i_used);
886
887 if (bits_wanted > free_bits) {
888 /* cluster bitmap never grows */
889 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
890 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
891 free_bits);
892 status = -ENOSPC;
893 goto bail;
894 }
895
896 if (!(flags & ALLOC_NEW_GROUP)) {
897 trace_ocfs2_reserve_suballoc_bits_no_new_group(
898 slot, bits_wanted, free_bits);
899 status = -ENOSPC;
900 goto bail;
901 }
902
903 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
904 ac->ac_max_block,
905 last_alloc_group, flags);
906 if (status < 0) {
907 if (status != -ENOSPC)
908 mlog_errno(status);
909 goto bail;
910 }
911 atomic_inc(&osb->alloc_stats.bg_extends);
912
913 /* You should never ask for this much metadata */
914 BUG_ON(bits_wanted >
915 (le32_to_cpu(fe->id1.bitmap1.i_total)
916 - le32_to_cpu(fe->id1.bitmap1.i_used)));
917 }
918
919 get_bh(bh);
920 ac->ac_bh = bh;
921 bail:
922 brelse(bh);
923
924 if (status)
925 mlog_errno(status);
926 return status;
927 }
928
ocfs2_init_inode_steal_slot(struct ocfs2_super * osb)929 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
930 {
931 spin_lock(&osb->osb_lock);
932 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
933 spin_unlock(&osb->osb_lock);
934 atomic_set(&osb->s_num_inodes_stolen, 0);
935 }
936
ocfs2_init_meta_steal_slot(struct ocfs2_super * osb)937 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
938 {
939 spin_lock(&osb->osb_lock);
940 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
941 spin_unlock(&osb->osb_lock);
942 atomic_set(&osb->s_num_meta_stolen, 0);
943 }
944
ocfs2_init_steal_slots(struct ocfs2_super * osb)945 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
946 {
947 ocfs2_init_inode_steal_slot(osb);
948 ocfs2_init_meta_steal_slot(osb);
949 }
950
__ocfs2_set_steal_slot(struct ocfs2_super * osb,int slot,int type)951 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
952 {
953 spin_lock(&osb->osb_lock);
954 if (type == INODE_ALLOC_SYSTEM_INODE)
955 osb->s_inode_steal_slot = (u16)slot;
956 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
957 osb->s_meta_steal_slot = (u16)slot;
958 spin_unlock(&osb->osb_lock);
959 }
960
__ocfs2_get_steal_slot(struct ocfs2_super * osb,int type)961 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
962 {
963 int slot = OCFS2_INVALID_SLOT;
964
965 spin_lock(&osb->osb_lock);
966 if (type == INODE_ALLOC_SYSTEM_INODE)
967 slot = osb->s_inode_steal_slot;
968 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
969 slot = osb->s_meta_steal_slot;
970 spin_unlock(&osb->osb_lock);
971
972 return slot;
973 }
974
ocfs2_get_inode_steal_slot(struct ocfs2_super * osb)975 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
976 {
977 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
978 }
979
ocfs2_get_meta_steal_slot(struct ocfs2_super * osb)980 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
981 {
982 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
983 }
984
ocfs2_steal_resource(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac,int type)985 static int ocfs2_steal_resource(struct ocfs2_super *osb,
986 struct ocfs2_alloc_context *ac,
987 int type)
988 {
989 int i, status = -ENOSPC;
990 int slot = __ocfs2_get_steal_slot(osb, type);
991
992 /* Start to steal resource from the first slot after ours. */
993 if (slot == OCFS2_INVALID_SLOT)
994 slot = osb->slot_num + 1;
995
996 for (i = 0; i < osb->max_slots; i++, slot++) {
997 if (slot == osb->max_slots)
998 slot = 0;
999
1000 if (slot == osb->slot_num)
1001 continue;
1002
1003 status = ocfs2_reserve_suballoc_bits(osb, ac,
1004 type,
1005 (u32)slot, NULL,
1006 NOT_ALLOC_NEW_GROUP);
1007 if (status >= 0) {
1008 __ocfs2_set_steal_slot(osb, slot, type);
1009 break;
1010 }
1011
1012 ocfs2_free_ac_resource(ac);
1013 }
1014
1015 return status;
1016 }
1017
ocfs2_steal_inode(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac)1018 static int ocfs2_steal_inode(struct ocfs2_super *osb,
1019 struct ocfs2_alloc_context *ac)
1020 {
1021 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
1022 }
1023
ocfs2_steal_meta(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac)1024 static int ocfs2_steal_meta(struct ocfs2_super *osb,
1025 struct ocfs2_alloc_context *ac)
1026 {
1027 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
1028 }
1029
ocfs2_reserve_new_metadata_blocks(struct ocfs2_super * osb,int blocks,struct ocfs2_alloc_context ** ac)1030 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
1031 int blocks,
1032 struct ocfs2_alloc_context **ac)
1033 {
1034 int status;
1035 int slot = ocfs2_get_meta_steal_slot(osb);
1036
1037 *ac = kzalloc_obj(struct ocfs2_alloc_context, GFP_KERNEL);
1038 if (!(*ac)) {
1039 status = -ENOMEM;
1040 mlog_errno(status);
1041 goto bail;
1042 }
1043
1044 (*ac)->ac_bits_wanted = blocks;
1045 (*ac)->ac_which = OCFS2_AC_USE_META;
1046 (*ac)->ac_group_search = ocfs2_block_group_search;
1047
1048 if (slot != OCFS2_INVALID_SLOT &&
1049 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1050 goto extent_steal;
1051
1052 atomic_set(&osb->s_num_meta_stolen, 0);
1053 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1054 EXTENT_ALLOC_SYSTEM_INODE,
1055 (u32)osb->slot_num, NULL,
1056 ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1057
1058
1059 if (status >= 0) {
1060 status = 0;
1061 if (slot != OCFS2_INVALID_SLOT)
1062 ocfs2_init_meta_steal_slot(osb);
1063 goto bail;
1064 } else if (status < 0 && status != -ENOSPC) {
1065 mlog_errno(status);
1066 goto bail;
1067 }
1068
1069 ocfs2_free_ac_resource(*ac);
1070
1071 extent_steal:
1072 status = ocfs2_steal_meta(osb, *ac);
1073 atomic_inc(&osb->s_num_meta_stolen);
1074 if (status < 0) {
1075 if (status != -ENOSPC)
1076 mlog_errno(status);
1077 goto bail;
1078 }
1079
1080 status = 0;
1081 bail:
1082 if ((status < 0) && *ac) {
1083 ocfs2_free_alloc_context(*ac);
1084 *ac = NULL;
1085 }
1086
1087 if (status)
1088 mlog_errno(status);
1089 return status;
1090 }
1091
ocfs2_reserve_new_metadata(struct ocfs2_super * osb,struct ocfs2_extent_list * root_el,struct ocfs2_alloc_context ** ac)1092 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1093 struct ocfs2_extent_list *root_el,
1094 struct ocfs2_alloc_context **ac)
1095 {
1096 return ocfs2_reserve_new_metadata_blocks(osb,
1097 ocfs2_extend_meta_needed(root_el),
1098 ac);
1099 }
1100
ocfs2_reserve_new_inode(struct ocfs2_super * osb,struct ocfs2_alloc_context ** ac)1101 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1102 struct ocfs2_alloc_context **ac)
1103 {
1104 int status;
1105 int slot = ocfs2_get_inode_steal_slot(osb);
1106 u64 alloc_group;
1107
1108 *ac = kzalloc_obj(struct ocfs2_alloc_context, GFP_KERNEL);
1109 if (!(*ac)) {
1110 status = -ENOMEM;
1111 mlog_errno(status);
1112 goto bail;
1113 }
1114
1115 (*ac)->ac_bits_wanted = 1;
1116 (*ac)->ac_which = OCFS2_AC_USE_INODE;
1117
1118 (*ac)->ac_group_search = ocfs2_block_group_search;
1119
1120 /*
1121 * stat(2) can't handle i_ino > 32bits, so we tell the
1122 * lower levels not to allocate us a block group past that
1123 * limit. The 'inode64' mount option avoids this behavior.
1124 */
1125 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1126 (*ac)->ac_max_block = (u32)~0U;
1127
1128 /*
1129 * slot is set when we successfully steal inode from other nodes.
1130 * It is reset in 3 places:
1131 * 1. when we flush the truncate log
1132 * 2. when we complete local alloc recovery.
1133 * 3. when we successfully allocate from our own slot.
1134 * After it is set, we will go on stealing inodes until we find the
1135 * need to check our slots to see whether there is some space for us.
1136 */
1137 if (slot != OCFS2_INVALID_SLOT &&
1138 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1139 goto inode_steal;
1140
1141 atomic_set(&osb->s_num_inodes_stolen, 0);
1142 alloc_group = osb->osb_inode_alloc_group;
1143 status = ocfs2_reserve_suballoc_bits(osb, *ac,
1144 INODE_ALLOC_SYSTEM_INODE,
1145 (u32)osb->slot_num,
1146 &alloc_group,
1147 ALLOC_NEW_GROUP |
1148 ALLOC_GROUPS_FROM_GLOBAL);
1149 if (status >= 0) {
1150 status = 0;
1151
1152 spin_lock(&osb->osb_lock);
1153 osb->osb_inode_alloc_group = alloc_group;
1154 spin_unlock(&osb->osb_lock);
1155 trace_ocfs2_reserve_new_inode_new_group(
1156 (unsigned long long)alloc_group);
1157
1158 /*
1159 * Some inodes must be freed by us, so try to allocate
1160 * from our own next time.
1161 */
1162 if (slot != OCFS2_INVALID_SLOT)
1163 ocfs2_init_inode_steal_slot(osb);
1164 goto bail;
1165 } else if (status < 0 && status != -ENOSPC) {
1166 mlog_errno(status);
1167 goto bail;
1168 }
1169
1170 ocfs2_free_ac_resource(*ac);
1171
1172 inode_steal:
1173 status = ocfs2_steal_inode(osb, *ac);
1174 atomic_inc(&osb->s_num_inodes_stolen);
1175 if (status < 0) {
1176 if (status != -ENOSPC)
1177 mlog_errno(status);
1178 goto bail;
1179 }
1180
1181 status = 0;
1182 bail:
1183 if ((status < 0) && *ac) {
1184 ocfs2_free_alloc_context(*ac);
1185 *ac = NULL;
1186 }
1187
1188 if (status)
1189 mlog_errno(status);
1190 return status;
1191 }
1192
1193 /* local alloc code has to do the same thing, so rather than do this
1194 * twice.. */
ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super * osb,struct ocfs2_alloc_context * ac)1195 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1196 struct ocfs2_alloc_context *ac)
1197 {
1198 int status;
1199
1200 ac->ac_which = OCFS2_AC_USE_MAIN;
1201 ac->ac_group_search = ocfs2_cluster_group_search;
1202
1203 status = ocfs2_reserve_suballoc_bits(osb, ac,
1204 GLOBAL_BITMAP_SYSTEM_INODE,
1205 OCFS2_INVALID_SLOT, NULL,
1206 ALLOC_NEW_GROUP);
1207 if (status < 0 && status != -ENOSPC)
1208 mlog_errno(status);
1209
1210 return status;
1211 }
1212
1213 /* Callers don't need to care which bitmap (local alloc or main) to
1214 * use so we figure it out for them, but unfortunately this clutters
1215 * things a bit. */
ocfs2_reserve_clusters_with_limit(struct ocfs2_super * osb,u32 bits_wanted,u64 max_block,int flags,struct ocfs2_alloc_context ** ac)1216 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1217 u32 bits_wanted, u64 max_block,
1218 int flags,
1219 struct ocfs2_alloc_context **ac)
1220 {
1221 int status, ret = 0;
1222 int retried = 0;
1223
1224 *ac = kzalloc_obj(struct ocfs2_alloc_context, GFP_KERNEL);
1225 if (!(*ac)) {
1226 status = -ENOMEM;
1227 mlog_errno(status);
1228 goto bail;
1229 }
1230
1231 (*ac)->ac_bits_wanted = bits_wanted;
1232 (*ac)->ac_max_block = max_block;
1233
1234 status = -ENOSPC;
1235 if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1236 ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1237 status = ocfs2_reserve_local_alloc_bits(osb,
1238 bits_wanted,
1239 *ac);
1240 if ((status < 0) && (status != -ENOSPC)) {
1241 mlog_errno(status);
1242 goto bail;
1243 }
1244 }
1245
1246 if (status == -ENOSPC) {
1247 retry:
1248 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1249 /* Retry if there is sufficient space cached in truncate log */
1250 if (status == -ENOSPC && !retried) {
1251 retried = 1;
1252 ocfs2_inode_unlock((*ac)->ac_inode, 1);
1253 inode_unlock((*ac)->ac_inode);
1254
1255 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1256 if (ret == 1) {
1257 iput((*ac)->ac_inode);
1258 (*ac)->ac_inode = NULL;
1259 goto retry;
1260 }
1261
1262 if (ret < 0)
1263 mlog_errno(ret);
1264
1265 inode_lock((*ac)->ac_inode);
1266 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1267 if (ret < 0) {
1268 mlog_errno(ret);
1269 inode_unlock((*ac)->ac_inode);
1270 iput((*ac)->ac_inode);
1271 (*ac)->ac_inode = NULL;
1272 goto bail;
1273 }
1274 }
1275 if (status < 0) {
1276 if (status != -ENOSPC)
1277 mlog_errno(status);
1278 goto bail;
1279 }
1280 }
1281
1282 status = 0;
1283 bail:
1284 if ((status < 0) && *ac) {
1285 ocfs2_free_alloc_context(*ac);
1286 *ac = NULL;
1287 }
1288
1289 if (status)
1290 mlog_errno(status);
1291 return status;
1292 }
1293
ocfs2_reserve_clusters(struct ocfs2_super * osb,u32 bits_wanted,struct ocfs2_alloc_context ** ac)1294 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1295 u32 bits_wanted,
1296 struct ocfs2_alloc_context **ac)
1297 {
1298 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1299 ALLOC_NEW_GROUP, ac);
1300 }
1301
1302 /*
1303 * More or less lifted from ext3. I'll leave their description below:
1304 *
1305 * "For ext3 allocations, we must not reuse any blocks which are
1306 * allocated in the bitmap buffer's "last committed data" copy. This
1307 * prevents deletes from freeing up the page for reuse until we have
1308 * committed the delete transaction.
1309 *
1310 * If we didn't do this, then deleting something and reallocating it as
1311 * data would allow the old block to be overwritten before the
1312 * transaction committed (because we force data to disk before commit).
1313 * This would lead to corruption if we crashed between overwriting the
1314 * data and committing the delete.
1315 *
1316 * @@@ We may want to make this allocation behaviour conditional on
1317 * data-writes at some point, and disable it for metadata allocations or
1318 * sync-data inodes."
1319 *
1320 * Note: OCFS2 already does this differently for metadata vs data
1321 * allocations, as those bitmaps are separate and undo access is never
1322 * called on a metadata group descriptor.
1323 */
ocfs2_test_bg_bit_allocatable(struct buffer_head * bg_bh,int nr)1324 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1325 int nr)
1326 {
1327 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1328 struct journal_head *jh;
1329 int ret;
1330
1331 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1332 return 0;
1333
1334 jh = jbd2_journal_grab_journal_head(bg_bh);
1335 if (!jh)
1336 return 1;
1337
1338 spin_lock(&jh->b_state_lock);
1339 bg = (struct ocfs2_group_desc *) jh->b_committed_data;
1340 if (bg)
1341 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1342 else
1343 ret = 1;
1344 spin_unlock(&jh->b_state_lock);
1345 jbd2_journal_put_journal_head(jh);
1346
1347 return ret;
1348 }
1349
ocfs2_find_max_contig_free_bits(void * bitmap,u16 total_bits,u16 start)1350 u16 ocfs2_find_max_contig_free_bits(void *bitmap,
1351 u16 total_bits, u16 start)
1352 {
1353 u16 offset, free_bits;
1354 u16 contig_bits = 0;
1355
1356 while (start < total_bits) {
1357 offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
1358 if (offset == total_bits)
1359 break;
1360
1361 start = ocfs2_find_next_bit(bitmap, total_bits, offset);
1362 free_bits = start - offset;
1363 if (contig_bits < free_bits)
1364 contig_bits = free_bits;
1365 }
1366
1367 return contig_bits;
1368 }
1369
ocfs2_block_group_find_clear_bits(struct ocfs2_super * osb,struct buffer_head * bg_bh,unsigned int bits_wanted,unsigned int total_bits,struct ocfs2_suballoc_result * res)1370 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1371 struct buffer_head *bg_bh,
1372 unsigned int bits_wanted,
1373 unsigned int total_bits,
1374 struct ocfs2_suballoc_result *res)
1375 {
1376 void *bitmap;
1377 u16 best_offset, best_size;
1378 u16 prev_best_size = 0;
1379 int offset, start, found, status = 0;
1380 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1381
1382 /* Callers got this descriptor from
1383 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1384 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1385
1386 found = start = best_offset = best_size = 0;
1387 bitmap = bg->bg_bitmap;
1388
1389 while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) <
1390 total_bits) {
1391 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1392 /* We found a zero, but we can't use it as it
1393 * hasn't been put to disk yet! */
1394 found = 0;
1395 start = offset + 1;
1396 } else if (offset == start) {
1397 /* we found a zero */
1398 found++;
1399 /* move start to the next bit to test */
1400 start++;
1401 } else {
1402 /* got a zero after some ones */
1403 found = 1;
1404 start = offset + 1;
1405 prev_best_size = best_size;
1406 }
1407 if (found > best_size) {
1408 best_size = found;
1409 best_offset = start - found;
1410 }
1411 /* we got everything we needed */
1412 if (found == bits_wanted) {
1413 /* mlog(0, "Found it all!\n"); */
1414 break;
1415 }
1416 }
1417
1418 /* best_size will be allocated, we save prev_best_size */
1419 res->sr_max_contig_bits = prev_best_size;
1420 if (best_size) {
1421 res->sr_bit_offset = best_offset;
1422 res->sr_bits = best_size;
1423 } else {
1424 status = -ENOSPC;
1425 /* No error log here -- see the comment above
1426 * ocfs2_test_bg_bit_allocatable */
1427 }
1428
1429 return status;
1430 }
1431
ocfs2_block_group_set_bits(handle_t * handle,struct inode * alloc_inode,struct ocfs2_group_desc * bg,struct buffer_head * group_bh,unsigned int bit_off,unsigned int num_bits,unsigned int max_contig_bits,int fastpath)1432 int ocfs2_block_group_set_bits(handle_t *handle,
1433 struct inode *alloc_inode,
1434 struct ocfs2_group_desc *bg,
1435 struct buffer_head *group_bh,
1436 unsigned int bit_off,
1437 unsigned int num_bits,
1438 unsigned int max_contig_bits,
1439 int fastpath)
1440 {
1441 int status;
1442 void *bitmap = bg->bg_bitmap;
1443 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1444 unsigned int start = bit_off + num_bits;
1445 u16 contig_bits;
1446 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1447
1448 /* All callers get the descriptor via
1449 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1450 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1451 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1452
1453 trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1454
1455 if (ocfs2_is_cluster_bitmap(alloc_inode))
1456 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1457
1458 status = ocfs2_journal_access_gd(handle,
1459 INODE_CACHE(alloc_inode),
1460 group_bh,
1461 journal_type);
1462 if (status < 0) {
1463 mlog_errno(status);
1464 goto bail;
1465 }
1466
1467 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1468 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1469 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1470 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1471 le16_to_cpu(bg->bg_bits),
1472 le16_to_cpu(bg->bg_free_bits_count),
1473 num_bits);
1474 }
1475 while(num_bits--)
1476 ocfs2_set_bit(bit_off++, bitmap);
1477
1478 /*
1479 * this is optimize path, caller set old contig value
1480 * in max_contig_bits to bypass finding action.
1481 */
1482 if (fastpath) {
1483 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1484 } else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
1485 /*
1486 * Usually, the block group bitmap allocates only 1 bit
1487 * at a time, while the cluster group allocates n bits
1488 * each time. Therefore, we only save the contig bits for
1489 * the cluster group.
1490 */
1491 contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
1492 le16_to_cpu(bg->bg_bits), start);
1493 if (contig_bits > max_contig_bits)
1494 max_contig_bits = contig_bits;
1495 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1496 ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits);
1497 } else {
1498 bg->bg_contig_free_bits = 0;
1499 }
1500
1501 ocfs2_journal_dirty(handle, group_bh);
1502
1503 bail:
1504 return status;
1505 }
1506
1507 /* find the one with the most empty bits */
ocfs2_find_victim_chain(struct ocfs2_chain_list * cl)1508 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1509 {
1510 u16 curr, best;
1511
1512 BUG_ON(!cl->cl_next_free_rec);
1513
1514 best = curr = 0;
1515 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1516 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1517 le32_to_cpu(cl->cl_recs[best].c_free))
1518 best = curr;
1519 curr++;
1520 }
1521
1522 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1523 return best;
1524 }
1525
ocfs2_relink_block_group(handle_t * handle,struct inode * alloc_inode,struct buffer_head * fe_bh,struct buffer_head * bg_bh,struct buffer_head * prev_bg_bh,u16 chain)1526 static int ocfs2_relink_block_group(handle_t *handle,
1527 struct inode *alloc_inode,
1528 struct buffer_head *fe_bh,
1529 struct buffer_head *bg_bh,
1530 struct buffer_head *prev_bg_bh,
1531 u16 chain)
1532 {
1533 int status;
1534 /* there is a really tiny chance the journal calls could fail,
1535 * but we wouldn't want inconsistent blocks in *any* case. */
1536 u64 bg_ptr, prev_bg_ptr;
1537 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1538 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1539 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1540
1541 /* The caller got these descriptors from
1542 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1543 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1544 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1545
1546 trace_ocfs2_relink_block_group(
1547 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1548 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1549 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1550
1551 bg_ptr = le64_to_cpu(bg->bg_next_group);
1552 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1553
1554 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1555 prev_bg_bh,
1556 OCFS2_JOURNAL_ACCESS_WRITE);
1557 if (status < 0)
1558 goto out;
1559
1560 prev_bg->bg_next_group = bg->bg_next_group;
1561 ocfs2_journal_dirty(handle, prev_bg_bh);
1562
1563 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1564 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1565 if (status < 0)
1566 goto out_rollback_prev_bg;
1567
1568 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1569 ocfs2_journal_dirty(handle, bg_bh);
1570
1571 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1572 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1573 if (status < 0)
1574 goto out_rollback_bg;
1575
1576 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1577 ocfs2_journal_dirty(handle, fe_bh);
1578
1579 out:
1580 if (status < 0)
1581 mlog_errno(status);
1582 return status;
1583
1584 out_rollback_bg:
1585 bg->bg_next_group = cpu_to_le64(bg_ptr);
1586 out_rollback_prev_bg:
1587 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1588 goto out;
1589 }
1590
ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc * bg,u32 wanted)1591 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1592 u32 wanted)
1593 {
1594 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1595 }
1596
1597 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1598 * value on error. */
ocfs2_cluster_group_search(struct inode * inode,struct buffer_head * group_bh,u32 bits_wanted,u32 min_bits,u64 max_block,struct ocfs2_suballoc_result * res)1599 static int ocfs2_cluster_group_search(struct inode *inode,
1600 struct buffer_head *group_bh,
1601 u32 bits_wanted, u32 min_bits,
1602 u64 max_block,
1603 struct ocfs2_suballoc_result *res)
1604 {
1605 int search = -ENOSPC;
1606 int ret;
1607 u64 blkoff;
1608 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1609 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1610 unsigned int max_bits, gd_cluster_off;
1611
1612 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1613
1614 if (le16_to_cpu(gd->bg_contig_free_bits) &&
1615 le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
1616 return -ENOSPC;
1617
1618 /* ->bg_contig_free_bits may un-initialized, so compare again */
1619 if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
1620 max_bits = le16_to_cpu(gd->bg_bits);
1621
1622 /* Tail groups in cluster bitmaps which aren't cpg
1623 * aligned are prone to partial extension by a failed
1624 * fs resize. If the file system resize never got to
1625 * update the dinode cluster count, then we don't want
1626 * to trust any clusters past it, regardless of what
1627 * the group descriptor says. */
1628 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1629 le64_to_cpu(gd->bg_blkno));
1630 if ((gd_cluster_off + max_bits) >
1631 OCFS2_I(inode)->ip_clusters) {
1632 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1633 trace_ocfs2_cluster_group_search_wrong_max_bits(
1634 (unsigned long long)le64_to_cpu(gd->bg_blkno),
1635 le16_to_cpu(gd->bg_bits),
1636 OCFS2_I(inode)->ip_clusters, max_bits);
1637 }
1638
1639 ret = ocfs2_block_group_find_clear_bits(osb,
1640 group_bh, bits_wanted,
1641 max_bits, res);
1642 if (ret)
1643 return ret;
1644
1645 if (max_block) {
1646 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1647 gd_cluster_off +
1648 res->sr_bit_offset +
1649 res->sr_bits);
1650 trace_ocfs2_cluster_group_search_max_block(
1651 (unsigned long long)blkoff,
1652 (unsigned long long)max_block);
1653 if (blkoff > max_block)
1654 return -ENOSPC;
1655 }
1656
1657 /* ocfs2_block_group_find_clear_bits() might
1658 * return success, but we still want to return
1659 * -ENOSPC unless it found the minimum number
1660 * of bits. */
1661 if (min_bits <= res->sr_bits)
1662 search = 0; /* success */
1663 }
1664
1665 return search;
1666 }
1667
ocfs2_block_group_search(struct inode * inode,struct buffer_head * group_bh,u32 bits_wanted,u32 min_bits,u64 max_block,struct ocfs2_suballoc_result * res)1668 static int ocfs2_block_group_search(struct inode *inode,
1669 struct buffer_head *group_bh,
1670 u32 bits_wanted, u32 min_bits,
1671 u64 max_block,
1672 struct ocfs2_suballoc_result *res)
1673 {
1674 int ret = -ENOSPC;
1675 u64 blkoff;
1676 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1677
1678 BUG_ON(min_bits != 1);
1679 BUG_ON(ocfs2_is_cluster_bitmap(inode));
1680
1681 if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
1682 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1683 group_bh, bits_wanted,
1684 le16_to_cpu(bg->bg_bits),
1685 res);
1686 if (!ret && max_block) {
1687 blkoff = le64_to_cpu(bg->bg_blkno) +
1688 res->sr_bit_offset + res->sr_bits;
1689 trace_ocfs2_block_group_search_max_block(
1690 (unsigned long long)blkoff,
1691 (unsigned long long)max_block);
1692 if (blkoff > max_block)
1693 ret = -ENOSPC;
1694 }
1695 }
1696
1697 return ret;
1698 }
1699
ocfs2_alloc_dinode_update_counts(struct inode * inode,handle_t * handle,struct buffer_head * di_bh,u32 num_bits,u16 chain)1700 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1701 handle_t *handle,
1702 struct buffer_head *di_bh,
1703 u32 num_bits,
1704 u16 chain)
1705 {
1706 int ret;
1707 u32 tmp_used;
1708 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1709 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1710
1711 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1712 OCFS2_JOURNAL_ACCESS_WRITE);
1713 if (ret < 0) {
1714 mlog_errno(ret);
1715 goto out;
1716 }
1717
1718 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1719 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1720 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1721 ocfs2_journal_dirty(handle, di_bh);
1722
1723 out:
1724 return ret;
1725 }
1726
ocfs2_rollback_alloc_dinode_counts(struct inode * inode,struct buffer_head * di_bh,u32 num_bits,u16 chain)1727 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1728 struct buffer_head *di_bh,
1729 u32 num_bits,
1730 u16 chain)
1731 {
1732 u32 tmp_used;
1733 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1734 struct ocfs2_chain_list *cl;
1735
1736 cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1737 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1738 di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1739 le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1740 }
1741
ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result * res,struct ocfs2_extent_rec * rec,struct ocfs2_chain_list * cl)1742 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1743 struct ocfs2_extent_rec *rec,
1744 struct ocfs2_chain_list *cl)
1745 {
1746 unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1747 unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1748 unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1749
1750 if (res->sr_bit_offset < bitoff)
1751 return 0;
1752 if (res->sr_bit_offset >= (bitoff + bitcount))
1753 return 0;
1754 res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1755 (res->sr_bit_offset - bitoff);
1756 if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1757 res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1758 return 1;
1759 }
1760
ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context * ac,struct ocfs2_group_desc * bg,struct ocfs2_suballoc_result * res)1761 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1762 struct ocfs2_group_desc *bg,
1763 struct ocfs2_suballoc_result *res)
1764 {
1765 int i;
1766 u64 bg_blkno = res->sr_bg_blkno; /* Save off */
1767 struct ocfs2_extent_rec *rec;
1768 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1769 struct ocfs2_chain_list *cl = &di->id2.i_chain;
1770
1771 if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1772 res->sr_blkno = 0;
1773 return;
1774 }
1775
1776 res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1777 res->sr_bg_blkno = 0; /* Clear it for contig block groups */
1778 if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1779 !bg->bg_list.l_next_free_rec)
1780 return;
1781
1782 for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1783 rec = &bg->bg_list.l_recs[i];
1784 if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1785 res->sr_bg_blkno = bg_blkno; /* Restore */
1786 break;
1787 }
1788 }
1789 }
1790
ocfs2_search_one_group(struct ocfs2_alloc_context * ac,handle_t * handle,u32 bits_wanted,u32 min_bits,struct ocfs2_suballoc_result * res,u16 * bits_left,int * released)1791 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1792 handle_t *handle,
1793 u32 bits_wanted,
1794 u32 min_bits,
1795 struct ocfs2_suballoc_result *res,
1796 u16 *bits_left, int *released)
1797 {
1798 int ret;
1799 struct buffer_head *group_bh = NULL;
1800 struct ocfs2_group_desc *gd;
1801 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1802 struct inode *alloc_inode = ac->ac_inode;
1803
1804 ret = ocfs2_read_hint_group_descriptor(alloc_inode, di,
1805 res->sr_bg_blkno, &group_bh, released);
1806 if (*released) {
1807 return 0;
1808 } else if (ret < 0) {
1809 mlog_errno(ret);
1810 return ret;
1811 }
1812
1813 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1814 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1815 ac->ac_max_block, res);
1816 if (ret < 0) {
1817 if (ret != -ENOSPC)
1818 mlog_errno(ret);
1819 goto out;
1820 }
1821
1822 if (!ret)
1823 ocfs2_bg_discontig_fix_result(ac, gd, res);
1824
1825 /*
1826 * sr_bg_blkno might have been changed by
1827 * ocfs2_bg_discontig_fix_result
1828 */
1829 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1830
1831 if (ac->ac_find_loc_only)
1832 goto out_loc_only;
1833
1834 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1835 res->sr_bits,
1836 le16_to_cpu(gd->bg_chain));
1837 if (ret < 0) {
1838 mlog_errno(ret);
1839 goto out;
1840 }
1841
1842 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1843 res->sr_bit_offset, res->sr_bits,
1844 res->sr_max_contig_bits, 0);
1845 if (ret < 0) {
1846 ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1847 res->sr_bits,
1848 le16_to_cpu(gd->bg_chain));
1849 mlog_errno(ret);
1850 }
1851
1852 out_loc_only:
1853 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1854
1855 out:
1856 brelse(group_bh);
1857
1858 return ret;
1859 }
1860
ocfs2_search_chain(struct ocfs2_alloc_context * ac,handle_t * handle,u32 bits_wanted,u32 min_bits,struct ocfs2_suballoc_result * res,u16 * bits_left)1861 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1862 handle_t *handle,
1863 u32 bits_wanted,
1864 u32 min_bits,
1865 struct ocfs2_suballoc_result *res,
1866 u16 *bits_left)
1867 {
1868 int status;
1869 u16 chain;
1870 u32 contig_bits;
1871 u64 next_group;
1872 struct inode *alloc_inode = ac->ac_inode;
1873 struct buffer_head *group_bh = NULL;
1874 struct buffer_head *prev_group_bh = NULL;
1875 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1876 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1877 struct ocfs2_group_desc *bg;
1878
1879 chain = ac->ac_chain;
1880 trace_ocfs2_search_chain_begin(
1881 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1882 bits_wanted, chain);
1883
1884 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1885 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1886 &group_bh);
1887 if (status < 0) {
1888 mlog_errno(status);
1889 goto bail;
1890 }
1891 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1892
1893 status = -ENOSPC;
1894 /* for now, the chain search is a bit simplistic. We just use
1895 * the 1st group with any empty bits. */
1896 while (1) {
1897 if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
1898 contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
1899 if (!contig_bits)
1900 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
1901 le16_to_cpu(bg->bg_bits), 0);
1902 if (bits_wanted > contig_bits && contig_bits >= min_bits)
1903 bits_wanted = contig_bits;
1904 }
1905
1906 status = ac->ac_group_search(alloc_inode, group_bh,
1907 bits_wanted, min_bits,
1908 ac->ac_max_block, res);
1909 if (status != -ENOSPC)
1910 break;
1911 if (!bg->bg_next_group)
1912 break;
1913
1914 brelse(prev_group_bh);
1915 prev_group_bh = NULL;
1916
1917 next_group = le64_to_cpu(bg->bg_next_group);
1918 prev_group_bh = group_bh;
1919 group_bh = NULL;
1920 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1921 next_group, &group_bh);
1922 if (status < 0) {
1923 mlog_errno(status);
1924 goto bail;
1925 }
1926 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1927 }
1928 if (status < 0) {
1929 if (status != -ENOSPC)
1930 mlog_errno(status);
1931 goto bail;
1932 }
1933
1934 trace_ocfs2_search_chain_succ(
1935 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1936
1937 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1938
1939 BUG_ON(res->sr_bits == 0);
1940 if (!status)
1941 ocfs2_bg_discontig_fix_result(ac, bg, res);
1942
1943 /*
1944 * sr_bg_blkno might have been changed by
1945 * ocfs2_bg_discontig_fix_result
1946 */
1947 res->sr_bg_stable_blkno = group_bh->b_blocknr;
1948
1949 /*
1950 * Keep track of previous block descriptor read. When
1951 * we find a target, if we have read more than X
1952 * number of descriptors, and the target is reasonably
1953 * empty, relink him to top of his chain.
1954 *
1955 * We've read 0 extra blocks and only send one more to
1956 * the transaction, yet the next guy to search has a
1957 * much easier time.
1958 *
1959 * Do this *after* figuring out how many bits we're taking out
1960 * of our target group.
1961 */
1962 if (!ac->ac_disable_chain_relink &&
1963 (prev_group_bh) &&
1964 (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1965 status = ocfs2_relink_block_group(handle, alloc_inode,
1966 ac->ac_bh, group_bh,
1967 prev_group_bh, chain);
1968 if (status < 0) {
1969 mlog_errno(status);
1970 goto bail;
1971 }
1972 }
1973
1974 if (ac->ac_find_loc_only)
1975 goto out_loc_only;
1976
1977 status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1978 ac->ac_bh, res->sr_bits,
1979 chain);
1980 if (status) {
1981 mlog_errno(status);
1982 goto bail;
1983 }
1984
1985 status = ocfs2_block_group_set_bits(handle,
1986 alloc_inode,
1987 bg,
1988 group_bh,
1989 res->sr_bit_offset,
1990 res->sr_bits,
1991 res->sr_max_contig_bits,
1992 0);
1993 if (status < 0) {
1994 ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1995 ac->ac_bh, res->sr_bits, chain);
1996 mlog_errno(status);
1997 goto bail;
1998 }
1999
2000 trace_ocfs2_search_chain_end(
2001 (unsigned long long)le64_to_cpu(fe->i_blkno),
2002 res->sr_bits);
2003
2004 out_loc_only:
2005 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
2006 bail:
2007 brelse(group_bh);
2008 brelse(prev_group_bh);
2009
2010 if (status)
2011 mlog_errno(status);
2012 return status;
2013 }
2014
2015 /* will give out up to bits_wanted contiguous bits. */
ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context * ac,handle_t * handle,u32 bits_wanted,u32 min_bits,struct ocfs2_suballoc_result * res)2016 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
2017 handle_t *handle,
2018 u32 bits_wanted,
2019 u32 min_bits,
2020 struct ocfs2_suballoc_result *res)
2021 {
2022 int status;
2023 int released = 0;
2024 u16 victim, i;
2025 u16 bits_left = 0;
2026 u64 hint = ac->ac_last_group;
2027 struct ocfs2_chain_list *cl;
2028 struct ocfs2_dinode *fe;
2029
2030 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2031 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
2032 BUG_ON(!ac->ac_bh);
2033
2034 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2035
2036 /* The bh was validated by the inode read during
2037 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
2038 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2039
2040 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
2041 le32_to_cpu(fe->id1.bitmap1.i_total)) {
2042 status = ocfs2_error(ac->ac_inode->i_sb,
2043 "Chain allocator dinode %llu has %u used bits but only %u total\n",
2044 (unsigned long long)le64_to_cpu(fe->i_blkno),
2045 le32_to_cpu(fe->id1.bitmap1.i_used),
2046 le32_to_cpu(fe->id1.bitmap1.i_total));
2047 goto bail;
2048 }
2049
2050 /* the hint bg may already be released, we quiet search this group. */
2051 res->sr_bg_blkno = hint;
2052 if (res->sr_bg_blkno) {
2053 /* Attempt to short-circuit the usual search mechanism
2054 * by jumping straight to the most recently used
2055 * allocation group. This helps us maintain some
2056 * contiguousness across allocations. */
2057 status = ocfs2_search_one_group(ac, handle, bits_wanted,
2058 min_bits, res, &bits_left,
2059 &released);
2060 if (released) {
2061 res->sr_bg_blkno = 0;
2062 goto chain_search;
2063 }
2064 if (!status)
2065 goto set_hint;
2066 if (status < 0 && status != -ENOSPC) {
2067 mlog_errno(status);
2068 goto bail;
2069 }
2070 }
2071 chain_search:
2072 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
2073 if (!le16_to_cpu(cl->cl_next_free_rec) ||
2074 le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) {
2075 status = ocfs2_error(ac->ac_inode->i_sb,
2076 "Chain allocator dinode %llu has invalid next "
2077 "free chain record %u, but only %u total\n",
2078 (unsigned long long)le64_to_cpu(fe->i_blkno),
2079 le16_to_cpu(cl->cl_next_free_rec),
2080 le16_to_cpu(cl->cl_count));
2081 goto bail;
2082 }
2083
2084 victim = ocfs2_find_victim_chain(cl);
2085 ac->ac_chain = victim;
2086
2087 search:
2088 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2089 res, &bits_left);
2090 if (!status) {
2091 if (ocfs2_is_cluster_bitmap(ac->ac_inode))
2092 hint = res->sr_bg_blkno;
2093 else
2094 hint = ocfs2_group_from_res(res);
2095 goto set_hint;
2096 }
2097 if (status < 0 && status != -ENOSPC) {
2098 mlog_errno(status);
2099 goto bail;
2100 }
2101
2102 trace_ocfs2_claim_suballoc_bits(victim);
2103
2104 /* If we didn't pick a good victim, then just default to
2105 * searching each chain in order. Don't allow chain relinking
2106 * because we only calculate enough journal credits for one
2107 * relink per alloc. */
2108 ac->ac_disable_chain_relink = 1;
2109 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
2110 if (i == victim)
2111 continue;
2112 if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted)
2113 continue;
2114
2115 ac->ac_chain = i;
2116 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2117 res, &bits_left);
2118 if (!status) {
2119 hint = ocfs2_group_from_res(res);
2120 break;
2121 }
2122 if (status < 0 && status != -ENOSPC) {
2123 mlog_errno(status);
2124 goto bail;
2125 }
2126 }
2127
2128 /* Chains can't supply the bits_wanted contiguous space.
2129 * We should switch to using every single bit when allocating
2130 * from the global bitmap. */
2131 if (i == le16_to_cpu(cl->cl_next_free_rec) &&
2132 status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
2133 ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
2134 ac->ac_chain = victim;
2135 goto search;
2136 }
2137
2138 set_hint:
2139 if (status != -ENOSPC) {
2140 /* If the next search of this group is not likely to
2141 * yield a suitable extent, then we reset the last
2142 * group hint so as to not waste a disk read */
2143 if (bits_left < min_bits)
2144 ac->ac_last_group = 0;
2145 else
2146 ac->ac_last_group = hint;
2147 }
2148
2149 bail:
2150 if (status)
2151 mlog_errno(status);
2152 return status;
2153 }
2154
ocfs2_claim_metadata(handle_t * handle,struct ocfs2_alloc_context * ac,u32 bits_wanted,u64 * suballoc_loc,u16 * suballoc_bit_start,unsigned int * num_bits,u64 * blkno_start)2155 int ocfs2_claim_metadata(handle_t *handle,
2156 struct ocfs2_alloc_context *ac,
2157 u32 bits_wanted,
2158 u64 *suballoc_loc,
2159 u16 *suballoc_bit_start,
2160 unsigned int *num_bits,
2161 u64 *blkno_start)
2162 {
2163 int status;
2164 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2165
2166 BUG_ON(!ac);
2167 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2168 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2169
2170 status = ocfs2_claim_suballoc_bits(ac,
2171 handle,
2172 bits_wanted,
2173 1,
2174 &res);
2175 if (status < 0) {
2176 mlog_errno(status);
2177 goto bail;
2178 }
2179 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2180
2181 *suballoc_loc = res.sr_bg_blkno;
2182 *suballoc_bit_start = res.sr_bit_offset;
2183 *blkno_start = res.sr_blkno;
2184 ac->ac_bits_given += res.sr_bits;
2185 *num_bits = res.sr_bits;
2186 status = 0;
2187 bail:
2188 if (status)
2189 mlog_errno(status);
2190 return status;
2191 }
2192
2193 /*
2194 * after ocfs2 has the ability to release block group unused space,
2195 * the ->ip_last_used_group may be invalid. so this function returns
2196 * ac->ac_last_group need to verify.
2197 * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details.
2198 */
ocfs2_init_inode_ac_group(struct inode * dir,struct buffer_head * parent_di_bh,struct ocfs2_alloc_context * ac)2199 static void ocfs2_init_inode_ac_group(struct inode *dir,
2200 struct buffer_head *parent_di_bh,
2201 struct ocfs2_alloc_context *ac)
2202 {
2203 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2204 /*
2205 * Try to allocate inodes from some specific group.
2206 *
2207 * If the parent dir has recorded the last group used in allocation,
2208 * cool, use it. Otherwise if we try to allocate new inode from the
2209 * same slot the parent dir belongs to, use the same chunk.
2210 *
2211 * We are very careful here to avoid the mistake of setting
2212 * ac_last_group to a group descriptor from a different (unlocked) slot.
2213 */
2214 if (OCFS2_I(dir)->ip_last_used_group &&
2215 OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2216 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2217 else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2218 if (di->i_suballoc_loc)
2219 ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2220 else
2221 ac->ac_last_group = ocfs2_which_suballoc_group(
2222 le64_to_cpu(di->i_blkno),
2223 le16_to_cpu(di->i_suballoc_bit));
2224 }
2225 }
2226
ocfs2_save_inode_ac_group(struct inode * dir,struct ocfs2_alloc_context * ac)2227 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2228 struct ocfs2_alloc_context *ac)
2229 {
2230 OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2231 OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2232 }
2233
ocfs2_find_new_inode_loc(struct inode * dir,struct buffer_head * parent_fe_bh,struct ocfs2_alloc_context * ac,u64 * fe_blkno)2234 int ocfs2_find_new_inode_loc(struct inode *dir,
2235 struct buffer_head *parent_fe_bh,
2236 struct ocfs2_alloc_context *ac,
2237 u64 *fe_blkno)
2238 {
2239 int ret;
2240 handle_t *handle = NULL;
2241 struct ocfs2_suballoc_result *res;
2242
2243 BUG_ON(!ac);
2244 BUG_ON(ac->ac_bits_given != 0);
2245 BUG_ON(ac->ac_bits_wanted != 1);
2246 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2247
2248 res = kzalloc_obj(*res, GFP_NOFS);
2249 if (res == NULL) {
2250 ret = -ENOMEM;
2251 mlog_errno(ret);
2252 goto out;
2253 }
2254
2255 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2256
2257 /*
2258 * The handle started here is for chain relink. Alternatively,
2259 * we could just disable relink for these calls.
2260 */
2261 handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2262 if (IS_ERR(handle)) {
2263 ret = PTR_ERR(handle);
2264 handle = NULL;
2265 mlog_errno(ret);
2266 goto out;
2267 }
2268
2269 /*
2270 * This will instruct ocfs2_claim_suballoc_bits and
2271 * ocfs2_search_one_group to search but save actual allocation
2272 * for later.
2273 */
2274 ac->ac_find_loc_only = 1;
2275
2276 ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2277 if (ret < 0) {
2278 mlog_errno(ret);
2279 goto out;
2280 }
2281
2282 ac->ac_find_loc_priv = res;
2283 *fe_blkno = res->sr_blkno;
2284 ocfs2_update_inode_fsync_trans(handle, dir, 0);
2285 out:
2286 if (handle)
2287 ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2288
2289 if (ret)
2290 kfree(res);
2291
2292 return ret;
2293 }
2294
ocfs2_claim_new_inode_at_loc(handle_t * handle,struct inode * dir,struct ocfs2_alloc_context * ac,u64 * suballoc_loc,u16 * suballoc_bit,u64 di_blkno)2295 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2296 struct inode *dir,
2297 struct ocfs2_alloc_context *ac,
2298 u64 *suballoc_loc,
2299 u16 *suballoc_bit,
2300 u64 di_blkno)
2301 {
2302 int ret;
2303 u16 chain;
2304 struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2305 struct buffer_head *bg_bh = NULL;
2306 struct ocfs2_group_desc *bg;
2307 struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2308
2309 /*
2310 * Since di_blkno is being passed back in, we check for any
2311 * inconsistencies which may have happened between
2312 * calls. These are code bugs as di_blkno is not expected to
2313 * change once returned from ocfs2_find_new_inode_loc()
2314 */
2315 BUG_ON(res->sr_blkno != di_blkno);
2316
2317 ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2318 res->sr_bg_stable_blkno, &bg_bh);
2319 if (ret) {
2320 mlog_errno(ret);
2321 goto out;
2322 }
2323
2324 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2325 chain = le16_to_cpu(bg->bg_chain);
2326
2327 ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2328 ac->ac_bh, res->sr_bits,
2329 chain);
2330 if (ret) {
2331 mlog_errno(ret);
2332 goto out;
2333 }
2334
2335 ret = ocfs2_block_group_set_bits(handle,
2336 ac->ac_inode,
2337 bg,
2338 bg_bh,
2339 res->sr_bit_offset,
2340 res->sr_bits,
2341 res->sr_max_contig_bits,
2342 0);
2343 if (ret < 0) {
2344 ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2345 ac->ac_bh, res->sr_bits, chain);
2346 mlog_errno(ret);
2347 goto out;
2348 }
2349
2350 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2351 res->sr_bits);
2352
2353 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2354
2355 BUG_ON(res->sr_bits != 1);
2356
2357 *suballoc_loc = res->sr_bg_blkno;
2358 *suballoc_bit = res->sr_bit_offset;
2359 ac->ac_bits_given++;
2360 ocfs2_save_inode_ac_group(dir, ac);
2361
2362 out:
2363 brelse(bg_bh);
2364
2365 return ret;
2366 }
2367
ocfs2_claim_new_inode(handle_t * handle,struct inode * dir,struct buffer_head * parent_fe_bh,struct ocfs2_alloc_context * ac,u64 * suballoc_loc,u16 * suballoc_bit,u64 * fe_blkno)2368 int ocfs2_claim_new_inode(handle_t *handle,
2369 struct inode *dir,
2370 struct buffer_head *parent_fe_bh,
2371 struct ocfs2_alloc_context *ac,
2372 u64 *suballoc_loc,
2373 u16 *suballoc_bit,
2374 u64 *fe_blkno)
2375 {
2376 int status;
2377 struct ocfs2_suballoc_result res;
2378
2379 BUG_ON(!ac);
2380 BUG_ON(ac->ac_bits_given != 0);
2381 BUG_ON(ac->ac_bits_wanted != 1);
2382 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2383
2384 ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2385
2386 status = ocfs2_claim_suballoc_bits(ac,
2387 handle,
2388 1,
2389 1,
2390 &res);
2391 if (status < 0) {
2392 mlog_errno(status);
2393 goto bail;
2394 }
2395 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2396
2397 BUG_ON(res.sr_bits != 1);
2398
2399 *suballoc_loc = res.sr_bg_blkno;
2400 *suballoc_bit = res.sr_bit_offset;
2401 *fe_blkno = res.sr_blkno;
2402 ac->ac_bits_given++;
2403 ocfs2_save_inode_ac_group(dir, ac);
2404 status = 0;
2405 bail:
2406 if (status)
2407 mlog_errno(status);
2408 return status;
2409 }
2410
2411 /* translate a group desc. blkno and it's bitmap offset into
2412 * disk cluster offset. */
ocfs2_desc_bitmap_to_cluster_off(struct inode * inode,u64 bg_blkno,u16 bg_bit_off)2413 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2414 u64 bg_blkno,
2415 u16 bg_bit_off)
2416 {
2417 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2418 u32 cluster = 0;
2419
2420 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2421
2422 if (bg_blkno != osb->first_cluster_group_blkno)
2423 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2424 cluster += (u32) bg_bit_off;
2425 return cluster;
2426 }
2427
2428 /* given a cluster offset, calculate which block group it belongs to
2429 * and return that block offset. */
ocfs2_which_cluster_group(struct inode * inode,u32 cluster)2430 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2431 {
2432 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2433 u32 group_no;
2434
2435 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2436
2437 group_no = cluster / osb->bitmap_cpg;
2438 if (!group_no)
2439 return osb->first_cluster_group_blkno;
2440 return ocfs2_clusters_to_blocks(inode->i_sb,
2441 group_no * osb->bitmap_cpg);
2442 }
2443
2444 /* given the block number of a cluster start, calculate which cluster
2445 * group and descriptor bitmap offset that corresponds to. */
ocfs2_block_to_cluster_group(struct inode * inode,u64 data_blkno,u64 * bg_blkno,u16 * bg_bit_off)2446 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2447 u64 data_blkno,
2448 u64 *bg_blkno,
2449 u16 *bg_bit_off)
2450 {
2451 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2452 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2453
2454 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2455
2456 *bg_blkno = ocfs2_which_cluster_group(inode,
2457 data_cluster);
2458
2459 if (*bg_blkno == osb->first_cluster_group_blkno)
2460 *bg_bit_off = (u16) data_cluster;
2461 else
2462 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2463 data_blkno - *bg_blkno);
2464 }
2465
2466 /*
2467 * min_bits - minimum contiguous chunk from this total allocation we
2468 * can handle. set to what we asked for originally for a full
2469 * contig. allocation, set to '1' to indicate we can deal with extents
2470 * of any size.
2471 */
__ocfs2_claim_clusters(handle_t * handle,struct ocfs2_alloc_context * ac,u32 min_clusters,u32 max_clusters,u32 * cluster_start,u32 * num_clusters)2472 int __ocfs2_claim_clusters(handle_t *handle,
2473 struct ocfs2_alloc_context *ac,
2474 u32 min_clusters,
2475 u32 max_clusters,
2476 u32 *cluster_start,
2477 u32 *num_clusters)
2478 {
2479 int status;
2480 unsigned int bits_wanted = max_clusters;
2481 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2482 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2483
2484 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2485
2486 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2487 && ac->ac_which != OCFS2_AC_USE_MAIN
2488 && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);
2489
2490 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2491 WARN_ON(min_clusters > 1);
2492
2493 status = ocfs2_claim_local_alloc_bits(osb,
2494 handle,
2495 ac,
2496 bits_wanted,
2497 cluster_start,
2498 num_clusters);
2499 if (!status)
2500 atomic_inc(&osb->alloc_stats.local_data);
2501 } else {
2502 if (min_clusters > (osb->bitmap_cpg - 1)) {
2503 /* The only paths asking for contiguousness
2504 * should know about this already. */
2505 mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2506 "group bitmap size %u!\n", min_clusters,
2507 osb->bitmap_cpg);
2508 status = -ENOSPC;
2509 goto bail;
2510 }
2511 /* clamp the current request down to a realistic size. */
2512 if (bits_wanted > (osb->bitmap_cpg - 1))
2513 bits_wanted = osb->bitmap_cpg - 1;
2514
2515 status = ocfs2_claim_suballoc_bits(ac,
2516 handle,
2517 bits_wanted,
2518 min_clusters,
2519 &res);
2520 if (!status) {
2521 BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2522 *cluster_start =
2523 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2524 res.sr_bg_blkno,
2525 res.sr_bit_offset);
2526 atomic_inc(&osb->alloc_stats.bitmap_data);
2527 *num_clusters = res.sr_bits;
2528 }
2529 }
2530 if (status < 0) {
2531 if (status != -ENOSPC)
2532 mlog_errno(status);
2533 goto bail;
2534 }
2535
2536 ac->ac_bits_given += *num_clusters;
2537
2538 bail:
2539 if (status)
2540 mlog_errno(status);
2541 return status;
2542 }
2543
ocfs2_claim_clusters(handle_t * handle,struct ocfs2_alloc_context * ac,u32 min_clusters,u32 * cluster_start,u32 * num_clusters)2544 int ocfs2_claim_clusters(handle_t *handle,
2545 struct ocfs2_alloc_context *ac,
2546 u32 min_clusters,
2547 u32 *cluster_start,
2548 u32 *num_clusters)
2549 {
2550 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2551
2552 return __ocfs2_claim_clusters(handle, ac, min_clusters,
2553 bits_wanted, cluster_start, num_clusters);
2554 }
2555
ocfs2_block_group_clear_bits(handle_t * handle,struct inode * alloc_inode,struct ocfs2_group_desc * bg,struct buffer_head * group_bh,unsigned int bit_off,unsigned int num_bits,unsigned int max_contig_bits,void (* undo_fn)(unsigned int bit,unsigned long * bmap))2556 static int ocfs2_block_group_clear_bits(handle_t *handle,
2557 struct inode *alloc_inode,
2558 struct ocfs2_group_desc *bg,
2559 struct buffer_head *group_bh,
2560 unsigned int bit_off,
2561 unsigned int num_bits,
2562 unsigned int max_contig_bits,
2563 void (*undo_fn)(unsigned int bit,
2564 unsigned long *bmap))
2565 {
2566 int status;
2567 unsigned int tmp;
2568 u16 contig_bits;
2569 struct ocfs2_group_desc *undo_bg = NULL;
2570 struct journal_head *jh;
2571
2572 /* The caller got this descriptor from
2573 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
2574 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2575
2576 trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2577
2578 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2579 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2580 group_bh,
2581 undo_fn ?
2582 OCFS2_JOURNAL_ACCESS_UNDO :
2583 OCFS2_JOURNAL_ACCESS_WRITE);
2584 if (status < 0) {
2585 mlog_errno(status);
2586 goto bail;
2587 }
2588
2589 jh = bh2jh(group_bh);
2590 if (undo_fn) {
2591 spin_lock(&jh->b_state_lock);
2592 undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
2593 BUG_ON(!undo_bg);
2594 }
2595
2596 tmp = num_bits;
2597 while(tmp--) {
2598 ocfs2_clear_bit((bit_off + tmp),
2599 (unsigned long *) bg->bg_bitmap);
2600 if (undo_fn)
2601 undo_fn(bit_off + tmp,
2602 (unsigned long *) undo_bg->bg_bitmap);
2603 }
2604 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2605 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2606 if (undo_fn)
2607 spin_unlock(&jh->b_state_lock);
2608 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2609 (unsigned long long)le64_to_cpu(bg->bg_blkno),
2610 le16_to_cpu(bg->bg_bits),
2611 le16_to_cpu(bg->bg_free_bits_count),
2612 num_bits);
2613 }
2614
2615 /*
2616 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
2617 * we still need to rescan whole bitmap.
2618 */
2619 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
2620 contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
2621 le16_to_cpu(bg->bg_bits), 0);
2622 if (contig_bits > max_contig_bits)
2623 max_contig_bits = contig_bits;
2624 bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
2625 } else {
2626 bg->bg_contig_free_bits = 0;
2627 }
2628
2629 if (undo_fn)
2630 spin_unlock(&jh->b_state_lock);
2631
2632 ocfs2_journal_dirty(handle, group_bh);
2633 bail:
2634 return status;
2635 }
2636
2637 /*
2638 * Reclaim the suballocator managed space to main bitmap.
2639 * This function first works on the suballocator to perform the
2640 * cleanup rec/alloc_inode job, then switches to the main bitmap
2641 * to reclaim released space.
2642 *
2643 * handle: The transaction handle
2644 * alloc_inode: The suballoc inode
2645 * alloc_bh: The buffer_head of suballoc inode
2646 * group_bh: The group descriptor buffer_head of suballocator managed.
2647 * Caller should release the input group_bh.
2648 */
_ocfs2_reclaim_suballoc_to_main(handle_t * handle,struct inode * alloc_inode,struct buffer_head * alloc_bh,struct buffer_head * group_bh)2649 static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle,
2650 struct inode *alloc_inode,
2651 struct buffer_head *alloc_bh,
2652 struct buffer_head *group_bh)
2653 {
2654 int idx, status = 0;
2655 int i, next_free_rec, len = 0;
2656 __le16 old_bg_contig_free_bits = 0;
2657 u16 start_bit;
2658 u32 tmp_used;
2659 u64 bg_blkno, start_blk;
2660 unsigned int count;
2661 struct ocfs2_chain_rec *rec;
2662 struct buffer_head *main_bm_bh = NULL;
2663 struct inode *main_bm_inode = NULL;
2664 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
2665 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2666 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2667 struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data;
2668
2669 idx = le16_to_cpu(group->bg_chain);
2670 rec = &(cl->cl_recs[idx]);
2671
2672 status = ocfs2_extend_trans(handle,
2673 ocfs2_calc_group_alloc_credits(osb->sb,
2674 le16_to_cpu(cl->cl_cpg)));
2675 if (status) {
2676 mlog_errno(status);
2677 goto bail;
2678 }
2679 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2680 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2681 if (status < 0) {
2682 mlog_errno(status);
2683 goto bail;
2684 }
2685
2686 /*
2687 * Only clear the suballocator rec item in-place.
2688 *
2689 * If idx is not the last, we don't compress (remove the empty item)
2690 * the cl_recs[]. If not, we need to do lots jobs.
2691 *
2692 * Compress cl_recs[] code example:
2693 * if (idx != cl->cl_next_free_rec - 1)
2694 * memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1],
2695 * sizeof(struct ocfs2_chain_rec) *
2696 * (cl->cl_next_free_rec - idx - 1));
2697 * for(i = idx; i < cl->cl_next_free_rec-1; i++) {
2698 * group->bg_chain = "later group->bg_chain";
2699 * group->bg_blkno = xxx;
2700 * ... ...
2701 * }
2702 */
2703
2704 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total);
2705 fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total));
2706
2707 /* Substraction 1 for the block group itself */
2708 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2709 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1);
2710
2711 tmp_used = le32_to_cpu(fe->i_clusters);
2712 fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg));
2713
2714 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
2715 OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters);
2716 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
2717 le32_to_cpu(fe->i_clusters)));
2718 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
2719 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
2720 alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
2721
2722 ocfs2_journal_dirty(handle, alloc_bh);
2723 ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
2724
2725 start_blk = le64_to_cpu(rec->c_blkno);
2726 count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc);
2727
2728 /*
2729 * If the rec is the last one, let's compress the chain list by
2730 * removing the empty cl_recs[] at the end.
2731 */
2732 next_free_rec = le16_to_cpu(cl->cl_next_free_rec);
2733 if (idx == (next_free_rec - 1)) {
2734 len++; /* the last item should be counted first */
2735 for (i = (next_free_rec - 2); i > 0; i--) {
2736 if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total)
2737 len++;
2738 else
2739 break;
2740 }
2741 }
2742 le16_add_cpu(&cl->cl_next_free_rec, -len);
2743
2744 rec->c_free = 0;
2745 rec->c_total = 0;
2746 rec->c_blkno = 0;
2747 ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh);
2748 memset(group, 0, sizeof(struct ocfs2_group_desc));
2749
2750 /* prepare job for reclaim clusters */
2751 main_bm_inode = ocfs2_get_system_file_inode(osb,
2752 GLOBAL_BITMAP_SYSTEM_INODE,
2753 OCFS2_INVALID_SLOT);
2754 if (!main_bm_inode)
2755 goto bail; /* ignore the error in reclaim path */
2756
2757 inode_lock(main_bm_inode);
2758
2759 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
2760 if (status < 0)
2761 goto free_bm_inode; /* ignore the error in reclaim path */
2762
2763 ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno,
2764 &start_bit);
2765 fe = (struct ocfs2_dinode *) main_bm_bh->b_data;
2766 cl = &fe->id2.i_chain;
2767 /* reuse group_bh, caller will release the input group_bh */
2768 group_bh = NULL;
2769
2770 /* reclaim clusters to global_bitmap */
2771 status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno,
2772 &group_bh);
2773 if (status < 0) {
2774 mlog_errno(status);
2775 goto free_bm_bh;
2776 }
2777 group = (struct ocfs2_group_desc *) group_bh->b_data;
2778
2779 if ((count + start_bit) > le16_to_cpu(group->bg_bits)) {
2780 ocfs2_error(alloc_inode->i_sb,
2781 "reclaim length (%d) beyands block group length (%d)",
2782 count + start_bit, le16_to_cpu(group->bg_bits));
2783 goto free_group_bh;
2784 }
2785
2786 old_bg_contig_free_bits = group->bg_contig_free_bits;
2787 status = ocfs2_block_group_clear_bits(handle, main_bm_inode,
2788 group, group_bh,
2789 start_bit, count, 0,
2790 _ocfs2_clear_bit);
2791 if (status < 0) {
2792 mlog_errno(status);
2793 goto free_group_bh;
2794 }
2795
2796 status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
2797 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2798 if (status < 0) {
2799 mlog_errno(status);
2800 ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh,
2801 start_bit, count,
2802 le16_to_cpu(old_bg_contig_free_bits), 1);
2803 goto free_group_bh;
2804 }
2805
2806 idx = le16_to_cpu(group->bg_chain);
2807 rec = &(cl->cl_recs[idx]);
2808
2809 le32_add_cpu(&rec->c_free, count);
2810 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2811 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2812 ocfs2_journal_dirty(handle, main_bm_bh);
2813
2814 free_group_bh:
2815 brelse(group_bh);
2816
2817 free_bm_bh:
2818 ocfs2_inode_unlock(main_bm_inode, 1);
2819 brelse(main_bm_bh);
2820
2821 free_bm_inode:
2822 inode_unlock(main_bm_inode);
2823 iput(main_bm_inode);
2824
2825 bail:
2826 return status;
2827 }
2828
2829 /*
2830 * expects the suballoc inode to already be locked.
2831 */
_ocfs2_free_suballoc_bits(handle_t * handle,struct inode * alloc_inode,struct buffer_head * alloc_bh,unsigned int start_bit,u64 bg_blkno,unsigned int count,void (* undo_fn)(unsigned int bit,unsigned long * bitmap))2832 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2833 struct inode *alloc_inode,
2834 struct buffer_head *alloc_bh,
2835 unsigned int start_bit,
2836 u64 bg_blkno,
2837 unsigned int count,
2838 void (*undo_fn)(unsigned int bit,
2839 unsigned long *bitmap))
2840 {
2841 int idx, status = 0;
2842 u32 tmp_used;
2843 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2844 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2845 struct buffer_head *group_bh = NULL;
2846 struct ocfs2_group_desc *group;
2847 struct ocfs2_chain_rec *rec;
2848 __le16 old_bg_contig_free_bits = 0;
2849
2850 /* The alloc_bh comes from ocfs2_free_dinode() or
2851 * ocfs2_free_clusters(). The callers have all locked the
2852 * allocator and gotten alloc_bh from the lock call. This
2853 * validates the dinode buffer. Any corruption that has happened
2854 * is a code bug. */
2855 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2856 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2857
2858 trace_ocfs2_free_suballoc_bits(
2859 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2860 (unsigned long long)bg_blkno,
2861 start_bit, count);
2862
2863 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2864 &group_bh);
2865 if (status < 0) {
2866 mlog_errno(status);
2867 goto bail;
2868 }
2869 group = (struct ocfs2_group_desc *) group_bh->b_data;
2870
2871 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2872
2873 if (ocfs2_is_cluster_bitmap(alloc_inode))
2874 old_bg_contig_free_bits = group->bg_contig_free_bits;
2875 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2876 group, group_bh,
2877 start_bit, count, 0, undo_fn);
2878 if (status < 0) {
2879 mlog_errno(status);
2880 goto bail;
2881 }
2882
2883 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2884 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2885 if (status < 0) {
2886 mlog_errno(status);
2887 ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2888 start_bit, count,
2889 le16_to_cpu(old_bg_contig_free_bits), 1);
2890 goto bail;
2891 }
2892
2893 idx = le16_to_cpu(group->bg_chain);
2894 rec = &(cl->cl_recs[idx]);
2895
2896 le32_add_cpu(&rec->c_free, count);
2897 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2898 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2899 ocfs2_journal_dirty(handle, alloc_bh);
2900
2901 /*
2902 * Reclaim suballocator free space.
2903 * Bypass: global_bitmap, non empty rec, first rec in cl_recs[]
2904 */
2905 if (ocfs2_is_cluster_bitmap(alloc_inode) ||
2906 (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) ||
2907 (le16_to_cpu(cl->cl_next_free_rec) == 1)) {
2908 goto bail;
2909 }
2910
2911 _ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh);
2912
2913 bail:
2914 brelse(group_bh);
2915 return status;
2916 }
2917
ocfs2_free_suballoc_bits(handle_t * handle,struct inode * alloc_inode,struct buffer_head * alloc_bh,unsigned int start_bit,u64 bg_blkno,unsigned int count)2918 int ocfs2_free_suballoc_bits(handle_t *handle,
2919 struct inode *alloc_inode,
2920 struct buffer_head *alloc_bh,
2921 unsigned int start_bit,
2922 u64 bg_blkno,
2923 unsigned int count)
2924 {
2925 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2926 start_bit, bg_blkno, count, NULL);
2927 }
2928
ocfs2_free_dinode(handle_t * handle,struct inode * inode_alloc_inode,struct buffer_head * inode_alloc_bh,struct ocfs2_dinode * di)2929 int ocfs2_free_dinode(handle_t *handle,
2930 struct inode *inode_alloc_inode,
2931 struct buffer_head *inode_alloc_bh,
2932 struct ocfs2_dinode *di)
2933 {
2934 u64 blk = le64_to_cpu(di->i_blkno);
2935 u16 bit = le16_to_cpu(di->i_suballoc_bit);
2936 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2937
2938 if (di->i_suballoc_loc)
2939 bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2940 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2941 inode_alloc_bh, bit, bg_blkno, 1);
2942 }
2943
_ocfs2_free_clusters(handle_t * handle,struct inode * bitmap_inode,struct buffer_head * bitmap_bh,u64 start_blk,unsigned int num_clusters,void (* undo_fn)(unsigned int bit,unsigned long * bitmap))2944 static int _ocfs2_free_clusters(handle_t *handle,
2945 struct inode *bitmap_inode,
2946 struct buffer_head *bitmap_bh,
2947 u64 start_blk,
2948 unsigned int num_clusters,
2949 void (*undo_fn)(unsigned int bit,
2950 unsigned long *bitmap))
2951 {
2952 int status;
2953 u16 bg_start_bit;
2954 u64 bg_blkno;
2955
2956 /* You can't ever have a contiguous set of clusters
2957 * bigger than a block group bitmap so we never have to worry
2958 * about looping on them.
2959 * This is expensive. We can safely remove once this stuff has
2960 * gotten tested really well. */
2961 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2962 ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2963 start_blk)));
2964
2965
2966 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2967 &bg_start_bit);
2968
2969 trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2970 (unsigned long long)start_blk,
2971 bg_start_bit, num_clusters);
2972
2973 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2974 bg_start_bit, bg_blkno,
2975 num_clusters, undo_fn);
2976 if (status < 0) {
2977 mlog_errno(status);
2978 goto out;
2979 }
2980
2981 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2982 num_clusters);
2983
2984 out:
2985 return status;
2986 }
2987
ocfs2_free_clusters(handle_t * handle,struct inode * bitmap_inode,struct buffer_head * bitmap_bh,u64 start_blk,unsigned int num_clusters)2988 int ocfs2_free_clusters(handle_t *handle,
2989 struct inode *bitmap_inode,
2990 struct buffer_head *bitmap_bh,
2991 u64 start_blk,
2992 unsigned int num_clusters)
2993 {
2994 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2995 start_blk, num_clusters,
2996 _ocfs2_set_bit);
2997 }
2998
2999 /*
3000 * Give never-used clusters back to the global bitmap. We don't need
3001 * to protect these bits in the undo buffer.
3002 */
ocfs2_release_clusters(handle_t * handle,struct inode * bitmap_inode,struct buffer_head * bitmap_bh,u64 start_blk,unsigned int num_clusters)3003 int ocfs2_release_clusters(handle_t *handle,
3004 struct inode *bitmap_inode,
3005 struct buffer_head *bitmap_bh,
3006 u64 start_blk,
3007 unsigned int num_clusters)
3008 {
3009 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
3010 start_blk, num_clusters,
3011 _ocfs2_clear_bit);
3012 }
3013
3014 /*
3015 * For a given allocation, determine which allocators will need to be
3016 * accessed, and lock them, reserving the appropriate number of bits.
3017 *
3018 * Sparse file systems call this from ocfs2_write_begin_nolock()
3019 * and ocfs2_allocate_unwritten_extents().
3020 *
3021 * File systems which don't support holes call this from
3022 * ocfs2_extend_allocation().
3023 */
ocfs2_lock_allocators(struct inode * inode,struct ocfs2_extent_tree * et,u32 clusters_to_add,u32 extents_to_split,struct ocfs2_alloc_context ** data_ac,struct ocfs2_alloc_context ** meta_ac)3024 int ocfs2_lock_allocators(struct inode *inode,
3025 struct ocfs2_extent_tree *et,
3026 u32 clusters_to_add, u32 extents_to_split,
3027 struct ocfs2_alloc_context **data_ac,
3028 struct ocfs2_alloc_context **meta_ac)
3029 {
3030 int ret = 0, num_free_extents;
3031 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
3032 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3033
3034 *meta_ac = NULL;
3035 if (data_ac)
3036 *data_ac = NULL;
3037
3038 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
3039
3040 num_free_extents = ocfs2_num_free_extents(et);
3041 if (num_free_extents < 0) {
3042 ret = num_free_extents;
3043 mlog_errno(ret);
3044 goto out;
3045 }
3046
3047 /*
3048 * Sparse allocation file systems need to be more conservative
3049 * with reserving room for expansion - the actual allocation
3050 * happens while we've got a journal handle open so re-taking
3051 * a cluster lock (because we ran out of room for another
3052 * extent) will violate ordering rules.
3053 *
3054 * Most of the time we'll only be seeing this 1 cluster at a time
3055 * anyway.
3056 *
3057 * Always lock for any unwritten extents - we might want to
3058 * add blocks during a split.
3059 */
3060 if (!num_free_extents ||
3061 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
3062 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
3063 if (ret < 0) {
3064 if (ret != -ENOSPC)
3065 mlog_errno(ret);
3066 goto out;
3067 }
3068 }
3069
3070 if (clusters_to_add == 0)
3071 goto out;
3072
3073 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
3074 if (ret < 0) {
3075 if (ret != -ENOSPC)
3076 mlog_errno(ret);
3077 goto out;
3078 }
3079
3080 out:
3081 if (ret) {
3082 if (*meta_ac) {
3083 ocfs2_free_alloc_context(*meta_ac);
3084 *meta_ac = NULL;
3085 }
3086
3087 /*
3088 * We cannot have an error and a non null *data_ac.
3089 */
3090 }
3091
3092 return ret;
3093 }
3094
3095 /*
3096 * Read the inode specified by blkno to get suballoc_slot and
3097 * suballoc_bit.
3098 */
ocfs2_get_suballoc_slot_bit(struct ocfs2_super * osb,u64 blkno,u16 * suballoc_slot,u64 * group_blkno,u16 * suballoc_bit)3099 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
3100 u16 *suballoc_slot, u64 *group_blkno,
3101 u16 *suballoc_bit)
3102 {
3103 int status;
3104 struct buffer_head *inode_bh = NULL;
3105 struct ocfs2_dinode *inode_fe;
3106
3107 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
3108
3109 /* dirty read disk */
3110 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
3111 if (status < 0) {
3112 mlog(ML_ERROR, "read block %llu failed %d\n",
3113 (unsigned long long)blkno, status);
3114 goto bail;
3115 }
3116
3117 inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
3118 if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
3119 mlog(ML_ERROR, "invalid inode %llu requested\n",
3120 (unsigned long long)blkno);
3121 status = -EINVAL;
3122 goto bail;
3123 }
3124
3125 if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
3126 (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
3127 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
3128 (unsigned long long)blkno,
3129 (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
3130 status = -EINVAL;
3131 goto bail;
3132 }
3133
3134 if (suballoc_slot)
3135 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
3136 if (suballoc_bit)
3137 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
3138 if (group_blkno)
3139 *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
3140
3141 bail:
3142 brelse(inode_bh);
3143
3144 if (status)
3145 mlog_errno(status);
3146 return status;
3147 }
3148
3149 /*
3150 * test whether bit is SET in allocator bitmap or not. on success, 0
3151 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
3152 * is returned and *res is meaningless. Call this after you have
3153 * cluster locked against suballoc, or you may get a result based on
3154 * non-up2date contents
3155 */
ocfs2_test_suballoc_bit(struct ocfs2_super * osb,struct inode * suballoc,struct buffer_head * alloc_bh,u64 group_blkno,u64 blkno,u16 bit,int * res)3156 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
3157 struct inode *suballoc,
3158 struct buffer_head *alloc_bh,
3159 u64 group_blkno, u64 blkno,
3160 u16 bit, int *res)
3161 {
3162 struct ocfs2_dinode *alloc_di;
3163 struct ocfs2_group_desc *group;
3164 struct buffer_head *group_bh = NULL;
3165 u64 bg_blkno;
3166 int status, quiet = 0, released = 0;
3167
3168 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
3169 (unsigned int)bit);
3170
3171 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
3172 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
3173 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
3174 (unsigned int)bit,
3175 ocfs2_bits_per_group(&alloc_di->id2.i_chain));
3176 status = -EINVAL;
3177 goto bail;
3178 }
3179
3180 bg_blkno = group_blkno ? group_blkno :
3181 ocfs2_which_suballoc_group(blkno, bit);
3182 status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno,
3183 &group_bh, &released);
3184 if (released) {
3185 quiet = 1;
3186 status = -ESTALE;
3187 goto bail;
3188 } else if (status < 0) {
3189 mlog(ML_ERROR, "read group %llu failed %d\n",
3190 (unsigned long long)bg_blkno, status);
3191 goto bail;
3192 }
3193
3194 group = (struct ocfs2_group_desc *) group_bh->b_data;
3195 *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
3196
3197 bail:
3198 brelse(group_bh);
3199
3200 if (status && !quiet)
3201 mlog_errno(status);
3202 return status;
3203 }
3204
3205 /*
3206 * Test if the bit representing this inode (blkno) is set in the
3207 * suballocator.
3208 *
3209 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
3210 *
3211 * In the event of failure, a negative value is returned and *res is
3212 * meaningless.
3213 *
3214 * Callers must make sure to hold nfs_sync_lock to prevent
3215 * ocfs2_delete_inode() on another node from accessing the same
3216 * suballocator concurrently.
3217 */
ocfs2_test_inode_bit(struct ocfs2_super * osb,u64 blkno,int * res)3218 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
3219 {
3220 int status, quiet = 0;
3221 u64 group_blkno = 0;
3222 u16 suballoc_bit = 0, suballoc_slot = 0;
3223 struct inode *inode_alloc_inode;
3224 struct buffer_head *alloc_bh = NULL;
3225
3226 trace_ocfs2_test_inode_bit((unsigned long long)blkno);
3227
3228 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
3229 &group_blkno, &suballoc_bit);
3230 if (status < 0) {
3231 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
3232 goto bail;
3233 }
3234
3235 if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
3236 inode_alloc_inode = ocfs2_get_system_file_inode(osb,
3237 GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
3238 else
3239 inode_alloc_inode = ocfs2_get_system_file_inode(osb,
3240 INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
3241 if (!inode_alloc_inode) {
3242 /* the error code could be inaccurate, but we are not able to
3243 * get the correct one. */
3244 status = -EINVAL;
3245 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
3246 (u32)suballoc_slot);
3247 goto bail;
3248 }
3249
3250 inode_lock(inode_alloc_inode);
3251 status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
3252 if (status < 0) {
3253 inode_unlock(inode_alloc_inode);
3254 iput(inode_alloc_inode);
3255 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
3256 (u32)suballoc_slot, status);
3257 goto bail;
3258 }
3259
3260 status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
3261 group_blkno, blkno, suballoc_bit, res);
3262 if (status < 0) {
3263 if (status == -ESTALE)
3264 quiet = 1;
3265 else
3266 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
3267 }
3268
3269 ocfs2_inode_unlock(inode_alloc_inode, 0);
3270 inode_unlock(inode_alloc_inode);
3271
3272 iput(inode_alloc_inode);
3273 brelse(alloc_bh);
3274 bail:
3275 if (status && !quiet)
3276 mlog_errno(status);
3277 return status;
3278 }
3279