1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * dir.c 4 * 5 * Creates, reads, walks and deletes directory-nodes 6 * 7 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * 9 * Portions of this code from linux/fs/ext3/dir.c 10 * 11 * Copyright (C) 1992, 1993, 1994, 1995 12 * Remy Card (card@masi.ibp.fr) 13 * Laboratoire MASI - Institut Blaise pascal 14 * Universite Pierre et Marie Curie (Paris VI) 15 * 16 * from 17 * 18 * linux/fs/minix/dir.c 19 * 20 * Copyright (C) 1991, 1992 Linus Torvalds 21 */ 22 23 #include <linux/fs.h> 24 #include <linux/types.h> 25 #include <linux/slab.h> 26 #include <linux/highmem.h> 27 #include <linux/quotaops.h> 28 #include <linux/sort.h> 29 #include <linux/iversion.h> 30 31 #include <cluster/masklog.h> 32 33 #include "ocfs2.h" 34 35 #include "alloc.h" 36 #include "blockcheck.h" 37 #include "dir.h" 38 #include "dlmglue.h" 39 #include "extent_map.h" 40 #include "file.h" 41 #include "inode.h" 42 #include "journal.h" 43 #include "namei.h" 44 #include "suballoc.h" 45 #include "super.h" 46 #include "sysfile.h" 47 #include "uptodate.h" 48 #include "ocfs2_trace.h" 49 50 #include "buffer_head_io.h" 51 52 #define NAMEI_RA_CHUNKS 2 53 #define NAMEI_RA_BLOCKS 4 54 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) 55 56 static int ocfs2_do_extend_dir(struct super_block *sb, 57 handle_t *handle, 58 struct inode *dir, 59 struct buffer_head *parent_fe_bh, 60 struct ocfs2_alloc_context *data_ac, 61 struct ocfs2_alloc_context *meta_ac, 62 struct buffer_head **new_bh); 63 static int ocfs2_dir_indexed(struct inode *inode); 64 65 /* 66 * These are distinct checks because future versions of the file system will 67 * want to have a trailing dirent structure independent of indexing. 68 */ 69 static int ocfs2_supports_dir_trailer(struct inode *dir) 70 { 71 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 72 73 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 74 return 0; 75 76 return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); 77 } 78 79 /* 80 * "new' here refers to the point at which we're creating a new 81 * directory via "mkdir()", but also when we're expanding an inline 82 * directory. In either case, we don't yet have the indexing bit set 83 * on the directory, so the standard checks will fail in when metaecc 84 * is turned off. Only directory-initialization type functions should 85 * use this then. Everything else wants ocfs2_supports_dir_trailer() 86 */ 87 static int ocfs2_new_dir_wants_trailer(struct inode *dir) 88 { 89 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 90 91 return ocfs2_meta_ecc(osb) || 92 ocfs2_supports_indexed_dirs(osb); 93 } 94 95 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) 96 { 97 return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer); 98 } 99 100 #define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb)))) 101 102 /* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make 103 * them more consistent? */ 104 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, 105 void *data) 106 { 107 char *p = data; 108 109 p += blocksize - sizeof(struct ocfs2_dir_block_trailer); 110 return (struct ocfs2_dir_block_trailer *)p; 111 } 112 113 /* 114 * XXX: This is executed once on every dirent. We should consider optimizing 115 * it. 116 */ 117 static int ocfs2_skip_dir_trailer(struct inode *dir, 118 struct ocfs2_dir_entry *de, 119 unsigned long offset, 120 unsigned long blklen) 121 { 122 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); 123 124 if (!ocfs2_supports_dir_trailer(dir)) 125 return 0; 126 127 if (offset != toff) 128 return 0; 129 130 return 1; 131 } 132 133 static void ocfs2_init_dir_trailer(struct inode *inode, 134 struct buffer_head *bh, u16 rec_len) 135 { 136 struct ocfs2_dir_block_trailer *trailer; 137 138 trailer = ocfs2_trailer_from_bh(bh, inode->i_sb); 139 strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE); 140 trailer->db_compat_rec_len = 141 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); 142 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 143 trailer->db_blkno = cpu_to_le64(bh->b_blocknr); 144 trailer->db_free_rec_len = cpu_to_le16(rec_len); 145 } 146 /* 147 * Link an unindexed block with a dir trailer structure into the index free 148 * list. This function will modify dirdata_bh, but assumes you've already 149 * passed it to the journal. 150 */ 151 static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, 152 struct buffer_head *dx_root_bh, 153 struct buffer_head *dirdata_bh) 154 { 155 int ret; 156 struct ocfs2_dx_root_block *dx_root; 157 struct ocfs2_dir_block_trailer *trailer; 158 159 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, 160 OCFS2_JOURNAL_ACCESS_WRITE); 161 if (ret) { 162 mlog_errno(ret); 163 goto out; 164 } 165 trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 166 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 167 168 trailer->db_free_next = dx_root->dr_free_blk; 169 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); 170 171 ocfs2_journal_dirty(handle, dx_root_bh); 172 173 out: 174 return ret; 175 } 176 177 static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) 178 { 179 return res->dl_prev_leaf_bh == NULL; 180 } 181 182 void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) 183 { 184 brelse(res->dl_dx_root_bh); 185 brelse(res->dl_leaf_bh); 186 brelse(res->dl_dx_leaf_bh); 187 brelse(res->dl_prev_leaf_bh); 188 } 189 190 static int ocfs2_dir_indexed(struct inode *inode) 191 { 192 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) 193 return 1; 194 return 0; 195 } 196 197 static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) 198 { 199 return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; 200 } 201 202 /* 203 * Hashing code adapted from ext3 204 */ 205 #define DELTA 0x9E3779B9 206 207 static void TEA_transform(__u32 buf[4], __u32 const in[]) 208 { 209 __u32 sum = 0; 210 __u32 b0 = buf[0], b1 = buf[1]; 211 __u32 a = in[0], b = in[1], c = in[2], d = in[3]; 212 int n = 16; 213 214 do { 215 sum += DELTA; 216 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 217 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 218 } while (--n); 219 220 buf[0] += b0; 221 buf[1] += b1; 222 } 223 224 static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 225 { 226 __u32 pad, val; 227 int i; 228 229 pad = (__u32)len | ((__u32)len << 8); 230 pad |= pad << 16; 231 232 val = pad; 233 if (len > num*4) 234 len = num * 4; 235 for (i = 0; i < len; i++) { 236 if ((i % 4) == 0) 237 val = pad; 238 val = msg[i] + (val << 8); 239 if ((i % 4) == 3) { 240 *buf++ = val; 241 val = pad; 242 num--; 243 } 244 } 245 if (--num >= 0) 246 *buf++ = val; 247 while (--num >= 0) 248 *buf++ = pad; 249 } 250 251 static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len, 252 struct ocfs2_dx_hinfo *hinfo) 253 { 254 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 255 const char *p; 256 __u32 in[8], buf[4]; 257 258 /* 259 * XXX: Is this really necessary, if the index is never looked 260 * at by readdir? Is a hash value of '0' a bad idea? 261 */ 262 if ((len == 1 && !strncmp(".", name, 1)) || 263 (len == 2 && !strncmp("..", name, 2))) { 264 buf[0] = buf[1] = 0; 265 goto out; 266 } 267 268 #ifdef OCFS2_DEBUG_DX_DIRS 269 /* 270 * This makes it very easy to debug indexing problems. We 271 * should never allow this to be selected without hand editing 272 * this file though. 273 */ 274 buf[0] = buf[1] = len; 275 goto out; 276 #endif 277 278 memcpy(buf, osb->osb_dx_seed, sizeof(buf)); 279 280 p = name; 281 while (len > 0) { 282 str2hashbuf(p, len, in, 4); 283 TEA_transform(buf, in); 284 len -= 16; 285 p += 16; 286 } 287 288 out: 289 hinfo->major_hash = buf[0]; 290 hinfo->minor_hash = buf[1]; 291 } 292 293 /* 294 * bh passed here can be an inode block or a dir data block, depending 295 * on the inode inline data flag. 296 */ 297 static int ocfs2_check_dir_entry(struct inode * dir, 298 struct ocfs2_dir_entry * de, 299 struct buffer_head * bh, 300 unsigned long offset) 301 { 302 const char *error_msg = NULL; 303 const int rlen = le16_to_cpu(de->rec_len); 304 305 if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) 306 error_msg = "rec_len is smaller than minimal"; 307 else if (unlikely(rlen % 4 != 0)) 308 error_msg = "rec_len % 4 != 0"; 309 else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) 310 error_msg = "rec_len is too small for name_len"; 311 else if (unlikely( 312 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)) 313 error_msg = "directory entry across blocks"; 314 315 if (unlikely(error_msg != NULL)) 316 mlog(ML_ERROR, "bad entry in directory #%llu: %s - " 317 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", 318 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, 319 offset, (unsigned long long)le64_to_cpu(de->inode), rlen, 320 de->name_len); 321 322 return error_msg == NULL ? 1 : 0; 323 } 324 325 static inline int ocfs2_match(int len, 326 const char * const name, 327 struct ocfs2_dir_entry *de) 328 { 329 if (len != de->name_len) 330 return 0; 331 if (!de->inode) 332 return 0; 333 return !memcmp(name, de->name, len); 334 } 335 336 /* 337 * Returns 0 if not found, -1 on failure, and 1 on success 338 */ 339 static inline int ocfs2_search_dirblock(struct buffer_head *bh, 340 struct inode *dir, 341 const char *name, int namelen, 342 unsigned long offset, 343 char *first_de, 344 unsigned int bytes, 345 struct ocfs2_dir_entry **res_dir) 346 { 347 struct ocfs2_dir_entry *de; 348 char *dlimit, *de_buf; 349 int de_len; 350 int ret = 0; 351 352 de_buf = first_de; 353 dlimit = de_buf + bytes; 354 355 while (de_buf < dlimit) { 356 /* this code is executed quadratically often */ 357 /* do minimal checking `by hand' */ 358 359 de = (struct ocfs2_dir_entry *) de_buf; 360 361 if (de_buf + namelen <= dlimit && 362 ocfs2_match(namelen, name, de)) { 363 /* found a match - just to be sure, do a full check */ 364 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { 365 ret = -1; 366 goto bail; 367 } 368 *res_dir = de; 369 ret = 1; 370 goto bail; 371 } 372 373 /* prevent looping on a bad block */ 374 de_len = le16_to_cpu(de->rec_len); 375 if (de_len <= 0) { 376 ret = -1; 377 goto bail; 378 } 379 380 de_buf += de_len; 381 offset += de_len; 382 } 383 384 bail: 385 trace_ocfs2_search_dirblock(ret); 386 return ret; 387 } 388 389 static struct buffer_head *ocfs2_find_entry_id(const char *name, 390 int namelen, 391 struct inode *dir, 392 struct ocfs2_dir_entry **res_dir) 393 { 394 int ret, found; 395 struct buffer_head *di_bh = NULL; 396 struct ocfs2_dinode *di; 397 struct ocfs2_inline_data *data; 398 399 ret = ocfs2_read_inode_block(dir, &di_bh); 400 if (ret) { 401 mlog_errno(ret); 402 goto out; 403 } 404 405 di = (struct ocfs2_dinode *)di_bh->b_data; 406 data = &di->id2.i_data; 407 408 found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0, 409 data->id_data, i_size_read(dir), res_dir); 410 if (found == 1) 411 return di_bh; 412 413 brelse(di_bh); 414 out: 415 return NULL; 416 } 417 418 static int ocfs2_validate_dir_block(struct super_block *sb, 419 struct buffer_head *bh) 420 { 421 int rc; 422 struct ocfs2_dir_block_trailer *trailer = 423 ocfs2_trailer_from_bh(bh, sb); 424 425 426 /* 427 * We don't validate dirents here, that's handled 428 * in-place when the code walks them. 429 */ 430 trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr); 431 432 BUG_ON(!buffer_uptodate(bh)); 433 434 /* 435 * If the ecc fails, we return the error but otherwise 436 * leave the filesystem running. We know any error is 437 * local to this block. 438 * 439 * Note that we are safe to call this even if the directory 440 * doesn't have a trailer. Filesystems without metaecc will do 441 * nothing, and filesystems with it will have one. 442 */ 443 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check); 444 if (rc) 445 mlog(ML_ERROR, "Checksum failed for dinode %llu\n", 446 (unsigned long long)bh->b_blocknr); 447 448 return rc; 449 } 450 451 /* 452 * Validate a directory trailer. 453 * 454 * We check the trailer here rather than in ocfs2_validate_dir_block() 455 * because that function doesn't have the inode to test. 456 */ 457 static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) 458 { 459 int rc = 0; 460 struct ocfs2_dir_block_trailer *trailer; 461 462 trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); 463 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { 464 rc = ocfs2_error(dir->i_sb, 465 "Invalid dirblock #%llu: signature = %.*s\n", 466 (unsigned long long)bh->b_blocknr, 7, 467 trailer->db_signature); 468 goto out; 469 } 470 if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { 471 rc = ocfs2_error(dir->i_sb, 472 "Directory block #%llu has an invalid db_blkno of %llu\n", 473 (unsigned long long)bh->b_blocknr, 474 (unsigned long long)le64_to_cpu(trailer->db_blkno)); 475 goto out; 476 } 477 if (le64_to_cpu(trailer->db_parent_dinode) != 478 OCFS2_I(dir)->ip_blkno) { 479 rc = ocfs2_error(dir->i_sb, 480 "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", 481 (unsigned long long)bh->b_blocknr, 482 (unsigned long long)OCFS2_I(dir)->ip_blkno, 483 (unsigned long long)le64_to_cpu(trailer->db_blkno)); 484 goto out; 485 } 486 out: 487 return rc; 488 } 489 490 /* 491 * This function forces all errors to -EIO for consistency with its 492 * predecessor, ocfs2_bread(). We haven't audited what returning the 493 * real error codes would do to callers. We log the real codes with 494 * mlog_errno() before we squash them. 495 */ 496 static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, 497 struct buffer_head **bh, int flags) 498 { 499 int rc = 0; 500 struct buffer_head *tmp = *bh; 501 502 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, 503 ocfs2_validate_dir_block); 504 if (rc) { 505 mlog_errno(rc); 506 goto out; 507 } 508 509 if (!(flags & OCFS2_BH_READAHEAD) && 510 ocfs2_supports_dir_trailer(inode)) { 511 rc = ocfs2_check_dir_trailer(inode, tmp); 512 if (rc) { 513 if (!*bh) 514 brelse(tmp); 515 mlog_errno(rc); 516 goto out; 517 } 518 } 519 520 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ 521 if (!*bh) 522 *bh = tmp; 523 524 out: 525 return rc ? -EIO : 0; 526 } 527 528 /* 529 * Read the block at 'phys' which belongs to this directory 530 * inode. This function does no virtual->physical block translation - 531 * what's passed in is assumed to be a valid directory block. 532 */ 533 static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, 534 struct buffer_head **bh) 535 { 536 int ret; 537 struct buffer_head *tmp = *bh; 538 539 ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp, 540 ocfs2_validate_dir_block); 541 if (ret) { 542 mlog_errno(ret); 543 goto out; 544 } 545 546 if (ocfs2_supports_dir_trailer(dir)) { 547 ret = ocfs2_check_dir_trailer(dir, tmp); 548 if (ret) { 549 if (!*bh) 550 brelse(tmp); 551 mlog_errno(ret); 552 goto out; 553 } 554 } 555 556 if (!ret && !*bh) 557 *bh = tmp; 558 out: 559 return ret; 560 } 561 562 static int ocfs2_validate_dx_root(struct super_block *sb, 563 struct buffer_head *bh) 564 { 565 int ret; 566 struct ocfs2_dx_root_block *dx_root; 567 568 BUG_ON(!buffer_uptodate(bh)); 569 570 dx_root = (struct ocfs2_dx_root_block *) bh->b_data; 571 572 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check); 573 if (ret) { 574 mlog(ML_ERROR, 575 "Checksum failed for dir index root block %llu\n", 576 (unsigned long long)bh->b_blocknr); 577 return ret; 578 } 579 580 if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { 581 ret = ocfs2_error(sb, 582 "Dir Index Root # %llu has bad signature %.*s\n", 583 (unsigned long long)le64_to_cpu(dx_root->dr_blkno), 584 7, dx_root->dr_signature); 585 } 586 587 return ret; 588 } 589 590 static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, 591 struct buffer_head **dx_root_bh) 592 { 593 int ret; 594 u64 blkno = le64_to_cpu(di->i_dx_root); 595 struct buffer_head *tmp = *dx_root_bh; 596 597 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, 598 ocfs2_validate_dx_root); 599 600 /* If ocfs2_read_block() got us a new bh, pass it up. */ 601 if (!ret && !*dx_root_bh) 602 *dx_root_bh = tmp; 603 604 return ret; 605 } 606 607 static int ocfs2_validate_dx_leaf(struct super_block *sb, 608 struct buffer_head *bh) 609 { 610 int ret; 611 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data; 612 613 BUG_ON(!buffer_uptodate(bh)); 614 615 ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check); 616 if (ret) { 617 mlog(ML_ERROR, 618 "Checksum failed for dir index leaf block %llu\n", 619 (unsigned long long)bh->b_blocknr); 620 return ret; 621 } 622 623 if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { 624 ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", 625 7, dx_leaf->dl_signature); 626 } 627 628 return ret; 629 } 630 631 static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, 632 struct buffer_head **dx_leaf_bh) 633 { 634 int ret; 635 struct buffer_head *tmp = *dx_leaf_bh; 636 637 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, 638 ocfs2_validate_dx_leaf); 639 640 /* If ocfs2_read_block() got us a new bh, pass it up. */ 641 if (!ret && !*dx_leaf_bh) 642 *dx_leaf_bh = tmp; 643 644 return ret; 645 } 646 647 /* 648 * Read a series of dx_leaf blocks. This expects all buffer_head 649 * pointers to be NULL on function entry. 650 */ 651 static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, 652 struct buffer_head **dx_leaf_bhs) 653 { 654 int ret; 655 656 ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0, 657 ocfs2_validate_dx_leaf); 658 if (ret) 659 mlog_errno(ret); 660 661 return ret; 662 } 663 664 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 665 struct inode *dir, 666 struct ocfs2_dir_entry **res_dir) 667 { 668 struct super_block *sb; 669 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 670 struct buffer_head *bh, *ret = NULL; 671 unsigned long start, block, b; 672 int ra_max = 0; /* Number of bh's in the readahead 673 buffer, bh_use[] */ 674 int ra_ptr = 0; /* Current index into readahead 675 buffer */ 676 int num = 0; 677 int nblocks, i; 678 679 sb = dir->i_sb; 680 681 nblocks = i_size_read(dir) >> sb->s_blocksize_bits; 682 start = OCFS2_I(dir)->ip_dir_start_lookup; 683 if (start >= nblocks) 684 start = 0; 685 block = start; 686 687 restart: 688 do { 689 /* 690 * We deal with the read-ahead logic here. 691 */ 692 if (ra_ptr >= ra_max) { 693 /* Refill the readahead buffer */ 694 ra_ptr = 0; 695 b = block; 696 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { 697 /* 698 * Terminate if we reach the end of the 699 * directory and must wrap, or if our 700 * search has finished at this block. 701 */ 702 if (b >= nblocks || (num && block == start)) { 703 bh_use[ra_max] = NULL; 704 break; 705 } 706 num++; 707 708 bh = NULL; 709 ocfs2_read_dir_block(dir, b++, &bh, 710 OCFS2_BH_READAHEAD); 711 bh_use[ra_max] = bh; 712 } 713 } 714 if ((bh = bh_use[ra_ptr++]) == NULL) 715 goto next; 716 if (ocfs2_read_dir_block(dir, block, &bh, 0)) { 717 /* read error, skip block & hope for the best. 718 * ocfs2_read_dir_block() has released the bh. */ 719 mlog(ML_ERROR, "reading directory %llu, " 720 "offset %lu\n", 721 (unsigned long long)OCFS2_I(dir)->ip_blkno, 722 block); 723 goto next; 724 } 725 i = ocfs2_search_dirblock(bh, dir, name, namelen, 726 block << sb->s_blocksize_bits, 727 bh->b_data, sb->s_blocksize, 728 res_dir); 729 if (i == 1) { 730 OCFS2_I(dir)->ip_dir_start_lookup = block; 731 ret = bh; 732 goto cleanup_and_exit; 733 } else { 734 brelse(bh); 735 if (i < 0) 736 goto cleanup_and_exit; 737 } 738 next: 739 if (++block >= nblocks) 740 block = 0; 741 } while (block != start); 742 743 /* 744 * If the directory has grown while we were searching, then 745 * search the last part of the directory before giving up. 746 */ 747 block = nblocks; 748 nblocks = i_size_read(dir) >> sb->s_blocksize_bits; 749 if (block < nblocks) { 750 start = 0; 751 goto restart; 752 } 753 754 cleanup_and_exit: 755 /* Clean up the read-ahead blocks */ 756 for (; ra_ptr < ra_max; ra_ptr++) 757 brelse(bh_use[ra_ptr]); 758 759 trace_ocfs2_find_entry_el(ret); 760 return ret; 761 } 762 763 static int ocfs2_dx_dir_lookup_rec(struct inode *inode, 764 struct ocfs2_extent_list *el, 765 u32 major_hash, 766 u32 *ret_cpos, 767 u64 *ret_phys_blkno, 768 unsigned int *ret_clen) 769 { 770 int ret = 0, i, found; 771 struct buffer_head *eb_bh = NULL; 772 struct ocfs2_extent_block *eb; 773 struct ocfs2_extent_rec *rec = NULL; 774 775 if (el->l_tree_depth) { 776 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, 777 &eb_bh); 778 if (ret) { 779 mlog_errno(ret); 780 goto out; 781 } 782 783 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 784 el = &eb->h_list; 785 786 if (el->l_tree_depth) { 787 ret = ocfs2_error(inode->i_sb, 788 "Inode %lu has non zero tree depth in btree tree block %llu\n", 789 inode->i_ino, 790 (unsigned long long)eb_bh->b_blocknr); 791 goto out; 792 } 793 } 794 795 found = 0; 796 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 797 rec = &el->l_recs[i]; 798 799 if (le32_to_cpu(rec->e_cpos) <= major_hash) { 800 found = 1; 801 break; 802 } 803 } 804 805 if (!found) { 806 ret = ocfs2_error(inode->i_sb, 807 "Inode %lu has bad extent record (%u, %u, 0) in btree\n", 808 inode->i_ino, 809 le32_to_cpu(rec->e_cpos), 810 ocfs2_rec_clusters(el, rec)); 811 goto out; 812 } 813 814 if (ret_phys_blkno) 815 *ret_phys_blkno = le64_to_cpu(rec->e_blkno); 816 if (ret_cpos) 817 *ret_cpos = le32_to_cpu(rec->e_cpos); 818 if (ret_clen) 819 *ret_clen = le16_to_cpu(rec->e_leaf_clusters); 820 821 out: 822 brelse(eb_bh); 823 return ret; 824 } 825 826 /* 827 * Returns the block index, from the start of the cluster which this 828 * hash belongs too. 829 */ 830 static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, 831 u32 minor_hash) 832 { 833 return minor_hash & osb->osb_dx_mask; 834 } 835 836 static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, 837 struct ocfs2_dx_hinfo *hinfo) 838 { 839 return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); 840 } 841 842 static int ocfs2_dx_dir_lookup(struct inode *inode, 843 struct ocfs2_extent_list *el, 844 struct ocfs2_dx_hinfo *hinfo, 845 u32 *ret_cpos, 846 u64 *ret_phys_blkno) 847 { 848 int ret = 0; 849 unsigned int cend, clen; 850 u32 cpos; 851 u64 blkno; 852 u32 name_hash = hinfo->major_hash; 853 854 ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, 855 &clen); 856 if (ret) { 857 mlog_errno(ret); 858 goto out; 859 } 860 861 cend = cpos + clen; 862 if (name_hash >= cend) { 863 /* We want the last cluster */ 864 blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1); 865 cpos += clen - 1; 866 } else { 867 blkno += ocfs2_clusters_to_blocks(inode->i_sb, 868 name_hash - cpos); 869 cpos = name_hash; 870 } 871 872 /* 873 * We now have the cluster which should hold our entry. To 874 * find the exact block from the start of the cluster to 875 * search, we take the lower bits of the hash. 876 */ 877 blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo); 878 879 if (ret_phys_blkno) 880 *ret_phys_blkno = blkno; 881 if (ret_cpos) 882 *ret_cpos = cpos; 883 884 out: 885 886 return ret; 887 } 888 889 static int ocfs2_dx_dir_search(const char *name, int namelen, 890 struct inode *dir, 891 struct ocfs2_dx_root_block *dx_root, 892 struct ocfs2_dir_lookup_result *res) 893 { 894 int ret, i, found; 895 u64 phys; 896 struct buffer_head *dx_leaf_bh = NULL; 897 struct ocfs2_dx_leaf *dx_leaf; 898 struct ocfs2_dx_entry *dx_entry = NULL; 899 struct buffer_head *dir_ent_bh = NULL; 900 struct ocfs2_dir_entry *dir_ent = NULL; 901 struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; 902 struct ocfs2_extent_list *dr_el; 903 struct ocfs2_dx_entry_list *entry_list; 904 905 ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); 906 907 if (ocfs2_dx_root_inline(dx_root)) { 908 entry_list = &dx_root->dr_entries; 909 goto search; 910 } 911 912 dr_el = &dx_root->dr_list; 913 914 ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); 915 if (ret) { 916 mlog_errno(ret); 917 goto out; 918 } 919 920 trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno, 921 namelen, name, hinfo->major_hash, 922 hinfo->minor_hash, (unsigned long long)phys); 923 924 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); 925 if (ret) { 926 mlog_errno(ret); 927 goto out; 928 } 929 930 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; 931 932 trace_ocfs2_dx_dir_search_leaf_info( 933 le16_to_cpu(dx_leaf->dl_list.de_num_used), 934 le16_to_cpu(dx_leaf->dl_list.de_count)); 935 936 entry_list = &dx_leaf->dl_list; 937 938 search: 939 /* 940 * Empty leaf is legal, so no need to check for that. 941 */ 942 found = 0; 943 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { 944 dx_entry = &entry_list->de_entries[i]; 945 946 if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) 947 || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) 948 continue; 949 950 /* 951 * Search unindexed leaf block now. We're not 952 * guaranteed to find anything. 953 */ 954 ret = ocfs2_read_dir_block_direct(dir, 955 le64_to_cpu(dx_entry->dx_dirent_blk), 956 &dir_ent_bh); 957 if (ret) { 958 mlog_errno(ret); 959 goto out; 960 } 961 962 /* 963 * XXX: We should check the unindexed block here, 964 * before using it. 965 */ 966 967 found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen, 968 0, dir_ent_bh->b_data, 969 dir->i_sb->s_blocksize, &dir_ent); 970 if (found == 1) 971 break; 972 973 if (found == -1) { 974 /* This means we found a bad directory entry. */ 975 ret = -EIO; 976 mlog_errno(ret); 977 goto out; 978 } 979 980 brelse(dir_ent_bh); 981 dir_ent_bh = NULL; 982 } 983 984 if (found <= 0) { 985 ret = -ENOENT; 986 goto out; 987 } 988 989 res->dl_leaf_bh = dir_ent_bh; 990 res->dl_entry = dir_ent; 991 res->dl_dx_leaf_bh = dx_leaf_bh; 992 res->dl_dx_entry = dx_entry; 993 994 ret = 0; 995 out: 996 if (ret) { 997 brelse(dx_leaf_bh); 998 brelse(dir_ent_bh); 999 } 1000 return ret; 1001 } 1002 1003 static int ocfs2_find_entry_dx(const char *name, int namelen, 1004 struct inode *dir, 1005 struct ocfs2_dir_lookup_result *lookup) 1006 { 1007 int ret; 1008 struct buffer_head *di_bh = NULL; 1009 struct ocfs2_dinode *di; 1010 struct buffer_head *dx_root_bh = NULL; 1011 struct ocfs2_dx_root_block *dx_root; 1012 1013 ret = ocfs2_read_inode_block(dir, &di_bh); 1014 if (ret) { 1015 mlog_errno(ret); 1016 goto out; 1017 } 1018 1019 di = (struct ocfs2_dinode *)di_bh->b_data; 1020 1021 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); 1022 if (ret) { 1023 mlog_errno(ret); 1024 goto out; 1025 } 1026 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 1027 1028 ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); 1029 if (ret) { 1030 if (ret != -ENOENT) 1031 mlog_errno(ret); 1032 goto out; 1033 } 1034 1035 lookup->dl_dx_root_bh = dx_root_bh; 1036 dx_root_bh = NULL; 1037 out: 1038 brelse(di_bh); 1039 brelse(dx_root_bh); 1040 return ret; 1041 } 1042 1043 /* 1044 * Try to find an entry of the provided name within 'dir'. 1045 * 1046 * If nothing was found, -ENOENT is returned. Otherwise, zero is 1047 * returned and the struct 'res' will contain information useful to 1048 * other directory manipulation functions. 1049 * 1050 * Caller can NOT assume anything about the contents of the 1051 * buffer_heads - they are passed back only so that it can be passed 1052 * into any one of the manipulation functions (add entry, delete 1053 * entry, etc). As an example, bh in the extent directory case is a 1054 * data block, in the inline-data case it actually points to an inode, 1055 * in the indexed directory case, multiple buffers are involved. 1056 */ 1057 int ocfs2_find_entry(const char *name, int namelen, 1058 struct inode *dir, struct ocfs2_dir_lookup_result *lookup) 1059 { 1060 struct buffer_head *bh; 1061 struct ocfs2_dir_entry *res_dir = NULL; 1062 1063 if (ocfs2_dir_indexed(dir)) 1064 return ocfs2_find_entry_dx(name, namelen, dir, lookup); 1065 1066 /* 1067 * The unindexed dir code only uses part of the lookup 1068 * structure, so there's no reason to push it down further 1069 * than this. 1070 */ 1071 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1072 bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); 1073 else 1074 bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); 1075 1076 if (bh == NULL) 1077 return -ENOENT; 1078 1079 lookup->dl_leaf_bh = bh; 1080 lookup->dl_entry = res_dir; 1081 return 0; 1082 } 1083 1084 /* 1085 * Update inode number and type of a previously found directory entry. 1086 */ 1087 int ocfs2_update_entry(struct inode *dir, handle_t *handle, 1088 struct ocfs2_dir_lookup_result *res, 1089 struct inode *new_entry_inode) 1090 { 1091 int ret; 1092 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1093 struct ocfs2_dir_entry *de = res->dl_entry; 1094 struct buffer_head *de_bh = res->dl_leaf_bh; 1095 1096 /* 1097 * The same code works fine for both inline-data and extent 1098 * based directories, so no need to split this up. The only 1099 * difference is the journal_access function. 1100 */ 1101 1102 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1103 access = ocfs2_journal_access_di; 1104 1105 ret = access(handle, INODE_CACHE(dir), de_bh, 1106 OCFS2_JOURNAL_ACCESS_WRITE); 1107 if (ret) { 1108 mlog_errno(ret); 1109 goto out; 1110 } 1111 1112 de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno); 1113 ocfs2_set_de_type(de, new_entry_inode->i_mode); 1114 1115 ocfs2_journal_dirty(handle, de_bh); 1116 1117 out: 1118 return ret; 1119 } 1120 1121 /* 1122 * __ocfs2_delete_entry deletes a directory entry by merging it with the 1123 * previous entry 1124 */ 1125 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, 1126 struct ocfs2_dir_entry *de_del, 1127 struct buffer_head *bh, char *first_de, 1128 unsigned int bytes) 1129 { 1130 struct ocfs2_dir_entry *de, *pde; 1131 int i, status = -ENOENT; 1132 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1133 1134 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1135 access = ocfs2_journal_access_di; 1136 1137 i = 0; 1138 pde = NULL; 1139 de = (struct ocfs2_dir_entry *) first_de; 1140 while (i < bytes) { 1141 if (!ocfs2_check_dir_entry(dir, de, bh, i)) { 1142 status = -EIO; 1143 mlog_errno(status); 1144 goto bail; 1145 } 1146 if (de == de_del) { 1147 status = access(handle, INODE_CACHE(dir), bh, 1148 OCFS2_JOURNAL_ACCESS_WRITE); 1149 if (status < 0) { 1150 status = -EIO; 1151 mlog_errno(status); 1152 goto bail; 1153 } 1154 if (pde) 1155 le16_add_cpu(&pde->rec_len, 1156 le16_to_cpu(de->rec_len)); 1157 de->inode = 0; 1158 inode_inc_iversion(dir); 1159 ocfs2_journal_dirty(handle, bh); 1160 goto bail; 1161 } 1162 i += le16_to_cpu(de->rec_len); 1163 pde = de; 1164 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); 1165 } 1166 bail: 1167 return status; 1168 } 1169 1170 static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) 1171 { 1172 unsigned int hole; 1173 1174 if (le64_to_cpu(de->inode) == 0) 1175 hole = le16_to_cpu(de->rec_len); 1176 else 1177 hole = le16_to_cpu(de->rec_len) - 1178 OCFS2_DIR_REC_LEN(de->name_len); 1179 1180 return hole; 1181 } 1182 1183 static int ocfs2_find_max_rec_len(struct super_block *sb, 1184 struct buffer_head *dirblock_bh) 1185 { 1186 int size, this_hole, largest_hole = 0; 1187 char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; 1188 struct ocfs2_dir_entry *de; 1189 1190 trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); 1191 size = ocfs2_dir_trailer_blk_off(sb); 1192 limit = start + size; 1193 de_buf = start; 1194 de = (struct ocfs2_dir_entry *)de_buf; 1195 do { 1196 if (de_buf != trailer) { 1197 this_hole = ocfs2_figure_dirent_hole(de); 1198 if (this_hole > largest_hole) 1199 largest_hole = this_hole; 1200 } 1201 1202 de_buf += le16_to_cpu(de->rec_len); 1203 de = (struct ocfs2_dir_entry *)de_buf; 1204 } while (de_buf < limit); 1205 1206 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) 1207 return largest_hole; 1208 return 0; 1209 } 1210 1211 static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, 1212 int index) 1213 { 1214 int num_used = le16_to_cpu(entry_list->de_num_used); 1215 1216 if (num_used == 1 || index == (num_used - 1)) 1217 goto clear; 1218 1219 memmove(&entry_list->de_entries[index], 1220 &entry_list->de_entries[index + 1], 1221 (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); 1222 clear: 1223 num_used--; 1224 memset(&entry_list->de_entries[num_used], 0, 1225 sizeof(struct ocfs2_dx_entry)); 1226 entry_list->de_num_used = cpu_to_le16(num_used); 1227 } 1228 1229 static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, 1230 struct ocfs2_dir_lookup_result *lookup) 1231 { 1232 int ret, index, max_rec_len, add_to_free_list = 0; 1233 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1234 struct buffer_head *leaf_bh = lookup->dl_leaf_bh; 1235 struct ocfs2_dx_leaf *dx_leaf; 1236 struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; 1237 struct ocfs2_dir_block_trailer *trailer; 1238 struct ocfs2_dx_root_block *dx_root; 1239 struct ocfs2_dx_entry_list *entry_list; 1240 1241 /* 1242 * This function gets a bit messy because we might have to 1243 * modify the root block, regardless of whether the indexed 1244 * entries are stored inline. 1245 */ 1246 1247 /* 1248 * *Only* set 'entry_list' here, based on where we're looking 1249 * for the indexed entries. Later, we might still want to 1250 * journal both blocks, based on free list state. 1251 */ 1252 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 1253 if (ocfs2_dx_root_inline(dx_root)) { 1254 entry_list = &dx_root->dr_entries; 1255 } else { 1256 dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; 1257 entry_list = &dx_leaf->dl_list; 1258 } 1259 1260 /* Neither of these are a disk corruption - that should have 1261 * been caught by lookup, before we got here. */ 1262 BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); 1263 BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); 1264 1265 index = (char *)dx_entry - (char *)entry_list->de_entries; 1266 index /= sizeof(*dx_entry); 1267 1268 if (index >= le16_to_cpu(entry_list->de_num_used)) { 1269 mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", 1270 (unsigned long long)OCFS2_I(dir)->ip_blkno, index, 1271 entry_list, dx_entry); 1272 return -EIO; 1273 } 1274 1275 /* 1276 * We know that removal of this dirent will leave enough room 1277 * for a new one, so add this block to the free list if it 1278 * isn't already there. 1279 */ 1280 trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); 1281 if (trailer->db_free_rec_len == 0) 1282 add_to_free_list = 1; 1283 1284 /* 1285 * Add the block holding our index into the journal before 1286 * removing the unindexed entry. If we get an error return 1287 * from __ocfs2_delete_entry(), then it hasn't removed the 1288 * entry yet. Likewise, successful return means we *must* 1289 * remove the indexed entry. 1290 * 1291 * We're also careful to journal the root tree block here as 1292 * the entry count needs to be updated. Also, we might be 1293 * adding to the start of the free list. 1294 */ 1295 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, 1296 OCFS2_JOURNAL_ACCESS_WRITE); 1297 if (ret) { 1298 mlog_errno(ret); 1299 goto out; 1300 } 1301 1302 if (!ocfs2_dx_root_inline(dx_root)) { 1303 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 1304 lookup->dl_dx_leaf_bh, 1305 OCFS2_JOURNAL_ACCESS_WRITE); 1306 if (ret) { 1307 mlog_errno(ret); 1308 goto out; 1309 } 1310 } 1311 1312 trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno, 1313 index); 1314 1315 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, 1316 leaf_bh, leaf_bh->b_data, leaf_bh->b_size); 1317 if (ret) { 1318 mlog_errno(ret); 1319 goto out; 1320 } 1321 1322 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); 1323 trailer->db_free_rec_len = cpu_to_le16(max_rec_len); 1324 if (add_to_free_list) { 1325 trailer->db_free_next = dx_root->dr_free_blk; 1326 dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); 1327 ocfs2_journal_dirty(handle, dx_root_bh); 1328 } 1329 1330 /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ 1331 ocfs2_journal_dirty(handle, leaf_bh); 1332 1333 le32_add_cpu(&dx_root->dr_num_entries, -1); 1334 ocfs2_journal_dirty(handle, dx_root_bh); 1335 1336 ocfs2_dx_list_remove_entry(entry_list, index); 1337 1338 if (!ocfs2_dx_root_inline(dx_root)) 1339 ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); 1340 1341 out: 1342 return ret; 1343 } 1344 1345 static inline int ocfs2_delete_entry_id(handle_t *handle, 1346 struct inode *dir, 1347 struct ocfs2_dir_entry *de_del, 1348 struct buffer_head *bh) 1349 { 1350 int ret; 1351 struct buffer_head *di_bh = NULL; 1352 struct ocfs2_dinode *di; 1353 struct ocfs2_inline_data *data; 1354 1355 ret = ocfs2_read_inode_block(dir, &di_bh); 1356 if (ret) { 1357 mlog_errno(ret); 1358 goto out; 1359 } 1360 1361 di = (struct ocfs2_dinode *)di_bh->b_data; 1362 data = &di->id2.i_data; 1363 1364 ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data, 1365 i_size_read(dir)); 1366 1367 brelse(di_bh); 1368 out: 1369 return ret; 1370 } 1371 1372 static inline int ocfs2_delete_entry_el(handle_t *handle, 1373 struct inode *dir, 1374 struct ocfs2_dir_entry *de_del, 1375 struct buffer_head *bh) 1376 { 1377 return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data, 1378 bh->b_size); 1379 } 1380 1381 /* 1382 * Delete a directory entry. Hide the details of directory 1383 * implementation from the caller. 1384 */ 1385 int ocfs2_delete_entry(handle_t *handle, 1386 struct inode *dir, 1387 struct ocfs2_dir_lookup_result *res) 1388 { 1389 if (ocfs2_dir_indexed(dir)) 1390 return ocfs2_delete_entry_dx(handle, dir, res); 1391 1392 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1393 return ocfs2_delete_entry_id(handle, dir, res->dl_entry, 1394 res->dl_leaf_bh); 1395 1396 return ocfs2_delete_entry_el(handle, dir, res->dl_entry, 1397 res->dl_leaf_bh); 1398 } 1399 1400 /* 1401 * Check whether 'de' has enough room to hold an entry of 1402 * 'new_rec_len' bytes. 1403 */ 1404 static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de, 1405 unsigned int new_rec_len) 1406 { 1407 unsigned int de_really_used; 1408 1409 /* Check whether this is an empty record with enough space */ 1410 if (le64_to_cpu(de->inode) == 0 && 1411 le16_to_cpu(de->rec_len) >= new_rec_len) 1412 return 1; 1413 1414 /* 1415 * Record might have free space at the end which we can 1416 * use. 1417 */ 1418 de_really_used = OCFS2_DIR_REC_LEN(de->name_len); 1419 if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len)) 1420 return 1; 1421 1422 return 0; 1423 } 1424 1425 static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf, 1426 struct ocfs2_dx_entry *dx_new_entry) 1427 { 1428 int i; 1429 1430 i = le16_to_cpu(dx_leaf->dl_list.de_num_used); 1431 dx_leaf->dl_list.de_entries[i] = *dx_new_entry; 1432 1433 le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); 1434 } 1435 1436 static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, 1437 struct ocfs2_dx_hinfo *hinfo, 1438 u64 dirent_blk) 1439 { 1440 int i; 1441 struct ocfs2_dx_entry *dx_entry; 1442 1443 i = le16_to_cpu(entry_list->de_num_used); 1444 dx_entry = &entry_list->de_entries[i]; 1445 1446 memset(dx_entry, 0, sizeof(*dx_entry)); 1447 dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); 1448 dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); 1449 dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); 1450 1451 le16_add_cpu(&entry_list->de_num_used, 1); 1452 } 1453 1454 static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, 1455 struct ocfs2_dx_hinfo *hinfo, 1456 u64 dirent_blk, 1457 struct buffer_head *dx_leaf_bh) 1458 { 1459 int ret; 1460 struct ocfs2_dx_leaf *dx_leaf; 1461 1462 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, 1463 OCFS2_JOURNAL_ACCESS_WRITE); 1464 if (ret) { 1465 mlog_errno(ret); 1466 goto out; 1467 } 1468 1469 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; 1470 ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); 1471 ocfs2_journal_dirty(handle, dx_leaf_bh); 1472 1473 out: 1474 return ret; 1475 } 1476 1477 static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, 1478 struct ocfs2_dx_hinfo *hinfo, 1479 u64 dirent_blk, 1480 struct ocfs2_dx_root_block *dx_root) 1481 { 1482 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); 1483 } 1484 1485 static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, 1486 struct ocfs2_dir_lookup_result *lookup) 1487 { 1488 int ret = 0; 1489 struct ocfs2_dx_root_block *dx_root; 1490 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1491 1492 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, 1493 OCFS2_JOURNAL_ACCESS_WRITE); 1494 if (ret) { 1495 mlog_errno(ret); 1496 goto out; 1497 } 1498 1499 dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; 1500 if (ocfs2_dx_root_inline(dx_root)) { 1501 ocfs2_dx_inline_root_insert(dir, handle, 1502 &lookup->dl_hinfo, 1503 lookup->dl_leaf_bh->b_blocknr, 1504 dx_root); 1505 } else { 1506 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, 1507 lookup->dl_leaf_bh->b_blocknr, 1508 lookup->dl_dx_leaf_bh); 1509 if (ret) 1510 goto out; 1511 } 1512 1513 le32_add_cpu(&dx_root->dr_num_entries, 1); 1514 ocfs2_journal_dirty(handle, dx_root_bh); 1515 1516 out: 1517 return ret; 1518 } 1519 1520 static void ocfs2_remove_block_from_free_list(struct inode *dir, 1521 handle_t *handle, 1522 struct ocfs2_dir_lookup_result *lookup) 1523 { 1524 struct ocfs2_dir_block_trailer *trailer, *prev; 1525 struct ocfs2_dx_root_block *dx_root; 1526 struct buffer_head *bh; 1527 1528 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); 1529 1530 if (ocfs2_free_list_at_root(lookup)) { 1531 bh = lookup->dl_dx_root_bh; 1532 dx_root = (struct ocfs2_dx_root_block *)bh->b_data; 1533 dx_root->dr_free_blk = trailer->db_free_next; 1534 } else { 1535 bh = lookup->dl_prev_leaf_bh; 1536 prev = ocfs2_trailer_from_bh(bh, dir->i_sb); 1537 prev->db_free_next = trailer->db_free_next; 1538 } 1539 1540 trailer->db_free_rec_len = cpu_to_le16(0); 1541 trailer->db_free_next = cpu_to_le64(0); 1542 1543 ocfs2_journal_dirty(handle, bh); 1544 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); 1545 } 1546 1547 /* 1548 * This expects that a journal write has been reserved on 1549 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh 1550 */ 1551 static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, 1552 struct ocfs2_dir_lookup_result *lookup) 1553 { 1554 int max_rec_len; 1555 struct ocfs2_dir_block_trailer *trailer; 1556 1557 /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ 1558 max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); 1559 if (max_rec_len) { 1560 /* 1561 * There's still room in this block, so no need to remove it 1562 * from the free list. In this case, we just want to update 1563 * the rec len accounting. 1564 */ 1565 trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); 1566 trailer->db_free_rec_len = cpu_to_le16(max_rec_len); 1567 ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); 1568 } else { 1569 ocfs2_remove_block_from_free_list(dir, handle, lookup); 1570 } 1571 } 1572 1573 /* we don't always have a dentry for what we want to add, so people 1574 * like orphan dir can call this instead. 1575 * 1576 * The lookup context must have been filled from 1577 * ocfs2_prepare_dir_for_insert. 1578 */ 1579 int __ocfs2_add_entry(handle_t *handle, 1580 struct inode *dir, 1581 const char *name, int namelen, 1582 struct inode *inode, u64 blkno, 1583 struct buffer_head *parent_fe_bh, 1584 struct ocfs2_dir_lookup_result *lookup) 1585 { 1586 unsigned long offset; 1587 unsigned short rec_len; 1588 struct ocfs2_dir_entry *de, *de1; 1589 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; 1590 struct super_block *sb = dir->i_sb; 1591 int retval; 1592 unsigned int size = sb->s_blocksize; 1593 struct buffer_head *insert_bh = lookup->dl_leaf_bh; 1594 char *data_start = insert_bh->b_data; 1595 1596 if (ocfs2_dir_indexed(dir)) { 1597 struct buffer_head *bh; 1598 1599 /* 1600 * An indexed dir may require that we update the free space 1601 * list. Reserve a write to the previous node in the list so 1602 * that we don't fail later. 1603 * 1604 * XXX: This can be either a dx_root_block, or an unindexed 1605 * directory tree leaf block. 1606 */ 1607 if (ocfs2_free_list_at_root(lookup)) { 1608 bh = lookup->dl_dx_root_bh; 1609 retval = ocfs2_journal_access_dr(handle, 1610 INODE_CACHE(dir), bh, 1611 OCFS2_JOURNAL_ACCESS_WRITE); 1612 } else { 1613 bh = lookup->dl_prev_leaf_bh; 1614 retval = ocfs2_journal_access_db(handle, 1615 INODE_CACHE(dir), bh, 1616 OCFS2_JOURNAL_ACCESS_WRITE); 1617 } 1618 if (retval) { 1619 mlog_errno(retval); 1620 return retval; 1621 } 1622 } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1623 data_start = di->id2.i_data.id_data; 1624 size = i_size_read(dir); 1625 1626 BUG_ON(insert_bh != parent_fe_bh); 1627 } 1628 1629 rec_len = OCFS2_DIR_REC_LEN(namelen); 1630 offset = 0; 1631 de = (struct ocfs2_dir_entry *) data_start; 1632 while (1) { 1633 BUG_ON((char *)de >= (size + data_start)); 1634 1635 /* These checks should've already been passed by the 1636 * prepare function, but I guess we can leave them 1637 * here anyway. */ 1638 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { 1639 retval = -ENOENT; 1640 goto bail; 1641 } 1642 if (ocfs2_match(namelen, name, de)) { 1643 retval = -EEXIST; 1644 goto bail; 1645 } 1646 1647 /* We're guaranteed that we should have space, so we 1648 * can't possibly have hit the trailer...right? */ 1649 mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size), 1650 "Hit dir trailer trying to insert %.*s " 1651 "(namelen %d) into directory %llu. " 1652 "offset is %lu, trailer offset is %d\n", 1653 namelen, name, namelen, 1654 (unsigned long long)parent_fe_bh->b_blocknr, 1655 offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); 1656 1657 if (ocfs2_dirent_would_fit(de, rec_len)) { 1658 inode_set_mtime_to_ts(dir, 1659 inode_set_ctime_current(dir)); 1660 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 1661 if (retval < 0) { 1662 mlog_errno(retval); 1663 goto bail; 1664 } 1665 1666 if (insert_bh == parent_fe_bh) 1667 retval = ocfs2_journal_access_di(handle, 1668 INODE_CACHE(dir), 1669 insert_bh, 1670 OCFS2_JOURNAL_ACCESS_WRITE); 1671 else { 1672 retval = ocfs2_journal_access_db(handle, 1673 INODE_CACHE(dir), 1674 insert_bh, 1675 OCFS2_JOURNAL_ACCESS_WRITE); 1676 1677 if (!retval && ocfs2_dir_indexed(dir)) 1678 retval = ocfs2_dx_dir_insert(dir, 1679 handle, 1680 lookup); 1681 } 1682 1683 if (retval) { 1684 mlog_errno(retval); 1685 goto bail; 1686 } 1687 1688 /* By now the buffer is marked for journaling */ 1689 offset += le16_to_cpu(de->rec_len); 1690 if (le64_to_cpu(de->inode)) { 1691 de1 = (struct ocfs2_dir_entry *)((char *) de + 1692 OCFS2_DIR_REC_LEN(de->name_len)); 1693 de1->rec_len = 1694 cpu_to_le16(le16_to_cpu(de->rec_len) - 1695 OCFS2_DIR_REC_LEN(de->name_len)); 1696 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); 1697 de = de1; 1698 } 1699 de->file_type = FT_UNKNOWN; 1700 if (blkno) { 1701 de->inode = cpu_to_le64(blkno); 1702 ocfs2_set_de_type(de, inode->i_mode); 1703 } else 1704 de->inode = 0; 1705 de->name_len = namelen; 1706 memcpy(de->name, name, namelen); 1707 1708 if (ocfs2_dir_indexed(dir)) 1709 ocfs2_recalc_free_list(dir, handle, lookup); 1710 1711 inode_inc_iversion(dir); 1712 ocfs2_journal_dirty(handle, insert_bh); 1713 retval = 0; 1714 goto bail; 1715 } 1716 1717 offset += le16_to_cpu(de->rec_len); 1718 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); 1719 } 1720 1721 /* when you think about it, the assert above should prevent us 1722 * from ever getting here. */ 1723 retval = -ENOSPC; 1724 bail: 1725 if (retval) 1726 mlog_errno(retval); 1727 1728 return retval; 1729 } 1730 1731 static int ocfs2_dir_foreach_blk_id(struct inode *inode, 1732 u64 *f_version, 1733 struct dir_context *ctx) 1734 { 1735 int ret, i; 1736 unsigned long offset = ctx->pos; 1737 struct buffer_head *di_bh = NULL; 1738 struct ocfs2_dinode *di; 1739 struct ocfs2_inline_data *data; 1740 struct ocfs2_dir_entry *de; 1741 1742 ret = ocfs2_read_inode_block(inode, &di_bh); 1743 if (ret) { 1744 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", 1745 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1746 goto out; 1747 } 1748 1749 di = (struct ocfs2_dinode *)di_bh->b_data; 1750 data = &di->id2.i_data; 1751 1752 while (ctx->pos < i_size_read(inode)) { 1753 /* If the dir block has changed since the last call to 1754 * readdir(2), then we might be pointing to an invalid 1755 * dirent right now. Scan from the start of the block 1756 * to make sure. */ 1757 if (!inode_eq_iversion(inode, *f_version)) { 1758 for (i = 0; i < i_size_read(inode) && i < offset; ) { 1759 de = (struct ocfs2_dir_entry *) 1760 (data->id_data + i); 1761 /* It's too expensive to do a full 1762 * dirent test each time round this 1763 * loop, but we do have to test at 1764 * least that it is non-zero. A 1765 * failure will be detected in the 1766 * dirent test below. */ 1767 if (le16_to_cpu(de->rec_len) < 1768 OCFS2_DIR_REC_LEN(1)) 1769 break; 1770 i += le16_to_cpu(de->rec_len); 1771 } 1772 ctx->pos = offset = i; 1773 *f_version = inode_query_iversion(inode); 1774 } 1775 1776 de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); 1777 if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) { 1778 /* On error, skip the f_pos to the end. */ 1779 ctx->pos = i_size_read(inode); 1780 break; 1781 } 1782 offset += le16_to_cpu(de->rec_len); 1783 if (le64_to_cpu(de->inode)) { 1784 if (!dir_emit(ctx, de->name, de->name_len, 1785 le64_to_cpu(de->inode), 1786 fs_ftype_to_dtype(de->file_type))) 1787 goto out; 1788 } 1789 ctx->pos += le16_to_cpu(de->rec_len); 1790 } 1791 out: 1792 brelse(di_bh); 1793 return 0; 1794 } 1795 1796 /* 1797 * NOTE: This function can be called against unindexed directories, 1798 * and indexed ones. 1799 */ 1800 static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1801 u64 *f_version, 1802 struct dir_context *ctx, 1803 bool persist) 1804 { 1805 unsigned long offset, blk, last_ra_blk = 0; 1806 int i; 1807 struct buffer_head * bh, * tmp; 1808 struct ocfs2_dir_entry * de; 1809 struct super_block * sb = inode->i_sb; 1810 unsigned int ra_sectors = 16; 1811 int stored = 0; 1812 1813 bh = NULL; 1814 1815 offset = ctx->pos & (sb->s_blocksize - 1); 1816 1817 while (ctx->pos < i_size_read(inode)) { 1818 blk = ctx->pos >> sb->s_blocksize_bits; 1819 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { 1820 /* Skip the corrupt dirblock and keep trying */ 1821 ctx->pos += sb->s_blocksize - offset; 1822 continue; 1823 } 1824 1825 /* The idea here is to begin with 8k read-ahead and to stay 1826 * 4k ahead of our current position. 1827 * 1828 * TODO: Use the pagecache for this. We just need to 1829 * make sure it's cluster-safe... */ 1830 if (!last_ra_blk 1831 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { 1832 for (i = ra_sectors >> (sb->s_blocksize_bits - 9); 1833 i > 0; i--) { 1834 tmp = NULL; 1835 if (!ocfs2_read_dir_block(inode, ++blk, &tmp, 1836 OCFS2_BH_READAHEAD)) 1837 brelse(tmp); 1838 } 1839 last_ra_blk = blk; 1840 ra_sectors = 8; 1841 } 1842 1843 /* If the dir block has changed since the last call to 1844 * readdir(2), then we might be pointing to an invalid 1845 * dirent right now. Scan from the start of the block 1846 * to make sure. */ 1847 if (!inode_eq_iversion(inode, *f_version)) { 1848 for (i = 0; i < sb->s_blocksize && i < offset; ) { 1849 de = (struct ocfs2_dir_entry *) (bh->b_data + i); 1850 /* It's too expensive to do a full 1851 * dirent test each time round this 1852 * loop, but we do have to test at 1853 * least that it is non-zero. A 1854 * failure will be detected in the 1855 * dirent test below. */ 1856 if (le16_to_cpu(de->rec_len) < 1857 OCFS2_DIR_REC_LEN(1)) 1858 break; 1859 i += le16_to_cpu(de->rec_len); 1860 } 1861 offset = i; 1862 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) 1863 | offset; 1864 *f_version = inode_query_iversion(inode); 1865 } 1866 1867 while (ctx->pos < i_size_read(inode) 1868 && offset < sb->s_blocksize) { 1869 de = (struct ocfs2_dir_entry *) (bh->b_data + offset); 1870 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 1871 /* On error, skip the f_pos to the 1872 next block. */ 1873 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; 1874 break; 1875 } 1876 if (le64_to_cpu(de->inode)) { 1877 if (!dir_emit(ctx, de->name, 1878 de->name_len, 1879 le64_to_cpu(de->inode), 1880 fs_ftype_to_dtype(de->file_type))) { 1881 brelse(bh); 1882 return 0; 1883 } 1884 stored++; 1885 } 1886 offset += le16_to_cpu(de->rec_len); 1887 ctx->pos += le16_to_cpu(de->rec_len); 1888 } 1889 offset = 0; 1890 brelse(bh); 1891 bh = NULL; 1892 if (!persist && stored) 1893 break; 1894 } 1895 return 0; 1896 } 1897 1898 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, 1899 struct dir_context *ctx, 1900 bool persist) 1901 { 1902 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1903 return ocfs2_dir_foreach_blk_id(inode, f_version, ctx); 1904 return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist); 1905 } 1906 1907 /* 1908 * This is intended to be called from inside other kernel functions, 1909 * so we fake some arguments. 1910 */ 1911 int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) 1912 { 1913 u64 version = inode_query_iversion(inode); 1914 ocfs2_dir_foreach_blk(inode, &version, ctx, true); 1915 return 0; 1916 } 1917 1918 /* 1919 * ocfs2_readdir() 1920 * 1921 */ 1922 int ocfs2_readdir(struct file *file, struct dir_context *ctx) 1923 { 1924 int error = 0; 1925 struct inode *inode = file_inode(file); 1926 int lock_level = 0; 1927 1928 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); 1929 1930 error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); 1931 if (lock_level && error >= 0) { 1932 /* We release EX lock which used to update atime 1933 * and get PR lock again to reduce contention 1934 * on commonly accessed directories. */ 1935 ocfs2_inode_unlock(inode, 1); 1936 lock_level = 0; 1937 error = ocfs2_inode_lock(inode, NULL, 0); 1938 } 1939 if (error < 0) { 1940 if (error != -ENOENT) 1941 mlog_errno(error); 1942 /* we haven't got any yet, so propagate the error. */ 1943 goto bail_nolock; 1944 } 1945 1946 error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); 1947 1948 ocfs2_inode_unlock(inode, lock_level); 1949 if (error) 1950 mlog_errno(error); 1951 1952 bail_nolock: 1953 1954 return error; 1955 } 1956 1957 /* 1958 * NOTE: this should always be called with parent dir i_rwsem taken. 1959 */ 1960 int ocfs2_find_files_on_disk(const char *name, 1961 int namelen, 1962 u64 *blkno, 1963 struct inode *inode, 1964 struct ocfs2_dir_lookup_result *lookup) 1965 { 1966 int status = -ENOENT; 1967 1968 trace_ocfs2_find_files_on_disk(namelen, name, blkno, 1969 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1970 1971 status = ocfs2_find_entry(name, namelen, inode, lookup); 1972 if (status) 1973 goto leave; 1974 1975 *blkno = le64_to_cpu(lookup->dl_entry->inode); 1976 1977 status = 0; 1978 leave: 1979 1980 return status; 1981 } 1982 1983 /* 1984 * Convenience function for callers which just want the block number 1985 * mapped to a name and don't require the full dirent info, etc. 1986 */ 1987 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 1988 int namelen, u64 *blkno) 1989 { 1990 int ret; 1991 struct ocfs2_dir_lookup_result lookup = { NULL, }; 1992 1993 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup); 1994 ocfs2_free_dir_lookup_result(&lookup); 1995 1996 return ret; 1997 } 1998 1999 /* Check for a name within a directory. 2000 * 2001 * Return 0 if the name does not exist 2002 * Return -EEXIST if the directory contains the name 2003 * 2004 * Callers should have i_rwsem + a cluster lock on dir 2005 */ 2006 int ocfs2_check_dir_for_entry(struct inode *dir, 2007 const char *name, 2008 int namelen) 2009 { 2010 int ret = 0; 2011 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2012 2013 trace_ocfs2_check_dir_for_entry( 2014 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2015 2016 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { 2017 ret = -EEXIST; 2018 mlog_errno(ret); 2019 } 2020 2021 ocfs2_free_dir_lookup_result(&lookup); 2022 2023 return ret; 2024 } 2025 2026 struct ocfs2_empty_dir_priv { 2027 struct dir_context ctx; 2028 unsigned seen_dot; 2029 unsigned seen_dot_dot; 2030 unsigned seen_other; 2031 unsigned dx_dir; 2032 }; 2033 static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, 2034 int name_len, loff_t pos, u64 ino, 2035 unsigned type) 2036 { 2037 struct ocfs2_empty_dir_priv *p = 2038 container_of(ctx, struct ocfs2_empty_dir_priv, ctx); 2039 2040 /* 2041 * Check the positions of "." and ".." records to be sure 2042 * they're in the correct place. 2043 * 2044 * Indexed directories don't need to proceed past the first 2045 * two entries, so we end the scan after seeing '..'. Despite 2046 * that, we allow the scan to proceed In the event that we 2047 * have a corrupted indexed directory (no dot or dot dot 2048 * entries). This allows us to double check for existing 2049 * entries which might not have been found in the index. 2050 */ 2051 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { 2052 p->seen_dot = 1; 2053 return true; 2054 } 2055 2056 if (name_len == 2 && !strncmp("..", name, 2) && 2057 pos == OCFS2_DIR_REC_LEN(1)) { 2058 p->seen_dot_dot = 1; 2059 2060 if (p->dx_dir && p->seen_dot) 2061 return false; 2062 2063 return true; 2064 } 2065 2066 p->seen_other = 1; 2067 return false; 2068 } 2069 2070 static int ocfs2_empty_dir_dx(struct inode *inode, 2071 struct ocfs2_empty_dir_priv *priv) 2072 { 2073 int ret; 2074 struct buffer_head *di_bh = NULL; 2075 struct buffer_head *dx_root_bh = NULL; 2076 struct ocfs2_dinode *di; 2077 struct ocfs2_dx_root_block *dx_root; 2078 2079 priv->dx_dir = 1; 2080 2081 ret = ocfs2_read_inode_block(inode, &di_bh); 2082 if (ret) { 2083 mlog_errno(ret); 2084 goto out; 2085 } 2086 di = (struct ocfs2_dinode *)di_bh->b_data; 2087 2088 ret = ocfs2_read_dx_root(inode, di, &dx_root_bh); 2089 if (ret) { 2090 mlog_errno(ret); 2091 goto out; 2092 } 2093 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2094 2095 if (le32_to_cpu(dx_root->dr_num_entries) != 2) 2096 priv->seen_other = 1; 2097 2098 out: 2099 brelse(di_bh); 2100 brelse(dx_root_bh); 2101 return ret; 2102 } 2103 2104 /* 2105 * routine to check that the specified directory is empty (for rmdir) 2106 * 2107 * Returns 1 if dir is empty, zero otherwise. 2108 * 2109 * XXX: This is a performance problem for unindexed directories. 2110 */ 2111 int ocfs2_empty_dir(struct inode *inode) 2112 { 2113 int ret; 2114 struct ocfs2_empty_dir_priv priv = { 2115 .ctx.actor = ocfs2_empty_dir_filldir, 2116 }; 2117 2118 if (ocfs2_dir_indexed(inode)) { 2119 ret = ocfs2_empty_dir_dx(inode, &priv); 2120 if (ret) 2121 mlog_errno(ret); 2122 /* 2123 * We still run ocfs2_dir_foreach to get the checks 2124 * for "." and "..". 2125 */ 2126 } 2127 2128 ret = ocfs2_dir_foreach(inode, &priv.ctx); 2129 if (ret) 2130 mlog_errno(ret); 2131 2132 if (!priv.seen_dot || !priv.seen_dot_dot) { 2133 mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n", 2134 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2135 /* 2136 * XXX: Is it really safe to allow an unlink to continue? 2137 */ 2138 return 1; 2139 } 2140 2141 return !priv.seen_other; 2142 } 2143 2144 /* 2145 * Fills "." and ".." dirents in a new directory block. Returns dirent for 2146 * "..", which might be used during creation of a directory with a trailing 2147 * header. It is otherwise safe to ignore the return code. 2148 */ 2149 static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode, 2150 struct inode *parent, 2151 char *start, 2152 unsigned int size) 2153 { 2154 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; 2155 2156 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); 2157 de->name_len = 1; 2158 de->rec_len = 2159 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); 2160 strcpy(de->name, "."); 2161 ocfs2_set_de_type(de, S_IFDIR); 2162 2163 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); 2164 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); 2165 de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1)); 2166 de->name_len = 2; 2167 strcpy(de->name, ".."); 2168 ocfs2_set_de_type(de, S_IFDIR); 2169 2170 return de; 2171 } 2172 2173 /* 2174 * This works together with code in ocfs2_mknod_locked() which sets 2175 * the inline-data flag and initializes the inline-data section. 2176 */ 2177 static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, 2178 handle_t *handle, 2179 struct inode *parent, 2180 struct inode *inode, 2181 struct buffer_head *di_bh) 2182 { 2183 int ret; 2184 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2185 struct ocfs2_inline_data *data = &di->id2.i_data; 2186 unsigned int size = le16_to_cpu(data->id_count); 2187 2188 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 2189 OCFS2_JOURNAL_ACCESS_WRITE); 2190 if (ret) { 2191 mlog_errno(ret); 2192 goto out; 2193 } 2194 2195 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); 2196 ocfs2_journal_dirty(handle, di_bh); 2197 2198 i_size_write(inode, size); 2199 set_nlink(inode, 2); 2200 inode->i_blocks = ocfs2_inode_sector_count(inode); 2201 2202 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 2203 if (ret < 0) 2204 mlog_errno(ret); 2205 2206 out: 2207 return ret; 2208 } 2209 2210 static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, 2211 handle_t *handle, 2212 struct inode *parent, 2213 struct inode *inode, 2214 struct buffer_head *fe_bh, 2215 struct ocfs2_alloc_context *data_ac, 2216 struct buffer_head **ret_new_bh) 2217 { 2218 int status; 2219 unsigned int size = osb->sb->s_blocksize; 2220 struct buffer_head *new_bh = NULL; 2221 struct ocfs2_dir_entry *de; 2222 2223 if (ocfs2_new_dir_wants_trailer(inode)) 2224 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2225 2226 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 2227 data_ac, NULL, &new_bh); 2228 if (status < 0) { 2229 mlog_errno(status); 2230 goto bail; 2231 } 2232 2233 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2234 2235 status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh, 2236 OCFS2_JOURNAL_ACCESS_CREATE); 2237 if (status < 0) { 2238 mlog_errno(status); 2239 goto bail; 2240 } 2241 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 2242 2243 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); 2244 if (ocfs2_new_dir_wants_trailer(inode)) { 2245 int size = le16_to_cpu(de->rec_len); 2246 2247 /* 2248 * Figure out the size of the hole left over after 2249 * insertion of '.' and '..'. The trailer wants this 2250 * information. 2251 */ 2252 size -= OCFS2_DIR_REC_LEN(2); 2253 size -= sizeof(struct ocfs2_dir_block_trailer); 2254 2255 ocfs2_init_dir_trailer(inode, new_bh, size); 2256 } 2257 2258 ocfs2_journal_dirty(handle, new_bh); 2259 2260 i_size_write(inode, inode->i_sb->s_blocksize); 2261 set_nlink(inode, 2); 2262 inode->i_blocks = ocfs2_inode_sector_count(inode); 2263 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 2264 if (status < 0) { 2265 mlog_errno(status); 2266 goto bail; 2267 } 2268 2269 status = 0; 2270 if (ret_new_bh) { 2271 *ret_new_bh = new_bh; 2272 new_bh = NULL; 2273 } 2274 bail: 2275 brelse(new_bh); 2276 2277 return status; 2278 } 2279 2280 static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, 2281 handle_t *handle, struct inode *dir, 2282 struct buffer_head *di_bh, 2283 struct buffer_head *dirdata_bh, 2284 struct ocfs2_alloc_context *meta_ac, 2285 int dx_inline, u32 num_entries, 2286 struct buffer_head **ret_dx_root_bh) 2287 { 2288 int ret; 2289 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 2290 u16 dr_suballoc_bit; 2291 u64 suballoc_loc, dr_blkno; 2292 unsigned int num_bits; 2293 struct buffer_head *dx_root_bh = NULL; 2294 struct ocfs2_dx_root_block *dx_root; 2295 struct ocfs2_dir_block_trailer *trailer = 2296 ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); 2297 2298 ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, 2299 &dr_suballoc_bit, &num_bits, &dr_blkno); 2300 if (ret) { 2301 mlog_errno(ret); 2302 goto out; 2303 } 2304 2305 trace_ocfs2_dx_dir_attach_index( 2306 (unsigned long long)OCFS2_I(dir)->ip_blkno, 2307 (unsigned long long)dr_blkno); 2308 2309 dx_root_bh = sb_getblk(osb->sb, dr_blkno); 2310 if (dx_root_bh == NULL) { 2311 ret = -ENOMEM; 2312 goto out; 2313 } 2314 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); 2315 2316 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, 2317 OCFS2_JOURNAL_ACCESS_CREATE); 2318 if (ret < 0) { 2319 mlog_errno(ret); 2320 goto out; 2321 } 2322 2323 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2324 memset(dx_root, 0, osb->sb->s_blocksize); 2325 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2326 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); 2327 dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc); 2328 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2329 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2330 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2331 dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); 2332 dx_root->dr_num_entries = cpu_to_le32(num_entries); 2333 if (le16_to_cpu(trailer->db_free_rec_len)) 2334 dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); 2335 else 2336 dx_root->dr_free_blk = cpu_to_le64(0); 2337 2338 if (dx_inline) { 2339 dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; 2340 dx_root->dr_entries.de_count = 2341 cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); 2342 } else { 2343 dx_root->dr_list.l_count = 2344 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 2345 } 2346 ocfs2_journal_dirty(handle, dx_root_bh); 2347 2348 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 2349 OCFS2_JOURNAL_ACCESS_CREATE); 2350 if (ret) { 2351 mlog_errno(ret); 2352 goto out; 2353 } 2354 2355 di->i_dx_root = cpu_to_le64(dr_blkno); 2356 2357 spin_lock(&OCFS2_I(dir)->ip_lock); 2358 OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; 2359 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 2360 spin_unlock(&OCFS2_I(dir)->ip_lock); 2361 2362 ocfs2_journal_dirty(handle, di_bh); 2363 2364 *ret_dx_root_bh = dx_root_bh; 2365 dx_root_bh = NULL; 2366 2367 out: 2368 brelse(dx_root_bh); 2369 return ret; 2370 } 2371 2372 static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, 2373 handle_t *handle, struct inode *dir, 2374 struct buffer_head **dx_leaves, 2375 int num_dx_leaves, u64 start_blk) 2376 { 2377 int ret, i; 2378 struct ocfs2_dx_leaf *dx_leaf; 2379 struct buffer_head *bh; 2380 2381 for (i = 0; i < num_dx_leaves; i++) { 2382 bh = sb_getblk(osb->sb, start_blk + i); 2383 if (bh == NULL) { 2384 ret = -ENOMEM; 2385 goto out; 2386 } 2387 dx_leaves[i] = bh; 2388 2389 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh); 2390 2391 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh, 2392 OCFS2_JOURNAL_ACCESS_CREATE); 2393 if (ret < 0) { 2394 mlog_errno(ret); 2395 goto out; 2396 } 2397 2398 dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data; 2399 2400 memset(dx_leaf, 0, osb->sb->s_blocksize); 2401 strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE); 2402 dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation); 2403 dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr); 2404 dx_leaf->dl_list.de_count = 2405 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); 2406 2407 trace_ocfs2_dx_dir_format_cluster( 2408 (unsigned long long)OCFS2_I(dir)->ip_blkno, 2409 (unsigned long long)bh->b_blocknr, 2410 le16_to_cpu(dx_leaf->dl_list.de_count)); 2411 2412 ocfs2_journal_dirty(handle, bh); 2413 } 2414 2415 ret = 0; 2416 out: 2417 return ret; 2418 } 2419 2420 /* 2421 * Allocates and formats a new cluster for use in an indexed dir 2422 * leaf. This version will not do the extent insert, so that it can be 2423 * used by operations which need careful ordering. 2424 */ 2425 static int __ocfs2_dx_dir_new_cluster(struct inode *dir, 2426 u32 cpos, handle_t *handle, 2427 struct ocfs2_alloc_context *data_ac, 2428 struct buffer_head **dx_leaves, 2429 int num_dx_leaves, u64 *ret_phys_blkno) 2430 { 2431 int ret; 2432 u32 phys, num; 2433 u64 phys_blkno; 2434 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2435 2436 /* 2437 * XXX: For create, this should claim cluster for the index 2438 * *before* the unindexed insert so that we have a better 2439 * chance of contiguousness as the directory grows in number 2440 * of entries. 2441 */ 2442 ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num); 2443 if (ret) { 2444 mlog_errno(ret); 2445 goto out; 2446 } 2447 2448 /* 2449 * Format the new cluster first. That way, we're inserting 2450 * valid data. 2451 */ 2452 phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys); 2453 ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves, 2454 num_dx_leaves, phys_blkno); 2455 if (ret) { 2456 mlog_errno(ret); 2457 goto out; 2458 } 2459 2460 *ret_phys_blkno = phys_blkno; 2461 out: 2462 return ret; 2463 } 2464 2465 static int ocfs2_dx_dir_new_cluster(struct inode *dir, 2466 struct ocfs2_extent_tree *et, 2467 u32 cpos, handle_t *handle, 2468 struct ocfs2_alloc_context *data_ac, 2469 struct ocfs2_alloc_context *meta_ac, 2470 struct buffer_head **dx_leaves, 2471 int num_dx_leaves) 2472 { 2473 int ret; 2474 u64 phys_blkno; 2475 2476 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, 2477 num_dx_leaves, &phys_blkno); 2478 if (ret) { 2479 mlog_errno(ret); 2480 goto out; 2481 } 2482 2483 ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0, 2484 meta_ac); 2485 if (ret) 2486 mlog_errno(ret); 2487 out: 2488 return ret; 2489 } 2490 2491 static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb, 2492 int *ret_num_leaves) 2493 { 2494 int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1); 2495 struct buffer_head **dx_leaves; 2496 2497 dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *), 2498 GFP_NOFS); 2499 if (dx_leaves && ret_num_leaves) 2500 *ret_num_leaves = num_dx_leaves; 2501 2502 return dx_leaves; 2503 } 2504 2505 static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb, 2506 handle_t *handle, 2507 struct inode *parent, 2508 struct inode *inode, 2509 struct buffer_head *di_bh, 2510 struct ocfs2_alloc_context *data_ac, 2511 struct ocfs2_alloc_context *meta_ac) 2512 { 2513 int ret; 2514 struct buffer_head *leaf_bh = NULL; 2515 struct buffer_head *dx_root_bh = NULL; 2516 struct ocfs2_dx_hinfo hinfo; 2517 struct ocfs2_dx_root_block *dx_root; 2518 struct ocfs2_dx_entry_list *entry_list; 2519 2520 /* 2521 * Our strategy is to create the directory as though it were 2522 * unindexed, then add the index block. This works with very 2523 * little complication since the state of a new directory is a 2524 * very well known quantity. 2525 * 2526 * Essentially, we have two dirents ("." and ".."), in the 1st 2527 * block which need indexing. These are easily inserted into 2528 * the index block. 2529 */ 2530 2531 ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, 2532 data_ac, &leaf_bh); 2533 if (ret) { 2534 mlog_errno(ret); 2535 goto out; 2536 } 2537 2538 ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, 2539 meta_ac, 1, 2, &dx_root_bh); 2540 if (ret) { 2541 mlog_errno(ret); 2542 goto out; 2543 } 2544 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2545 entry_list = &dx_root->dr_entries; 2546 2547 /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ 2548 ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); 2549 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); 2550 2551 ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); 2552 ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); 2553 2554 out: 2555 brelse(dx_root_bh); 2556 brelse(leaf_bh); 2557 return ret; 2558 } 2559 2560 int ocfs2_fill_new_dir(struct ocfs2_super *osb, 2561 handle_t *handle, 2562 struct inode *parent, 2563 struct inode *inode, 2564 struct buffer_head *fe_bh, 2565 struct ocfs2_alloc_context *data_ac, 2566 struct ocfs2_alloc_context *meta_ac) 2567 2568 { 2569 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); 2570 2571 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2572 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); 2573 2574 if (ocfs2_supports_indexed_dirs(osb)) 2575 return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh, 2576 data_ac, meta_ac); 2577 2578 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, 2579 data_ac, NULL); 2580 } 2581 2582 static int ocfs2_dx_dir_index_block(struct inode *dir, 2583 handle_t *handle, 2584 struct buffer_head **dx_leaves, 2585 int num_dx_leaves, 2586 u32 *num_dx_entries, 2587 struct buffer_head *dirent_bh) 2588 { 2589 int ret = 0, namelen, i; 2590 char *de_buf, *limit; 2591 struct ocfs2_dir_entry *de; 2592 struct buffer_head *dx_leaf_bh; 2593 struct ocfs2_dx_hinfo hinfo; 2594 u64 dirent_blk = dirent_bh->b_blocknr; 2595 2596 de_buf = dirent_bh->b_data; 2597 limit = de_buf + dir->i_sb->s_blocksize; 2598 2599 while (de_buf < limit) { 2600 de = (struct ocfs2_dir_entry *)de_buf; 2601 2602 namelen = de->name_len; 2603 if (!namelen || !de->inode) 2604 goto inc; 2605 2606 ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo); 2607 2608 i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo); 2609 dx_leaf_bh = dx_leaves[i]; 2610 2611 ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo, 2612 dirent_blk, dx_leaf_bh); 2613 if (ret) { 2614 mlog_errno(ret); 2615 goto out; 2616 } 2617 2618 *num_dx_entries = *num_dx_entries + 1; 2619 2620 inc: 2621 de_buf += le16_to_cpu(de->rec_len); 2622 } 2623 2624 out: 2625 return ret; 2626 } 2627 2628 /* 2629 * XXX: This expects dx_root_bh to already be part of the transaction. 2630 */ 2631 static void ocfs2_dx_dir_index_root_block(struct inode *dir, 2632 struct buffer_head *dx_root_bh, 2633 struct buffer_head *dirent_bh) 2634 { 2635 char *de_buf, *limit; 2636 struct ocfs2_dx_root_block *dx_root; 2637 struct ocfs2_dir_entry *de; 2638 struct ocfs2_dx_hinfo hinfo; 2639 u64 dirent_blk = dirent_bh->b_blocknr; 2640 2641 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2642 2643 de_buf = dirent_bh->b_data; 2644 limit = de_buf + dir->i_sb->s_blocksize; 2645 2646 while (de_buf < limit) { 2647 de = (struct ocfs2_dir_entry *)de_buf; 2648 2649 if (!de->name_len || !de->inode) 2650 goto inc; 2651 2652 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); 2653 2654 trace_ocfs2_dx_dir_index_root_block( 2655 (unsigned long long)dir->i_ino, 2656 hinfo.major_hash, hinfo.minor_hash, 2657 de->name_len, de->name, 2658 le16_to_cpu(dx_root->dr_entries.de_num_used)); 2659 2660 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, 2661 dirent_blk); 2662 2663 le32_add_cpu(&dx_root->dr_num_entries, 1); 2664 inc: 2665 de_buf += le16_to_cpu(de->rec_len); 2666 } 2667 } 2668 2669 /* 2670 * Count the number of inline directory entries in di_bh and compare 2671 * them against the number of entries we can hold in an inline dx root 2672 * block. 2673 */ 2674 static int ocfs2_new_dx_should_be_inline(struct inode *dir, 2675 struct buffer_head *di_bh) 2676 { 2677 int dirent_count = 0; 2678 char *de_buf, *limit; 2679 struct ocfs2_dir_entry *de; 2680 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2681 2682 de_buf = di->id2.i_data.id_data; 2683 limit = de_buf + i_size_read(dir); 2684 2685 while (de_buf < limit) { 2686 de = (struct ocfs2_dir_entry *)de_buf; 2687 2688 if (de->name_len && de->inode) 2689 dirent_count++; 2690 2691 de_buf += le16_to_cpu(de->rec_len); 2692 } 2693 2694 /* We are careful to leave room for one extra record. */ 2695 return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); 2696 } 2697 2698 /* 2699 * Expand rec_len of the rightmost dirent in a directory block so that it 2700 * contains the end of our valid space for dirents. We do this during 2701 * expansion from an inline directory to one with extents. The first dir block 2702 * in that case is taken from the inline data portion of the inode block. 2703 * 2704 * This will also return the largest amount of contiguous space for a dirent 2705 * in the block. That value is *not* necessarily the last dirent, even after 2706 * expansion. The directory indexing code wants this value for free space 2707 * accounting. We do this here since we're already walking the entire dir 2708 * block. 2709 * 2710 * We add the dir trailer if this filesystem wants it. 2711 */ 2712 static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, 2713 struct inode *dir) 2714 { 2715 struct super_block *sb = dir->i_sb; 2716 struct ocfs2_dir_entry *de; 2717 struct ocfs2_dir_entry *prev_de; 2718 char *de_buf, *limit; 2719 unsigned int new_size = sb->s_blocksize; 2720 unsigned int bytes, this_hole; 2721 unsigned int largest_hole = 0; 2722 2723 if (ocfs2_new_dir_wants_trailer(dir)) 2724 new_size = ocfs2_dir_trailer_blk_off(sb); 2725 2726 bytes = new_size - old_size; 2727 2728 limit = start + old_size; 2729 de_buf = start; 2730 de = (struct ocfs2_dir_entry *)de_buf; 2731 do { 2732 this_hole = ocfs2_figure_dirent_hole(de); 2733 if (this_hole > largest_hole) 2734 largest_hole = this_hole; 2735 2736 prev_de = de; 2737 de_buf += le16_to_cpu(de->rec_len); 2738 de = (struct ocfs2_dir_entry *)de_buf; 2739 } while (de_buf < limit); 2740 2741 le16_add_cpu(&prev_de->rec_len, bytes); 2742 2743 /* We need to double check this after modification of the final 2744 * dirent. */ 2745 this_hole = ocfs2_figure_dirent_hole(prev_de); 2746 if (this_hole > largest_hole) 2747 largest_hole = this_hole; 2748 2749 if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) 2750 return largest_hole; 2751 return 0; 2752 } 2753 2754 /* 2755 * We allocate enough clusters to fulfill "blocks_wanted", but set 2756 * i_size to exactly one block. Ocfs2_extend_dir() will handle the 2757 * rest automatically for us. 2758 * 2759 * *first_block_bh is a pointer to the 1st data block allocated to the 2760 * directory. 2761 */ 2762 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, 2763 unsigned int blocks_wanted, 2764 struct ocfs2_dir_lookup_result *lookup, 2765 struct buffer_head **first_block_bh) 2766 { 2767 u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0; 2768 struct super_block *sb = dir->i_sb; 2769 int ret, i, num_dx_leaves = 0, dx_inline = 0, 2770 credits = ocfs2_inline_to_extents_credits(sb); 2771 u64 dx_insert_blkno, blkno, 2772 bytes = blocks_wanted << sb->s_blocksize_bits; 2773 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2774 struct ocfs2_inode_info *oi = OCFS2_I(dir); 2775 struct ocfs2_alloc_context *data_ac = NULL; 2776 struct ocfs2_alloc_context *meta_ac = NULL; 2777 struct buffer_head *dirdata_bh = NULL; 2778 struct buffer_head *dx_root_bh = NULL; 2779 struct buffer_head **dx_leaves = NULL; 2780 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2781 handle_t *handle; 2782 struct ocfs2_extent_tree et; 2783 struct ocfs2_extent_tree dx_et; 2784 int did_quota = 0, bytes_allocated = 0; 2785 2786 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); 2787 2788 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2789 dx_alloc = 0; 2790 2791 down_write(&oi->ip_alloc_sem); 2792 2793 if (ocfs2_supports_indexed_dirs(osb)) { 2794 credits += ocfs2_add_dir_index_credits(sb); 2795 2796 dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); 2797 if (!dx_inline) { 2798 /* Add one more cluster for an index leaf */ 2799 dx_alloc++; 2800 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, 2801 &num_dx_leaves); 2802 if (!dx_leaves) { 2803 ret = -ENOMEM; 2804 mlog_errno(ret); 2805 goto out; 2806 } 2807 } 2808 2809 /* This gets us the dx_root */ 2810 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 2811 if (ret) { 2812 mlog_errno(ret); 2813 goto out; 2814 } 2815 } 2816 2817 /* 2818 * We should never need more than 2 clusters for the unindexed 2819 * tree - maximum dirent size is far less than one block. In 2820 * fact, the only time we'd need more than one cluster is if 2821 * blocksize == clustersize and the dirent won't fit in the 2822 * extra space that the expansion to a single block gives. As 2823 * of today, that only happens on 4k/4k file systems. 2824 */ 2825 BUG_ON(alloc > 2); 2826 2827 ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac); 2828 if (ret) { 2829 mlog_errno(ret); 2830 goto out; 2831 } 2832 2833 /* 2834 * Prepare for worst case allocation scenario of two separate 2835 * extents in the unindexed tree. 2836 */ 2837 if (alloc == 2) 2838 credits += OCFS2_SUBALLOC_ALLOC; 2839 2840 handle = ocfs2_start_trans(osb, credits); 2841 if (IS_ERR(handle)) { 2842 ret = PTR_ERR(handle); 2843 mlog_errno(ret); 2844 goto out; 2845 } 2846 2847 ret = dquot_alloc_space_nodirty(dir, 2848 ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); 2849 if (ret) 2850 goto out_commit; 2851 did_quota = 1; 2852 2853 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2854 /* 2855 * Allocate our index cluster first, to maximize the 2856 * possibility that unindexed leaves grow 2857 * contiguously. 2858 */ 2859 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, 2860 dx_leaves, num_dx_leaves, 2861 &dx_insert_blkno); 2862 if (ret) { 2863 mlog_errno(ret); 2864 goto out_commit; 2865 } 2866 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); 2867 } 2868 2869 /* 2870 * Try to claim as many clusters as the bitmap can give though 2871 * if we only get one now, that's enough to continue. The rest 2872 * will be claimed after the conversion to extents. 2873 */ 2874 if (ocfs2_dir_resv_allowed(osb)) 2875 data_ac->ac_resv = &oi->ip_la_data_resv; 2876 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); 2877 if (ret) { 2878 mlog_errno(ret); 2879 goto out_commit; 2880 } 2881 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); 2882 2883 /* 2884 * Operations are carefully ordered so that we set up the new 2885 * data block first. The conversion from inline data to 2886 * extents follows. 2887 */ 2888 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 2889 dirdata_bh = sb_getblk(sb, blkno); 2890 if (!dirdata_bh) { 2891 ret = -ENOMEM; 2892 mlog_errno(ret); 2893 goto out_commit; 2894 } 2895 2896 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh); 2897 2898 ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh, 2899 OCFS2_JOURNAL_ACCESS_CREATE); 2900 if (ret) { 2901 mlog_errno(ret); 2902 goto out_commit; 2903 } 2904 2905 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 2906 memset(dirdata_bh->b_data + i_size_read(dir), 0, 2907 sb->s_blocksize - i_size_read(dir)); 2908 i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); 2909 if (ocfs2_new_dir_wants_trailer(dir)) { 2910 /* 2911 * Prepare the dir trailer up front. It will otherwise look 2912 * like a valid dirent. Even if inserting the index fails 2913 * (unlikely), then all we'll have done is given first dir 2914 * block a small amount of fragmentation. 2915 */ 2916 ocfs2_init_dir_trailer(dir, dirdata_bh, i); 2917 } 2918 2919 ocfs2_update_inode_fsync_trans(handle, dir, 1); 2920 ocfs2_journal_dirty(handle, dirdata_bh); 2921 2922 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2923 /* 2924 * Dx dirs with an external cluster need to do this up 2925 * front. Inline dx root's get handled later, after 2926 * we've allocated our root block. We get passed back 2927 * a total number of items so that dr_num_entries can 2928 * be correctly set once the dx_root has been 2929 * allocated. 2930 */ 2931 ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, 2932 num_dx_leaves, &num_dx_entries, 2933 dirdata_bh); 2934 if (ret) { 2935 mlog_errno(ret); 2936 goto out_commit; 2937 } 2938 } 2939 2940 /* 2941 * Set extent, i_size, etc on the directory. After this, the 2942 * inode should contain the same exact dirents as before and 2943 * be fully accessible from system calls. 2944 * 2945 * We let the later dirent insert modify c/mtime - to the user 2946 * the data hasn't changed. 2947 */ 2948 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 2949 OCFS2_JOURNAL_ACCESS_CREATE); 2950 if (ret) { 2951 mlog_errno(ret); 2952 goto out_commit; 2953 } 2954 2955 spin_lock(&oi->ip_lock); 2956 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; 2957 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 2958 spin_unlock(&oi->ip_lock); 2959 2960 ocfs2_dinode_new_extent_list(dir, di); 2961 2962 i_size_write(dir, sb->s_blocksize); 2963 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); 2964 2965 di->i_size = cpu_to_le64(sb->s_blocksize); 2966 di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir)); 2967 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir)); 2968 ocfs2_update_inode_fsync_trans(handle, dir, 1); 2969 2970 /* 2971 * This should never fail as our extent list is empty and all 2972 * related blocks have been journaled already. 2973 */ 2974 ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, 2975 0, NULL); 2976 if (ret) { 2977 mlog_errno(ret); 2978 goto out_commit; 2979 } 2980 2981 /* 2982 * Set i_blocks after the extent insert for the most up to 2983 * date ip_clusters value. 2984 */ 2985 dir->i_blocks = ocfs2_inode_sector_count(dir); 2986 2987 ocfs2_journal_dirty(handle, di_bh); 2988 2989 if (ocfs2_supports_indexed_dirs(osb)) { 2990 ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, 2991 dirdata_bh, meta_ac, dx_inline, 2992 num_dx_entries, &dx_root_bh); 2993 if (ret) { 2994 mlog_errno(ret); 2995 goto out_commit; 2996 } 2997 2998 if (dx_inline) { 2999 ocfs2_dx_dir_index_root_block(dir, dx_root_bh, 3000 dirdata_bh); 3001 } else { 3002 ocfs2_init_dx_root_extent_tree(&dx_et, 3003 INODE_CACHE(dir), 3004 dx_root_bh); 3005 ret = ocfs2_insert_extent(handle, &dx_et, 0, 3006 dx_insert_blkno, 1, 0, NULL); 3007 if (ret) 3008 mlog_errno(ret); 3009 } 3010 } 3011 3012 /* 3013 * We asked for two clusters, but only got one in the 1st 3014 * pass. Claim the 2nd cluster as a separate extent. 3015 */ 3016 if (alloc > len) { 3017 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, 3018 &len); 3019 if (ret) { 3020 mlog_errno(ret); 3021 goto out_commit; 3022 } 3023 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 3024 3025 ret = ocfs2_insert_extent(handle, &et, 1, 3026 blkno, len, 0, NULL); 3027 if (ret) { 3028 mlog_errno(ret); 3029 goto out_commit; 3030 } 3031 bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); 3032 } 3033 3034 *first_block_bh = dirdata_bh; 3035 dirdata_bh = NULL; 3036 if (ocfs2_supports_indexed_dirs(osb)) { 3037 unsigned int off; 3038 3039 if (!dx_inline) { 3040 /* 3041 * We need to return the correct block within the 3042 * cluster which should hold our entry. 3043 */ 3044 off = ocfs2_dx_dir_hash_idx(osb, 3045 &lookup->dl_hinfo); 3046 get_bh(dx_leaves[off]); 3047 lookup->dl_dx_leaf_bh = dx_leaves[off]; 3048 } 3049 lookup->dl_dx_root_bh = dx_root_bh; 3050 dx_root_bh = NULL; 3051 } 3052 3053 out_commit: 3054 if (ret < 0 && did_quota) 3055 dquot_free_space_nodirty(dir, bytes_allocated); 3056 3057 ocfs2_commit_trans(osb, handle); 3058 3059 out: 3060 up_write(&oi->ip_alloc_sem); 3061 if (data_ac) 3062 ocfs2_free_alloc_context(data_ac); 3063 if (meta_ac) 3064 ocfs2_free_alloc_context(meta_ac); 3065 3066 if (dx_leaves) { 3067 for (i = 0; i < num_dx_leaves; i++) 3068 brelse(dx_leaves[i]); 3069 kfree(dx_leaves); 3070 } 3071 3072 brelse(dirdata_bh); 3073 brelse(dx_root_bh); 3074 3075 return ret; 3076 } 3077 3078 /* returns a bh of the 1st new block in the allocation. */ 3079 static int ocfs2_do_extend_dir(struct super_block *sb, 3080 handle_t *handle, 3081 struct inode *dir, 3082 struct buffer_head *parent_fe_bh, 3083 struct ocfs2_alloc_context *data_ac, 3084 struct ocfs2_alloc_context *meta_ac, 3085 struct buffer_head **new_bh) 3086 { 3087 int status; 3088 int extend, did_quota = 0; 3089 u64 p_blkno, v_blkno; 3090 3091 spin_lock(&OCFS2_I(dir)->ip_lock); 3092 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); 3093 spin_unlock(&OCFS2_I(dir)->ip_lock); 3094 3095 if (extend) { 3096 u32 offset = OCFS2_I(dir)->ip_clusters; 3097 3098 status = dquot_alloc_space_nodirty(dir, 3099 ocfs2_clusters_to_bytes(sb, 1)); 3100 if (status) 3101 goto bail; 3102 did_quota = 1; 3103 3104 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 3105 1, 0, parent_fe_bh, handle, 3106 data_ac, meta_ac, NULL); 3107 BUG_ON(status == -EAGAIN); 3108 if (status < 0) { 3109 mlog_errno(status); 3110 goto bail; 3111 } 3112 } 3113 3114 v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); 3115 status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); 3116 if (status < 0) { 3117 mlog_errno(status); 3118 goto bail; 3119 } 3120 3121 *new_bh = sb_getblk(sb, p_blkno); 3122 if (!*new_bh) { 3123 status = -ENOMEM; 3124 mlog_errno(status); 3125 goto bail; 3126 } 3127 status = 0; 3128 bail: 3129 if (did_quota && status < 0) 3130 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); 3131 return status; 3132 } 3133 3134 /* 3135 * Assumes you already have a cluster lock on the directory. 3136 * 3137 * 'blocks_wanted' is only used if we have an inline directory which 3138 * is to be turned into an extent based one. The size of the dirent to 3139 * insert might be larger than the space gained by growing to just one 3140 * block, so we may have to grow the inode by two blocks in that case. 3141 * 3142 * If the directory is already indexed, dx_root_bh must be provided. 3143 */ 3144 static int ocfs2_extend_dir(struct ocfs2_super *osb, 3145 struct inode *dir, 3146 struct buffer_head *parent_fe_bh, 3147 unsigned int blocks_wanted, 3148 struct ocfs2_dir_lookup_result *lookup, 3149 struct buffer_head **new_de_bh) 3150 { 3151 int status = 0; 3152 int credits, num_free_extents, drop_alloc_sem = 0; 3153 loff_t dir_i_size; 3154 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 3155 struct ocfs2_extent_list *el = &fe->id2.i_list; 3156 struct ocfs2_alloc_context *data_ac = NULL; 3157 struct ocfs2_alloc_context *meta_ac = NULL; 3158 handle_t *handle = NULL; 3159 struct buffer_head *new_bh = NULL; 3160 struct ocfs2_dir_entry * de; 3161 struct super_block *sb = osb->sb; 3162 struct ocfs2_extent_tree et; 3163 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 3164 3165 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3166 /* 3167 * This would be a code error as an inline directory should 3168 * never have an index root. 3169 */ 3170 BUG_ON(dx_root_bh); 3171 3172 status = ocfs2_expand_inline_dir(dir, parent_fe_bh, 3173 blocks_wanted, lookup, 3174 &new_bh); 3175 if (status) { 3176 mlog_errno(status); 3177 goto bail; 3178 } 3179 3180 /* Expansion from inline to an indexed directory will 3181 * have given us this. */ 3182 dx_root_bh = lookup->dl_dx_root_bh; 3183 3184 if (blocks_wanted == 1) { 3185 /* 3186 * If the new dirent will fit inside the space 3187 * created by pushing out to one block, then 3188 * we can complete the operation 3189 * here. Otherwise we have to expand i_size 3190 * and format the 2nd block below. 3191 */ 3192 BUG_ON(new_bh == NULL); 3193 goto bail_bh; 3194 } 3195 3196 /* 3197 * Get rid of 'new_bh' - we want to format the 2nd 3198 * data block and return that instead. 3199 */ 3200 brelse(new_bh); 3201 new_bh = NULL; 3202 3203 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3204 drop_alloc_sem = 1; 3205 dir_i_size = i_size_read(dir); 3206 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 3207 goto do_extend; 3208 } 3209 3210 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3211 drop_alloc_sem = 1; 3212 dir_i_size = i_size_read(dir); 3213 trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno, 3214 dir_i_size); 3215 3216 /* dir->i_size is always block aligned. */ 3217 spin_lock(&OCFS2_I(dir)->ip_lock); 3218 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { 3219 spin_unlock(&OCFS2_I(dir)->ip_lock); 3220 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), 3221 parent_fe_bh); 3222 num_free_extents = ocfs2_num_free_extents(&et); 3223 if (num_free_extents < 0) { 3224 status = num_free_extents; 3225 mlog_errno(status); 3226 goto bail; 3227 } 3228 3229 if (!num_free_extents) { 3230 status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); 3231 if (status < 0) { 3232 if (status != -ENOSPC) 3233 mlog_errno(status); 3234 goto bail; 3235 } 3236 } 3237 3238 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 3239 if (status < 0) { 3240 if (status != -ENOSPC) 3241 mlog_errno(status); 3242 goto bail; 3243 } 3244 3245 if (ocfs2_dir_resv_allowed(osb)) 3246 data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; 3247 3248 credits = ocfs2_calc_extend_credits(sb, el); 3249 } else { 3250 spin_unlock(&OCFS2_I(dir)->ip_lock); 3251 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 3252 } 3253 3254 do_extend: 3255 if (ocfs2_dir_indexed(dir)) 3256 credits++; /* For attaching the new dirent block to the 3257 * dx_root */ 3258 3259 handle = ocfs2_start_trans(osb, credits); 3260 if (IS_ERR(handle)) { 3261 status = PTR_ERR(handle); 3262 handle = NULL; 3263 mlog_errno(status); 3264 goto bail; 3265 } 3266 3267 status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, 3268 data_ac, meta_ac, &new_bh); 3269 if (status < 0) { 3270 mlog_errno(status); 3271 goto bail; 3272 } 3273 3274 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh); 3275 3276 status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh, 3277 OCFS2_JOURNAL_ACCESS_CREATE); 3278 if (status < 0) { 3279 mlog_errno(status); 3280 goto bail; 3281 } 3282 memset(new_bh->b_data, 0, sb->s_blocksize); 3283 3284 de = (struct ocfs2_dir_entry *) new_bh->b_data; 3285 de->inode = 0; 3286 if (ocfs2_supports_dir_trailer(dir)) { 3287 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); 3288 3289 ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); 3290 3291 if (ocfs2_dir_indexed(dir)) { 3292 status = ocfs2_dx_dir_link_trailer(dir, handle, 3293 dx_root_bh, new_bh); 3294 if (status) { 3295 mlog_errno(status); 3296 goto bail; 3297 } 3298 } 3299 } else { 3300 de->rec_len = cpu_to_le16(sb->s_blocksize); 3301 } 3302 ocfs2_update_inode_fsync_trans(handle, dir, 1); 3303 ocfs2_journal_dirty(handle, new_bh); 3304 3305 dir_i_size += dir->i_sb->s_blocksize; 3306 i_size_write(dir, dir_i_size); 3307 dir->i_blocks = ocfs2_inode_sector_count(dir); 3308 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 3309 if (status < 0) { 3310 mlog_errno(status); 3311 goto bail; 3312 } 3313 3314 bail_bh: 3315 *new_de_bh = new_bh; 3316 get_bh(*new_de_bh); 3317 bail: 3318 if (handle) 3319 ocfs2_commit_trans(osb, handle); 3320 if (drop_alloc_sem) 3321 up_write(&OCFS2_I(dir)->ip_alloc_sem); 3322 3323 if (data_ac) 3324 ocfs2_free_alloc_context(data_ac); 3325 if (meta_ac) 3326 ocfs2_free_alloc_context(meta_ac); 3327 3328 brelse(new_bh); 3329 3330 return status; 3331 } 3332 3333 static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, 3334 const char *name, int namelen, 3335 struct buffer_head **ret_de_bh, 3336 unsigned int *blocks_wanted) 3337 { 3338 int ret; 3339 struct super_block *sb = dir->i_sb; 3340 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3341 struct ocfs2_dir_entry *de, *last_de = NULL; 3342 char *de_buf, *limit; 3343 unsigned long offset = 0; 3344 unsigned int rec_len, new_rec_len, free_space; 3345 3346 /* 3347 * This calculates how many free bytes we'd have in block zero, should 3348 * this function force expansion to an extent tree. 3349 */ 3350 if (ocfs2_new_dir_wants_trailer(dir)) 3351 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); 3352 else 3353 free_space = dir->i_sb->s_blocksize - i_size_read(dir); 3354 3355 de_buf = di->id2.i_data.id_data; 3356 limit = de_buf + i_size_read(dir); 3357 rec_len = OCFS2_DIR_REC_LEN(namelen); 3358 3359 while (de_buf < limit) { 3360 de = (struct ocfs2_dir_entry *)de_buf; 3361 3362 if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) { 3363 ret = -ENOENT; 3364 goto out; 3365 } 3366 if (ocfs2_match(namelen, name, de)) { 3367 ret = -EEXIST; 3368 goto out; 3369 } 3370 /* 3371 * No need to check for a trailing dirent record here as 3372 * they're not used for inline dirs. 3373 */ 3374 3375 if (ocfs2_dirent_would_fit(de, rec_len)) { 3376 /* Ok, we found a spot. Return this bh and let 3377 * the caller actually fill it in. */ 3378 *ret_de_bh = di_bh; 3379 get_bh(*ret_de_bh); 3380 ret = 0; 3381 goto out; 3382 } 3383 3384 last_de = de; 3385 de_buf += le16_to_cpu(de->rec_len); 3386 offset += le16_to_cpu(de->rec_len); 3387 } 3388 3389 /* 3390 * We're going to require expansion of the directory - figure 3391 * out how many blocks we'll need so that a place for the 3392 * dirent can be found. 3393 */ 3394 *blocks_wanted = 1; 3395 new_rec_len = le16_to_cpu(last_de->rec_len) + free_space; 3396 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) 3397 *blocks_wanted = 2; 3398 3399 ret = -ENOSPC; 3400 out: 3401 return ret; 3402 } 3403 3404 static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, 3405 int namelen, struct buffer_head **ret_de_bh) 3406 { 3407 unsigned long offset; 3408 struct buffer_head *bh = NULL; 3409 unsigned short rec_len; 3410 struct ocfs2_dir_entry *de; 3411 struct super_block *sb = dir->i_sb; 3412 int status; 3413 int blocksize = dir->i_sb->s_blocksize; 3414 3415 status = ocfs2_read_dir_block(dir, 0, &bh, 0); 3416 if (status) 3417 goto bail; 3418 3419 rec_len = OCFS2_DIR_REC_LEN(namelen); 3420 offset = 0; 3421 de = (struct ocfs2_dir_entry *) bh->b_data; 3422 while (1) { 3423 if ((char *)de >= sb->s_blocksize + bh->b_data) { 3424 brelse(bh); 3425 bh = NULL; 3426 3427 if (i_size_read(dir) <= offset) { 3428 /* 3429 * Caller will have to expand this 3430 * directory. 3431 */ 3432 status = -ENOSPC; 3433 goto bail; 3434 } 3435 status = ocfs2_read_dir_block(dir, 3436 offset >> sb->s_blocksize_bits, 3437 &bh, 0); 3438 if (status) 3439 goto bail; 3440 3441 /* move to next block */ 3442 de = (struct ocfs2_dir_entry *) bh->b_data; 3443 } 3444 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { 3445 status = -ENOENT; 3446 goto bail; 3447 } 3448 if (ocfs2_match(namelen, name, de)) { 3449 status = -EEXIST; 3450 goto bail; 3451 } 3452 3453 if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize, 3454 blocksize)) 3455 goto next; 3456 3457 if (ocfs2_dirent_would_fit(de, rec_len)) { 3458 /* Ok, we found a spot. Return this bh and let 3459 * the caller actually fill it in. */ 3460 *ret_de_bh = bh; 3461 get_bh(*ret_de_bh); 3462 status = 0; 3463 goto bail; 3464 } 3465 next: 3466 offset += le16_to_cpu(de->rec_len); 3467 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); 3468 } 3469 3470 bail: 3471 brelse(bh); 3472 if (status) 3473 mlog_errno(status); 3474 3475 return status; 3476 } 3477 3478 static int dx_leaf_sort_cmp(const void *a, const void *b) 3479 { 3480 const struct ocfs2_dx_entry *entry1 = a; 3481 const struct ocfs2_dx_entry *entry2 = b; 3482 u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash); 3483 u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash); 3484 u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash); 3485 u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash); 3486 3487 if (major_hash1 > major_hash2) 3488 return 1; 3489 if (major_hash1 < major_hash2) 3490 return -1; 3491 3492 /* 3493 * It is not strictly necessary to sort by minor 3494 */ 3495 if (minor_hash1 > minor_hash2) 3496 return 1; 3497 if (minor_hash1 < minor_hash2) 3498 return -1; 3499 return 0; 3500 } 3501 3502 static void dx_leaf_sort_swap(void *a, void *b, int size) 3503 { 3504 struct ocfs2_dx_entry *entry1 = a; 3505 struct ocfs2_dx_entry *entry2 = b; 3506 3507 BUG_ON(size != sizeof(*entry1)); 3508 3509 swap(*entry1, *entry2); 3510 } 3511 3512 static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) 3513 { 3514 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; 3515 int i, num = le16_to_cpu(dl_list->de_num_used); 3516 3517 for (i = 0; i < (num - 1); i++) { 3518 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) != 3519 le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash)) 3520 return 0; 3521 } 3522 3523 return 1; 3524 } 3525 3526 /* 3527 * Find the optimal value to split this leaf on. This expects the leaf 3528 * entries to be in sorted order. 3529 * 3530 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is 3531 * the hash we want to insert. 3532 * 3533 * This function is only concerned with the major hash - that which 3534 * determines which cluster an item belongs to. 3535 */ 3536 static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf, 3537 u32 leaf_cpos, u32 insert_hash, 3538 u32 *split_hash) 3539 { 3540 struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; 3541 int i, num_used = le16_to_cpu(dl_list->de_num_used); 3542 int allsame; 3543 3544 /* 3545 * There's a couple rare, but nasty corner cases we have to 3546 * check for here. All of them involve a leaf where all value 3547 * have the same hash, which is what we look for first. 3548 * 3549 * Most of the time, all of the above is false, and we simply 3550 * pick the median value for a split. 3551 */ 3552 allsame = ocfs2_dx_leaf_same_major(dx_leaf); 3553 if (allsame) { 3554 u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash); 3555 3556 if (val == insert_hash) { 3557 /* 3558 * No matter where we would choose to split, 3559 * the new entry would want to occupy the same 3560 * block as these. Since there's no space left 3561 * in their existing block, we know there 3562 * won't be space after the split. 3563 */ 3564 return -ENOSPC; 3565 } 3566 3567 if (val == leaf_cpos) { 3568 /* 3569 * Because val is the same as leaf_cpos (which 3570 * is the smallest value this leaf can have), 3571 * yet is not equal to insert_hash, then we 3572 * know that insert_hash *must* be larger than 3573 * val (and leaf_cpos). At least cpos+1 in value. 3574 * 3575 * We also know then, that there cannot be an 3576 * adjacent extent (otherwise we'd be looking 3577 * at it). Choosing this value gives us a 3578 * chance to get some contiguousness. 3579 */ 3580 *split_hash = leaf_cpos + 1; 3581 return 0; 3582 } 3583 3584 if (val > insert_hash) { 3585 /* 3586 * val can not be the same as insert hash, and 3587 * also must be larger than leaf_cpos. Also, 3588 * we know that there can't be a leaf between 3589 * cpos and val, otherwise the entries with 3590 * hash 'val' would be there. 3591 */ 3592 *split_hash = val; 3593 return 0; 3594 } 3595 3596 *split_hash = insert_hash; 3597 return 0; 3598 } 3599 3600 /* 3601 * Since the records are sorted and the checks above 3602 * guaranteed that not all records in this block are the same, 3603 * we simple travel forward, from the median, and pick the 1st 3604 * record whose value is larger than leaf_cpos. 3605 */ 3606 for (i = (num_used / 2); i < num_used; i++) 3607 if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) > 3608 leaf_cpos) 3609 break; 3610 3611 BUG_ON(i == num_used); /* Should be impossible */ 3612 *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash); 3613 return 0; 3614 } 3615 3616 /* 3617 * Transfer all entries in orig_dx_leaves whose major hash is equal to or 3618 * larger than split_hash into new_dx_leaves. We use a temporary 3619 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks. 3620 * 3621 * Since the block offset inside a leaf (cluster) is a constant mask 3622 * of minor_hash, we can optimize - an item at block offset X within 3623 * the original cluster, will be at offset X within the new cluster. 3624 */ 3625 static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, 3626 handle_t *handle, 3627 struct ocfs2_dx_leaf *tmp_dx_leaf, 3628 struct buffer_head **orig_dx_leaves, 3629 struct buffer_head **new_dx_leaves, 3630 int num_dx_leaves) 3631 { 3632 int i, j, num_used; 3633 u32 major_hash; 3634 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; 3635 struct ocfs2_dx_entry_list *orig_list, *tmp_list; 3636 struct ocfs2_dx_entry *dx_entry; 3637 3638 tmp_list = &tmp_dx_leaf->dl_list; 3639 3640 for (i = 0; i < num_dx_leaves; i++) { 3641 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; 3642 orig_list = &orig_dx_leaf->dl_list; 3643 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; 3644 3645 num_used = le16_to_cpu(orig_list->de_num_used); 3646 3647 memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize); 3648 tmp_list->de_num_used = cpu_to_le16(0); 3649 memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used); 3650 3651 for (j = 0; j < num_used; j++) { 3652 dx_entry = &orig_list->de_entries[j]; 3653 major_hash = le32_to_cpu(dx_entry->dx_major_hash); 3654 if (major_hash >= split_hash) 3655 ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf, 3656 dx_entry); 3657 else 3658 ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf, 3659 dx_entry); 3660 } 3661 memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize); 3662 3663 ocfs2_journal_dirty(handle, orig_dx_leaves[i]); 3664 ocfs2_journal_dirty(handle, new_dx_leaves[i]); 3665 } 3666 } 3667 3668 static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, 3669 struct ocfs2_dx_root_block *dx_root) 3670 { 3671 int credits = ocfs2_clusters_to_blocks(osb->sb, 3); 3672 3673 credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list); 3674 credits += ocfs2_quota_trans_credits(osb->sb); 3675 return credits; 3676 } 3677 3678 /* 3679 * Find the median value in dx_leaf_bh and allocate a new leaf to move 3680 * half our entries into. 3681 */ 3682 static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, 3683 struct buffer_head *dx_root_bh, 3684 struct buffer_head *dx_leaf_bh, 3685 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos, 3686 u64 leaf_blkno) 3687 { 3688 struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; 3689 int credits, ret, i, num_used, did_quota = 0; 3690 u32 cpos, split_hash, insert_hash = hinfo->major_hash; 3691 u64 orig_leaves_start; 3692 int num_dx_leaves; 3693 struct buffer_head **orig_dx_leaves = NULL; 3694 struct buffer_head **new_dx_leaves = NULL; 3695 struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL; 3696 struct ocfs2_extent_tree et; 3697 handle_t *handle = NULL; 3698 struct ocfs2_dx_root_block *dx_root; 3699 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; 3700 3701 trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno, 3702 (unsigned long long)leaf_blkno, 3703 insert_hash); 3704 3705 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); 3706 3707 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3708 /* 3709 * XXX: This is a rather large limit. We should use a more 3710 * realistic value. 3711 */ 3712 if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX) 3713 return -ENOSPC; 3714 3715 num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used); 3716 if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) { 3717 mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: " 3718 "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, 3719 (unsigned long long)leaf_blkno, num_used); 3720 ret = -EIO; 3721 goto out; 3722 } 3723 3724 orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); 3725 if (!orig_dx_leaves) { 3726 ret = -ENOMEM; 3727 mlog_errno(ret); 3728 goto out; 3729 } 3730 3731 new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL); 3732 if (!new_dx_leaves) { 3733 ret = -ENOMEM; 3734 mlog_errno(ret); 3735 goto out; 3736 } 3737 3738 ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac); 3739 if (ret) { 3740 if (ret != -ENOSPC) 3741 mlog_errno(ret); 3742 goto out; 3743 } 3744 3745 credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root); 3746 handle = ocfs2_start_trans(osb, credits); 3747 if (IS_ERR(handle)) { 3748 ret = PTR_ERR(handle); 3749 handle = NULL; 3750 mlog_errno(ret); 3751 goto out; 3752 } 3753 3754 ret = dquot_alloc_space_nodirty(dir, 3755 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3756 if (ret) 3757 goto out_commit; 3758 did_quota = 1; 3759 3760 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, 3761 OCFS2_JOURNAL_ACCESS_WRITE); 3762 if (ret) { 3763 mlog_errno(ret); 3764 goto out_commit; 3765 } 3766 3767 /* 3768 * This block is changing anyway, so we can sort it in place. 3769 */ 3770 sort(dx_leaf->dl_list.de_entries, num_used, 3771 sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, 3772 dx_leaf_sort_swap); 3773 3774 ocfs2_journal_dirty(handle, dx_leaf_bh); 3775 3776 ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, 3777 &split_hash); 3778 if (ret) { 3779 mlog_errno(ret); 3780 goto out_commit; 3781 } 3782 3783 trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash); 3784 3785 /* 3786 * We have to carefully order operations here. There are items 3787 * which want to be in the new cluster before insert, but in 3788 * order to put those items in the new cluster, we alter the 3789 * old cluster. A failure to insert gets nasty. 3790 * 3791 * So, start by reserving writes to the old 3792 * cluster. ocfs2_dx_dir_new_cluster will reserve writes on 3793 * the new cluster for us, before inserting it. The insert 3794 * won't happen if there's an error before that. Once the 3795 * insert is done then, we can transfer from one leaf into the 3796 * other without fear of hitting any error. 3797 */ 3798 3799 /* 3800 * The leaf transfer wants some scratch space so that we don't 3801 * wind up doing a bunch of expensive memmove(). 3802 */ 3803 tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS); 3804 if (!tmp_dx_leaf) { 3805 ret = -ENOMEM; 3806 mlog_errno(ret); 3807 goto out_commit; 3808 } 3809 3810 orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno); 3811 ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves, 3812 orig_dx_leaves); 3813 if (ret) { 3814 mlog_errno(ret); 3815 goto out_commit; 3816 } 3817 3818 cpos = split_hash; 3819 ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, 3820 data_ac, meta_ac, new_dx_leaves, 3821 num_dx_leaves); 3822 if (ret) { 3823 mlog_errno(ret); 3824 goto out_commit; 3825 } 3826 3827 for (i = 0; i < num_dx_leaves; i++) { 3828 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3829 orig_dx_leaves[i], 3830 OCFS2_JOURNAL_ACCESS_WRITE); 3831 if (ret) { 3832 mlog_errno(ret); 3833 goto out_commit; 3834 } 3835 3836 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), 3837 new_dx_leaves[i], 3838 OCFS2_JOURNAL_ACCESS_WRITE); 3839 if (ret) { 3840 mlog_errno(ret); 3841 goto out_commit; 3842 } 3843 } 3844 3845 ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, 3846 orig_dx_leaves, new_dx_leaves, num_dx_leaves); 3847 3848 out_commit: 3849 if (ret < 0 && did_quota) 3850 dquot_free_space_nodirty(dir, 3851 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3852 3853 ocfs2_update_inode_fsync_trans(handle, dir, 1); 3854 ocfs2_commit_trans(osb, handle); 3855 3856 out: 3857 if (orig_dx_leaves || new_dx_leaves) { 3858 for (i = 0; i < num_dx_leaves; i++) { 3859 if (orig_dx_leaves) 3860 brelse(orig_dx_leaves[i]); 3861 if (new_dx_leaves) 3862 brelse(new_dx_leaves[i]); 3863 } 3864 kfree(orig_dx_leaves); 3865 kfree(new_dx_leaves); 3866 } 3867 3868 if (meta_ac) 3869 ocfs2_free_alloc_context(meta_ac); 3870 if (data_ac) 3871 ocfs2_free_alloc_context(data_ac); 3872 3873 kfree(tmp_dx_leaf); 3874 return ret; 3875 } 3876 3877 static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, 3878 struct buffer_head *di_bh, 3879 struct buffer_head *dx_root_bh, 3880 const char *name, int namelen, 3881 struct ocfs2_dir_lookup_result *lookup) 3882 { 3883 int ret, rebalanced = 0; 3884 struct ocfs2_dx_root_block *dx_root; 3885 struct buffer_head *dx_leaf_bh = NULL; 3886 struct ocfs2_dx_leaf *dx_leaf; 3887 u64 blkno; 3888 u32 leaf_cpos; 3889 3890 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3891 3892 restart_search: 3893 ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, 3894 &leaf_cpos, &blkno); 3895 if (ret) { 3896 mlog_errno(ret); 3897 goto out; 3898 } 3899 3900 ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); 3901 if (ret) { 3902 mlog_errno(ret); 3903 goto out; 3904 } 3905 3906 dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; 3907 3908 if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= 3909 le16_to_cpu(dx_leaf->dl_list.de_count)) { 3910 if (rebalanced) { 3911 /* 3912 * Rebalancing should have provided us with 3913 * space in an appropriate leaf. 3914 * 3915 * XXX: Is this an abnormal condition then? 3916 * Should we print a message here? 3917 */ 3918 ret = -ENOSPC; 3919 goto out; 3920 } 3921 3922 ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, 3923 &lookup->dl_hinfo, leaf_cpos, 3924 blkno); 3925 if (ret) { 3926 if (ret != -ENOSPC) 3927 mlog_errno(ret); 3928 goto out; 3929 } 3930 3931 /* 3932 * Restart the lookup. The rebalance might have 3933 * changed which block our item fits into. Mark our 3934 * progress, so we only execute this once. 3935 */ 3936 brelse(dx_leaf_bh); 3937 dx_leaf_bh = NULL; 3938 rebalanced = 1; 3939 goto restart_search; 3940 } 3941 3942 lookup->dl_dx_leaf_bh = dx_leaf_bh; 3943 dx_leaf_bh = NULL; 3944 3945 out: 3946 brelse(dx_leaf_bh); 3947 return ret; 3948 } 3949 3950 static int ocfs2_search_dx_free_list(struct inode *dir, 3951 struct buffer_head *dx_root_bh, 3952 int namelen, 3953 struct ocfs2_dir_lookup_result *lookup) 3954 { 3955 int ret = -ENOSPC; 3956 struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; 3957 struct ocfs2_dir_block_trailer *db; 3958 u64 next_block; 3959 int rec_len = OCFS2_DIR_REC_LEN(namelen); 3960 struct ocfs2_dx_root_block *dx_root; 3961 3962 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3963 next_block = le64_to_cpu(dx_root->dr_free_blk); 3964 3965 while (next_block) { 3966 brelse(prev_leaf_bh); 3967 prev_leaf_bh = leaf_bh; 3968 leaf_bh = NULL; 3969 3970 ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); 3971 if (ret) { 3972 mlog_errno(ret); 3973 goto out; 3974 } 3975 3976 db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); 3977 if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { 3978 lookup->dl_leaf_bh = leaf_bh; 3979 lookup->dl_prev_leaf_bh = prev_leaf_bh; 3980 leaf_bh = NULL; 3981 prev_leaf_bh = NULL; 3982 break; 3983 } 3984 3985 next_block = le64_to_cpu(db->db_free_next); 3986 } 3987 3988 if (!next_block) 3989 ret = -ENOSPC; 3990 3991 out: 3992 3993 brelse(leaf_bh); 3994 brelse(prev_leaf_bh); 3995 return ret; 3996 } 3997 3998 static int ocfs2_expand_inline_dx_root(struct inode *dir, 3999 struct buffer_head *dx_root_bh) 4000 { 4001 int ret, num_dx_leaves, i, j, did_quota = 0; 4002 struct buffer_head **dx_leaves = NULL; 4003 struct ocfs2_extent_tree et; 4004 u64 insert_blkno; 4005 struct ocfs2_alloc_context *data_ac = NULL; 4006 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 4007 handle_t *handle = NULL; 4008 struct ocfs2_dx_root_block *dx_root; 4009 struct ocfs2_dx_entry_list *entry_list; 4010 struct ocfs2_dx_entry *dx_entry; 4011 struct ocfs2_dx_leaf *target_leaf; 4012 4013 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 4014 if (ret) { 4015 mlog_errno(ret); 4016 goto out; 4017 } 4018 4019 dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); 4020 if (!dx_leaves) { 4021 ret = -ENOMEM; 4022 mlog_errno(ret); 4023 goto out; 4024 } 4025 4026 handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); 4027 if (IS_ERR(handle)) { 4028 ret = PTR_ERR(handle); 4029 mlog_errno(ret); 4030 goto out; 4031 } 4032 4033 ret = dquot_alloc_space_nodirty(dir, 4034 ocfs2_clusters_to_bytes(osb->sb, 1)); 4035 if (ret) 4036 goto out_commit; 4037 did_quota = 1; 4038 4039 /* 4040 * We do this up front, before the allocation, so that a 4041 * failure to add the dx_root_bh to the journal won't result 4042 * us losing clusters. 4043 */ 4044 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, 4045 OCFS2_JOURNAL_ACCESS_WRITE); 4046 if (ret) { 4047 mlog_errno(ret); 4048 goto out_commit; 4049 } 4050 4051 ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, 4052 num_dx_leaves, &insert_blkno); 4053 if (ret) { 4054 mlog_errno(ret); 4055 goto out_commit; 4056 } 4057 4058 /* 4059 * Transfer the entries from our dx_root into the appropriate 4060 * block 4061 */ 4062 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 4063 entry_list = &dx_root->dr_entries; 4064 4065 for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { 4066 dx_entry = &entry_list->de_entries[i]; 4067 4068 j = __ocfs2_dx_dir_hash_idx(osb, 4069 le32_to_cpu(dx_entry->dx_minor_hash)); 4070 target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; 4071 4072 ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); 4073 4074 /* Each leaf has been passed to the journal already 4075 * via __ocfs2_dx_dir_new_cluster() */ 4076 } 4077 4078 dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; 4079 memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - 4080 offsetof(struct ocfs2_dx_root_block, dr_list)); 4081 dx_root->dr_list.l_count = 4082 cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); 4083 4084 /* This should never fail considering we start with an empty 4085 * dx_root. */ 4086 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); 4087 ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL); 4088 if (ret) 4089 mlog_errno(ret); 4090 did_quota = 0; 4091 4092 ocfs2_update_inode_fsync_trans(handle, dir, 1); 4093 ocfs2_journal_dirty(handle, dx_root_bh); 4094 4095 out_commit: 4096 if (ret < 0 && did_quota) 4097 dquot_free_space_nodirty(dir, 4098 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 4099 4100 ocfs2_commit_trans(osb, handle); 4101 4102 out: 4103 if (data_ac) 4104 ocfs2_free_alloc_context(data_ac); 4105 4106 if (dx_leaves) { 4107 for (i = 0; i < num_dx_leaves; i++) 4108 brelse(dx_leaves[i]); 4109 kfree(dx_leaves); 4110 } 4111 return ret; 4112 } 4113 4114 static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) 4115 { 4116 struct ocfs2_dx_root_block *dx_root; 4117 struct ocfs2_dx_entry_list *entry_list; 4118 4119 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 4120 entry_list = &dx_root->dr_entries; 4121 4122 if (le16_to_cpu(entry_list->de_num_used) >= 4123 le16_to_cpu(entry_list->de_count)) 4124 return -ENOSPC; 4125 4126 return 0; 4127 } 4128 4129 static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, 4130 struct buffer_head *di_bh, 4131 const char *name, 4132 int namelen, 4133 struct ocfs2_dir_lookup_result *lookup) 4134 { 4135 int ret, free_dx_root = 1; 4136 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 4137 struct buffer_head *dx_root_bh = NULL; 4138 struct buffer_head *leaf_bh = NULL; 4139 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 4140 struct ocfs2_dx_root_block *dx_root; 4141 4142 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); 4143 if (ret) { 4144 mlog_errno(ret); 4145 goto out; 4146 } 4147 4148 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 4149 if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) { 4150 ret = -ENOSPC; 4151 mlog_errno(ret); 4152 goto out; 4153 } 4154 4155 if (ocfs2_dx_root_inline(dx_root)) { 4156 ret = ocfs2_inline_dx_has_space(dx_root_bh); 4157 4158 if (ret == 0) 4159 goto search_el; 4160 4161 /* 4162 * We ran out of room in the root block. Expand it to 4163 * an extent, then allow ocfs2_find_dir_space_dx to do 4164 * the rest. 4165 */ 4166 ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); 4167 if (ret) { 4168 mlog_errno(ret); 4169 goto out; 4170 } 4171 } 4172 4173 /* 4174 * Insert preparation for an indexed directory is split into two 4175 * steps. The call to find_dir_space_dx reserves room in the index for 4176 * an additional item. If we run out of space there, it's a real error 4177 * we can't continue on. 4178 */ 4179 ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, 4180 namelen, lookup); 4181 if (ret) { 4182 mlog_errno(ret); 4183 goto out; 4184 } 4185 4186 search_el: 4187 /* 4188 * Next, we need to find space in the unindexed tree. This call 4189 * searches using the free space linked list. If the unindexed tree 4190 * lacks sufficient space, we'll expand it below. The expansion code 4191 * is smart enough to add any new blocks to the free space list. 4192 */ 4193 ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); 4194 if (ret && ret != -ENOSPC) { 4195 mlog_errno(ret); 4196 goto out; 4197 } 4198 4199 /* Do this up here - ocfs2_extend_dir might need the dx_root */ 4200 lookup->dl_dx_root_bh = dx_root_bh; 4201 free_dx_root = 0; 4202 4203 if (ret == -ENOSPC) { 4204 ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); 4205 4206 if (ret) { 4207 mlog_errno(ret); 4208 goto out; 4209 } 4210 4211 /* 4212 * We make the assumption here that new leaf blocks are added 4213 * to the front of our free list. 4214 */ 4215 lookup->dl_prev_leaf_bh = NULL; 4216 lookup->dl_leaf_bh = leaf_bh; 4217 } 4218 4219 out: 4220 if (free_dx_root) 4221 brelse(dx_root_bh); 4222 return ret; 4223 } 4224 4225 /* 4226 * Get a directory ready for insert. Any directory allocation required 4227 * happens here. Success returns zero, and enough context in the dir 4228 * lookup result that ocfs2_add_entry() will be able complete the task 4229 * with minimal performance impact. 4230 */ 4231 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 4232 struct inode *dir, 4233 struct buffer_head *parent_fe_bh, 4234 const char *name, 4235 int namelen, 4236 struct ocfs2_dir_lookup_result *lookup) 4237 { 4238 int ret; 4239 unsigned int blocks_wanted = 1; 4240 struct buffer_head *bh = NULL; 4241 4242 trace_ocfs2_prepare_dir_for_insert( 4243 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); 4244 4245 /* 4246 * Do this up front to reduce confusion. 4247 * 4248 * The directory might start inline, then be turned into an 4249 * indexed one, in which case we'd need to hash deep inside 4250 * ocfs2_find_dir_space_id(). Since 4251 * ocfs2_prepare_dx_dir_for_insert() also needs this hash 4252 * done, there seems no point in spreading out the calls. We 4253 * can optimize away the case where the file system doesn't 4254 * support indexing. 4255 */ 4256 if (ocfs2_supports_indexed_dirs(osb)) 4257 ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); 4258 4259 if (ocfs2_dir_indexed(dir)) { 4260 ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, 4261 name, namelen, lookup); 4262 if (ret) 4263 mlog_errno(ret); 4264 goto out; 4265 } 4266 4267 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4268 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, 4269 namelen, &bh, &blocks_wanted); 4270 } else 4271 ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh); 4272 4273 if (ret && ret != -ENOSPC) { 4274 mlog_errno(ret); 4275 goto out; 4276 } 4277 4278 if (ret == -ENOSPC) { 4279 /* 4280 * We have to expand the directory to add this name. 4281 */ 4282 BUG_ON(bh); 4283 4284 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, 4285 lookup, &bh); 4286 if (ret) { 4287 if (ret != -ENOSPC) 4288 mlog_errno(ret); 4289 goto out; 4290 } 4291 4292 BUG_ON(!bh); 4293 } 4294 4295 lookup->dl_leaf_bh = bh; 4296 bh = NULL; 4297 out: 4298 brelse(bh); 4299 return ret; 4300 } 4301 4302 static int ocfs2_dx_dir_remove_index(struct inode *dir, 4303 struct buffer_head *di_bh, 4304 struct buffer_head *dx_root_bh) 4305 { 4306 int ret; 4307 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 4308 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 4309 struct ocfs2_dx_root_block *dx_root; 4310 struct inode *dx_alloc_inode = NULL; 4311 struct buffer_head *dx_alloc_bh = NULL; 4312 handle_t *handle; 4313 u64 blk; 4314 u16 bit; 4315 u64 bg_blkno; 4316 4317 dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; 4318 4319 dx_alloc_inode = ocfs2_get_system_file_inode(osb, 4320 EXTENT_ALLOC_SYSTEM_INODE, 4321 le16_to_cpu(dx_root->dr_suballoc_slot)); 4322 if (!dx_alloc_inode) { 4323 ret = -ENOMEM; 4324 mlog_errno(ret); 4325 goto out; 4326 } 4327 inode_lock(dx_alloc_inode); 4328 4329 ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); 4330 if (ret) { 4331 mlog_errno(ret); 4332 goto out_mutex; 4333 } 4334 4335 handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS); 4336 if (IS_ERR(handle)) { 4337 ret = PTR_ERR(handle); 4338 mlog_errno(ret); 4339 goto out_unlock; 4340 } 4341 4342 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, 4343 OCFS2_JOURNAL_ACCESS_WRITE); 4344 if (ret) { 4345 mlog_errno(ret); 4346 goto out_commit; 4347 } 4348 4349 spin_lock(&OCFS2_I(dir)->ip_lock); 4350 OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; 4351 di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); 4352 spin_unlock(&OCFS2_I(dir)->ip_lock); 4353 di->i_dx_root = cpu_to_le64(0ULL); 4354 ocfs2_update_inode_fsync_trans(handle, dir, 1); 4355 4356 ocfs2_journal_dirty(handle, di_bh); 4357 4358 blk = le64_to_cpu(dx_root->dr_blkno); 4359 bit = le16_to_cpu(dx_root->dr_suballoc_bit); 4360 if (dx_root->dr_suballoc_loc) 4361 bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc); 4362 else 4363 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 4364 ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, 4365 bit, bg_blkno, 1); 4366 if (ret) 4367 mlog_errno(ret); 4368 4369 out_commit: 4370 ocfs2_commit_trans(osb, handle); 4371 4372 out_unlock: 4373 ocfs2_inode_unlock(dx_alloc_inode, 1); 4374 4375 out_mutex: 4376 inode_unlock(dx_alloc_inode); 4377 brelse(dx_alloc_bh); 4378 out: 4379 iput(dx_alloc_inode); 4380 return ret; 4381 } 4382 4383 int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) 4384 { 4385 int ret; 4386 unsigned int clen; 4387 u32 major_hash = UINT_MAX, p_cpos, cpos; 4388 u64 blkno; 4389 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 4390 struct buffer_head *dx_root_bh = NULL; 4391 struct ocfs2_dx_root_block *dx_root; 4392 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 4393 struct ocfs2_cached_dealloc_ctxt dealloc; 4394 struct ocfs2_extent_tree et; 4395 4396 ocfs2_init_dealloc_ctxt(&dealloc); 4397 4398 if (!ocfs2_dir_indexed(dir)) 4399 return 0; 4400 4401 ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); 4402 if (ret) { 4403 mlog_errno(ret); 4404 goto out; 4405 } 4406 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 4407 4408 if (ocfs2_dx_root_inline(dx_root)) 4409 goto remove_index; 4410 4411 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); 4412 4413 /* XXX: What if dr_clusters is too large? */ 4414 while (le32_to_cpu(dx_root->dr_clusters)) { 4415 ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list, 4416 major_hash, &cpos, &blkno, &clen); 4417 if (ret) { 4418 mlog_errno(ret); 4419 goto out; 4420 } 4421 4422 p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); 4423 4424 ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, 4425 &dealloc, 0, false); 4426 if (ret) { 4427 mlog_errno(ret); 4428 goto out; 4429 } 4430 4431 if (cpos == 0) 4432 break; 4433 4434 major_hash = cpos - 1; 4435 } 4436 4437 remove_index: 4438 ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); 4439 if (ret) { 4440 mlog_errno(ret); 4441 goto out; 4442 } 4443 4444 ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh); 4445 out: 4446 ocfs2_schedule_truncate_log_flush(osb, 1); 4447 ocfs2_run_deallocs(osb, &dealloc); 4448 4449 brelse(dx_root_bh); 4450 return ret; 4451 } 4452