1 #include <linux/ceph/ceph_debug.h> 2 3 #include <linux/spinlock.h> 4 #include <linux/fs_struct.h> 5 #include <linux/namei.h> 6 #include <linux/slab.h> 7 #include <linux/sched.h> 8 #include <linux/xattr.h> 9 10 #include "super.h" 11 #include "mds_client.h" 12 13 /* 14 * Directory operations: readdir, lookup, create, link, unlink, 15 * rename, etc. 16 */ 17 18 /* 19 * Ceph MDS operations are specified in terms of a base ino and 20 * relative path. Thus, the client can specify an operation on a 21 * specific inode (e.g., a getattr due to fstat(2)), or as a path 22 * relative to, say, the root directory. 23 * 24 * Normally, we limit ourselves to strict inode ops (no path component) 25 * or dentry operations (a single path component relative to an ino). The 26 * exception to this is open_root_dentry(), which will open the mount 27 * point by name. 28 */ 29 30 const struct dentry_operations ceph_dentry_ops; 31 32 /* 33 * Initialize ceph dentry state. 34 */ 35 int ceph_init_dentry(struct dentry *dentry) 36 { 37 struct ceph_dentry_info *di; 38 39 if (dentry->d_fsdata) 40 return 0; 41 42 di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); 43 if (!di) 44 return -ENOMEM; /* oh well */ 45 46 spin_lock(&dentry->d_lock); 47 if (dentry->d_fsdata) { 48 /* lost a race */ 49 kmem_cache_free(ceph_dentry_cachep, di); 50 goto out_unlock; 51 } 52 53 if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) 54 d_set_d_op(dentry, &ceph_dentry_ops); 55 else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) 56 d_set_d_op(dentry, &ceph_snapdir_dentry_ops); 57 else 58 d_set_d_op(dentry, &ceph_snap_dentry_ops); 59 60 di->dentry = dentry; 61 di->lease_session = NULL; 62 dentry->d_time = jiffies; 63 /* avoid reordering d_fsdata setup so that the check above is safe */ 64 smp_mb(); 65 dentry->d_fsdata = di; 66 ceph_dentry_lru_add(dentry); 67 out_unlock: 68 spin_unlock(&dentry->d_lock); 69 return 0; 70 } 71 72 /* 73 * for f_pos for readdir: 74 * - hash order: 75 * (0xff << 52) | ((24 bits hash) << 28) | 76 * (the nth entry has hash collision); 77 * - frag+name order; 78 * ((frag value) << 28) | (the nth entry in frag); 79 */ 80 #define OFFSET_BITS 28 81 #define OFFSET_MASK ((1 << OFFSET_BITS) - 1) 82 #define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) 83 loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) 84 { 85 loff_t fpos = ((loff_t)high << 28) | (loff_t)off; 86 if (hash_order) 87 fpos |= HASH_ORDER; 88 return fpos; 89 } 90 91 static bool is_hash_order(loff_t p) 92 { 93 return (p & HASH_ORDER) == HASH_ORDER; 94 } 95 96 static unsigned fpos_frag(loff_t p) 97 { 98 return p >> OFFSET_BITS; 99 } 100 101 static unsigned fpos_hash(loff_t p) 102 { 103 return ceph_frag_value(fpos_frag(p)); 104 } 105 106 static unsigned fpos_off(loff_t p) 107 { 108 return p & OFFSET_MASK; 109 } 110 111 static int fpos_cmp(loff_t l, loff_t r) 112 { 113 int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); 114 if (v) 115 return v; 116 return (int)(fpos_off(l) - fpos_off(r)); 117 } 118 119 /* 120 * make note of the last dentry we read, so we can 121 * continue at the same lexicographical point, 122 * regardless of what dir changes take place on the 123 * server. 124 */ 125 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 126 int len, unsigned next_offset) 127 { 128 char *buf = kmalloc(len+1, GFP_KERNEL); 129 if (!buf) 130 return -ENOMEM; 131 kfree(fi->last_name); 132 fi->last_name = buf; 133 memcpy(fi->last_name, name, len); 134 fi->last_name[len] = 0; 135 fi->next_offset = next_offset; 136 dout("note_last_dentry '%s'\n", fi->last_name); 137 return 0; 138 } 139 140 141 static struct dentry * 142 __dcache_find_get_entry(struct dentry *parent, u64 idx, 143 struct ceph_readdir_cache_control *cache_ctl) 144 { 145 struct inode *dir = d_inode(parent); 146 struct dentry *dentry; 147 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; 148 loff_t ptr_pos = idx * sizeof(struct dentry *); 149 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; 150 151 if (ptr_pos >= i_size_read(dir)) 152 return NULL; 153 154 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { 155 ceph_readdir_cache_release(cache_ctl); 156 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); 157 if (!cache_ctl->page) { 158 dout(" page %lu not found\n", ptr_pgoff); 159 return ERR_PTR(-EAGAIN); 160 } 161 /* reading/filling the cache are serialized by 162 i_mutex, no need to use page lock */ 163 unlock_page(cache_ctl->page); 164 cache_ctl->dentries = kmap(cache_ctl->page); 165 } 166 167 cache_ctl->index = idx & idx_mask; 168 169 rcu_read_lock(); 170 spin_lock(&parent->d_lock); 171 /* check i_size again here, because empty directory can be 172 * marked as complete while not holding the i_mutex. */ 173 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) 174 dentry = cache_ctl->dentries[cache_ctl->index]; 175 else 176 dentry = NULL; 177 spin_unlock(&parent->d_lock); 178 if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) 179 dentry = NULL; 180 rcu_read_unlock(); 181 return dentry ? : ERR_PTR(-EAGAIN); 182 } 183 184 /* 185 * When possible, we try to satisfy a readdir by peeking at the 186 * dcache. We make this work by carefully ordering dentries on 187 * d_child when we initially get results back from the MDS, and 188 * falling back to a "normal" sync readdir if any dentries in the dir 189 * are dropped. 190 * 191 * Complete dir indicates that we have all dentries in the dir. It is 192 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 193 * the MDS if/when the directory is modified). 194 */ 195 static int __dcache_readdir(struct file *file, struct dir_context *ctx, 196 u32 shared_gen) 197 { 198 struct ceph_file_info *fi = file->private_data; 199 struct dentry *parent = file->f_path.dentry; 200 struct inode *dir = d_inode(parent); 201 struct dentry *dentry, *last = NULL; 202 struct ceph_dentry_info *di; 203 struct ceph_readdir_cache_control cache_ctl = {}; 204 u64 idx = 0; 205 int err = 0; 206 207 dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); 208 209 /* search start position */ 210 if (ctx->pos > 2) { 211 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); 212 while (count > 0) { 213 u64 step = count >> 1; 214 dentry = __dcache_find_get_entry(parent, idx + step, 215 &cache_ctl); 216 if (!dentry) { 217 /* use linar search */ 218 idx = 0; 219 break; 220 } 221 if (IS_ERR(dentry)) { 222 err = PTR_ERR(dentry); 223 goto out; 224 } 225 di = ceph_dentry(dentry); 226 spin_lock(&dentry->d_lock); 227 if (fpos_cmp(di->offset, ctx->pos) < 0) { 228 idx += step + 1; 229 count -= step + 1; 230 } else { 231 count = step; 232 } 233 spin_unlock(&dentry->d_lock); 234 dput(dentry); 235 } 236 237 dout("__dcache_readdir %p cache idx %llu\n", dir, idx); 238 } 239 240 241 for (;;) { 242 bool emit_dentry = false; 243 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); 244 if (!dentry) { 245 fi->flags |= CEPH_F_ATEND; 246 err = 0; 247 break; 248 } 249 if (IS_ERR(dentry)) { 250 err = PTR_ERR(dentry); 251 goto out; 252 } 253 254 di = ceph_dentry(dentry); 255 spin_lock(&dentry->d_lock); 256 if (di->lease_shared_gen == shared_gen && 257 d_really_is_positive(dentry) && 258 fpos_cmp(ctx->pos, di->offset) <= 0) { 259 emit_dentry = true; 260 } 261 spin_unlock(&dentry->d_lock); 262 263 if (emit_dentry) { 264 dout(" %llx dentry %p %pd %p\n", di->offset, 265 dentry, dentry, d_inode(dentry)); 266 ctx->pos = di->offset; 267 if (!dir_emit(ctx, dentry->d_name.name, 268 dentry->d_name.len, 269 ceph_translate_ino(dentry->d_sb, 270 d_inode(dentry)->i_ino), 271 d_inode(dentry)->i_mode >> 12)) { 272 dput(dentry); 273 err = 0; 274 break; 275 } 276 ctx->pos++; 277 278 if (last) 279 dput(last); 280 last = dentry; 281 } else { 282 dput(dentry); 283 } 284 } 285 out: 286 ceph_readdir_cache_release(&cache_ctl); 287 if (last) { 288 int ret; 289 di = ceph_dentry(last); 290 ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, 291 fpos_off(di->offset) + 1); 292 if (ret < 0) 293 err = ret; 294 dput(last); 295 } 296 return err; 297 } 298 299 static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) 300 { 301 if (!fi->last_readdir) 302 return true; 303 if (is_hash_order(pos)) 304 return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); 305 else 306 return fi->frag != fpos_frag(pos); 307 } 308 309 static int ceph_readdir(struct file *file, struct dir_context *ctx) 310 { 311 struct ceph_file_info *fi = file->private_data; 312 struct inode *inode = file_inode(file); 313 struct ceph_inode_info *ci = ceph_inode(inode); 314 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 315 struct ceph_mds_client *mdsc = fsc->mdsc; 316 int i; 317 int err; 318 u32 ftype; 319 struct ceph_mds_reply_info_parsed *rinfo; 320 321 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); 322 if (fi->flags & CEPH_F_ATEND) 323 return 0; 324 325 /* always start with . and .. */ 326 if (ctx->pos == 0) { 327 dout("readdir off 0 -> '.'\n"); 328 if (!dir_emit(ctx, ".", 1, 329 ceph_translate_ino(inode->i_sb, inode->i_ino), 330 inode->i_mode >> 12)) 331 return 0; 332 ctx->pos = 1; 333 } 334 if (ctx->pos == 1) { 335 ino_t ino = parent_ino(file->f_path.dentry); 336 dout("readdir off 1 -> '..'\n"); 337 if (!dir_emit(ctx, "..", 2, 338 ceph_translate_ino(inode->i_sb, ino), 339 inode->i_mode >> 12)) 340 return 0; 341 ctx->pos = 2; 342 } 343 344 /* can we use the dcache? */ 345 spin_lock(&ci->i_ceph_lock); 346 if (ceph_test_mount_opt(fsc, DCACHE) && 347 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 348 ceph_snap(inode) != CEPH_SNAPDIR && 349 __ceph_dir_is_complete_ordered(ci) && 350 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 351 u32 shared_gen = ci->i_shared_gen; 352 spin_unlock(&ci->i_ceph_lock); 353 err = __dcache_readdir(file, ctx, shared_gen); 354 if (err != -EAGAIN) 355 return err; 356 } else { 357 spin_unlock(&ci->i_ceph_lock); 358 } 359 360 /* proceed with a normal readdir */ 361 more: 362 /* do we have the correct frag content buffered? */ 363 if (need_send_readdir(fi, ctx->pos)) { 364 struct ceph_mds_request *req; 365 unsigned frag; 366 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 367 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 368 369 /* discard old result, if any */ 370 if (fi->last_readdir) { 371 ceph_mdsc_put_request(fi->last_readdir); 372 fi->last_readdir = NULL; 373 } 374 375 if (is_hash_order(ctx->pos)) { 376 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), 377 NULL, NULL); 378 } else { 379 frag = fpos_frag(ctx->pos); 380 } 381 382 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 383 ceph_vinop(inode), frag, fi->last_name); 384 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 385 if (IS_ERR(req)) 386 return PTR_ERR(req); 387 err = ceph_alloc_readdir_reply_buffer(req, inode); 388 if (err) { 389 ceph_mdsc_put_request(req); 390 return err; 391 } 392 /* hints to request -> mds selection code */ 393 req->r_direct_mode = USE_AUTH_MDS; 394 req->r_direct_hash = ceph_frag_value(frag); 395 req->r_direct_is_hash = true; 396 if (fi->last_name) { 397 req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 398 if (!req->r_path2) { 399 ceph_mdsc_put_request(req); 400 return -ENOMEM; 401 } 402 } 403 req->r_dir_release_cnt = fi->dir_release_count; 404 req->r_dir_ordered_cnt = fi->dir_ordered_count; 405 req->r_readdir_cache_idx = fi->readdir_cache_idx; 406 req->r_readdir_offset = fi->next_offset; 407 req->r_args.readdir.frag = cpu_to_le32(frag); 408 req->r_args.readdir.flags = 409 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); 410 411 req->r_inode = inode; 412 ihold(inode); 413 req->r_dentry = dget(file->f_path.dentry); 414 err = ceph_mdsc_do_request(mdsc, NULL, req); 415 if (err < 0) { 416 ceph_mdsc_put_request(req); 417 return err; 418 } 419 dout("readdir got and parsed readdir result=%d on " 420 "frag %x, end=%d, complete=%d, hash_order=%d\n", 421 err, frag, 422 (int)req->r_reply_info.dir_end, 423 (int)req->r_reply_info.dir_complete, 424 (int)req->r_reply_info.hash_order); 425 426 rinfo = &req->r_reply_info; 427 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 428 frag = le32_to_cpu(rinfo->dir_dir->frag); 429 if (!rinfo->hash_order) { 430 fi->next_offset = req->r_readdir_offset; 431 /* adjust ctx->pos to beginning of frag */ 432 ctx->pos = ceph_make_fpos(frag, 433 fi->next_offset, 434 false); 435 } 436 } 437 438 fi->frag = frag; 439 fi->last_readdir = req; 440 441 if (req->r_did_prepopulate) { 442 fi->readdir_cache_idx = req->r_readdir_cache_idx; 443 if (fi->readdir_cache_idx < 0) { 444 /* preclude from marking dir ordered */ 445 fi->dir_ordered_count = 0; 446 } else if (ceph_frag_is_leftmost(frag) && 447 fi->next_offset == 2) { 448 /* note dir version at start of readdir so 449 * we can tell if any dentries get dropped */ 450 fi->dir_release_count = req->r_dir_release_cnt; 451 fi->dir_ordered_count = req->r_dir_ordered_cnt; 452 } 453 } else { 454 dout("readdir !did_prepopulate"); 455 /* disable readdir cache */ 456 fi->readdir_cache_idx = -1; 457 /* preclude from marking dir complete */ 458 fi->dir_release_count = 0; 459 } 460 461 /* note next offset and last dentry name */ 462 if (rinfo->dir_nr > 0) { 463 struct ceph_mds_reply_dir_entry *rde = 464 rinfo->dir_entries + (rinfo->dir_nr-1); 465 unsigned next_offset = req->r_reply_info.dir_end ? 466 2 : (fpos_off(rde->offset) + 1); 467 err = note_last_dentry(fi, rde->name, rde->name_len, 468 next_offset); 469 if (err) 470 return err; 471 } else if (req->r_reply_info.dir_end) { 472 fi->next_offset = 2; 473 /* keep last name */ 474 } 475 } 476 477 rinfo = &fi->last_readdir->r_reply_info; 478 dout("readdir frag %x num %d pos %llx chunk first %llx\n", 479 fi->frag, rinfo->dir_nr, ctx->pos, 480 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); 481 482 i = 0; 483 /* search start position */ 484 if (rinfo->dir_nr > 0) { 485 int step, nr = rinfo->dir_nr; 486 while (nr > 0) { 487 step = nr >> 1; 488 if (rinfo->dir_entries[i + step].offset < ctx->pos) { 489 i += step + 1; 490 nr -= step + 1; 491 } else { 492 nr = step; 493 } 494 } 495 } 496 for (; i < rinfo->dir_nr; i++) { 497 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 498 struct ceph_vino vino; 499 ino_t ino; 500 501 BUG_ON(rde->offset < ctx->pos); 502 503 ctx->pos = rde->offset; 504 dout("readdir (%d/%d) -> %llx '%.*s' %p\n", 505 i, rinfo->dir_nr, ctx->pos, 506 rde->name_len, rde->name, &rde->inode.in); 507 508 BUG_ON(!rde->inode.in); 509 ftype = le32_to_cpu(rde->inode.in->mode) >> 12; 510 vino.ino = le64_to_cpu(rde->inode.in->ino); 511 vino.snap = le64_to_cpu(rde->inode.in->snapid); 512 ino = ceph_vino_to_ino(vino); 513 514 if (!dir_emit(ctx, rde->name, rde->name_len, 515 ceph_translate_ino(inode->i_sb, ino), ftype)) { 516 dout("filldir stopping us...\n"); 517 return 0; 518 } 519 ctx->pos++; 520 } 521 522 if (fi->next_offset > 2) { 523 ceph_mdsc_put_request(fi->last_readdir); 524 fi->last_readdir = NULL; 525 goto more; 526 } 527 528 /* more frags? */ 529 if (!ceph_frag_is_rightmost(fi->frag)) { 530 unsigned frag = ceph_frag_next(fi->frag); 531 if (is_hash_order(ctx->pos)) { 532 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), 533 fi->next_offset, true); 534 if (new_pos > ctx->pos) 535 ctx->pos = new_pos; 536 /* keep last_name */ 537 } else { 538 ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); 539 kfree(fi->last_name); 540 fi->last_name = NULL; 541 } 542 dout("readdir next frag is %x\n", frag); 543 goto more; 544 } 545 fi->flags |= CEPH_F_ATEND; 546 547 /* 548 * if dir_release_count still matches the dir, no dentries 549 * were released during the whole readdir, and we should have 550 * the complete dir contents in our cache. 551 */ 552 if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { 553 spin_lock(&ci->i_ceph_lock); 554 if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { 555 dout(" marking %p complete and ordered\n", inode); 556 /* use i_size to track number of entries in 557 * readdir cache */ 558 BUG_ON(fi->readdir_cache_idx < 0); 559 i_size_write(inode, fi->readdir_cache_idx * 560 sizeof(struct dentry*)); 561 } else { 562 dout(" marking %p complete\n", inode); 563 } 564 __ceph_dir_set_complete(ci, fi->dir_release_count, 565 fi->dir_ordered_count); 566 spin_unlock(&ci->i_ceph_lock); 567 } 568 569 dout("readdir %p file %p done.\n", inode, file); 570 return 0; 571 } 572 573 static void reset_readdir(struct ceph_file_info *fi) 574 { 575 if (fi->last_readdir) { 576 ceph_mdsc_put_request(fi->last_readdir); 577 fi->last_readdir = NULL; 578 } 579 kfree(fi->last_name); 580 fi->last_name = NULL; 581 fi->dir_release_count = 0; 582 fi->readdir_cache_idx = -1; 583 fi->next_offset = 2; /* compensate for . and .. */ 584 fi->flags &= ~CEPH_F_ATEND; 585 } 586 587 /* 588 * discard buffered readdir content on seekdir(0), or seek to new frag, 589 * or seek prior to current chunk 590 */ 591 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) 592 { 593 struct ceph_mds_reply_info_parsed *rinfo; 594 loff_t chunk_offset; 595 if (new_pos == 0) 596 return true; 597 if (is_hash_order(new_pos)) { 598 /* no need to reset last_name for a forward seek when 599 * dentries are sotred in hash order */ 600 } else if (fi->frag |= fpos_frag(new_pos)) { 601 return true; 602 } 603 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 604 if (!rinfo || !rinfo->dir_nr) 605 return true; 606 chunk_offset = rinfo->dir_entries[0].offset; 607 return new_pos < chunk_offset || 608 is_hash_order(new_pos) != is_hash_order(chunk_offset); 609 } 610 611 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 612 { 613 struct ceph_file_info *fi = file->private_data; 614 struct inode *inode = file->f_mapping->host; 615 loff_t retval; 616 617 inode_lock(inode); 618 retval = -EINVAL; 619 switch (whence) { 620 case SEEK_CUR: 621 offset += file->f_pos; 622 case SEEK_SET: 623 break; 624 case SEEK_END: 625 retval = -EOPNOTSUPP; 626 default: 627 goto out; 628 } 629 630 if (offset >= 0) { 631 if (need_reset_readdir(fi, offset)) { 632 dout("dir_llseek dropping %p content\n", file); 633 reset_readdir(fi); 634 } else if (is_hash_order(offset) && offset > file->f_pos) { 635 /* for hash offset, we don't know if a forward seek 636 * is within same frag */ 637 fi->dir_release_count = 0; 638 fi->readdir_cache_idx = -1; 639 } 640 641 if (offset != file->f_pos) { 642 file->f_pos = offset; 643 file->f_version = 0; 644 fi->flags &= ~CEPH_F_ATEND; 645 } 646 retval = offset; 647 } 648 out: 649 inode_unlock(inode); 650 return retval; 651 } 652 653 /* 654 * Handle lookups for the hidden .snap directory. 655 */ 656 int ceph_handle_snapdir(struct ceph_mds_request *req, 657 struct dentry *dentry, int err) 658 { 659 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 660 struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ 661 662 /* .snap dir? */ 663 if (err == -ENOENT && 664 ceph_snap(parent) == CEPH_NOSNAP && 665 strcmp(dentry->d_name.name, 666 fsc->mount_options->snapdir_name) == 0) { 667 struct inode *inode = ceph_get_snapdir(parent); 668 dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", 669 dentry, dentry, inode); 670 BUG_ON(!d_unhashed(dentry)); 671 d_add(dentry, inode); 672 err = 0; 673 } 674 return err; 675 } 676 677 /* 678 * Figure out final result of a lookup/open request. 679 * 680 * Mainly, make sure we return the final req->r_dentry (if it already 681 * existed) in place of the original VFS-provided dentry when they 682 * differ. 683 * 684 * Gracefully handle the case where the MDS replies with -ENOENT and 685 * no trace (which it may do, at its discretion, e.g., if it doesn't 686 * care to issue a lease on the negative dentry). 687 */ 688 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 689 struct dentry *dentry, int err) 690 { 691 if (err == -ENOENT) { 692 /* no trace? */ 693 err = 0; 694 if (!req->r_reply_info.head->is_dentry) { 695 dout("ENOENT and no trace, dentry %p inode %p\n", 696 dentry, d_inode(dentry)); 697 if (d_really_is_positive(dentry)) { 698 d_drop(dentry); 699 err = -ENOENT; 700 } else { 701 d_add(dentry, NULL); 702 } 703 } 704 } 705 if (err) 706 dentry = ERR_PTR(err); 707 else if (dentry != req->r_dentry) 708 dentry = dget(req->r_dentry); /* we got spliced */ 709 else 710 dentry = NULL; 711 return dentry; 712 } 713 714 static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 715 { 716 return ceph_ino(inode) == CEPH_INO_ROOT && 717 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 718 } 719 720 /* 721 * Look up a single dir entry. If there is a lookup intent, inform 722 * the MDS so that it gets our 'caps wanted' value in a single op. 723 */ 724 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 725 unsigned int flags) 726 { 727 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 728 struct ceph_mds_client *mdsc = fsc->mdsc; 729 struct ceph_mds_request *req; 730 int op; 731 int mask; 732 int err; 733 734 dout("lookup %p dentry %p '%pd'\n", 735 dir, dentry, dentry); 736 737 if (dentry->d_name.len > NAME_MAX) 738 return ERR_PTR(-ENAMETOOLONG); 739 740 err = ceph_init_dentry(dentry); 741 if (err < 0) 742 return ERR_PTR(err); 743 744 /* can we conclude ENOENT locally? */ 745 if (d_really_is_negative(dentry)) { 746 struct ceph_inode_info *ci = ceph_inode(dir); 747 struct ceph_dentry_info *di = ceph_dentry(dentry); 748 749 spin_lock(&ci->i_ceph_lock); 750 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 751 if (strncmp(dentry->d_name.name, 752 fsc->mount_options->snapdir_name, 753 dentry->d_name.len) && 754 !is_root_ceph_dentry(dir, dentry) && 755 ceph_test_mount_opt(fsc, DCACHE) && 756 __ceph_dir_is_complete(ci) && 757 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 758 spin_unlock(&ci->i_ceph_lock); 759 dout(" dir %p complete, -ENOENT\n", dir); 760 d_add(dentry, NULL); 761 di->lease_shared_gen = ci->i_shared_gen; 762 return NULL; 763 } 764 spin_unlock(&ci->i_ceph_lock); 765 } 766 767 op = ceph_snap(dir) == CEPH_SNAPDIR ? 768 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 769 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 770 if (IS_ERR(req)) 771 return ERR_CAST(req); 772 req->r_dentry = dget(dentry); 773 req->r_num_caps = 2; 774 775 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 776 if (ceph_security_xattr_wanted(dir)) 777 mask |= CEPH_CAP_XATTR_SHARED; 778 req->r_args.getattr.mask = cpu_to_le32(mask); 779 780 req->r_locked_dir = dir; 781 err = ceph_mdsc_do_request(mdsc, NULL, req); 782 err = ceph_handle_snapdir(req, dentry, err); 783 dentry = ceph_finish_lookup(req, dentry, err); 784 ceph_mdsc_put_request(req); /* will dput(dentry) */ 785 dout("lookup result=%p\n", dentry); 786 return dentry; 787 } 788 789 /* 790 * If we do a create but get no trace back from the MDS, follow up with 791 * a lookup (the VFS expects us to link up the provided dentry). 792 */ 793 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 794 { 795 struct dentry *result = ceph_lookup(dir, dentry, 0); 796 797 if (result && !IS_ERR(result)) { 798 /* 799 * We created the item, then did a lookup, and found 800 * it was already linked to another inode we already 801 * had in our cache (and thus got spliced). To not 802 * confuse VFS (especially when inode is a directory), 803 * we don't link our dentry to that inode, return an 804 * error instead. 805 * 806 * This event should be rare and it happens only when 807 * we talk to old MDS. Recent MDS does not send traceless 808 * reply for request that creates new inode. 809 */ 810 d_drop(result); 811 return -ESTALE; 812 } 813 return PTR_ERR(result); 814 } 815 816 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 817 umode_t mode, dev_t rdev) 818 { 819 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 820 struct ceph_mds_client *mdsc = fsc->mdsc; 821 struct ceph_mds_request *req; 822 struct ceph_acls_info acls = {}; 823 int err; 824 825 if (ceph_snap(dir) != CEPH_NOSNAP) 826 return -EROFS; 827 828 err = ceph_pre_init_acls(dir, &mode, &acls); 829 if (err < 0) 830 return err; 831 832 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", 833 dir, dentry, mode, rdev); 834 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 835 if (IS_ERR(req)) { 836 err = PTR_ERR(req); 837 goto out; 838 } 839 req->r_dentry = dget(dentry); 840 req->r_num_caps = 2; 841 req->r_locked_dir = dir; 842 req->r_args.mknod.mode = cpu_to_le32(mode); 843 req->r_args.mknod.rdev = cpu_to_le32(rdev); 844 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 845 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 846 if (acls.pagelist) { 847 req->r_pagelist = acls.pagelist; 848 acls.pagelist = NULL; 849 } 850 err = ceph_mdsc_do_request(mdsc, dir, req); 851 if (!err && !req->r_reply_info.head->is_dentry) 852 err = ceph_handle_notrace_create(dir, dentry); 853 ceph_mdsc_put_request(req); 854 out: 855 if (!err) 856 ceph_init_inode_acls(d_inode(dentry), &acls); 857 else 858 d_drop(dentry); 859 ceph_release_acls_info(&acls); 860 return err; 861 } 862 863 static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, 864 bool excl) 865 { 866 return ceph_mknod(dir, dentry, mode, 0); 867 } 868 869 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 870 const char *dest) 871 { 872 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 873 struct ceph_mds_client *mdsc = fsc->mdsc; 874 struct ceph_mds_request *req; 875 int err; 876 877 if (ceph_snap(dir) != CEPH_NOSNAP) 878 return -EROFS; 879 880 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 881 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 882 if (IS_ERR(req)) { 883 err = PTR_ERR(req); 884 goto out; 885 } 886 req->r_path2 = kstrdup(dest, GFP_KERNEL); 887 if (!req->r_path2) { 888 err = -ENOMEM; 889 ceph_mdsc_put_request(req); 890 goto out; 891 } 892 req->r_locked_dir = dir; 893 req->r_dentry = dget(dentry); 894 req->r_num_caps = 2; 895 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 896 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 897 err = ceph_mdsc_do_request(mdsc, dir, req); 898 if (!err && !req->r_reply_info.head->is_dentry) 899 err = ceph_handle_notrace_create(dir, dentry); 900 ceph_mdsc_put_request(req); 901 out: 902 if (err) 903 d_drop(dentry); 904 return err; 905 } 906 907 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 908 { 909 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 910 struct ceph_mds_client *mdsc = fsc->mdsc; 911 struct ceph_mds_request *req; 912 struct ceph_acls_info acls = {}; 913 int err = -EROFS; 914 int op; 915 916 if (ceph_snap(dir) == CEPH_SNAPDIR) { 917 /* mkdir .snap/foo is a MKSNAP */ 918 op = CEPH_MDS_OP_MKSNAP; 919 dout("mksnap dir %p snap '%pd' dn %p\n", dir, 920 dentry, dentry); 921 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 922 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 923 op = CEPH_MDS_OP_MKDIR; 924 } else { 925 goto out; 926 } 927 928 mode |= S_IFDIR; 929 err = ceph_pre_init_acls(dir, &mode, &acls); 930 if (err < 0) 931 goto out; 932 933 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 934 if (IS_ERR(req)) { 935 err = PTR_ERR(req); 936 goto out; 937 } 938 939 req->r_dentry = dget(dentry); 940 req->r_num_caps = 2; 941 req->r_locked_dir = dir; 942 req->r_args.mkdir.mode = cpu_to_le32(mode); 943 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 944 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 945 if (acls.pagelist) { 946 req->r_pagelist = acls.pagelist; 947 acls.pagelist = NULL; 948 } 949 err = ceph_mdsc_do_request(mdsc, dir, req); 950 if (!err && 951 !req->r_reply_info.head->is_target && 952 !req->r_reply_info.head->is_dentry) 953 err = ceph_handle_notrace_create(dir, dentry); 954 ceph_mdsc_put_request(req); 955 out: 956 if (!err) 957 ceph_init_inode_acls(d_inode(dentry), &acls); 958 else 959 d_drop(dentry); 960 ceph_release_acls_info(&acls); 961 return err; 962 } 963 964 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 965 struct dentry *dentry) 966 { 967 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 968 struct ceph_mds_client *mdsc = fsc->mdsc; 969 struct ceph_mds_request *req; 970 int err; 971 972 if (ceph_snap(dir) != CEPH_NOSNAP) 973 return -EROFS; 974 975 dout("link in dir %p old_dentry %p dentry %p\n", dir, 976 old_dentry, dentry); 977 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 978 if (IS_ERR(req)) { 979 d_drop(dentry); 980 return PTR_ERR(req); 981 } 982 req->r_dentry = dget(dentry); 983 req->r_num_caps = 2; 984 req->r_old_dentry = dget(old_dentry); 985 req->r_locked_dir = dir; 986 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 987 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 988 /* release LINK_SHARED on source inode (mds will lock it) */ 989 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 990 err = ceph_mdsc_do_request(mdsc, dir, req); 991 if (err) { 992 d_drop(dentry); 993 } else if (!req->r_reply_info.head->is_dentry) { 994 ihold(d_inode(old_dentry)); 995 d_instantiate(dentry, d_inode(old_dentry)); 996 } 997 ceph_mdsc_put_request(req); 998 return err; 999 } 1000 1001 /* 1002 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 1003 * looks like the link count will hit 0, drop any other caps (other 1004 * than PIN) we don't specifically want (due to the file still being 1005 * open). 1006 */ 1007 static int drop_caps_for_unlink(struct inode *inode) 1008 { 1009 struct ceph_inode_info *ci = ceph_inode(inode); 1010 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 1011 1012 spin_lock(&ci->i_ceph_lock); 1013 if (inode->i_nlink == 1) { 1014 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 1015 ci->i_ceph_flags |= CEPH_I_NODELAY; 1016 } 1017 spin_unlock(&ci->i_ceph_lock); 1018 return drop; 1019 } 1020 1021 /* 1022 * rmdir and unlink are differ only by the metadata op code 1023 */ 1024 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 1025 { 1026 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 1027 struct ceph_mds_client *mdsc = fsc->mdsc; 1028 struct inode *inode = d_inode(dentry); 1029 struct ceph_mds_request *req; 1030 int err = -EROFS; 1031 int op; 1032 1033 if (ceph_snap(dir) == CEPH_SNAPDIR) { 1034 /* rmdir .snap/foo is RMSNAP */ 1035 dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); 1036 op = CEPH_MDS_OP_RMSNAP; 1037 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 1038 dout("unlink/rmdir dir %p dn %p inode %p\n", 1039 dir, dentry, inode); 1040 op = d_is_dir(dentry) ? 1041 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 1042 } else 1043 goto out; 1044 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1045 if (IS_ERR(req)) { 1046 err = PTR_ERR(req); 1047 goto out; 1048 } 1049 req->r_dentry = dget(dentry); 1050 req->r_num_caps = 2; 1051 req->r_locked_dir = dir; 1052 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1053 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1054 req->r_inode_drop = drop_caps_for_unlink(inode); 1055 err = ceph_mdsc_do_request(mdsc, dir, req); 1056 if (!err && !req->r_reply_info.head->is_dentry) 1057 d_delete(dentry); 1058 ceph_mdsc_put_request(req); 1059 out: 1060 return err; 1061 } 1062 1063 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 1064 struct inode *new_dir, struct dentry *new_dentry) 1065 { 1066 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); 1067 struct ceph_mds_client *mdsc = fsc->mdsc; 1068 struct ceph_mds_request *req; 1069 int op = CEPH_MDS_OP_RENAME; 1070 int err; 1071 1072 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 1073 return -EXDEV; 1074 if (ceph_snap(old_dir) != CEPH_NOSNAP) { 1075 if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) 1076 op = CEPH_MDS_OP_RENAMESNAP; 1077 else 1078 return -EROFS; 1079 } 1080 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1081 old_dir, old_dentry, new_dir, new_dentry); 1082 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 1083 if (IS_ERR(req)) 1084 return PTR_ERR(req); 1085 ihold(old_dir); 1086 req->r_dentry = dget(new_dentry); 1087 req->r_num_caps = 2; 1088 req->r_old_dentry = dget(old_dentry); 1089 req->r_old_dentry_dir = old_dir; 1090 req->r_locked_dir = new_dir; 1091 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 1092 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 1093 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 1094 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 1095 /* release LINK_RDCACHE on source inode (mds will lock it) */ 1096 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 1097 if (d_really_is_positive(new_dentry)) 1098 req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); 1099 err = ceph_mdsc_do_request(mdsc, old_dir, req); 1100 if (!err && !req->r_reply_info.head->is_dentry) { 1101 /* 1102 * Normally d_move() is done by fill_trace (called by 1103 * do_request, above). If there is no trace, we need 1104 * to do it here. 1105 */ 1106 1107 /* d_move screws up sibling dentries' offsets */ 1108 ceph_dir_clear_complete(old_dir); 1109 ceph_dir_clear_complete(new_dir); 1110 1111 d_move(old_dentry, new_dentry); 1112 1113 /* ensure target dentry is invalidated, despite 1114 rehashing bug in vfs_rename_dir */ 1115 ceph_invalidate_dentry_lease(new_dentry); 1116 } 1117 ceph_mdsc_put_request(req); 1118 return err; 1119 } 1120 1121 /* 1122 * Ensure a dentry lease will no longer revalidate. 1123 */ 1124 void ceph_invalidate_dentry_lease(struct dentry *dentry) 1125 { 1126 spin_lock(&dentry->d_lock); 1127 dentry->d_time = jiffies; 1128 ceph_dentry(dentry)->lease_shared_gen = 0; 1129 spin_unlock(&dentry->d_lock); 1130 } 1131 1132 /* 1133 * Check if dentry lease is valid. If not, delete the lease. Try to 1134 * renew if the least is more than half up. 1135 */ 1136 static int dentry_lease_is_valid(struct dentry *dentry) 1137 { 1138 struct ceph_dentry_info *di; 1139 struct ceph_mds_session *s; 1140 int valid = 0; 1141 u32 gen; 1142 unsigned long ttl; 1143 struct ceph_mds_session *session = NULL; 1144 struct inode *dir = NULL; 1145 u32 seq = 0; 1146 1147 spin_lock(&dentry->d_lock); 1148 di = ceph_dentry(dentry); 1149 if (di->lease_session) { 1150 s = di->lease_session; 1151 spin_lock(&s->s_gen_ttl_lock); 1152 gen = s->s_cap_gen; 1153 ttl = s->s_cap_ttl; 1154 spin_unlock(&s->s_gen_ttl_lock); 1155 1156 if (di->lease_gen == gen && 1157 time_before(jiffies, dentry->d_time) && 1158 time_before(jiffies, ttl)) { 1159 valid = 1; 1160 if (di->lease_renew_after && 1161 time_after(jiffies, di->lease_renew_after)) { 1162 /* we should renew */ 1163 dir = d_inode(dentry->d_parent); 1164 session = ceph_get_mds_session(s); 1165 seq = di->lease_seq; 1166 di->lease_renew_after = 0; 1167 di->lease_renew_from = jiffies; 1168 } 1169 } 1170 } 1171 spin_unlock(&dentry->d_lock); 1172 1173 if (session) { 1174 ceph_mdsc_lease_send_msg(session, dir, dentry, 1175 CEPH_MDS_LEASE_RENEW, seq); 1176 ceph_put_mds_session(session); 1177 } 1178 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 1179 return valid; 1180 } 1181 1182 /* 1183 * Check if directory-wide content lease/cap is valid. 1184 */ 1185 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 1186 { 1187 struct ceph_inode_info *ci = ceph_inode(dir); 1188 struct ceph_dentry_info *di = ceph_dentry(dentry); 1189 int valid = 0; 1190 1191 spin_lock(&ci->i_ceph_lock); 1192 if (ci->i_shared_gen == di->lease_shared_gen) 1193 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 1194 spin_unlock(&ci->i_ceph_lock); 1195 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 1196 dir, (unsigned)ci->i_shared_gen, dentry, 1197 (unsigned)di->lease_shared_gen, valid); 1198 return valid; 1199 } 1200 1201 /* 1202 * Check if cached dentry can be trusted. 1203 */ 1204 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1205 { 1206 int valid = 0; 1207 struct dentry *parent; 1208 struct inode *dir; 1209 1210 if (flags & LOOKUP_RCU) 1211 return -ECHILD; 1212 1213 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1214 dentry, d_inode(dentry), ceph_dentry(dentry)->offset); 1215 1216 parent = dget_parent(dentry); 1217 dir = d_inode(parent); 1218 1219 /* always trust cached snapped dentries, snapdir dentry */ 1220 if (ceph_snap(dir) != CEPH_NOSNAP) { 1221 dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, 1222 dentry, d_inode(dentry)); 1223 valid = 1; 1224 } else if (d_really_is_positive(dentry) && 1225 ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { 1226 valid = 1; 1227 } else if (dentry_lease_is_valid(dentry) || 1228 dir_lease_is_valid(dir, dentry)) { 1229 if (d_really_is_positive(dentry)) 1230 valid = ceph_is_any_caps(d_inode(dentry)); 1231 else 1232 valid = 1; 1233 } 1234 1235 if (!valid) { 1236 struct ceph_mds_client *mdsc = 1237 ceph_sb_to_client(dir->i_sb)->mdsc; 1238 struct ceph_mds_request *req; 1239 int op, mask, err; 1240 1241 op = ceph_snap(dir) == CEPH_SNAPDIR ? 1242 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 1243 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 1244 if (!IS_ERR(req)) { 1245 req->r_dentry = dget(dentry); 1246 req->r_num_caps = 2; 1247 1248 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 1249 if (ceph_security_xattr_wanted(dir)) 1250 mask |= CEPH_CAP_XATTR_SHARED; 1251 req->r_args.getattr.mask = mask; 1252 1253 req->r_locked_dir = dir; 1254 err = ceph_mdsc_do_request(mdsc, NULL, req); 1255 if (err == 0 || err == -ENOENT) { 1256 if (dentry == req->r_dentry) { 1257 valid = !d_unhashed(dentry); 1258 } else { 1259 d_invalidate(req->r_dentry); 1260 err = -EAGAIN; 1261 } 1262 } 1263 ceph_mdsc_put_request(req); 1264 dout("d_revalidate %p lookup result=%d\n", 1265 dentry, err); 1266 } 1267 } 1268 1269 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1270 if (valid) { 1271 ceph_dentry_lru_touch(dentry); 1272 } else { 1273 ceph_dir_clear_complete(dir); 1274 } 1275 1276 dput(parent); 1277 return valid; 1278 } 1279 1280 /* 1281 * Release our ceph_dentry_info. 1282 */ 1283 static void ceph_d_release(struct dentry *dentry) 1284 { 1285 struct ceph_dentry_info *di = ceph_dentry(dentry); 1286 1287 dout("d_release %p\n", dentry); 1288 ceph_dentry_lru_del(dentry); 1289 if (di->lease_session) 1290 ceph_put_mds_session(di->lease_session); 1291 kmem_cache_free(ceph_dentry_cachep, di); 1292 dentry->d_fsdata = NULL; 1293 } 1294 1295 static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1296 unsigned int flags) 1297 { 1298 /* 1299 * Eventually, we'll want to revalidate snapped metadata 1300 * too... probably... 1301 */ 1302 return 1; 1303 } 1304 1305 /* 1306 * When the VFS prunes a dentry from the cache, we need to clear the 1307 * complete flag on the parent directory. 1308 * 1309 * Called under dentry->d_lock. 1310 */ 1311 static void ceph_d_prune(struct dentry *dentry) 1312 { 1313 dout("ceph_d_prune %p\n", dentry); 1314 1315 /* do we have a valid parent? */ 1316 if (IS_ROOT(dentry)) 1317 return; 1318 1319 /* if we are not hashed, we don't affect dir's completeness */ 1320 if (d_unhashed(dentry)) 1321 return; 1322 1323 /* 1324 * we hold d_lock, so d_parent is stable, and d_fsdata is never 1325 * cleared until d_release 1326 */ 1327 ceph_dir_clear_complete(d_inode(dentry->d_parent)); 1328 } 1329 1330 /* 1331 * read() on a dir. This weird interface hack only works if mounted 1332 * with '-o dirstat'. 1333 */ 1334 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1335 loff_t *ppos) 1336 { 1337 struct ceph_file_info *cf = file->private_data; 1338 struct inode *inode = file_inode(file); 1339 struct ceph_inode_info *ci = ceph_inode(inode); 1340 int left; 1341 const int bufsize = 1024; 1342 1343 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1344 return -EISDIR; 1345 1346 if (!cf->dir_info) { 1347 cf->dir_info = kmalloc(bufsize, GFP_KERNEL); 1348 if (!cf->dir_info) 1349 return -ENOMEM; 1350 cf->dir_info_len = 1351 snprintf(cf->dir_info, bufsize, 1352 "entries: %20lld\n" 1353 " files: %20lld\n" 1354 " subdirs: %20lld\n" 1355 "rentries: %20lld\n" 1356 " rfiles: %20lld\n" 1357 " rsubdirs: %20lld\n" 1358 "rbytes: %20lld\n" 1359 "rctime: %10ld.%09ld\n", 1360 ci->i_files + ci->i_subdirs, 1361 ci->i_files, 1362 ci->i_subdirs, 1363 ci->i_rfiles + ci->i_rsubdirs, 1364 ci->i_rfiles, 1365 ci->i_rsubdirs, 1366 ci->i_rbytes, 1367 (long)ci->i_rctime.tv_sec, 1368 (long)ci->i_rctime.tv_nsec); 1369 } 1370 1371 if (*ppos >= cf->dir_info_len) 1372 return 0; 1373 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1374 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1375 if (left == size) 1376 return -EFAULT; 1377 *ppos += (size - left); 1378 return size - left; 1379 } 1380 1381 /* 1382 * We maintain a private dentry LRU. 1383 * 1384 * FIXME: this needs to be changed to a per-mds lru to be useful. 1385 */ 1386 void ceph_dentry_lru_add(struct dentry *dn) 1387 { 1388 struct ceph_dentry_info *di = ceph_dentry(dn); 1389 struct ceph_mds_client *mdsc; 1390 1391 dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); 1392 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1393 spin_lock(&mdsc->dentry_lru_lock); 1394 list_add_tail(&di->lru, &mdsc->dentry_lru); 1395 mdsc->num_dentry++; 1396 spin_unlock(&mdsc->dentry_lru_lock); 1397 } 1398 1399 void ceph_dentry_lru_touch(struct dentry *dn) 1400 { 1401 struct ceph_dentry_info *di = ceph_dentry(dn); 1402 struct ceph_mds_client *mdsc; 1403 1404 dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, 1405 di->offset); 1406 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1407 spin_lock(&mdsc->dentry_lru_lock); 1408 list_move_tail(&di->lru, &mdsc->dentry_lru); 1409 spin_unlock(&mdsc->dentry_lru_lock); 1410 } 1411 1412 void ceph_dentry_lru_del(struct dentry *dn) 1413 { 1414 struct ceph_dentry_info *di = ceph_dentry(dn); 1415 struct ceph_mds_client *mdsc; 1416 1417 dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); 1418 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1419 spin_lock(&mdsc->dentry_lru_lock); 1420 list_del_init(&di->lru); 1421 mdsc->num_dentry--; 1422 spin_unlock(&mdsc->dentry_lru_lock); 1423 } 1424 1425 /* 1426 * Return name hash for a given dentry. This is dependent on 1427 * the parent directory's hash function. 1428 */ 1429 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) 1430 { 1431 struct ceph_inode_info *dci = ceph_inode(dir); 1432 1433 switch (dci->i_dir_layout.dl_dir_hash) { 1434 case 0: /* for backward compat */ 1435 case CEPH_STR_HASH_LINUX: 1436 return dn->d_name.hash; 1437 1438 default: 1439 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, 1440 dn->d_name.name, dn->d_name.len); 1441 } 1442 } 1443 1444 const struct file_operations ceph_dir_fops = { 1445 .read = ceph_read_dir, 1446 .iterate = ceph_readdir, 1447 .llseek = ceph_dir_llseek, 1448 .open = ceph_open, 1449 .release = ceph_release, 1450 .unlocked_ioctl = ceph_ioctl, 1451 .fsync = ceph_fsync, 1452 }; 1453 1454 const struct file_operations ceph_snapdir_fops = { 1455 .iterate = ceph_readdir, 1456 .llseek = ceph_dir_llseek, 1457 .open = ceph_open, 1458 .release = ceph_release, 1459 }; 1460 1461 const struct inode_operations ceph_dir_iops = { 1462 .lookup = ceph_lookup, 1463 .permission = ceph_permission, 1464 .getattr = ceph_getattr, 1465 .setattr = ceph_setattr, 1466 .setxattr = generic_setxattr, 1467 .getxattr = generic_getxattr, 1468 .listxattr = ceph_listxattr, 1469 .removexattr = generic_removexattr, 1470 .get_acl = ceph_get_acl, 1471 .set_acl = ceph_set_acl, 1472 .mknod = ceph_mknod, 1473 .symlink = ceph_symlink, 1474 .mkdir = ceph_mkdir, 1475 .link = ceph_link, 1476 .unlink = ceph_unlink, 1477 .rmdir = ceph_unlink, 1478 .rename = ceph_rename, 1479 .create = ceph_create, 1480 .atomic_open = ceph_atomic_open, 1481 }; 1482 1483 const struct inode_operations ceph_snapdir_iops = { 1484 .lookup = ceph_lookup, 1485 .permission = ceph_permission, 1486 .getattr = ceph_getattr, 1487 .mkdir = ceph_mkdir, 1488 .rmdir = ceph_unlink, 1489 .rename = ceph_rename, 1490 }; 1491 1492 const struct dentry_operations ceph_dentry_ops = { 1493 .d_revalidate = ceph_d_revalidate, 1494 .d_release = ceph_d_release, 1495 .d_prune = ceph_d_prune, 1496 }; 1497 1498 const struct dentry_operations ceph_snapdir_dentry_ops = { 1499 .d_revalidate = ceph_snapdir_d_revalidate, 1500 .d_release = ceph_d_release, 1501 }; 1502 1503 const struct dentry_operations ceph_snap_dentry_ops = { 1504 .d_release = ceph_d_release, 1505 .d_prune = ceph_d_prune, 1506 }; 1507