1 #include "ceph_debug.h" 2 3 #include <linux/spinlock.h> 4 #include <linux/fs_struct.h> 5 #include <linux/namei.h> 6 #include <linux/slab.h> 7 #include <linux/sched.h> 8 9 #include "super.h" 10 11 /* 12 * Directory operations: readdir, lookup, create, link, unlink, 13 * rename, etc. 14 */ 15 16 /* 17 * Ceph MDS operations are specified in terms of a base ino and 18 * relative path. Thus, the client can specify an operation on a 19 * specific inode (e.g., a getattr due to fstat(2)), or as a path 20 * relative to, say, the root directory. 21 * 22 * Normally, we limit ourselves to strict inode ops (no path component) 23 * or dentry operations (a single path component relative to an ino). The 24 * exception to this is open_root_dentry(), which will open the mount 25 * point by name. 26 */ 27 28 const struct inode_operations ceph_dir_iops; 29 const struct file_operations ceph_dir_fops; 30 struct dentry_operations ceph_dentry_ops; 31 32 /* 33 * Initialize ceph dentry state. 34 */ 35 int ceph_init_dentry(struct dentry *dentry) 36 { 37 struct ceph_dentry_info *di; 38 39 if (dentry->d_fsdata) 40 return 0; 41 42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) 43 dentry->d_op = &ceph_dentry_ops; 44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) 45 dentry->d_op = &ceph_snapdir_dentry_ops; 46 else 47 dentry->d_op = &ceph_snap_dentry_ops; 48 49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 50 if (!di) 51 return -ENOMEM; /* oh well */ 52 53 spin_lock(&dentry->d_lock); 54 if (dentry->d_fsdata) { 55 /* lost a race */ 56 kmem_cache_free(ceph_dentry_cachep, di); 57 goto out_unlock; 58 } 59 di->dentry = dentry; 60 di->lease_session = NULL; 61 dentry->d_fsdata = di; 62 dentry->d_time = jiffies; 63 ceph_dentry_lru_add(dentry); 64 out_unlock: 65 spin_unlock(&dentry->d_lock); 66 return 0; 67 } 68 69 70 71 /* 72 * for readdir, we encode the directory frag and offset within that 73 * frag into f_pos. 74 */ 75 static unsigned fpos_frag(loff_t p) 76 { 77 return p >> 32; 78 } 79 static unsigned fpos_off(loff_t p) 80 { 81 return p & 0xffffffff; 82 } 83 84 /* 85 * When possible, we try to satisfy a readdir by peeking at the 86 * dcache. We make this work by carefully ordering dentries on 87 * d_u.d_child when we initially get results back from the MDS, and 88 * falling back to a "normal" sync readdir if any dentries in the dir 89 * are dropped. 90 * 91 * I_COMPLETE tells indicates we have all dentries in the dir. It is 92 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 93 * the MDS if/when the directory is modified). 94 */ 95 static int __dcache_readdir(struct file *filp, 96 void *dirent, filldir_t filldir) 97 { 98 struct inode *inode = filp->f_dentry->d_inode; 99 struct ceph_file_info *fi = filp->private_data; 100 struct dentry *parent = filp->f_dentry; 101 struct inode *dir = parent->d_inode; 102 struct list_head *p; 103 struct dentry *dentry, *last; 104 struct ceph_dentry_info *di; 105 int err = 0; 106 107 /* claim ref on last dentry we returned */ 108 last = fi->dentry; 109 fi->dentry = NULL; 110 111 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 112 last); 113 114 spin_lock(&dcache_lock); 115 116 /* start at beginning? */ 117 if (filp->f_pos == 2 || (last && 118 filp->f_pos < ceph_dentry(last)->offset)) { 119 if (list_empty(&parent->d_subdirs)) 120 goto out_unlock; 121 p = parent->d_subdirs.prev; 122 dout(" initial p %p/%p\n", p->prev, p->next); 123 } else { 124 p = last->d_u.d_child.prev; 125 } 126 127 more: 128 dentry = list_entry(p, struct dentry, d_u.d_child); 129 di = ceph_dentry(dentry); 130 while (1) { 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, 132 d_unhashed(dentry) ? "!hashed" : "hashed", 133 parent->d_subdirs.prev, parent->d_subdirs.next); 134 if (p == &parent->d_subdirs) { 135 fi->at_end = 1; 136 goto out_unlock; 137 } 138 if (!d_unhashed(dentry) && dentry->d_inode && 139 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 140 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 141 filp->f_pos <= di->offset) 142 break; 143 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 144 dentry->d_name.len, dentry->d_name.name, di->offset, 145 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 146 !dentry->d_inode ? " null" : ""); 147 p = p->prev; 148 dentry = list_entry(p, struct dentry, d_u.d_child); 149 di = ceph_dentry(dentry); 150 } 151 152 atomic_inc(&dentry->d_count); 153 spin_unlock(&dcache_lock); 154 spin_unlock(&inode->i_lock); 155 156 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 157 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 158 filp->f_pos = di->offset; 159 err = filldir(dirent, dentry->d_name.name, 160 dentry->d_name.len, di->offset, 161 dentry->d_inode->i_ino, 162 dentry->d_inode->i_mode >> 12); 163 164 if (last) { 165 if (err < 0) { 166 /* remember our position */ 167 fi->dentry = last; 168 fi->next_offset = di->offset; 169 } else { 170 dput(last); 171 } 172 last = NULL; 173 } 174 175 spin_lock(&inode->i_lock); 176 spin_lock(&dcache_lock); 177 178 last = dentry; 179 180 if (err < 0) 181 goto out_unlock; 182 183 p = p->prev; 184 filp->f_pos++; 185 186 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 187 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 188 goto more; 189 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 190 err = -EAGAIN; 191 192 out_unlock: 193 spin_unlock(&dcache_lock); 194 195 if (last) { 196 spin_unlock(&inode->i_lock); 197 dput(last); 198 spin_lock(&inode->i_lock); 199 } 200 201 return err; 202 } 203 204 /* 205 * make note of the last dentry we read, so we can 206 * continue at the same lexicographical point, 207 * regardless of what dir changes take place on the 208 * server. 209 */ 210 static int note_last_dentry(struct ceph_file_info *fi, const char *name, 211 int len) 212 { 213 kfree(fi->last_name); 214 fi->last_name = kmalloc(len+1, GFP_NOFS); 215 if (!fi->last_name) 216 return -ENOMEM; 217 memcpy(fi->last_name, name, len); 218 fi->last_name[len] = 0; 219 dout("note_last_dentry '%s'\n", fi->last_name); 220 return 0; 221 } 222 223 static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) 224 { 225 struct ceph_file_info *fi = filp->private_data; 226 struct inode *inode = filp->f_dentry->d_inode; 227 struct ceph_inode_info *ci = ceph_inode(inode); 228 struct ceph_client *client = ceph_inode_to_client(inode); 229 struct ceph_mds_client *mdsc = &client->mdsc; 230 unsigned frag = fpos_frag(filp->f_pos); 231 int off = fpos_off(filp->f_pos); 232 int err; 233 u32 ftype; 234 struct ceph_mds_reply_info_parsed *rinfo; 235 const int max_entries = client->mount_args->max_readdir; 236 const int max_bytes = client->mount_args->max_readdir_bytes; 237 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 239 if (fi->at_end) 240 return 0; 241 242 /* always start with . and .. */ 243 if (filp->f_pos == 0) { 244 /* note dir version at start of readdir so we can tell 245 * if any dentries get dropped */ 246 fi->dir_release_count = ci->i_release_count; 247 248 dout("readdir off 0 -> '.'\n"); 249 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 250 inode->i_ino, inode->i_mode >> 12) < 0) 251 return 0; 252 filp->f_pos = 1; 253 off = 1; 254 } 255 if (filp->f_pos == 1) { 256 dout("readdir off 1 -> '..'\n"); 257 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 258 filp->f_dentry->d_parent->d_inode->i_ino, 259 inode->i_mode >> 12) < 0) 260 return 0; 261 filp->f_pos = 2; 262 off = 2; 263 } 264 265 /* can we use the dcache? */ 266 spin_lock(&inode->i_lock); 267 if ((filp->f_pos == 2 || fi->dentry) && 268 !ceph_test_opt(client, NOASYNCREADDIR) && 269 ceph_snap(inode) != CEPH_SNAPDIR && 270 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 271 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 272 err = __dcache_readdir(filp, dirent, filldir); 273 if (err != -EAGAIN) { 274 spin_unlock(&inode->i_lock); 275 return err; 276 } 277 } 278 spin_unlock(&inode->i_lock); 279 if (fi->dentry) { 280 err = note_last_dentry(fi, fi->dentry->d_name.name, 281 fi->dentry->d_name.len); 282 if (err) 283 return err; 284 dput(fi->dentry); 285 fi->dentry = NULL; 286 } 287 288 /* proceed with a normal readdir */ 289 290 more: 291 /* do we have the correct frag content buffered? */ 292 if (fi->frag != frag || fi->last_readdir == NULL) { 293 struct ceph_mds_request *req; 294 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 295 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 296 297 /* discard old result, if any */ 298 if (fi->last_readdir) { 299 ceph_mdsc_put_request(fi->last_readdir); 300 fi->last_readdir = NULL; 301 } 302 303 /* requery frag tree, as the frag topology may have changed */ 304 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); 305 306 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 307 ceph_vinop(inode), frag, fi->last_name); 308 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 309 if (IS_ERR(req)) 310 return PTR_ERR(req); 311 req->r_inode = igrab(inode); 312 req->r_dentry = dget(filp->f_dentry); 313 /* hints to request -> mds selection code */ 314 req->r_direct_mode = USE_AUTH_MDS; 315 req->r_direct_hash = ceph_frag_value(frag); 316 req->r_direct_is_hash = true; 317 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 318 req->r_readdir_offset = fi->next_offset; 319 req->r_args.readdir.frag = cpu_to_le32(frag); 320 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 321 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); 322 req->r_num_caps = max_entries + 1; 323 err = ceph_mdsc_do_request(mdsc, NULL, req); 324 if (err < 0) { 325 ceph_mdsc_put_request(req); 326 return err; 327 } 328 dout("readdir got and parsed readdir result=%d" 329 " on frag %x, end=%d, complete=%d\n", err, frag, 330 (int)req->r_reply_info.dir_end, 331 (int)req->r_reply_info.dir_complete); 332 333 if (!req->r_did_prepopulate) { 334 dout("readdir !did_prepopulate"); 335 fi->dir_release_count--; /* preclude I_COMPLETE */ 336 } 337 338 /* note next offset and last dentry name */ 339 fi->offset = fi->next_offset; 340 fi->last_readdir = req; 341 342 if (req->r_reply_info.dir_end) { 343 kfree(fi->last_name); 344 fi->last_name = NULL; 345 fi->next_offset = 2; 346 } else { 347 rinfo = &req->r_reply_info; 348 err = note_last_dentry(fi, 349 rinfo->dir_dname[rinfo->dir_nr-1], 350 rinfo->dir_dname_len[rinfo->dir_nr-1]); 351 if (err) 352 return err; 353 fi->next_offset += rinfo->dir_nr; 354 } 355 } 356 357 rinfo = &fi->last_readdir->r_reply_info; 358 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 359 rinfo->dir_nr, off, fi->offset); 360 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { 361 u64 pos = ceph_make_fpos(frag, off); 362 struct ceph_mds_reply_inode *in = 363 rinfo->dir_in[off - fi->offset].in; 364 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 365 off, off - fi->offset, rinfo->dir_nr, pos, 366 rinfo->dir_dname_len[off - fi->offset], 367 rinfo->dir_dname[off - fi->offset], in); 368 BUG_ON(!in); 369 ftype = le32_to_cpu(in->mode) >> 12; 370 if (filldir(dirent, 371 rinfo->dir_dname[off - fi->offset], 372 rinfo->dir_dname_len[off - fi->offset], 373 pos, 374 le64_to_cpu(in->ino), 375 ftype) < 0) { 376 dout("filldir stopping us...\n"); 377 return 0; 378 } 379 off++; 380 filp->f_pos = pos + 1; 381 } 382 383 if (fi->last_name) { 384 ceph_mdsc_put_request(fi->last_readdir); 385 fi->last_readdir = NULL; 386 goto more; 387 } 388 389 /* more frags? */ 390 if (!ceph_frag_is_rightmost(frag)) { 391 frag = ceph_frag_next(frag); 392 off = 0; 393 filp->f_pos = ceph_make_fpos(frag, off); 394 dout("readdir next frag is %x\n", frag); 395 goto more; 396 } 397 fi->at_end = 1; 398 399 /* 400 * if dir_release_count still matches the dir, no dentries 401 * were released during the whole readdir, and we should have 402 * the complete dir contents in our cache. 403 */ 404 spin_lock(&inode->i_lock); 405 if (ci->i_release_count == fi->dir_release_count) { 406 dout(" marking %p complete\n", inode); 407 ci->i_ceph_flags |= CEPH_I_COMPLETE; 408 ci->i_max_offset = filp->f_pos; 409 } 410 spin_unlock(&inode->i_lock); 411 412 dout("readdir %p filp %p done.\n", inode, filp); 413 return 0; 414 } 415 416 static void reset_readdir(struct ceph_file_info *fi) 417 { 418 if (fi->last_readdir) { 419 ceph_mdsc_put_request(fi->last_readdir); 420 fi->last_readdir = NULL; 421 } 422 kfree(fi->last_name); 423 fi->next_offset = 2; /* compensate for . and .. */ 424 if (fi->dentry) { 425 dput(fi->dentry); 426 fi->dentry = NULL; 427 } 428 fi->at_end = 0; 429 } 430 431 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 432 { 433 struct ceph_file_info *fi = file->private_data; 434 struct inode *inode = file->f_mapping->host; 435 loff_t old_offset = offset; 436 loff_t retval; 437 438 mutex_lock(&inode->i_mutex); 439 switch (origin) { 440 case SEEK_END: 441 offset += inode->i_size + 2; /* FIXME */ 442 break; 443 case SEEK_CUR: 444 offset += file->f_pos; 445 } 446 retval = -EINVAL; 447 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { 448 if (offset != file->f_pos) { 449 file->f_pos = offset; 450 file->f_version = 0; 451 fi->at_end = 0; 452 } 453 retval = offset; 454 455 /* 456 * discard buffered readdir content on seekdir(0), or 457 * seek to new frag, or seek prior to current chunk. 458 */ 459 if (offset == 0 || 460 fpos_frag(offset) != fpos_frag(old_offset) || 461 fpos_off(offset) < fi->offset) { 462 dout("dir_llseek dropping %p content\n", file); 463 reset_readdir(fi); 464 } 465 466 /* bump dir_release_count if we did a forward seek */ 467 if (offset > old_offset) 468 fi->dir_release_count--; 469 } 470 mutex_unlock(&inode->i_mutex); 471 return retval; 472 } 473 474 /* 475 * Process result of a lookup/open request. 476 * 477 * Mainly, make sure we return the final req->r_dentry (if it already 478 * existed) in place of the original VFS-provided dentry when they 479 * differ. 480 * 481 * Gracefully handle the case where the MDS replies with -ENOENT and 482 * no trace (which it may do, at its discretion, e.g., if it doesn't 483 * care to issue a lease on the negative dentry). 484 */ 485 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 486 struct dentry *dentry, int err) 487 { 488 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 489 struct inode *parent = dentry->d_parent->d_inode; 490 491 /* .snap dir? */ 492 if (err == -ENOENT && 493 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ 494 strcmp(dentry->d_name.name, 495 client->mount_args->snapdir_name) == 0) { 496 struct inode *inode = ceph_get_snapdir(parent); 497 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 498 dentry, dentry->d_name.len, dentry->d_name.name, inode); 499 BUG_ON(!d_unhashed(dentry)); 500 d_add(dentry, inode); 501 err = 0; 502 } 503 504 if (err == -ENOENT) { 505 /* no trace? */ 506 err = 0; 507 if (!req->r_reply_info.head->is_dentry) { 508 dout("ENOENT and no trace, dentry %p inode %p\n", 509 dentry, dentry->d_inode); 510 if (dentry->d_inode) { 511 d_drop(dentry); 512 err = -ENOENT; 513 } else { 514 d_add(dentry, NULL); 515 } 516 } 517 } 518 if (err) 519 dentry = ERR_PTR(err); 520 else if (dentry != req->r_dentry) 521 dentry = dget(req->r_dentry); /* we got spliced */ 522 else 523 dentry = NULL; 524 return dentry; 525 } 526 527 static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 528 { 529 return ceph_ino(inode) == CEPH_INO_ROOT && 530 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 531 } 532 533 /* 534 * Look up a single dir entry. If there is a lookup intent, inform 535 * the MDS so that it gets our 'caps wanted' value in a single op. 536 */ 537 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 538 struct nameidata *nd) 539 { 540 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 541 struct ceph_mds_client *mdsc = &client->mdsc; 542 struct ceph_mds_request *req; 543 int op; 544 int err; 545 546 dout("lookup %p dentry %p '%.*s'\n", 547 dir, dentry, dentry->d_name.len, dentry->d_name.name); 548 549 if (dentry->d_name.len > NAME_MAX) 550 return ERR_PTR(-ENAMETOOLONG); 551 552 err = ceph_init_dentry(dentry); 553 if (err < 0) 554 return ERR_PTR(err); 555 556 /* open (but not create!) intent? */ 557 if (nd && 558 (nd->flags & LOOKUP_OPEN) && 559 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ 560 !(nd->intent.open.flags & O_CREAT)) { 561 int mode = nd->intent.open.create_mode & ~current->fs->umask; 562 return ceph_lookup_open(dir, dentry, nd, mode, 1); 563 } 564 565 /* can we conclude ENOENT locally? */ 566 if (dentry->d_inode == NULL) { 567 struct ceph_inode_info *ci = ceph_inode(dir); 568 struct ceph_dentry_info *di = ceph_dentry(dentry); 569 570 spin_lock(&dir->i_lock); 571 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 572 if (strncmp(dentry->d_name.name, 573 client->mount_args->snapdir_name, 574 dentry->d_name.len) && 575 !is_root_ceph_dentry(dir, dentry) && 576 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 577 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 578 spin_unlock(&dir->i_lock); 579 dout(" dir %p complete, -ENOENT\n", dir); 580 d_add(dentry, NULL); 581 di->lease_shared_gen = ci->i_shared_gen; 582 return NULL; 583 } 584 spin_unlock(&dir->i_lock); 585 } 586 587 op = ceph_snap(dir) == CEPH_SNAPDIR ? 588 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 589 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 590 if (IS_ERR(req)) 591 return ERR_CAST(req); 592 req->r_dentry = dget(dentry); 593 req->r_num_caps = 2; 594 /* we only need inode linkage */ 595 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 596 req->r_locked_dir = dir; 597 err = ceph_mdsc_do_request(mdsc, NULL, req); 598 dentry = ceph_finish_lookup(req, dentry, err); 599 ceph_mdsc_put_request(req); /* will dput(dentry) */ 600 dout("lookup result=%p\n", dentry); 601 return dentry; 602 } 603 604 /* 605 * If we do a create but get no trace back from the MDS, follow up with 606 * a lookup (the VFS expects us to link up the provided dentry). 607 */ 608 int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) 609 { 610 struct dentry *result = ceph_lookup(dir, dentry, NULL); 611 612 if (result && !IS_ERR(result)) { 613 /* 614 * We created the item, then did a lookup, and found 615 * it was already linked to another inode we already 616 * had in our cache (and thus got spliced). Link our 617 * dentry to that inode, but don't hash it, just in 618 * case the VFS wants to dereference it. 619 */ 620 BUG_ON(!result->d_inode); 621 d_instantiate(dentry, result->d_inode); 622 return 0; 623 } 624 return PTR_ERR(result); 625 } 626 627 static int ceph_mknod(struct inode *dir, struct dentry *dentry, 628 int mode, dev_t rdev) 629 { 630 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 631 struct ceph_mds_client *mdsc = &client->mdsc; 632 struct ceph_mds_request *req; 633 int err; 634 635 if (ceph_snap(dir) != CEPH_NOSNAP) 636 return -EROFS; 637 638 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", 639 dir, dentry, mode, rdev); 640 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 641 if (IS_ERR(req)) { 642 d_drop(dentry); 643 return PTR_ERR(req); 644 } 645 req->r_dentry = dget(dentry); 646 req->r_num_caps = 2; 647 req->r_locked_dir = dir; 648 req->r_args.mknod.mode = cpu_to_le32(mode); 649 req->r_args.mknod.rdev = cpu_to_le32(rdev); 650 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 651 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 652 err = ceph_mdsc_do_request(mdsc, dir, req); 653 if (!err && !req->r_reply_info.head->is_dentry) 654 err = ceph_handle_notrace_create(dir, dentry); 655 ceph_mdsc_put_request(req); 656 if (err) 657 d_drop(dentry); 658 return err; 659 } 660 661 static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, 662 struct nameidata *nd) 663 { 664 dout("create in dir %p dentry %p name '%.*s'\n", 665 dir, dentry, dentry->d_name.len, dentry->d_name.name); 666 667 if (ceph_snap(dir) != CEPH_NOSNAP) 668 return -EROFS; 669 670 if (nd) { 671 BUG_ON((nd->flags & LOOKUP_OPEN) == 0); 672 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); 673 /* hrm, what should i do here if we get aliased? */ 674 if (IS_ERR(dentry)) 675 return PTR_ERR(dentry); 676 return 0; 677 } 678 679 /* fall back to mknod */ 680 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); 681 } 682 683 static int ceph_symlink(struct inode *dir, struct dentry *dentry, 684 const char *dest) 685 { 686 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 687 struct ceph_mds_client *mdsc = &client->mdsc; 688 struct ceph_mds_request *req; 689 int err; 690 691 if (ceph_snap(dir) != CEPH_NOSNAP) 692 return -EROFS; 693 694 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 695 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 696 if (IS_ERR(req)) { 697 d_drop(dentry); 698 return PTR_ERR(req); 699 } 700 req->r_dentry = dget(dentry); 701 req->r_num_caps = 2; 702 req->r_path2 = kstrdup(dest, GFP_NOFS); 703 req->r_locked_dir = dir; 704 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 705 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 706 err = ceph_mdsc_do_request(mdsc, dir, req); 707 if (!err && !req->r_reply_info.head->is_dentry) 708 err = ceph_handle_notrace_create(dir, dentry); 709 ceph_mdsc_put_request(req); 710 if (err) 711 d_drop(dentry); 712 return err; 713 } 714 715 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 716 { 717 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 718 struct ceph_mds_client *mdsc = &client->mdsc; 719 struct ceph_mds_request *req; 720 int err = -EROFS; 721 int op; 722 723 if (ceph_snap(dir) == CEPH_SNAPDIR) { 724 /* mkdir .snap/foo is a MKSNAP */ 725 op = CEPH_MDS_OP_MKSNAP; 726 dout("mksnap dir %p snap '%.*s' dn %p\n", dir, 727 dentry->d_name.len, dentry->d_name.name, dentry); 728 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 729 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); 730 op = CEPH_MDS_OP_MKDIR; 731 } else { 732 goto out; 733 } 734 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 735 if (IS_ERR(req)) { 736 err = PTR_ERR(req); 737 goto out; 738 } 739 740 req->r_dentry = dget(dentry); 741 req->r_num_caps = 2; 742 req->r_locked_dir = dir; 743 req->r_args.mkdir.mode = cpu_to_le32(mode); 744 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 745 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 746 err = ceph_mdsc_do_request(mdsc, dir, req); 747 if (!err && !req->r_reply_info.head->is_dentry) 748 err = ceph_handle_notrace_create(dir, dentry); 749 ceph_mdsc_put_request(req); 750 out: 751 if (err < 0) 752 d_drop(dentry); 753 return err; 754 } 755 756 static int ceph_link(struct dentry *old_dentry, struct inode *dir, 757 struct dentry *dentry) 758 { 759 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 760 struct ceph_mds_client *mdsc = &client->mdsc; 761 struct ceph_mds_request *req; 762 int err; 763 764 if (ceph_snap(dir) != CEPH_NOSNAP) 765 return -EROFS; 766 767 dout("link in dir %p old_dentry %p dentry %p\n", dir, 768 old_dentry, dentry); 769 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); 770 if (IS_ERR(req)) { 771 d_drop(dentry); 772 return PTR_ERR(req); 773 } 774 req->r_dentry = dget(dentry); 775 req->r_num_caps = 2; 776 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 777 req->r_locked_dir = dir; 778 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 779 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 780 err = ceph_mdsc_do_request(mdsc, dir, req); 781 if (err) 782 d_drop(dentry); 783 else if (!req->r_reply_info.head->is_dentry) 784 d_instantiate(dentry, igrab(old_dentry->d_inode)); 785 ceph_mdsc_put_request(req); 786 return err; 787 } 788 789 /* 790 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 791 * looks like the link count will hit 0, drop any other caps (other 792 * than PIN) we don't specifically want (due to the file still being 793 * open). 794 */ 795 static int drop_caps_for_unlink(struct inode *inode) 796 { 797 struct ceph_inode_info *ci = ceph_inode(inode); 798 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 799 800 spin_lock(&inode->i_lock); 801 if (inode->i_nlink == 1) { 802 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 803 ci->i_ceph_flags |= CEPH_I_NODELAY; 804 } 805 spin_unlock(&inode->i_lock); 806 return drop; 807 } 808 809 /* 810 * rmdir and unlink are differ only by the metadata op code 811 */ 812 static int ceph_unlink(struct inode *dir, struct dentry *dentry) 813 { 814 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 815 struct ceph_mds_client *mdsc = &client->mdsc; 816 struct inode *inode = dentry->d_inode; 817 struct ceph_mds_request *req; 818 int err = -EROFS; 819 int op; 820 821 if (ceph_snap(dir) == CEPH_SNAPDIR) { 822 /* rmdir .snap/foo is RMSNAP */ 823 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, 824 dentry->d_name.name, dentry); 825 op = CEPH_MDS_OP_RMSNAP; 826 } else if (ceph_snap(dir) == CEPH_NOSNAP) { 827 dout("unlink/rmdir dir %p dn %p inode %p\n", 828 dir, dentry, inode); 829 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? 830 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; 831 } else 832 goto out; 833 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 834 if (IS_ERR(req)) { 835 err = PTR_ERR(req); 836 goto out; 837 } 838 req->r_dentry = dget(dentry); 839 req->r_num_caps = 2; 840 req->r_locked_dir = dir; 841 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 842 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 843 req->r_inode_drop = drop_caps_for_unlink(inode); 844 err = ceph_mdsc_do_request(mdsc, dir, req); 845 if (!err && !req->r_reply_info.head->is_dentry) 846 d_delete(dentry); 847 ceph_mdsc_put_request(req); 848 out: 849 return err; 850 } 851 852 static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 853 struct inode *new_dir, struct dentry *new_dentry) 854 { 855 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 856 struct ceph_mds_client *mdsc = &client->mdsc; 857 struct ceph_mds_request *req; 858 int err; 859 860 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 861 return -EXDEV; 862 if (ceph_snap(old_dir) != CEPH_NOSNAP || 863 ceph_snap(new_dir) != CEPH_NOSNAP) 864 return -EROFS; 865 dout("rename dir %p dentry %p to dir %p dentry %p\n", 866 old_dir, old_dentry, new_dir, new_dentry); 867 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 868 if (IS_ERR(req)) 869 return PTR_ERR(req); 870 req->r_dentry = dget(new_dentry); 871 req->r_num_caps = 2; 872 req->r_old_dentry = dget(old_dentry); 873 req->r_locked_dir = new_dir; 874 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 875 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 876 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 877 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 878 /* release LINK_RDCACHE on source inode (mds will lock it) */ 879 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; 880 if (new_dentry->d_inode) 881 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); 882 err = ceph_mdsc_do_request(mdsc, old_dir, req); 883 if (!err && !req->r_reply_info.head->is_dentry) { 884 /* 885 * Normally d_move() is done by fill_trace (called by 886 * do_request, above). If there is no trace, we need 887 * to do it here. 888 */ 889 890 /* d_move screws up d_subdirs order */ 891 ceph_i_clear(new_dir, CEPH_I_COMPLETE); 892 893 d_move(old_dentry, new_dentry); 894 895 /* ensure target dentry is invalidated, despite 896 rehashing bug in vfs_rename_dir */ 897 ceph_invalidate_dentry_lease(new_dentry); 898 } 899 ceph_mdsc_put_request(req); 900 return err; 901 } 902 903 /* 904 * Ensure a dentry lease will no longer revalidate. 905 */ 906 void ceph_invalidate_dentry_lease(struct dentry *dentry) 907 { 908 spin_lock(&dentry->d_lock); 909 dentry->d_time = jiffies; 910 ceph_dentry(dentry)->lease_shared_gen = 0; 911 spin_unlock(&dentry->d_lock); 912 } 913 914 /* 915 * Check if dentry lease is valid. If not, delete the lease. Try to 916 * renew if the least is more than half up. 917 */ 918 static int dentry_lease_is_valid(struct dentry *dentry) 919 { 920 struct ceph_dentry_info *di; 921 struct ceph_mds_session *s; 922 int valid = 0; 923 u32 gen; 924 unsigned long ttl; 925 struct ceph_mds_session *session = NULL; 926 struct inode *dir = NULL; 927 u32 seq = 0; 928 929 spin_lock(&dentry->d_lock); 930 di = ceph_dentry(dentry); 931 if (di && di->lease_session) { 932 s = di->lease_session; 933 spin_lock(&s->s_cap_lock); 934 gen = s->s_cap_gen; 935 ttl = s->s_cap_ttl; 936 spin_unlock(&s->s_cap_lock); 937 938 if (di->lease_gen == gen && 939 time_before(jiffies, dentry->d_time) && 940 time_before(jiffies, ttl)) { 941 valid = 1; 942 if (di->lease_renew_after && 943 time_after(jiffies, di->lease_renew_after)) { 944 /* we should renew */ 945 dir = dentry->d_parent->d_inode; 946 session = ceph_get_mds_session(s); 947 seq = di->lease_seq; 948 di->lease_renew_after = 0; 949 di->lease_renew_from = jiffies; 950 } 951 } 952 } 953 spin_unlock(&dentry->d_lock); 954 955 if (session) { 956 ceph_mdsc_lease_send_msg(session, dir, dentry, 957 CEPH_MDS_LEASE_RENEW, seq); 958 ceph_put_mds_session(session); 959 } 960 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); 961 return valid; 962 } 963 964 /* 965 * Check if directory-wide content lease/cap is valid. 966 */ 967 static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) 968 { 969 struct ceph_inode_info *ci = ceph_inode(dir); 970 struct ceph_dentry_info *di = ceph_dentry(dentry); 971 int valid = 0; 972 973 spin_lock(&dir->i_lock); 974 if (ci->i_shared_gen == di->lease_shared_gen) 975 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); 976 spin_unlock(&dir->i_lock); 977 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", 978 dir, (unsigned)ci->i_shared_gen, dentry, 979 (unsigned)di->lease_shared_gen, valid); 980 return valid; 981 } 982 983 /* 984 * Check if cached dentry can be trusted. 985 */ 986 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) 987 { 988 struct inode *dir = dentry->d_parent->d_inode; 989 990 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 991 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 992 ceph_dentry(dentry)->offset); 993 994 /* always trust cached snapped dentries, snapdir dentry */ 995 if (ceph_snap(dir) != CEPH_NOSNAP) { 996 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, 997 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 998 goto out_touch; 999 } 1000 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) 1001 goto out_touch; 1002 1003 if (dentry_lease_is_valid(dentry) || 1004 dir_lease_is_valid(dir, dentry)) 1005 goto out_touch; 1006 1007 dout("d_revalidate %p invalid\n", dentry); 1008 d_drop(dentry); 1009 return 0; 1010 out_touch: 1011 ceph_dentry_lru_touch(dentry); 1012 return 1; 1013 } 1014 1015 /* 1016 * When a dentry is released, clear the dir I_COMPLETE if it was part 1017 * of the current dir gen or if this is in the snapshot namespace. 1018 */ 1019 static void ceph_dentry_release(struct dentry *dentry) 1020 { 1021 struct ceph_dentry_info *di = ceph_dentry(dentry); 1022 struct inode *parent_inode = dentry->d_parent->d_inode; 1023 u64 snapid = ceph_snap(parent_inode); 1024 1025 dout("dentry_release %p parent %p\n", dentry, parent_inode); 1026 1027 if (parent_inode && snapid != CEPH_SNAPDIR) { 1028 struct ceph_inode_info *ci = ceph_inode(parent_inode); 1029 1030 spin_lock(&parent_inode->i_lock); 1031 if (ci->i_shared_gen == di->lease_shared_gen || 1032 snapid <= CEPH_MAXSNAP) { 1033 dout(" clearing %p complete (d_release)\n", 1034 parent_inode); 1035 ci->i_ceph_flags &= ~CEPH_I_COMPLETE; 1036 ci->i_release_count++; 1037 } 1038 spin_unlock(&parent_inode->i_lock); 1039 } 1040 if (di) { 1041 ceph_dentry_lru_del(dentry); 1042 if (di->lease_session) 1043 ceph_put_mds_session(di->lease_session); 1044 kmem_cache_free(ceph_dentry_cachep, di); 1045 dentry->d_fsdata = NULL; 1046 } 1047 } 1048 1049 static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1050 struct nameidata *nd) 1051 { 1052 /* 1053 * Eventually, we'll want to revalidate snapped metadata 1054 * too... probably... 1055 */ 1056 return 1; 1057 } 1058 1059 1060 1061 /* 1062 * read() on a dir. This weird interface hack only works if mounted 1063 * with '-o dirstat'. 1064 */ 1065 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1066 loff_t *ppos) 1067 { 1068 struct ceph_file_info *cf = file->private_data; 1069 struct inode *inode = file->f_dentry->d_inode; 1070 struct ceph_inode_info *ci = ceph_inode(inode); 1071 int left; 1072 1073 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1074 return -EISDIR; 1075 1076 if (!cf->dir_info) { 1077 cf->dir_info = kmalloc(1024, GFP_NOFS); 1078 if (!cf->dir_info) 1079 return -ENOMEM; 1080 cf->dir_info_len = 1081 sprintf(cf->dir_info, 1082 "entries: %20lld\n" 1083 " files: %20lld\n" 1084 " subdirs: %20lld\n" 1085 "rentries: %20lld\n" 1086 " rfiles: %20lld\n" 1087 " rsubdirs: %20lld\n" 1088 "rbytes: %20lld\n" 1089 "rctime: %10ld.%09ld\n", 1090 ci->i_files + ci->i_subdirs, 1091 ci->i_files, 1092 ci->i_subdirs, 1093 ci->i_rfiles + ci->i_rsubdirs, 1094 ci->i_rfiles, 1095 ci->i_rsubdirs, 1096 ci->i_rbytes, 1097 (long)ci->i_rctime.tv_sec, 1098 (long)ci->i_rctime.tv_nsec); 1099 } 1100 1101 if (*ppos >= cf->dir_info_len) 1102 return 0; 1103 size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1104 left = copy_to_user(buf, cf->dir_info + *ppos, size); 1105 if (left == size) 1106 return -EFAULT; 1107 *ppos += (size - left); 1108 return size - left; 1109 } 1110 1111 /* 1112 * an fsync() on a dir will wait for any uncommitted directory 1113 * operations to commit. 1114 */ 1115 static int ceph_dir_fsync(struct file *file, int datasync) 1116 { 1117 struct inode *inode = file->f_path.dentry->d_inode; 1118 struct ceph_inode_info *ci = ceph_inode(inode); 1119 struct list_head *head = &ci->i_unsafe_dirops; 1120 struct ceph_mds_request *req; 1121 u64 last_tid; 1122 int ret = 0; 1123 1124 dout("dir_fsync %p\n", inode); 1125 spin_lock(&ci->i_unsafe_lock); 1126 if (list_empty(head)) 1127 goto out; 1128 1129 req = list_entry(head->prev, 1130 struct ceph_mds_request, r_unsafe_dir_item); 1131 last_tid = req->r_tid; 1132 1133 do { 1134 ceph_mdsc_get_request(req); 1135 spin_unlock(&ci->i_unsafe_lock); 1136 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1137 inode, req->r_tid, last_tid); 1138 if (req->r_timeout) { 1139 ret = wait_for_completion_timeout( 1140 &req->r_safe_completion, req->r_timeout); 1141 if (ret > 0) 1142 ret = 0; 1143 else if (ret == 0) 1144 ret = -EIO; /* timed out */ 1145 } else { 1146 wait_for_completion(&req->r_safe_completion); 1147 } 1148 spin_lock(&ci->i_unsafe_lock); 1149 ceph_mdsc_put_request(req); 1150 1151 if (ret || list_empty(head)) 1152 break; 1153 req = list_entry(head->next, 1154 struct ceph_mds_request, r_unsafe_dir_item); 1155 } while (req->r_tid < last_tid); 1156 out: 1157 spin_unlock(&ci->i_unsafe_lock); 1158 return ret; 1159 } 1160 1161 /* 1162 * We maintain a private dentry LRU. 1163 * 1164 * FIXME: this needs to be changed to a per-mds lru to be useful. 1165 */ 1166 void ceph_dentry_lru_add(struct dentry *dn) 1167 { 1168 struct ceph_dentry_info *di = ceph_dentry(dn); 1169 struct ceph_mds_client *mdsc; 1170 1171 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1172 dn->d_name.len, dn->d_name.name); 1173 if (di) { 1174 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1175 spin_lock(&mdsc->dentry_lru_lock); 1176 list_add_tail(&di->lru, &mdsc->dentry_lru); 1177 mdsc->num_dentry++; 1178 spin_unlock(&mdsc->dentry_lru_lock); 1179 } 1180 } 1181 1182 void ceph_dentry_lru_touch(struct dentry *dn) 1183 { 1184 struct ceph_dentry_info *di = ceph_dentry(dn); 1185 struct ceph_mds_client *mdsc; 1186 1187 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1188 dn->d_name.len, dn->d_name.name, di->offset); 1189 if (di) { 1190 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1191 spin_lock(&mdsc->dentry_lru_lock); 1192 list_move_tail(&di->lru, &mdsc->dentry_lru); 1193 spin_unlock(&mdsc->dentry_lru_lock); 1194 } 1195 } 1196 1197 void ceph_dentry_lru_del(struct dentry *dn) 1198 { 1199 struct ceph_dentry_info *di = ceph_dentry(dn); 1200 struct ceph_mds_client *mdsc; 1201 1202 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1203 dn->d_name.len, dn->d_name.name); 1204 if (di) { 1205 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1206 spin_lock(&mdsc->dentry_lru_lock); 1207 list_del_init(&di->lru); 1208 mdsc->num_dentry--; 1209 spin_unlock(&mdsc->dentry_lru_lock); 1210 } 1211 } 1212 1213 const struct file_operations ceph_dir_fops = { 1214 .read = ceph_read_dir, 1215 .readdir = ceph_readdir, 1216 .llseek = ceph_dir_llseek, 1217 .open = ceph_open, 1218 .release = ceph_release, 1219 .unlocked_ioctl = ceph_ioctl, 1220 .fsync = ceph_dir_fsync, 1221 }; 1222 1223 const struct inode_operations ceph_dir_iops = { 1224 .lookup = ceph_lookup, 1225 .permission = ceph_permission, 1226 .getattr = ceph_getattr, 1227 .setattr = ceph_setattr, 1228 .setxattr = ceph_setxattr, 1229 .getxattr = ceph_getxattr, 1230 .listxattr = ceph_listxattr, 1231 .removexattr = ceph_removexattr, 1232 .mknod = ceph_mknod, 1233 .symlink = ceph_symlink, 1234 .mkdir = ceph_mkdir, 1235 .link = ceph_link, 1236 .unlink = ceph_unlink, 1237 .rmdir = ceph_unlink, 1238 .rename = ceph_rename, 1239 .create = ceph_create, 1240 }; 1241 1242 struct dentry_operations ceph_dentry_ops = { 1243 .d_revalidate = ceph_d_revalidate, 1244 .d_release = ceph_dentry_release, 1245 }; 1246 1247 struct dentry_operations ceph_snapdir_dentry_ops = { 1248 .d_revalidate = ceph_snapdir_d_revalidate, 1249 .d_release = ceph_dentry_release, 1250 }; 1251 1252 struct dentry_operations ceph_snap_dentry_ops = { 1253 .d_release = ceph_dentry_release, 1254 }; 1255