1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/module.h> 5 #include <linux/fs.h> 6 #include <linux/slab.h> 7 #include <linux/string.h> 8 #include <linux/uaccess.h> 9 #include <linux/kernel.h> 10 #include <linux/writeback.h> 11 #include <linux/vmalloc.h> 12 #include <linux/xattr.h> 13 #include <linux/posix_acl.h> 14 #include <linux/random.h> 15 #include <linux/sort.h> 16 #include <linux/iversion.h> 17 #include <linux/fscrypt.h> 18 19 #include "super.h" 20 #include "mds_client.h" 21 #include "cache.h" 22 #include "crypto.h" 23 #include <linux/ceph/decode.h> 24 25 /* 26 * Ceph inode operations 27 * 28 * Implement basic inode helpers (get, alloc) and inode ops (getattr, 29 * setattr, etc.), xattr helpers, and helpers for assimilating 30 * metadata returned by the MDS into our cache. 31 * 32 * Also define helpers for doing asynchronous writeback, invalidation, 33 * and truncation for the benefit of those who can't afford to block 34 * (typically because they are in the message handler path). 35 */ 36 37 static const struct inode_operations ceph_symlink_iops; 38 static const struct inode_operations ceph_encrypted_symlink_iops; 39 40 static void ceph_inode_work(struct work_struct *work); 41 42 /* 43 * find or create an inode, given the ceph ino number 44 */ 45 static int ceph_set_ino_cb(struct inode *inode, void *data) 46 { 47 struct ceph_inode_info *ci = ceph_inode(inode); 48 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 49 50 ci->i_vino = *(struct ceph_vino *)data; 51 inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); 52 inode_set_iversion_raw(inode, 0); 53 percpu_counter_inc(&mdsc->metric.total_inodes); 54 55 return 0; 56 } 57 58 /* 59 * Check if the parent inode matches the vino from directory reply info 60 */ 61 static inline bool ceph_vino_matches_parent(struct inode *parent, 62 struct ceph_vino vino) 63 { 64 return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; 65 } 66 67 /* 68 * Validate that the directory inode referenced by @req->r_parent matches the 69 * inode number and snapshot id contained in the reply's directory record. If 70 * they do not match – which can theoretically happen if the parent dentry was 71 * moved between the time the request was issued and the reply arrived – fall 72 * back to looking up the correct inode in the inode cache. 73 * 74 * A reference is *always* returned. Callers that receive a different inode 75 * than the original @parent are responsible for dropping the extra reference 76 * once the reply has been processed. 77 */ 78 static struct inode *ceph_get_reply_dir(struct super_block *sb, 79 struct inode *parent, 80 struct ceph_mds_reply_info_parsed *rinfo) 81 { 82 struct ceph_vino vino; 83 84 if (unlikely(!rinfo->diri.in)) 85 return parent; /* nothing to compare against */ 86 87 /* If we didn't have a cached parent inode to begin with, just bail out. */ 88 if (!parent) 89 return NULL; 90 91 vino.ino = le64_to_cpu(rinfo->diri.in->ino); 92 vino.snap = le64_to_cpu(rinfo->diri.in->snapid); 93 94 if (likely(ceph_vino_matches_parent(parent, vino))) 95 return parent; /* matches – use the original reference */ 96 97 /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ 98 WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", 99 ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); 100 101 return ceph_get_inode(sb, vino, NULL); 102 } 103 104 /** 105 * ceph_new_inode - allocate a new inode in advance of an expected create 106 * @dir: parent directory for new inode 107 * @dentry: dentry that may eventually point to new inode 108 * @mode: mode of new inode 109 * @as_ctx: pointer to inherited security context 110 * 111 * Allocate a new inode in advance of an operation to create a new inode. 112 * This allocates the inode and sets up the acl_sec_ctx with appropriate 113 * info for the new inode. 114 * 115 * Returns a pointer to the new inode or an ERR_PTR. 116 */ 117 struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, 118 umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) 119 { 120 int err; 121 struct inode *inode; 122 123 inode = new_inode(dir->i_sb); 124 if (!inode) 125 return ERR_PTR(-ENOMEM); 126 127 inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; 128 129 if (!S_ISLNK(*mode)) { 130 err = ceph_pre_init_acls(dir, mode, as_ctx); 131 if (err < 0) 132 goto out_err; 133 } 134 135 inode_state_assign_raw(inode, 0); 136 inode->i_mode = *mode; 137 138 err = ceph_security_init_secctx(dentry, *mode, as_ctx); 139 if (err < 0) 140 goto out_err; 141 142 /* 143 * We'll skip setting fscrypt context for snapshots, leaving that for 144 * the handle_reply(). 145 */ 146 if (ceph_snap(dir) != CEPH_SNAPDIR) { 147 err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); 148 if (err) 149 goto out_err; 150 } 151 152 return inode; 153 out_err: 154 iput(inode); 155 return ERR_PTR(err); 156 } 157 158 void ceph_as_ctx_to_req(struct ceph_mds_request *req, 159 struct ceph_acl_sec_ctx *as_ctx) 160 { 161 if (as_ctx->pagelist) { 162 req->r_pagelist = as_ctx->pagelist; 163 as_ctx->pagelist = NULL; 164 } 165 ceph_fscrypt_as_ctx_to_req(req, as_ctx); 166 } 167 168 /** 169 * ceph_get_inode - find or create/hash a new inode 170 * @sb: superblock to search and allocate in 171 * @vino: vino to search for 172 * @newino: optional new inode to insert if one isn't found (may be NULL) 173 * 174 * Search for or insert a new inode into the hash for the given vino, and 175 * return a reference to it. If new is non-NULL, its reference is consumed. 176 */ 177 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, 178 struct inode *newino) 179 { 180 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); 181 struct ceph_client *cl = mdsc->fsc->client; 182 struct inode *inode; 183 184 if (ceph_vino_is_reserved(vino)) 185 return ERR_PTR(-EREMOTEIO); 186 187 if (newino) { 188 inode = inode_insert5(newino, (unsigned long)vino.ino, 189 ceph_ino_compare, ceph_set_ino_cb, &vino); 190 if (inode != newino) 191 iput(newino); 192 } else { 193 inode = iget5_locked(sb, (unsigned long)vino.ino, 194 ceph_ino_compare, ceph_set_ino_cb, &vino); 195 } 196 197 if (!inode) { 198 doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap); 199 return ERR_PTR(-ENOMEM); 200 } 201 202 doutc(cl, "on %llx=%llx.%llx got %p new %d\n", 203 ceph_present_inode(inode), ceph_vinop(inode), inode, 204 !!(inode_state_read_once(inode) & I_NEW)); 205 return inode; 206 } 207 208 /* 209 * get/construct snapdir inode for a given directory 210 */ 211 struct inode *ceph_get_snapdir(struct inode *parent) 212 { 213 struct ceph_client *cl = ceph_inode_to_client(parent); 214 struct ceph_vino vino = { 215 .ino = ceph_ino(parent), 216 .snap = CEPH_SNAPDIR, 217 }; 218 struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL); 219 struct ceph_inode_info *ci = ceph_inode(inode); 220 int ret = -ENOTDIR; 221 222 if (IS_ERR(inode)) 223 return inode; 224 225 if (!S_ISDIR(parent->i_mode)) { 226 pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n", 227 parent->i_mode); 228 goto err; 229 } 230 231 if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) { 232 pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", 233 inode->i_mode); 234 goto err; 235 } 236 237 inode->i_mode = parent->i_mode; 238 inode->i_uid = parent->i_uid; 239 inode->i_gid = parent->i_gid; 240 inode_set_mtime_to_ts(inode, inode_get_mtime(parent)); 241 inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); 242 inode_set_atime_to_ts(inode, inode_get_atime(parent)); 243 ci->i_rbytes = 0; 244 ci->i_btime = ceph_inode(parent)->i_btime; 245 246 #ifdef CONFIG_FS_ENCRYPTION 247 /* if encrypted, just borrow fscrypt_auth from parent */ 248 if (IS_ENCRYPTED(parent)) { 249 struct ceph_inode_info *pci = ceph_inode(parent); 250 251 ci->fscrypt_auth = kmemdup(pci->fscrypt_auth, 252 pci->fscrypt_auth_len, 253 GFP_KERNEL); 254 if (ci->fscrypt_auth) { 255 inode->i_flags |= S_ENCRYPTED; 256 ci->fscrypt_auth_len = pci->fscrypt_auth_len; 257 } else { 258 doutc(cl, "Failed to alloc snapdir fscrypt_auth\n"); 259 ret = -ENOMEM; 260 goto err; 261 } 262 } 263 #endif 264 if (inode_state_read_once(inode) & I_NEW) { 265 inode->i_op = &ceph_snapdir_iops; 266 inode->i_fop = &ceph_snapdir_fops; 267 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 268 unlock_new_inode(inode); 269 } 270 271 return inode; 272 err: 273 if ((inode_state_read_once(inode) & I_NEW)) 274 discard_new_inode(inode); 275 else 276 iput(inode); 277 return ERR_PTR(ret); 278 } 279 280 const struct inode_operations ceph_file_iops = { 281 .permission = ceph_permission, 282 .setattr = ceph_setattr, 283 .getattr = ceph_getattr, 284 .listxattr = ceph_listxattr, 285 .get_inode_acl = ceph_get_acl, 286 .set_acl = ceph_set_acl, 287 }; 288 289 290 /* 291 * We use a 'frag tree' to keep track of the MDS's directory fragments 292 * for a given inode (usually there is just a single fragment). We 293 * need to know when a child frag is delegated to a new MDS, or when 294 * it is flagged as replicated, so we can direct our requests 295 * accordingly. 296 */ 297 298 /* 299 * find/create a frag in the tree 300 */ 301 static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, 302 u32 f) 303 { 304 struct inode *inode = &ci->netfs.inode; 305 struct ceph_client *cl = ceph_inode_to_client(inode); 306 struct rb_node **p; 307 struct rb_node *parent = NULL; 308 struct ceph_inode_frag *frag; 309 int c; 310 311 p = &ci->i_fragtree.rb_node; 312 while (*p) { 313 parent = *p; 314 frag = rb_entry(parent, struct ceph_inode_frag, node); 315 c = ceph_frag_compare(f, frag->frag); 316 if (c < 0) 317 p = &(*p)->rb_left; 318 else if (c > 0) 319 p = &(*p)->rb_right; 320 else 321 return frag; 322 } 323 324 frag = kmalloc_obj(*frag, GFP_NOFS); 325 if (!frag) 326 return ERR_PTR(-ENOMEM); 327 328 frag->frag = f; 329 frag->split_by = 0; 330 frag->mds = -1; 331 frag->ndist = 0; 332 333 rb_link_node(&frag->node, parent, p); 334 rb_insert_color(&frag->node, &ci->i_fragtree); 335 336 doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f); 337 return frag; 338 } 339 340 /* 341 * find a specific frag @f 342 */ 343 struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) 344 { 345 struct rb_node *n = ci->i_fragtree.rb_node; 346 347 while (n) { 348 struct ceph_inode_frag *frag = 349 rb_entry(n, struct ceph_inode_frag, node); 350 int c = ceph_frag_compare(f, frag->frag); 351 if (c < 0) 352 n = n->rb_left; 353 else if (c > 0) 354 n = n->rb_right; 355 else 356 return frag; 357 } 358 return NULL; 359 } 360 361 /* 362 * Choose frag containing the given value @v. If @pfrag is 363 * specified, copy the frag delegation info to the caller if 364 * it is present. 365 */ 366 static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 367 struct ceph_inode_frag *pfrag, int *found) 368 { 369 struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); 370 u32 t = ceph_frag_make(0, 0); 371 struct ceph_inode_frag *frag; 372 unsigned nway, i; 373 u32 n; 374 375 if (found) 376 *found = 0; 377 378 while (1) { 379 WARN_ON(!ceph_frag_contains_value(t, v)); 380 frag = __ceph_find_frag(ci, t); 381 if (!frag) 382 break; /* t is a leaf */ 383 if (frag->split_by == 0) { 384 if (pfrag) 385 memcpy(pfrag, frag, sizeof(*pfrag)); 386 if (found) 387 *found = 1; 388 break; 389 } 390 391 /* choose child */ 392 nway = 1 << frag->split_by; 393 doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t, 394 frag->split_by, nway); 395 for (i = 0; i < nway; i++) { 396 n = ceph_frag_make_child(t, frag->split_by, i); 397 if (ceph_frag_contains_value(n, v)) { 398 t = n; 399 break; 400 } 401 } 402 BUG_ON(i == nway); 403 } 404 doutc(cl, "frag(%x) = %x\n", v, t); 405 406 return t; 407 } 408 409 u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 410 struct ceph_inode_frag *pfrag, int *found) 411 { 412 u32 ret; 413 mutex_lock(&ci->i_fragtree_mutex); 414 ret = __ceph_choose_frag(ci, v, pfrag, found); 415 mutex_unlock(&ci->i_fragtree_mutex); 416 return ret; 417 } 418 419 /* 420 * Process dirfrag (delegation) info from the mds. Include leaf 421 * fragment in tree ONLY if ndist > 0. Otherwise, only 422 * branches/splits are included in i_fragtree) 423 */ 424 static int ceph_fill_dirfrag(struct inode *inode, 425 struct ceph_mds_reply_dirfrag *dirinfo) 426 { 427 struct ceph_inode_info *ci = ceph_inode(inode); 428 struct ceph_client *cl = ceph_inode_to_client(inode); 429 struct ceph_inode_frag *frag; 430 u32 id = le32_to_cpu(dirinfo->frag); 431 int mds = le32_to_cpu(dirinfo->auth); 432 int ndist = le32_to_cpu(dirinfo->ndist); 433 int diri_auth = -1; 434 int i; 435 int err = 0; 436 437 spin_lock(&ci->i_ceph_lock); 438 if (ci->i_auth_cap) 439 diri_auth = ci->i_auth_cap->mds; 440 spin_unlock(&ci->i_ceph_lock); 441 442 if (mds == -1) /* CDIR_AUTH_PARENT */ 443 mds = diri_auth; 444 445 mutex_lock(&ci->i_fragtree_mutex); 446 if (ndist == 0 && mds == diri_auth) { 447 /* no delegation info needed. */ 448 frag = __ceph_find_frag(ci, id); 449 if (!frag) 450 goto out; 451 if (frag->split_by == 0) { 452 /* tree leaf, remove */ 453 doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n", 454 inode, ceph_vinop(inode), id); 455 rb_erase(&frag->node, &ci->i_fragtree); 456 kfree(frag); 457 } else { 458 /* tree branch, keep and clear */ 459 doutc(cl, "cleared %p %llx.%llx frag %x referral\n", 460 inode, ceph_vinop(inode), id); 461 frag->mds = -1; 462 frag->ndist = 0; 463 } 464 goto out; 465 } 466 467 468 /* find/add this frag to store mds delegation info */ 469 frag = __get_or_create_frag(ci, id); 470 if (IS_ERR(frag)) { 471 /* this is not the end of the world; we can continue 472 with bad/inaccurate delegation info */ 473 pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n", 474 inode, ceph_vinop(inode), 475 le32_to_cpu(dirinfo->frag)); 476 err = -ENOMEM; 477 goto out; 478 } 479 480 frag->mds = mds; 481 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); 482 for (i = 0; i < frag->ndist; i++) 483 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); 484 doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode, 485 ceph_vinop(inode), frag->frag, frag->ndist); 486 487 out: 488 mutex_unlock(&ci->i_fragtree_mutex); 489 return err; 490 } 491 492 static int frag_tree_split_cmp(const void *l, const void *r) 493 { 494 struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; 495 struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; 496 return ceph_frag_compare(le32_to_cpu(ls->frag), 497 le32_to_cpu(rs->frag)); 498 } 499 500 static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) 501 { 502 if (!frag) 503 return f == ceph_frag_make(0, 0); 504 if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) 505 return false; 506 return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); 507 } 508 509 static int ceph_fill_fragtree(struct inode *inode, 510 struct ceph_frag_tree_head *fragtree, 511 struct ceph_mds_reply_dirfrag *dirinfo) 512 { 513 struct ceph_client *cl = ceph_inode_to_client(inode); 514 struct ceph_inode_info *ci = ceph_inode(inode); 515 struct ceph_inode_frag *frag, *prev_frag = NULL; 516 struct rb_node *rb_node; 517 unsigned i, split_by, nsplits; 518 u32 id; 519 bool update = false; 520 521 mutex_lock(&ci->i_fragtree_mutex); 522 nsplits = le32_to_cpu(fragtree->nsplits); 523 if (nsplits != ci->i_fragtree_nsplits) { 524 update = true; 525 } else if (nsplits) { 526 i = get_random_u32_below(nsplits); 527 id = le32_to_cpu(fragtree->splits[i].frag); 528 if (!__ceph_find_frag(ci, id)) 529 update = true; 530 } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { 531 rb_node = rb_first(&ci->i_fragtree); 532 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 533 if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) 534 update = true; 535 } 536 if (!update && dirinfo) { 537 id = le32_to_cpu(dirinfo->frag); 538 if (id != __ceph_choose_frag(ci, id, NULL, NULL)) 539 update = true; 540 } 541 if (!update) 542 goto out_unlock; 543 544 if (nsplits > 1) { 545 sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), 546 frag_tree_split_cmp, NULL); 547 } 548 549 doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); 550 rb_node = rb_first(&ci->i_fragtree); 551 for (i = 0; i < nsplits; i++) { 552 id = le32_to_cpu(fragtree->splits[i].frag); 553 split_by = le32_to_cpu(fragtree->splits[i].by); 554 if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { 555 pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, " 556 "frag %x split by %d\n", inode, 557 ceph_vinop(inode), i, nsplits, id, split_by); 558 continue; 559 } 560 frag = NULL; 561 while (rb_node) { 562 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 563 if (ceph_frag_compare(frag->frag, id) >= 0) { 564 if (frag->frag != id) 565 frag = NULL; 566 else 567 rb_node = rb_next(rb_node); 568 break; 569 } 570 rb_node = rb_next(rb_node); 571 /* delete stale split/leaf node */ 572 if (frag->split_by > 0 || 573 !is_frag_child(frag->frag, prev_frag)) { 574 rb_erase(&frag->node, &ci->i_fragtree); 575 if (frag->split_by > 0) 576 ci->i_fragtree_nsplits--; 577 kfree(frag); 578 } 579 frag = NULL; 580 } 581 if (!frag) { 582 frag = __get_or_create_frag(ci, id); 583 if (IS_ERR(frag)) 584 continue; 585 } 586 if (frag->split_by == 0) 587 ci->i_fragtree_nsplits++; 588 frag->split_by = split_by; 589 doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by); 590 prev_frag = frag; 591 } 592 while (rb_node) { 593 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 594 rb_node = rb_next(rb_node); 595 /* delete stale split/leaf node */ 596 if (frag->split_by > 0 || 597 !is_frag_child(frag->frag, prev_frag)) { 598 rb_erase(&frag->node, &ci->i_fragtree); 599 if (frag->split_by > 0) 600 ci->i_fragtree_nsplits--; 601 kfree(frag); 602 } 603 } 604 out_unlock: 605 mutex_unlock(&ci->i_fragtree_mutex); 606 return 0; 607 } 608 609 /* 610 * initialize a newly allocated inode. 611 */ 612 struct inode *ceph_alloc_inode(struct super_block *sb) 613 { 614 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); 615 struct ceph_inode_info *ci; 616 int i; 617 618 ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS); 619 if (!ci) 620 return NULL; 621 622 doutc(fsc->client, "%p\n", &ci->netfs.inode); 623 624 /* Set parameters for the netfs library */ 625 netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); 626 627 spin_lock_init(&ci->i_ceph_lock); 628 629 ci->i_version = 0; 630 ci->i_inline_version = 0; 631 ci->i_time_warp_seq = 0; 632 ci->i_ceph_flags = 0; 633 atomic64_set(&ci->i_ordered_count, 1); 634 atomic64_set(&ci->i_release_count, 1); 635 atomic64_set(&ci->i_complete_seq[0], 0); 636 atomic64_set(&ci->i_complete_seq[1], 0); 637 ci->i_symlink = NULL; 638 639 ci->i_max_bytes = 0; 640 ci->i_max_files = 0; 641 ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE; 642 643 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 644 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); 645 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); 646 647 ci->i_fragtree = RB_ROOT; 648 mutex_init(&ci->i_fragtree_mutex); 649 650 ci->i_xattrs.blob = NULL; 651 ci->i_xattrs.prealloc_blob = NULL; 652 ci->i_xattrs.dirty = false; 653 ci->i_xattrs.index = RB_ROOT; 654 ci->i_xattrs.count = 0; 655 ci->i_xattrs.names_size = 0; 656 ci->i_xattrs.vals_size = 0; 657 ci->i_xattrs.version = 0; 658 ci->i_xattrs.index_version = 0; 659 660 ci->i_caps = RB_ROOT; 661 ci->i_auth_cap = NULL; 662 ci->i_dirty_caps = 0; 663 ci->i_flushing_caps = 0; 664 INIT_LIST_HEAD(&ci->i_dirty_item); 665 INIT_LIST_HEAD(&ci->i_flushing_item); 666 ci->i_prealloc_cap_flush = NULL; 667 INIT_LIST_HEAD(&ci->i_cap_flush_list); 668 init_waitqueue_head(&ci->i_cap_wq); 669 ci->i_hold_caps_max = 0; 670 INIT_LIST_HEAD(&ci->i_cap_delay_list); 671 INIT_LIST_HEAD(&ci->i_cap_snaps); 672 ci->i_head_snapc = NULL; 673 ci->i_snap_caps = 0; 674 675 ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; 676 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) 677 ci->i_nr_by_mode[i] = 0; 678 679 mutex_init(&ci->i_truncate_mutex); 680 ci->i_truncate_seq = 0; 681 ci->i_truncate_size = 0; 682 ci->i_truncate_pending = 0; 683 ci->i_truncate_pagecache_size = 0; 684 685 ci->i_max_size = 0; 686 ci->i_reported_size = 0; 687 ci->i_wanted_max_size = 0; 688 ci->i_requested_max_size = 0; 689 690 ci->i_pin_ref = 0; 691 ci->i_rd_ref = 0; 692 ci->i_rdcache_ref = 0; 693 ci->i_wr_ref = 0; 694 ci->i_wb_ref = 0; 695 ci->i_fx_ref = 0; 696 ci->i_wrbuffer_ref = 0; 697 ci->i_wrbuffer_ref_head = 0; 698 atomic_set(&ci->i_filelock_ref, 0); 699 atomic_set(&ci->i_shared_gen, 1); 700 ci->i_rdcache_gen = 0; 701 ci->i_rdcache_revoking = 0; 702 703 INIT_LIST_HEAD(&ci->i_unsafe_dirops); 704 INIT_LIST_HEAD(&ci->i_unsafe_iops); 705 spin_lock_init(&ci->i_unsafe_lock); 706 707 ci->i_snap_realm = NULL; 708 INIT_LIST_HEAD(&ci->i_snap_realm_item); 709 INIT_LIST_HEAD(&ci->i_snap_flush_item); 710 711 INIT_WORK(&ci->i_work, ceph_inode_work); 712 ci->i_work_mask = 0; 713 memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); 714 #ifdef CONFIG_FS_ENCRYPTION 715 ci->i_crypt_info = NULL; 716 ci->fscrypt_auth = NULL; 717 ci->fscrypt_auth_len = 0; 718 #endif 719 return &ci->netfs.inode; 720 } 721 722 void ceph_free_inode(struct inode *inode) 723 { 724 struct ceph_inode_info *ci = ceph_inode(inode); 725 726 kfree(ci->i_symlink); 727 #ifdef CONFIG_FS_ENCRYPTION 728 kfree(ci->fscrypt_auth); 729 #endif 730 fscrypt_free_inode(inode); 731 kmem_cache_free(ceph_inode_cachep, ci); 732 } 733 734 void ceph_evict_inode(struct inode *inode) 735 { 736 struct ceph_inode_info *ci = ceph_inode(inode); 737 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 738 struct ceph_client *cl = ceph_inode_to_client(inode); 739 struct ceph_inode_frag *frag; 740 struct rb_node *n; 741 742 doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode)); 743 744 percpu_counter_dec(&mdsc->metric.total_inodes); 745 746 ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE; 747 748 netfs_wait_for_outstanding_io(inode); 749 truncate_inode_pages_final(&inode->i_data); 750 if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) 751 ceph_fscache_unuse_cookie(inode, true); 752 clear_inode(inode); 753 754 ceph_fscache_unregister_inode_cookie(ci); 755 fscrypt_put_encryption_info(inode); 756 757 __ceph_remove_caps(ci); 758 759 if (__ceph_has_quota(ci, QUOTA_GET_ANY)) 760 ceph_adjust_quota_realms_count(inode, false); 761 762 /* 763 * we may still have a snap_realm reference if there are stray 764 * caps in i_snap_caps. 765 */ 766 if (ci->i_snap_realm) { 767 if (ceph_snap(inode) == CEPH_NOSNAP) { 768 doutc(cl, " dropping residual ref to snap realm %p\n", 769 ci->i_snap_realm); 770 ceph_change_snap_realm(inode, NULL); 771 } else { 772 ceph_put_snapid_map(mdsc, ci->i_snapid_map); 773 ci->i_snap_realm = NULL; 774 } 775 } 776 777 while ((n = rb_first(&ci->i_fragtree)) != NULL) { 778 frag = rb_entry(n, struct ceph_inode_frag, node); 779 rb_erase(n, &ci->i_fragtree); 780 kfree(frag); 781 } 782 ci->i_fragtree_nsplits = 0; 783 784 __ceph_destroy_xattrs(ci); 785 if (ci->i_xattrs.blob) 786 ceph_buffer_put(ci->i_xattrs.blob); 787 if (ci->i_xattrs.prealloc_blob) 788 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 789 790 ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); 791 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); 792 } 793 794 static inline blkcnt_t calc_inode_blocks(u64 size) 795 { 796 return (size + (1<<9) - 1) >> 9; 797 } 798 799 /* 800 * Helpers to fill in size, ctime, mtime, and atime. We have to be 801 * careful because either the client or MDS may have more up to date 802 * info, depending on which capabilities are held, and whether 803 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime 804 * and size are monotonically increasing, except when utimes() or 805 * truncate() increments the corresponding _seq values.) 806 */ 807 int ceph_fill_file_size(struct inode *inode, int issued, 808 u32 truncate_seq, u64 truncate_size, u64 size) 809 { 810 struct ceph_client *cl = ceph_inode_to_client(inode); 811 struct ceph_inode_info *ci = ceph_inode(inode); 812 int queue_trunc = 0; 813 loff_t isize = i_size_read(inode); 814 815 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 816 (truncate_seq == ci->i_truncate_seq && size > isize)) { 817 doutc(cl, "size %lld -> %llu\n", isize, size); 818 if (size > 0 && S_ISDIR(inode->i_mode)) { 819 pr_err_client(cl, "non-zero size for directory\n"); 820 size = 0; 821 } 822 i_size_write(inode, size); 823 inode->i_blocks = calc_inode_blocks(size); 824 /* 825 * If we're expanding, then we should be able to just update 826 * the existing cookie. 827 */ 828 if (size > isize) 829 ceph_fscache_update(inode); 830 ci->i_reported_size = size; 831 if (truncate_seq != ci->i_truncate_seq) { 832 doutc(cl, "truncate_seq %u -> %u\n", 833 ci->i_truncate_seq, truncate_seq); 834 ci->i_truncate_seq = truncate_seq; 835 836 /* the MDS should have revoked these caps */ 837 WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | 838 CEPH_CAP_FILE_LAZYIO)); 839 /* 840 * If we hold relevant caps, or in the case where we're 841 * not the only client referencing this file and we 842 * don't hold those caps, then we need to check whether 843 * the file is either opened or mmaped 844 */ 845 if ((issued & (CEPH_CAP_FILE_CACHE| 846 CEPH_CAP_FILE_BUFFER)) || 847 mapping_mapped(inode->i_mapping) || 848 __ceph_is_file_opened(ci)) { 849 ci->i_truncate_pending++; 850 queue_trunc = 1; 851 } 852 } 853 } 854 855 /* 856 * It's possible that the new sizes of the two consecutive 857 * size truncations will be in the same fscrypt last block, 858 * and we need to truncate the corresponding page caches 859 * anyway. 860 */ 861 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { 862 doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n", 863 ci->i_truncate_size, truncate_size, 864 !!IS_ENCRYPTED(inode)); 865 866 ci->i_truncate_size = truncate_size; 867 868 if (IS_ENCRYPTED(inode)) { 869 doutc(cl, "truncate_pagecache_size %lld -> %llu\n", 870 ci->i_truncate_pagecache_size, size); 871 ci->i_truncate_pagecache_size = size; 872 } else { 873 ci->i_truncate_pagecache_size = truncate_size; 874 } 875 } 876 return queue_trunc; 877 } 878 879 /* 880 * Set the subvolume ID for an inode. 881 * 882 * The subvolume_id identifies which CephFS subvolume this inode belongs to. 883 * CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset - the MDS only sends 884 * non-zero IDs for inodes within subvolumes. 885 * 886 * An inode's subvolume membership is immutable - once an inode is created 887 * in a subvolume, it stays there. Therefore, if we already have a valid 888 * (non-zero) subvolume_id and receive a different one, that indicates a bug. 889 */ 890 void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id) 891 { 892 struct ceph_inode_info *ci; 893 u64 old; 894 895 if (!inode || subvolume_id == CEPH_SUBVOLUME_ID_NONE) 896 return; 897 898 ci = ceph_inode(inode); 899 old = READ_ONCE(ci->i_subvolume_id); 900 901 if (old == subvolume_id) 902 return; 903 904 if (old != CEPH_SUBVOLUME_ID_NONE) { 905 /* subvolume_id should not change once set */ 906 WARN_ON_ONCE(1); 907 return; 908 } 909 910 WRITE_ONCE(ci->i_subvolume_id, subvolume_id); 911 } 912 913 void ceph_fill_file_time(struct inode *inode, int issued, 914 u64 time_warp_seq, struct timespec64 *ctime, 915 struct timespec64 *mtime, struct timespec64 *atime) 916 { 917 struct ceph_client *cl = ceph_inode_to_client(inode); 918 struct ceph_inode_info *ci = ceph_inode(inode); 919 struct timespec64 iatime = inode_get_atime(inode); 920 struct timespec64 ictime = inode_get_ctime(inode); 921 struct timespec64 imtime = inode_get_mtime(inode); 922 int warn = 0; 923 924 if (issued & (CEPH_CAP_FILE_EXCL| 925 CEPH_CAP_FILE_WR| 926 CEPH_CAP_FILE_BUFFER| 927 CEPH_CAP_AUTH_EXCL| 928 CEPH_CAP_XATTR_EXCL)) { 929 if (ci->i_version == 0 || 930 timespec64_compare(ctime, &ictime) > 0) { 931 doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime); 932 inode_set_ctime_to_ts(inode, *ctime); 933 } 934 if (ci->i_version == 0 || 935 ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { 936 /* the MDS did a utimes() */ 937 doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime, 938 ci->i_time_warp_seq, (int)time_warp_seq); 939 940 inode_set_mtime_to_ts(inode, *mtime); 941 inode_set_atime_to_ts(inode, *atime); 942 ci->i_time_warp_seq = time_warp_seq; 943 } else if (time_warp_seq == ci->i_time_warp_seq) { 944 /* nobody did utimes(); take the max */ 945 if (timespec64_compare(mtime, &imtime) > 0) { 946 doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime); 947 inode_set_mtime_to_ts(inode, *mtime); 948 } 949 if (timespec64_compare(atime, &iatime) > 0) { 950 doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime); 951 inode_set_atime_to_ts(inode, *atime); 952 } 953 } else if (issued & CEPH_CAP_FILE_EXCL) { 954 /* we did a utimes(); ignore mds values */ 955 } else { 956 warn = 1; 957 } 958 } else { 959 /* we have no write|excl caps; whatever the MDS says is true */ 960 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { 961 inode_set_ctime_to_ts(inode, *ctime); 962 inode_set_mtime_to_ts(inode, *mtime); 963 inode_set_atime_to_ts(inode, *atime); 964 ci->i_time_warp_seq = time_warp_seq; 965 } else { 966 warn = 1; 967 } 968 } 969 if (warn) /* time_warp_seq shouldn't go backwards */ 970 doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode, 971 time_warp_seq, ci->i_time_warp_seq); 972 } 973 974 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 975 static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, 976 const char *encsym, 977 int enclen, u8 **decsym) 978 { 979 struct ceph_client *cl = mdsc->fsc->client; 980 int declen; 981 u8 *sym; 982 983 sym = kmalloc(enclen + 1, GFP_NOFS); 984 if (!sym) 985 return -ENOMEM; 986 987 declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP); 988 if (declen < 0) { 989 pr_err_client(cl, 990 "can't decode symlink (%d). Content: %.*s\n", 991 declen, enclen, encsym); 992 kfree(sym); 993 return -EIO; 994 } 995 sym[declen + 1] = '\0'; 996 *decsym = sym; 997 return declen; 998 } 999 #else 1000 static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, 1001 const char *encsym, 1002 int symlen, u8 **decsym) 1003 { 1004 return -EOPNOTSUPP; 1005 } 1006 #endif 1007 1008 /* 1009 * Populate an inode based on info from mds. May be called on new or 1010 * existing inodes. 1011 */ 1012 int ceph_fill_inode(struct inode *inode, struct page *locked_page, 1013 struct ceph_mds_reply_info_in *iinfo, 1014 struct ceph_mds_reply_dirfrag *dirinfo, 1015 struct ceph_mds_session *session, int cap_fmode, 1016 struct ceph_cap_reservation *caps_reservation) 1017 { 1018 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 1019 struct ceph_client *cl = mdsc->fsc->client; 1020 struct ceph_mds_reply_inode *info = iinfo->in; 1021 struct ceph_inode_info *ci = ceph_inode(inode); 1022 int issued, new_issued, info_caps; 1023 struct timespec64 mtime, atime, ctime; 1024 struct ceph_buffer *xattr_blob = NULL; 1025 struct ceph_buffer *old_blob = NULL; 1026 struct ceph_string *pool_ns = NULL; 1027 struct ceph_cap *new_cap = NULL; 1028 int err = 0; 1029 bool wake = false; 1030 bool queue_trunc = false; 1031 bool new_version = false; 1032 bool fill_inline = false; 1033 umode_t mode = le32_to_cpu(info->mode); 1034 dev_t rdev = le32_to_cpu(info->rdev); 1035 1036 lockdep_assert_held(&mdsc->snap_rwsem); 1037 1038 doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), 1039 le64_to_cpu(info->version), ci->i_version); 1040 1041 /* Once I_NEW is cleared, we can't change type or dev numbers */ 1042 if (inode_state_read_once(inode) & I_NEW) { 1043 inode->i_mode = mode; 1044 } else { 1045 if (inode_wrong_type(inode, mode)) { 1046 pr_warn_once_client(cl, 1047 "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", 1048 ceph_vinop(inode), inode->i_mode, mode); 1049 return -ESTALE; 1050 } 1051 1052 if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { 1053 pr_warn_once_client(cl, 1054 "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", 1055 ceph_vinop(inode), MAJOR(inode->i_rdev), 1056 MINOR(inode->i_rdev), MAJOR(rdev), 1057 MINOR(rdev)); 1058 return -ESTALE; 1059 } 1060 } 1061 1062 info_caps = le32_to_cpu(info->cap.caps); 1063 1064 /* prealloc new cap struct */ 1065 if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) { 1066 new_cap = ceph_get_cap(mdsc, caps_reservation); 1067 if (!new_cap) 1068 return -ENOMEM; 1069 } 1070 1071 /* 1072 * prealloc xattr data, if it looks like we'll need it. only 1073 * if len > 4 (meaning there are actually xattrs; the first 4 1074 * bytes are the xattr count). 1075 */ 1076 if (iinfo->xattr_len > 4) { 1077 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); 1078 if (!xattr_blob) 1079 pr_err_client(cl, "ENOMEM xattr blob %d bytes\n", 1080 iinfo->xattr_len); 1081 } 1082 1083 if (iinfo->pool_ns_len > 0) 1084 pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, 1085 iinfo->pool_ns_len); 1086 1087 if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) 1088 ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); 1089 1090 spin_lock(&ci->i_ceph_lock); 1091 1092 /* 1093 * provided version will be odd if inode value is projected, 1094 * even if stable. skip the update if we have newer stable 1095 * info (ours>=theirs, e.g. due to racing mds replies), unless 1096 * we are getting projected (unstable) info (in which case the 1097 * version is odd, and we want ours>theirs). 1098 * us them 1099 * 2 2 skip 1100 * 3 2 skip 1101 * 3 3 update 1102 */ 1103 if (ci->i_version == 0 || 1104 ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 1105 le64_to_cpu(info->version) > (ci->i_version & ~1))) 1106 new_version = true; 1107 1108 /* Update change_attribute */ 1109 inode_set_max_iversion_raw(inode, iinfo->change_attr); 1110 1111 __ceph_caps_issued(ci, &issued); 1112 issued |= __ceph_caps_dirty(ci); 1113 new_issued = ~issued & info_caps; 1114 1115 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 1116 ceph_inode_set_subvolume(inode, iinfo->subvolume_id); 1117 1118 #ifdef CONFIG_FS_ENCRYPTION 1119 if (iinfo->fscrypt_auth_len && 1120 ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) { 1121 kfree(ci->fscrypt_auth); 1122 ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; 1123 ci->fscrypt_auth = iinfo->fscrypt_auth; 1124 iinfo->fscrypt_auth = NULL; 1125 iinfo->fscrypt_auth_len = 0; 1126 inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); 1127 } 1128 #endif 1129 1130 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && 1131 (issued & CEPH_CAP_AUTH_EXCL) == 0) { 1132 inode->i_mode = mode; 1133 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); 1134 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); 1135 doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, 1136 ceph_vinop(inode), inode->i_mode, 1137 from_kuid(&init_user_ns, inode->i_uid), 1138 from_kgid(&init_user_ns, inode->i_gid)); 1139 ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); 1140 ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); 1141 } 1142 1143 /* directories have fl_stripe_unit set to zero */ 1144 if (IS_ENCRYPTED(inode)) 1145 inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; 1146 else if (le32_to_cpu(info->layout.fl_stripe_unit)) 1147 inode->i_blkbits = 1148 fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 1149 else 1150 inode->i_blkbits = CEPH_BLOCK_SHIFT; 1151 1152 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && 1153 (issued & CEPH_CAP_LINK_EXCL) == 0) 1154 set_nlink(inode, le32_to_cpu(info->nlink)); 1155 1156 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { 1157 /* be careful with mtime, atime, size */ 1158 ceph_decode_timespec64(&atime, &info->atime); 1159 ceph_decode_timespec64(&mtime, &info->mtime); 1160 ceph_decode_timespec64(&ctime, &info->ctime); 1161 ceph_fill_file_time(inode, issued, 1162 le32_to_cpu(info->time_warp_seq), 1163 &ctime, &mtime, &atime); 1164 } 1165 1166 if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) { 1167 ci->i_files = le64_to_cpu(info->files); 1168 ci->i_subdirs = le64_to_cpu(info->subdirs); 1169 } 1170 1171 if (new_version || 1172 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { 1173 u64 size = le64_to_cpu(info->size); 1174 s64 old_pool = ci->i_layout.pool_id; 1175 struct ceph_string *old_ns; 1176 1177 ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); 1178 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, 1179 lockdep_is_held(&ci->i_ceph_lock)); 1180 rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns); 1181 1182 if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns) 1183 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; 1184 1185 pool_ns = old_ns; 1186 1187 if (IS_ENCRYPTED(inode) && size && 1188 iinfo->fscrypt_file_len == sizeof(__le64)) { 1189 u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); 1190 1191 if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { 1192 size = fsize; 1193 } else { 1194 pr_warn_client(cl, 1195 "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", 1196 info->size, size); 1197 } 1198 } 1199 1200 queue_trunc = ceph_fill_file_size(inode, issued, 1201 le32_to_cpu(info->truncate_seq), 1202 le64_to_cpu(info->truncate_size), 1203 size); 1204 /* only update max_size on auth cap */ 1205 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 1206 ci->i_max_size != le64_to_cpu(info->max_size)) { 1207 doutc(cl, "max_size %lld -> %llu\n", 1208 ci->i_max_size, le64_to_cpu(info->max_size)); 1209 ci->i_max_size = le64_to_cpu(info->max_size); 1210 } 1211 } 1212 1213 /* layout and rstat are not tracked by capability, update them if 1214 * the inode info is from auth mds */ 1215 if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) { 1216 if (S_ISDIR(inode->i_mode)) { 1217 ci->i_dir_layout = iinfo->dir_layout; 1218 ci->i_rbytes = le64_to_cpu(info->rbytes); 1219 ci->i_rfiles = le64_to_cpu(info->rfiles); 1220 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); 1221 ci->i_dir_pin = iinfo->dir_pin; 1222 ci->i_rsnaps = iinfo->rsnaps; 1223 ceph_decode_timespec64(&ci->i_rctime, &info->rctime); 1224 } 1225 } 1226 1227 /* xattrs */ 1228 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ 1229 if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && 1230 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { 1231 if (ci->i_xattrs.blob) 1232 old_blob = ci->i_xattrs.blob; 1233 ci->i_xattrs.blob = xattr_blob; 1234 if (xattr_blob) 1235 memcpy(ci->i_xattrs.blob->vec.iov_base, 1236 iinfo->xattr_data, iinfo->xattr_len); 1237 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 1238 ceph_forget_all_cached_acls(inode); 1239 ceph_security_invalidate_secctx(inode); 1240 xattr_blob = NULL; 1241 } 1242 1243 /* finally update i_version */ 1244 if (le64_to_cpu(info->version) > ci->i_version) 1245 ci->i_version = le64_to_cpu(info->version); 1246 1247 inode->i_mapping->a_ops = &ceph_aops; 1248 1249 switch (inode->i_mode & S_IFMT) { 1250 case S_IFIFO: 1251 case S_IFBLK: 1252 case S_IFCHR: 1253 case S_IFSOCK: 1254 inode->i_blkbits = PAGE_SHIFT; 1255 init_special_inode(inode, inode->i_mode, rdev); 1256 inode->i_op = &ceph_file_iops; 1257 break; 1258 case S_IFREG: 1259 inode->i_op = &ceph_file_iops; 1260 inode->i_fop = &ceph_file_fops; 1261 break; 1262 case S_IFLNK: 1263 if (!ci->i_symlink) { 1264 u32 symlen = iinfo->symlink_len; 1265 char *sym; 1266 1267 spin_unlock(&ci->i_ceph_lock); 1268 1269 if (IS_ENCRYPTED(inode)) { 1270 if (symlen != i_size_read(inode)) 1271 pr_err_client(cl, 1272 "%p %llx.%llx BAD symlink size %lld\n", 1273 inode, ceph_vinop(inode), 1274 i_size_read(inode)); 1275 1276 err = decode_encrypted_symlink(mdsc, iinfo->symlink, 1277 symlen, (u8 **)&sym); 1278 if (err < 0) { 1279 pr_err_client(cl, 1280 "decoding encrypted symlink failed: %d\n", 1281 err); 1282 goto out; 1283 } 1284 symlen = err; 1285 i_size_write(inode, symlen); 1286 inode->i_blocks = calc_inode_blocks(symlen); 1287 } else { 1288 if (symlen != i_size_read(inode)) { 1289 pr_err_client(cl, 1290 "%p %llx.%llx BAD symlink size %lld\n", 1291 inode, ceph_vinop(inode), 1292 i_size_read(inode)); 1293 i_size_write(inode, symlen); 1294 inode->i_blocks = calc_inode_blocks(symlen); 1295 } 1296 1297 err = -ENOMEM; 1298 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); 1299 if (!sym) 1300 goto out; 1301 } 1302 1303 spin_lock(&ci->i_ceph_lock); 1304 if (!ci->i_symlink) 1305 ci->i_symlink = sym; 1306 else 1307 kfree(sym); /* lost a race */ 1308 } 1309 1310 if (IS_ENCRYPTED(inode)) { 1311 /* 1312 * Encrypted symlinks need to be decrypted before we can 1313 * cache their targets in i_link. Don't touch it here. 1314 */ 1315 inode->i_op = &ceph_encrypted_symlink_iops; 1316 } else { 1317 inode->i_link = ci->i_symlink; 1318 inode->i_op = &ceph_symlink_iops; 1319 } 1320 break; 1321 case S_IFDIR: 1322 inode->i_op = &ceph_dir_iops; 1323 inode->i_fop = &ceph_dir_fops; 1324 break; 1325 default: 1326 pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode, 1327 ceph_vinop(inode), inode->i_mode); 1328 } 1329 1330 /* were we issued a capability? */ 1331 if (info_caps) { 1332 if (ceph_snap(inode) == CEPH_NOSNAP) { 1333 ceph_add_cap(inode, session, 1334 le64_to_cpu(info->cap.cap_id), 1335 info_caps, 1336 le32_to_cpu(info->cap.wanted), 1337 le32_to_cpu(info->cap.seq), 1338 le32_to_cpu(info->cap.mseq), 1339 le64_to_cpu(info->cap.realm), 1340 info->cap.flags, &new_cap); 1341 1342 /* set dir completion flag? */ 1343 if (S_ISDIR(inode->i_mode) && 1344 ci->i_files == 0 && ci->i_subdirs == 0 && 1345 (info_caps & CEPH_CAP_FILE_SHARED) && 1346 (issued & CEPH_CAP_FILE_EXCL) == 0 && 1347 !__ceph_dir_is_complete(ci)) { 1348 doutc(cl, " marking %p complete (empty)\n", 1349 inode); 1350 i_size_write(inode, 0); 1351 __ceph_dir_set_complete(ci, 1352 atomic64_read(&ci->i_release_count), 1353 atomic64_read(&ci->i_ordered_count)); 1354 } 1355 1356 wake = true; 1357 } else { 1358 doutc(cl, " %p got snap_caps %s\n", inode, 1359 ceph_cap_string(info_caps)); 1360 ci->i_snap_caps |= info_caps; 1361 } 1362 } 1363 1364 if (iinfo->inline_version > 0 && 1365 iinfo->inline_version >= ci->i_inline_version) { 1366 int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1367 ci->i_inline_version = iinfo->inline_version; 1368 if (ceph_has_inline_data(ci) && 1369 (locked_page || (info_caps & cache_caps))) 1370 fill_inline = true; 1371 } 1372 1373 if (cap_fmode >= 0) { 1374 if (!info_caps) 1375 pr_warn_client(cl, "mds issued no caps on %llx.%llx\n", 1376 ceph_vinop(inode)); 1377 __ceph_touch_fmode(ci, mdsc, cap_fmode); 1378 } 1379 1380 spin_unlock(&ci->i_ceph_lock); 1381 1382 ceph_fscache_register_inode_cookie(inode); 1383 1384 if (fill_inline) 1385 ceph_fill_inline_data(inode, locked_page, 1386 iinfo->inline_data, iinfo->inline_len); 1387 1388 if (wake) 1389 wake_up_all(&ci->i_cap_wq); 1390 1391 /* queue truncate if we saw i_size decrease */ 1392 if (queue_trunc) 1393 ceph_queue_vmtruncate(inode); 1394 1395 /* populate frag tree */ 1396 if (S_ISDIR(inode->i_mode)) 1397 ceph_fill_fragtree(inode, &info->fragtree, dirinfo); 1398 1399 /* update delegation info? */ 1400 if (dirinfo) 1401 ceph_fill_dirfrag(inode, dirinfo); 1402 1403 err = 0; 1404 out: 1405 if (new_cap) 1406 ceph_put_cap(mdsc, new_cap); 1407 ceph_buffer_put(old_blob); 1408 ceph_buffer_put(xattr_blob); 1409 ceph_put_string(pool_ns); 1410 return err; 1411 } 1412 1413 /* 1414 * caller should hold session s_mutex and dentry->d_lock. 1415 */ 1416 static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, 1417 struct ceph_mds_reply_lease *lease, 1418 struct ceph_mds_session *session, 1419 unsigned long from_time, 1420 struct ceph_mds_session **old_lease_session) 1421 { 1422 struct ceph_client *cl = ceph_inode_to_client(dir); 1423 struct ceph_dentry_info *di = ceph_dentry(dentry); 1424 unsigned mask = le16_to_cpu(lease->mask); 1425 long unsigned duration = le32_to_cpu(lease->duration_ms); 1426 long unsigned ttl = from_time + (duration * HZ) / 1000; 1427 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; 1428 1429 doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl); 1430 1431 /* only track leases on regular dentries */ 1432 if (ceph_snap(dir) != CEPH_NOSNAP) 1433 return; 1434 1435 if (mask & CEPH_LEASE_PRIMARY_LINK) 1436 di->flags |= CEPH_DENTRY_PRIMARY_LINK; 1437 else 1438 di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; 1439 1440 di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); 1441 if (!(mask & CEPH_LEASE_VALID)) { 1442 __ceph_dentry_dir_lease_touch(di); 1443 return; 1444 } 1445 1446 if (di->lease_gen == atomic_read(&session->s_cap_gen) && 1447 time_before(ttl, di->time)) 1448 return; /* we already have a newer lease. */ 1449 1450 if (di->lease_session && di->lease_session != session) { 1451 *old_lease_session = di->lease_session; 1452 di->lease_session = NULL; 1453 } 1454 1455 if (!di->lease_session) 1456 di->lease_session = ceph_get_mds_session(session); 1457 di->lease_gen = atomic_read(&session->s_cap_gen); 1458 di->lease_seq = le32_to_cpu(lease->seq); 1459 di->lease_renew_after = half_ttl; 1460 di->lease_renew_from = 0; 1461 di->time = ttl; 1462 1463 __ceph_dentry_lease_touch(di); 1464 } 1465 1466 static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, 1467 struct ceph_mds_reply_lease *lease, 1468 struct ceph_mds_session *session, 1469 unsigned long from_time) 1470 { 1471 struct ceph_mds_session *old_lease_session = NULL; 1472 spin_lock(&dentry->d_lock); 1473 __update_dentry_lease(dir, dentry, lease, session, from_time, 1474 &old_lease_session); 1475 spin_unlock(&dentry->d_lock); 1476 ceph_put_mds_session(old_lease_session); 1477 } 1478 1479 /* 1480 * update dentry lease without having parent inode locked 1481 */ 1482 static void update_dentry_lease_careful(struct dentry *dentry, 1483 struct ceph_mds_reply_lease *lease, 1484 struct ceph_mds_session *session, 1485 unsigned long from_time, 1486 char *dname, u32 dname_len, 1487 struct ceph_vino *pdvino, 1488 struct ceph_vino *ptvino) 1489 1490 { 1491 struct inode *dir; 1492 struct ceph_mds_session *old_lease_session = NULL; 1493 1494 spin_lock(&dentry->d_lock); 1495 /* make sure dentry's name matches target */ 1496 if (dentry->d_name.len != dname_len || 1497 memcmp(dentry->d_name.name, dname, dname_len)) 1498 goto out_unlock; 1499 1500 dir = d_inode(dentry->d_parent); 1501 /* make sure parent matches dvino */ 1502 if (!ceph_ino_compare(dir, pdvino)) 1503 goto out_unlock; 1504 1505 /* make sure dentry's inode matches target. NULL ptvino means that 1506 * we expect a negative dentry */ 1507 if (ptvino) { 1508 if (d_really_is_negative(dentry)) 1509 goto out_unlock; 1510 if (!ceph_ino_compare(d_inode(dentry), ptvino)) 1511 goto out_unlock; 1512 } else { 1513 if (d_really_is_positive(dentry)) 1514 goto out_unlock; 1515 } 1516 1517 __update_dentry_lease(dir, dentry, lease, session, 1518 from_time, &old_lease_session); 1519 out_unlock: 1520 spin_unlock(&dentry->d_lock); 1521 ceph_put_mds_session(old_lease_session); 1522 } 1523 1524 /* 1525 * splice a dentry to an inode. 1526 * caller must hold directory i_rwsem for this to be safe. 1527 */ 1528 static int splice_dentry(struct dentry **pdn, struct inode *in) 1529 { 1530 struct ceph_client *cl = ceph_inode_to_client(in); 1531 struct dentry *dn = *pdn; 1532 struct dentry *realdn; 1533 1534 BUG_ON(d_inode(dn)); 1535 1536 if (S_ISDIR(in->i_mode)) { 1537 /* If inode is directory, d_splice_alias() below will remove 1538 * 'realdn' from its origin parent. We need to ensure that 1539 * origin parent's readdir cache will not reference 'realdn' 1540 */ 1541 realdn = d_find_any_alias(in); 1542 if (realdn) { 1543 struct ceph_dentry_info *di = ceph_dentry(realdn); 1544 spin_lock(&realdn->d_lock); 1545 1546 realdn->d_op->d_prune(realdn); 1547 1548 di->time = jiffies; 1549 di->lease_shared_gen = 0; 1550 di->offset = 0; 1551 1552 spin_unlock(&realdn->d_lock); 1553 dput(realdn); 1554 } 1555 } 1556 1557 /* dn must be unhashed */ 1558 if (!d_unhashed(dn)) 1559 d_drop(dn); 1560 realdn = d_splice_alias(in, dn); 1561 if (IS_ERR(realdn)) { 1562 pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n", 1563 PTR_ERR(realdn), dn, in, ceph_vinop(in)); 1564 return PTR_ERR(realdn); 1565 } 1566 1567 if (realdn) { 1568 doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n", 1569 dn, d_count(dn), realdn, d_count(realdn), 1570 d_inode(realdn), ceph_vinop(d_inode(realdn))); 1571 dput(dn); 1572 *pdn = realdn; 1573 } else { 1574 BUG_ON(!ceph_dentry(dn)); 1575 doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn, 1576 d_inode(dn), ceph_vinop(d_inode(dn))); 1577 } 1578 return 0; 1579 } 1580 1581 /* 1582 * Incorporate results into the local cache. This is either just 1583 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 1584 * after a lookup). 1585 * 1586 * A reply may contain 1587 * a directory inode along with a dentry. 1588 * and/or a target inode 1589 * 1590 * Called with snap_rwsem (read). 1591 */ 1592 int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) 1593 { 1594 struct ceph_mds_session *session = req->r_session; 1595 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1596 struct inode *in = NULL; 1597 struct ceph_vino tvino, dvino; 1598 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); 1599 struct ceph_client *cl = fsc->client; 1600 struct inode *parent_dir = NULL; 1601 int err = 0; 1602 1603 doutc(cl, "%p is_dentry %d is_target %d\n", req, 1604 rinfo->head->is_dentry, rinfo->head->is_target); 1605 1606 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 1607 doutc(cl, "reply is empty!\n"); 1608 if (rinfo->head->result == 0 && req->r_parent) 1609 ceph_invalidate_dir_request(req); 1610 return 0; 1611 } 1612 1613 if (rinfo->head->is_dentry) { 1614 /* 1615 * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, 1616 * so we need to get the correct inode 1617 */ 1618 parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); 1619 if (unlikely(IS_ERR(parent_dir))) { 1620 err = PTR_ERR(parent_dir); 1621 goto done; 1622 } 1623 if (parent_dir) { 1624 ceph_inode_set_subvolume(parent_dir, 1625 rinfo->diri.subvolume_id); 1626 err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, 1627 rinfo->dirfrag, session, -1, 1628 &req->r_caps_reservation); 1629 if (err < 0) 1630 goto done; 1631 } else { 1632 WARN_ON_ONCE(1); 1633 } 1634 1635 if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && 1636 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && 1637 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { 1638 bool is_nokey = false; 1639 struct qstr dname; 1640 struct dentry *dn, *parent; 1641 struct fscrypt_str oname = FSTR_INIT(NULL, 0); 1642 struct ceph_fname fname = { .dir = parent_dir, 1643 .name = rinfo->dname, 1644 .ctext = rinfo->altname, 1645 .name_len = rinfo->dname_len, 1646 .ctext_len = rinfo->altname_len }; 1647 1648 BUG_ON(!rinfo->head->is_target); 1649 BUG_ON(req->r_dentry); 1650 1651 parent = d_find_any_alias(parent_dir); 1652 BUG_ON(!parent); 1653 1654 err = ceph_fname_alloc_buffer(parent_dir, &oname); 1655 if (err < 0) { 1656 dput(parent); 1657 goto done; 1658 } 1659 1660 err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); 1661 if (err < 0) { 1662 dput(parent); 1663 ceph_fname_free_buffer(parent_dir, &oname); 1664 goto done; 1665 } 1666 dname.name = oname.name; 1667 dname.len = oname.len; 1668 dname.hash = full_name_hash(parent, dname.name, dname.len); 1669 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); 1670 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); 1671 retry_lookup: 1672 dn = d_lookup(parent, &dname); 1673 doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", 1674 parent, dname.len, dname.name, dn); 1675 1676 if (!dn) { 1677 dn = d_alloc(parent, &dname); 1678 doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, 1679 dname.len, dname.name, dn); 1680 if (!dn) { 1681 dput(parent); 1682 ceph_fname_free_buffer(parent_dir, &oname); 1683 err = -ENOMEM; 1684 goto done; 1685 } 1686 if (is_nokey) { 1687 spin_lock(&dn->d_lock); 1688 dn->d_flags |= DCACHE_NOKEY_NAME; 1689 spin_unlock(&dn->d_lock); 1690 } 1691 err = 0; 1692 } else if (d_really_is_positive(dn) && 1693 (ceph_ino(d_inode(dn)) != tvino.ino || 1694 ceph_snap(d_inode(dn)) != tvino.snap)) { 1695 doutc(cl, " dn %p points to wrong inode %p\n", 1696 dn, d_inode(dn)); 1697 ceph_dir_clear_ordered(parent_dir); 1698 d_delete(dn); 1699 dput(dn); 1700 goto retry_lookup; 1701 } 1702 ceph_fname_free_buffer(parent_dir, &oname); 1703 1704 req->r_dentry = dn; 1705 dput(parent); 1706 } 1707 } 1708 1709 if (rinfo->head->is_target) { 1710 /* Should be filled in by handle_reply */ 1711 BUG_ON(!req->r_target_inode); 1712 1713 in = req->r_target_inode; 1714 ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id); 1715 err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, 1716 NULL, session, 1717 (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && 1718 !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && 1719 rinfo->head->result == 0) ? req->r_fmode : -1, 1720 &req->r_caps_reservation); 1721 if (err < 0) { 1722 pr_err_client(cl, "badness %p %llx.%llx\n", in, 1723 ceph_vinop(in)); 1724 req->r_target_inode = NULL; 1725 if (inode_state_read_once(in) & I_NEW) 1726 discard_new_inode(in); 1727 else 1728 iput(in); 1729 goto done; 1730 } 1731 if (inode_state_read_once(in) & I_NEW) 1732 unlock_new_inode(in); 1733 } 1734 1735 /* 1736 * ignore null lease/binding on snapdir ENOENT, or else we 1737 * will have trouble splicing in the virtual snapdir later 1738 */ 1739 if (rinfo->head->is_dentry && 1740 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && 1741 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && 1742 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 1743 fsc->mount_options->snapdir_name, 1744 req->r_dentry->d_name.len))) { 1745 /* 1746 * lookup link rename : null -> possibly existing inode 1747 * mknod symlink mkdir : null -> new inode 1748 * unlink : linked -> null 1749 */ 1750 struct inode *dir = req->r_parent; 1751 struct dentry *dn = req->r_dentry; 1752 bool have_dir_cap, have_lease; 1753 1754 BUG_ON(!dn); 1755 BUG_ON(!dir); 1756 BUG_ON(d_inode(dn->d_parent) != dir); 1757 1758 dvino.ino = le64_to_cpu(rinfo->diri.in->ino); 1759 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); 1760 1761 BUG_ON(ceph_ino(dir) != dvino.ino); 1762 BUG_ON(ceph_snap(dir) != dvino.snap); 1763 1764 /* do we have a lease on the whole dir? */ 1765 have_dir_cap = 1766 (le32_to_cpu(rinfo->diri.in->cap.caps) & 1767 CEPH_CAP_FILE_SHARED); 1768 1769 /* do we have a dn lease? */ 1770 have_lease = have_dir_cap || 1771 le32_to_cpu(rinfo->dlease->duration_ms); 1772 if (!have_lease) 1773 doutc(cl, "no dentry lease or dir cap\n"); 1774 1775 /* rename? */ 1776 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { 1777 struct inode *olddir = req->r_old_dentry_dir; 1778 BUG_ON(!olddir); 1779 1780 doutc(cl, " src %p '%pd' dst %p '%pd'\n", 1781 req->r_old_dentry, req->r_old_dentry, dn, dn); 1782 doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn); 1783 1784 /* d_move screws up sibling dentries' offsets */ 1785 ceph_dir_clear_ordered(dir); 1786 ceph_dir_clear_ordered(olddir); 1787 1788 d_move(req->r_old_dentry, dn); 1789 doutc(cl, " src %p '%pd' dst %p '%pd'\n", 1790 req->r_old_dentry, req->r_old_dentry, dn, dn); 1791 1792 /* ensure target dentry is invalidated, despite 1793 rehashing bug in vfs_rename_dir */ 1794 ceph_invalidate_dentry_lease(dn); 1795 1796 doutc(cl, "dn %p gets new offset %lld\n", 1797 req->r_old_dentry, 1798 ceph_dentry(req->r_old_dentry)->offset); 1799 1800 /* swap r_dentry and r_old_dentry in case that 1801 * splice_dentry() gets called later. This is safe 1802 * because no other place will use them */ 1803 req->r_dentry = req->r_old_dentry; 1804 req->r_old_dentry = dn; 1805 dn = req->r_dentry; 1806 } 1807 1808 /* null dentry? */ 1809 if (!rinfo->head->is_target) { 1810 doutc(cl, "null dentry\n"); 1811 if (d_really_is_positive(dn)) { 1812 doutc(cl, "d_delete %p\n", dn); 1813 ceph_dir_clear_ordered(dir); 1814 d_delete(dn); 1815 } else if (have_lease) { 1816 if (d_unhashed(dn)) 1817 d_add(dn, NULL); 1818 } 1819 1820 if (!d_unhashed(dn) && have_lease) 1821 update_dentry_lease(dir, dn, 1822 rinfo->dlease, session, 1823 req->r_request_started); 1824 goto done; 1825 } 1826 1827 if (unlikely(!in)) { 1828 err = -EINVAL; 1829 goto done; 1830 } 1831 1832 /* attach proper inode */ 1833 if (d_really_is_negative(dn)) { 1834 ceph_dir_clear_ordered(dir); 1835 ihold(in); 1836 err = splice_dentry(&req->r_dentry, in); 1837 if (err < 0) 1838 goto done; 1839 dn = req->r_dentry; /* may have spliced */ 1840 } else if (d_really_is_positive(dn) && d_inode(dn) != in) { 1841 doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n", 1842 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1843 ceph_vinop(in)); 1844 d_invalidate(dn); 1845 have_lease = false; 1846 } 1847 1848 if (have_lease) { 1849 update_dentry_lease(dir, dn, 1850 rinfo->dlease, session, 1851 req->r_request_started); 1852 } 1853 doutc(cl, " final dn %p\n", dn); 1854 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || 1855 req->r_op == CEPH_MDS_OP_MKSNAP) && 1856 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && 1857 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { 1858 struct inode *dir = req->r_parent; 1859 1860 /* fill out a snapdir LOOKUPSNAP dentry */ 1861 BUG_ON(!dir); 1862 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); 1863 BUG_ON(!req->r_dentry); 1864 doutc(cl, " linking snapped dir %p to dn %p\n", in, 1865 req->r_dentry); 1866 ceph_dir_clear_ordered(dir); 1867 1868 if (unlikely(!in)) { 1869 err = -EINVAL; 1870 goto done; 1871 } 1872 1873 ihold(in); 1874 err = splice_dentry(&req->r_dentry, in); 1875 if (err < 0) 1876 goto done; 1877 } else if (rinfo->head->is_dentry && req->r_dentry) { 1878 /* parent inode is not locked, be careful */ 1879 struct ceph_vino *ptvino = NULL; 1880 dvino.ino = le64_to_cpu(rinfo->diri.in->ino); 1881 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); 1882 if (rinfo->head->is_target) { 1883 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); 1884 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); 1885 ptvino = &tvino; 1886 } 1887 update_dentry_lease_careful(req->r_dentry, rinfo->dlease, 1888 session, req->r_request_started, 1889 rinfo->dname, rinfo->dname_len, 1890 &dvino, ptvino); 1891 } 1892 done: 1893 /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ 1894 if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) 1895 iput(parent_dir); 1896 doutc(cl, "done err=%d\n", err); 1897 return err; 1898 } 1899 1900 /* 1901 * Prepopulate our cache with readdir results, leases, etc. 1902 */ 1903 static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, 1904 struct ceph_mds_session *session) 1905 { 1906 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1907 struct ceph_client *cl = session->s_mdsc->fsc->client; 1908 int i, err = 0; 1909 1910 for (i = 0; i < rinfo->dir_nr; i++) { 1911 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 1912 struct ceph_vino vino; 1913 struct inode *in; 1914 int rc; 1915 1916 vino.ino = le64_to_cpu(rde->inode.in->ino); 1917 vino.snap = le64_to_cpu(rde->inode.in->snapid); 1918 1919 in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL); 1920 if (IS_ERR(in)) { 1921 err = PTR_ERR(in); 1922 doutc(cl, "badness got %d\n", err); 1923 continue; 1924 } 1925 rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, 1926 -1, &req->r_caps_reservation); 1927 if (rc < 0) { 1928 pr_err_client(cl, "inode badness on %p got %d\n", in, 1929 rc); 1930 err = rc; 1931 if (inode_state_read_once(in) & I_NEW) { 1932 ihold(in); 1933 discard_new_inode(in); 1934 } 1935 } else if (inode_state_read_once(in) & I_NEW) { 1936 unlock_new_inode(in); 1937 } 1938 1939 iput(in); 1940 } 1941 1942 return err; 1943 } 1944 1945 void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) 1946 { 1947 if (ctl->folio) { 1948 folio_release_kmap(ctl->folio, ctl->dentries); 1949 ctl->folio = NULL; 1950 } 1951 } 1952 1953 static int fill_readdir_cache(struct inode *dir, struct dentry *dn, 1954 struct ceph_readdir_cache_control *ctl, 1955 struct ceph_mds_request *req) 1956 { 1957 struct ceph_client *cl = ceph_inode_to_client(dir); 1958 struct ceph_inode_info *ci = ceph_inode(dir); 1959 unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); 1960 unsigned idx = ctl->index % nsize; 1961 pgoff_t pgoff = ctl->index / nsize; 1962 1963 if (!ctl->folio || pgoff != ctl->folio->index) { 1964 ceph_readdir_cache_release(ctl); 1965 fgf_t fgf = FGP_LOCK; 1966 1967 if (idx == 0) 1968 fgf |= FGP_ACCESSED | FGP_CREAT; 1969 1970 ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, 1971 fgf, mapping_gfp_mask(&dir->i_data)); 1972 if (IS_ERR(ctl->folio)) { 1973 int err = PTR_ERR(ctl->folio); 1974 1975 ctl->folio = NULL; 1976 ctl->index = -1; 1977 return idx == 0 ? err : 0; 1978 } 1979 /* reading/filling the cache are serialized by 1980 * i_rwsem, no need to use folio lock */ 1981 folio_unlock(ctl->folio); 1982 ctl->dentries = kmap_local_folio(ctl->folio, 0); 1983 if (idx == 0) 1984 memset(ctl->dentries, 0, PAGE_SIZE); 1985 } 1986 1987 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && 1988 req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { 1989 doutc(cl, "dn %p idx %d\n", dn, ctl->index); 1990 ctl->dentries[idx] = dn; 1991 ctl->index++; 1992 } else { 1993 doutc(cl, "disable readdir cache\n"); 1994 ctl->index = -1; 1995 } 1996 return 0; 1997 } 1998 1999 int ceph_readdir_prepopulate(struct ceph_mds_request *req, 2000 struct ceph_mds_session *session) 2001 { 2002 struct dentry *parent = req->r_dentry; 2003 struct inode *inode = d_inode(parent); 2004 struct ceph_inode_info *ci = ceph_inode(inode); 2005 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 2006 struct ceph_client *cl = session->s_mdsc->fsc->client; 2007 struct qstr dname; 2008 struct dentry *dn; 2009 struct inode *in; 2010 int err = 0, skipped = 0, ret, i; 2011 u32 frag = le32_to_cpu(req->r_args.readdir.frag); 2012 u32 last_hash = 0; 2013 u32 fpos_offset; 2014 struct ceph_readdir_cache_control cache_ctl = {}; 2015 2016 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) 2017 return readdir_prepopulate_inodes_only(req, session); 2018 2019 if (rinfo->hash_order) { 2020 if (req->r_path2) { 2021 last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, 2022 req->r_path2, 2023 strlen(req->r_path2)); 2024 last_hash = ceph_frag_value(last_hash); 2025 } else if (rinfo->offset_hash) { 2026 /* mds understands offset_hash */ 2027 WARN_ON_ONCE(req->r_readdir_offset != 2); 2028 last_hash = le32_to_cpu(req->r_args.readdir.offset_hash); 2029 } 2030 } 2031 2032 if (rinfo->dir_dir && 2033 le32_to_cpu(rinfo->dir_dir->frag) != frag) { 2034 doutc(cl, "got new frag %x -> %x\n", frag, 2035 le32_to_cpu(rinfo->dir_dir->frag)); 2036 frag = le32_to_cpu(rinfo->dir_dir->frag); 2037 if (!rinfo->hash_order) 2038 req->r_readdir_offset = 2; 2039 } 2040 2041 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 2042 doutc(cl, "%d items under SNAPDIR dn %p\n", 2043 rinfo->dir_nr, parent); 2044 } else { 2045 doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent); 2046 if (rinfo->dir_dir) 2047 ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); 2048 2049 if (ceph_frag_is_leftmost(frag) && 2050 req->r_readdir_offset == 2 && 2051 !(rinfo->hash_order && last_hash)) { 2052 /* note dir version at start of readdir so we can 2053 * tell if any dentries get dropped */ 2054 req->r_dir_release_cnt = 2055 atomic64_read(&ci->i_release_count); 2056 req->r_dir_ordered_cnt = 2057 atomic64_read(&ci->i_ordered_count); 2058 req->r_readdir_cache_idx = 0; 2059 } 2060 } 2061 2062 cache_ctl.index = req->r_readdir_cache_idx; 2063 fpos_offset = req->r_readdir_offset; 2064 2065 /* FIXME: release caps/leases if error occurs */ 2066 for (i = 0; i < rinfo->dir_nr; i++) { 2067 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 2068 struct ceph_vino tvino; 2069 2070 dname.name = rde->name; 2071 dname.len = rde->name_len; 2072 dname.hash = full_name_hash(parent, dname.name, dname.len); 2073 2074 tvino.ino = le64_to_cpu(rde->inode.in->ino); 2075 tvino.snap = le64_to_cpu(rde->inode.in->snapid); 2076 2077 if (rinfo->hash_order) { 2078 u32 hash = ceph_frag_value(rde->raw_hash); 2079 if (hash != last_hash) 2080 fpos_offset = 2; 2081 last_hash = hash; 2082 rde->offset = ceph_make_fpos(hash, fpos_offset++, true); 2083 } else { 2084 rde->offset = ceph_make_fpos(frag, fpos_offset++, false); 2085 } 2086 2087 retry_lookup: 2088 dn = d_lookup(parent, &dname); 2089 doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", 2090 parent, dname.len, dname.name, dn); 2091 2092 if (!dn) { 2093 dn = d_alloc(parent, &dname); 2094 doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, 2095 dname.len, dname.name, dn); 2096 if (!dn) { 2097 doutc(cl, "d_alloc badness\n"); 2098 err = -ENOMEM; 2099 goto out; 2100 } 2101 if (rde->is_nokey) { 2102 spin_lock(&dn->d_lock); 2103 dn->d_flags |= DCACHE_NOKEY_NAME; 2104 spin_unlock(&dn->d_lock); 2105 } 2106 } else if (d_really_is_positive(dn) && 2107 (ceph_ino(d_inode(dn)) != tvino.ino || 2108 ceph_snap(d_inode(dn)) != tvino.snap)) { 2109 struct ceph_dentry_info *di = ceph_dentry(dn); 2110 doutc(cl, " dn %p points to wrong inode %p\n", 2111 dn, d_inode(dn)); 2112 2113 spin_lock(&dn->d_lock); 2114 if (di->offset > 0 && 2115 di->lease_shared_gen == 2116 atomic_read(&ci->i_shared_gen)) { 2117 __ceph_dir_clear_ordered(ci); 2118 di->offset = 0; 2119 } 2120 spin_unlock(&dn->d_lock); 2121 2122 d_delete(dn); 2123 dput(dn); 2124 goto retry_lookup; 2125 } 2126 2127 /* inode */ 2128 if (d_really_is_positive(dn)) { 2129 in = d_inode(dn); 2130 } else { 2131 in = ceph_get_inode(parent->d_sb, tvino, NULL); 2132 if (IS_ERR(in)) { 2133 doutc(cl, "new_inode badness\n"); 2134 d_drop(dn); 2135 dput(dn); 2136 err = PTR_ERR(in); 2137 goto out; 2138 } 2139 } 2140 2141 ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, 2142 -1, &req->r_caps_reservation); 2143 if (ret < 0) { 2144 pr_err_client(cl, "badness on %p %llx.%llx\n", in, 2145 ceph_vinop(in)); 2146 if (d_really_is_negative(dn)) { 2147 if (inode_state_read_once(in) & I_NEW) { 2148 ihold(in); 2149 discard_new_inode(in); 2150 } 2151 iput(in); 2152 } 2153 d_drop(dn); 2154 err = ret; 2155 goto next_item; 2156 } 2157 if (inode_state_read_once(in) & I_NEW) 2158 unlock_new_inode(in); 2159 2160 if (d_really_is_negative(dn)) { 2161 if (ceph_security_xattr_deadlock(in)) { 2162 doutc(cl, " skip splicing dn %p to inode %p" 2163 " (security xattr deadlock)\n", dn, in); 2164 iput(in); 2165 skipped++; 2166 goto next_item; 2167 } 2168 2169 err = splice_dentry(&dn, in); 2170 if (err < 0) 2171 goto next_item; 2172 } 2173 2174 ceph_dentry(dn)->offset = rde->offset; 2175 2176 update_dentry_lease(d_inode(parent), dn, 2177 rde->lease, req->r_session, 2178 req->r_request_started); 2179 2180 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 2181 ret = fill_readdir_cache(d_inode(parent), dn, 2182 &cache_ctl, req); 2183 if (ret < 0) 2184 err = ret; 2185 } 2186 next_item: 2187 dput(dn); 2188 } 2189 out: 2190 if (err == 0 && skipped == 0) { 2191 set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); 2192 req->r_readdir_cache_idx = cache_ctl.index; 2193 } 2194 ceph_readdir_cache_release(&cache_ctl); 2195 doutc(cl, "done\n"); 2196 return err; 2197 } 2198 2199 bool ceph_inode_set_size(struct inode *inode, loff_t size) 2200 { 2201 struct ceph_client *cl = ceph_inode_to_client(inode); 2202 struct ceph_inode_info *ci = ceph_inode(inode); 2203 bool ret; 2204 2205 spin_lock(&ci->i_ceph_lock); 2206 doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); 2207 i_size_write(inode, size); 2208 ceph_fscache_update(inode); 2209 inode->i_blocks = calc_inode_blocks(size); 2210 2211 ret = __ceph_should_report_size(ci); 2212 2213 spin_unlock(&ci->i_ceph_lock); 2214 2215 return ret; 2216 } 2217 2218 void ceph_queue_inode_work(struct inode *inode, int work_bit) 2219 { 2220 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 2221 struct ceph_client *cl = fsc->client; 2222 struct ceph_inode_info *ci = ceph_inode(inode); 2223 set_bit(work_bit, &ci->i_work_mask); 2224 2225 ihold(inode); 2226 if (queue_work(fsc->inode_wq, &ci->i_work)) { 2227 doutc(cl, "%p %llx.%llx mask=%lx\n", inode, 2228 ceph_vinop(inode), ci->i_work_mask); 2229 } else { 2230 doutc(cl, "%p %llx.%llx already queued, mask=%lx\n", 2231 inode, ceph_vinop(inode), ci->i_work_mask); 2232 iput(inode); 2233 } 2234 } 2235 2236 static void ceph_do_invalidate_pages(struct inode *inode) 2237 { 2238 struct ceph_client *cl = ceph_inode_to_client(inode); 2239 struct ceph_inode_info *ci = ceph_inode(inode); 2240 u32 orig_gen; 2241 int check = 0; 2242 2243 ceph_fscache_invalidate(inode, false); 2244 2245 mutex_lock(&ci->i_truncate_mutex); 2246 2247 if (ceph_inode_is_shutdown(inode)) { 2248 pr_warn_ratelimited_client(cl, 2249 "%p %llx.%llx is shut down\n", inode, 2250 ceph_vinop(inode)); 2251 mapping_set_error(inode->i_mapping, -EIO); 2252 truncate_pagecache(inode, 0); 2253 mutex_unlock(&ci->i_truncate_mutex); 2254 goto out; 2255 } 2256 2257 spin_lock(&ci->i_ceph_lock); 2258 doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode, 2259 ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking); 2260 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 2261 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 2262 check = 1; 2263 spin_unlock(&ci->i_ceph_lock); 2264 mutex_unlock(&ci->i_truncate_mutex); 2265 goto out; 2266 } 2267 orig_gen = ci->i_rdcache_gen; 2268 spin_unlock(&ci->i_ceph_lock); 2269 2270 if (invalidate_inode_pages2(inode->i_mapping) < 0) { 2271 pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n", 2272 ceph_vinop(inode)); 2273 } 2274 2275 spin_lock(&ci->i_ceph_lock); 2276 if (orig_gen == ci->i_rdcache_gen && 2277 orig_gen == ci->i_rdcache_revoking) { 2278 doutc(cl, "%p %llx.%llx gen %d successful\n", inode, 2279 ceph_vinop(inode), ci->i_rdcache_gen); 2280 ci->i_rdcache_revoking--; 2281 check = 1; 2282 } else { 2283 doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n", 2284 inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen, 2285 ci->i_rdcache_revoking); 2286 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 2287 check = 1; 2288 } 2289 spin_unlock(&ci->i_ceph_lock); 2290 mutex_unlock(&ci->i_truncate_mutex); 2291 out: 2292 if (check) 2293 ceph_check_caps(ci, 0); 2294 } 2295 2296 /* 2297 * Make sure any pending truncation is applied before doing anything 2298 * that may depend on it. 2299 */ 2300 void __ceph_do_pending_vmtruncate(struct inode *inode) 2301 { 2302 struct ceph_client *cl = ceph_inode_to_client(inode); 2303 struct ceph_inode_info *ci = ceph_inode(inode); 2304 u64 to; 2305 int wrbuffer_refs, finish = 0; 2306 2307 mutex_lock(&ci->i_truncate_mutex); 2308 retry: 2309 spin_lock(&ci->i_ceph_lock); 2310 if (ci->i_truncate_pending == 0) { 2311 doutc(cl, "%p %llx.%llx none pending\n", inode, 2312 ceph_vinop(inode)); 2313 spin_unlock(&ci->i_ceph_lock); 2314 mutex_unlock(&ci->i_truncate_mutex); 2315 return; 2316 } 2317 2318 /* 2319 * make sure any dirty snapped pages are flushed before we 2320 * possibly truncate them.. so write AND block! 2321 */ 2322 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 2323 spin_unlock(&ci->i_ceph_lock); 2324 doutc(cl, "%p %llx.%llx flushing snaps first\n", inode, 2325 ceph_vinop(inode)); 2326 filemap_write_and_wait_range(&inode->i_data, 0, 2327 inode->i_sb->s_maxbytes); 2328 goto retry; 2329 } 2330 2331 /* there should be no reader or writer */ 2332 WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); 2333 2334 to = ci->i_truncate_pagecache_size; 2335 wrbuffer_refs = ci->i_wrbuffer_ref; 2336 doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode), 2337 ci->i_truncate_pending, to); 2338 spin_unlock(&ci->i_ceph_lock); 2339 2340 ceph_fscache_resize(inode, to); 2341 truncate_pagecache(inode, to); 2342 2343 spin_lock(&ci->i_ceph_lock); 2344 if (to == ci->i_truncate_pagecache_size) { 2345 ci->i_truncate_pending = 0; 2346 finish = 1; 2347 } 2348 spin_unlock(&ci->i_ceph_lock); 2349 if (!finish) 2350 goto retry; 2351 2352 mutex_unlock(&ci->i_truncate_mutex); 2353 2354 if (wrbuffer_refs == 0) 2355 ceph_check_caps(ci, 0); 2356 2357 wake_up_all(&ci->i_cap_wq); 2358 } 2359 2360 static void ceph_inode_work(struct work_struct *work) 2361 { 2362 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, 2363 i_work); 2364 struct inode *inode = &ci->netfs.inode; 2365 struct ceph_client *cl = ceph_inode_to_client(inode); 2366 2367 if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { 2368 doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode)); 2369 filemap_fdatawrite(&inode->i_data); 2370 } 2371 if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask)) 2372 ceph_do_invalidate_pages(inode); 2373 2374 if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask)) 2375 __ceph_do_pending_vmtruncate(inode); 2376 2377 if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) 2378 ceph_check_caps(ci, 0); 2379 2380 if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) 2381 ceph_flush_snaps(ci, NULL); 2382 2383 iput(inode); 2384 } 2385 2386 static const char *ceph_encrypted_get_link(struct dentry *dentry, 2387 struct inode *inode, 2388 struct delayed_call *done) 2389 { 2390 struct ceph_inode_info *ci = ceph_inode(inode); 2391 2392 if (!dentry) 2393 return ERR_PTR(-ECHILD); 2394 2395 return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode), 2396 done); 2397 } 2398 2399 static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap, 2400 const struct path *path, 2401 struct kstat *stat, u32 request_mask, 2402 unsigned int query_flags) 2403 { 2404 int ret; 2405 2406 ret = ceph_getattr(idmap, path, stat, request_mask, query_flags); 2407 if (ret) 2408 return ret; 2409 return fscrypt_symlink_getattr(path, stat); 2410 } 2411 2412 /* 2413 * symlinks 2414 */ 2415 static const struct inode_operations ceph_symlink_iops = { 2416 .get_link = simple_get_link, 2417 .setattr = ceph_setattr, 2418 .getattr = ceph_getattr, 2419 .listxattr = ceph_listxattr, 2420 }; 2421 2422 static const struct inode_operations ceph_encrypted_symlink_iops = { 2423 .get_link = ceph_encrypted_get_link, 2424 .setattr = ceph_setattr, 2425 .getattr = ceph_encrypted_symlink_getattr, 2426 .listxattr = ceph_listxattr, 2427 }; 2428 2429 /* 2430 * Transfer the encrypted last block to the MDS and the MDS 2431 * will help update it when truncating a smaller size. 2432 * 2433 * We don't support a PAGE_SIZE that is smaller than the 2434 * CEPH_FSCRYPT_BLOCK_SIZE. 2435 */ 2436 static int fill_fscrypt_truncate(struct inode *inode, 2437 struct ceph_mds_request *req, 2438 struct iattr *attr) 2439 { 2440 struct ceph_client *cl = ceph_inode_to_client(inode); 2441 struct ceph_inode_info *ci = ceph_inode(inode); 2442 int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; 2443 loff_t pos, orig_pos = round_down(attr->ia_size, 2444 CEPH_FSCRYPT_BLOCK_SIZE); 2445 u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; 2446 struct ceph_pagelist *pagelist = NULL; 2447 struct kvec iov = {0}; 2448 struct iov_iter iter; 2449 struct page *page = NULL; 2450 struct ceph_fscrypt_truncate_size_header header; 2451 int retry_op = 0; 2452 int len = CEPH_FSCRYPT_BLOCK_SIZE; 2453 loff_t i_size = i_size_read(inode); 2454 int got, ret, issued; 2455 u64 objver; 2456 2457 ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); 2458 if (ret < 0) 2459 return ret; 2460 2461 issued = __ceph_caps_issued(ci, NULL); 2462 2463 doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n", 2464 i_size, attr->ia_size, ceph_cap_string(got), 2465 ceph_cap_string(issued)); 2466 2467 /* Try to writeback the dirty pagecaches */ 2468 if (issued & (CEPH_CAP_FILE_BUFFER)) { 2469 loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; 2470 2471 ret = filemap_write_and_wait_range(inode->i_mapping, 2472 orig_pos, lend); 2473 if (ret < 0) 2474 goto out; 2475 } 2476 2477 page = __page_cache_alloc(GFP_KERNEL); 2478 if (page == NULL) { 2479 ret = -ENOMEM; 2480 goto out; 2481 } 2482 2483 pagelist = ceph_pagelist_alloc(GFP_KERNEL); 2484 if (!pagelist) { 2485 ret = -ENOMEM; 2486 goto out; 2487 } 2488 2489 iov.iov_base = kmap_local_page(page); 2490 iov.iov_len = len; 2491 iov_iter_kvec(&iter, READ, &iov, 1, len); 2492 2493 pos = orig_pos; 2494 ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); 2495 if (ret < 0) 2496 goto out; 2497 2498 /* Insert the header first */ 2499 header.ver = 1; 2500 header.compat = 1; 2501 header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); 2502 2503 /* 2504 * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, 2505 * because in MDS it may need this to do the truncate. 2506 */ 2507 header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); 2508 2509 /* 2510 * If we hit a hole here, we should just skip filling 2511 * the fscrypt for the request, because once the fscrypt 2512 * is enabled, the file will be split into many blocks 2513 * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there 2514 * has a hole, the hole size should be multiple of block 2515 * size. 2516 * 2517 * If the Rados object doesn't exist, it will be set to 0. 2518 */ 2519 if (!objver) { 2520 doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size); 2521 2522 header.data_len = cpu_to_le32(8 + 8 + 4); 2523 header.file_offset = 0; 2524 ret = 0; 2525 } else { 2526 header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); 2527 header.file_offset = cpu_to_le64(orig_pos); 2528 2529 doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff, 2530 CEPH_FSCRYPT_BLOCK_SIZE); 2531 2532 /* truncate and zero out the extra contents for the last block */ 2533 memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); 2534 2535 /* encrypt the last block */ 2536 ret = ceph_fscrypt_encrypt_block_inplace(inode, page, 2537 CEPH_FSCRYPT_BLOCK_SIZE, 2538 0, block); 2539 if (ret) 2540 goto out; 2541 } 2542 2543 /* Insert the header */ 2544 ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); 2545 if (ret) 2546 goto out; 2547 2548 if (header.block_size) { 2549 /* Append the last block contents to pagelist */ 2550 ret = ceph_pagelist_append(pagelist, iov.iov_base, 2551 CEPH_FSCRYPT_BLOCK_SIZE); 2552 if (ret) 2553 goto out; 2554 } 2555 req->r_pagelist = pagelist; 2556 out: 2557 doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode, 2558 ceph_vinop(inode), ceph_cap_string(got)); 2559 ceph_put_cap_refs(ci, got); 2560 if (iov.iov_base) 2561 kunmap_local(iov.iov_base); 2562 if (page) 2563 __free_pages(page, 0); 2564 if (ret && pagelist) 2565 ceph_pagelist_release(pagelist); 2566 return ret; 2567 } 2568 2569 int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, 2570 struct iattr *attr, struct ceph_iattr *cia) 2571 { 2572 struct ceph_inode_info *ci = ceph_inode(inode); 2573 unsigned int ia_valid = attr->ia_valid; 2574 struct ceph_mds_request *req; 2575 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 2576 struct ceph_client *cl = ceph_inode_to_client(inode); 2577 struct ceph_cap_flush *prealloc_cf; 2578 loff_t isize = i_size_read(inode); 2579 int issued; 2580 int release = 0, dirtied = 0; 2581 int mask = 0; 2582 int err = 0; 2583 int inode_dirty_flags = 0; 2584 bool lock_snap_rwsem = false; 2585 bool fill_fscrypt; 2586 int truncate_retry = 20; /* The RMW will take around 50ms */ 2587 struct dentry *dentry; 2588 char *path; 2589 bool do_sync = false; 2590 2591 dentry = d_find_alias(inode); 2592 if (!dentry) { 2593 do_sync = true; 2594 } else { 2595 struct ceph_path_info path_info = {0}; 2596 path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); 2597 if (IS_ERR(path)) { 2598 do_sync = true; 2599 err = 0; 2600 } else { 2601 err = ceph_mds_check_access(mdsc, path, MAY_WRITE); 2602 } 2603 ceph_mdsc_free_path_info(&path_info); 2604 dput(dentry); 2605 2606 /* For none EACCES cases will let the MDS do the mds auth check */ 2607 if (err == -EACCES) { 2608 return err; 2609 } else if (err < 0) { 2610 do_sync = true; 2611 err = 0; 2612 } 2613 } 2614 2615 retry: 2616 prealloc_cf = ceph_alloc_cap_flush(); 2617 if (!prealloc_cf) 2618 return -ENOMEM; 2619 2620 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, 2621 USE_AUTH_MDS); 2622 if (IS_ERR(req)) { 2623 ceph_free_cap_flush(prealloc_cf); 2624 return PTR_ERR(req); 2625 } 2626 2627 fill_fscrypt = false; 2628 spin_lock(&ci->i_ceph_lock); 2629 issued = __ceph_caps_issued(ci, NULL); 2630 2631 if (!ci->i_head_snapc && 2632 (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { 2633 lock_snap_rwsem = true; 2634 if (!down_read_trylock(&mdsc->snap_rwsem)) { 2635 spin_unlock(&ci->i_ceph_lock); 2636 down_read(&mdsc->snap_rwsem); 2637 spin_lock(&ci->i_ceph_lock); 2638 issued = __ceph_caps_issued(ci, NULL); 2639 } 2640 } 2641 2642 doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode), 2643 ceph_cap_string(issued)); 2644 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 2645 if (cia && cia->fscrypt_auth) { 2646 u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); 2647 2648 if (len > sizeof(*cia->fscrypt_auth)) { 2649 err = -EINVAL; 2650 spin_unlock(&ci->i_ceph_lock); 2651 goto out; 2652 } 2653 2654 doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode, 2655 ceph_vinop(inode), ci->fscrypt_auth_len, len); 2656 2657 /* It should never be re-set once set */ 2658 WARN_ON_ONCE(ci->fscrypt_auth); 2659 2660 if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { 2661 dirtied |= CEPH_CAP_AUTH_EXCL; 2662 kfree(ci->fscrypt_auth); 2663 ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; 2664 ci->fscrypt_auth_len = len; 2665 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2666 ci->fscrypt_auth_len != len || 2667 memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { 2668 req->r_fscrypt_auth = cia->fscrypt_auth; 2669 mask |= CEPH_SETATTR_FSCRYPT_AUTH; 2670 release |= CEPH_CAP_AUTH_SHARED; 2671 } 2672 cia->fscrypt_auth = NULL; 2673 } 2674 #else 2675 if (cia && cia->fscrypt_auth) { 2676 err = -EINVAL; 2677 spin_unlock(&ci->i_ceph_lock); 2678 goto out; 2679 } 2680 #endif /* CONFIG_FS_ENCRYPTION */ 2681 2682 if (ia_valid & ATTR_UID) { 2683 kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); 2684 2685 doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode, 2686 ceph_vinop(inode), 2687 from_kuid(&init_user_ns, inode->i_uid), 2688 from_kuid(&init_user_ns, attr->ia_uid)); 2689 if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { 2690 inode->i_uid = fsuid; 2691 dirtied |= CEPH_CAP_AUTH_EXCL; 2692 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2693 !uid_eq(fsuid, inode->i_uid)) { 2694 req->r_args.setattr.uid = cpu_to_le32( 2695 from_kuid(&init_user_ns, fsuid)); 2696 mask |= CEPH_SETATTR_UID; 2697 release |= CEPH_CAP_AUTH_SHARED; 2698 } 2699 } 2700 if (ia_valid & ATTR_GID) { 2701 kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); 2702 2703 doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode, 2704 ceph_vinop(inode), 2705 from_kgid(&init_user_ns, inode->i_gid), 2706 from_kgid(&init_user_ns, attr->ia_gid)); 2707 if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { 2708 inode->i_gid = fsgid; 2709 dirtied |= CEPH_CAP_AUTH_EXCL; 2710 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2711 !gid_eq(fsgid, inode->i_gid)) { 2712 req->r_args.setattr.gid = cpu_to_le32( 2713 from_kgid(&init_user_ns, fsgid)); 2714 mask |= CEPH_SETATTR_GID; 2715 release |= CEPH_CAP_AUTH_SHARED; 2716 } 2717 } 2718 if (ia_valid & ATTR_MODE) { 2719 doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode, 2720 ceph_vinop(inode), inode->i_mode, attr->ia_mode); 2721 if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { 2722 inode->i_mode = attr->ia_mode; 2723 dirtied |= CEPH_CAP_AUTH_EXCL; 2724 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2725 attr->ia_mode != inode->i_mode) { 2726 inode->i_mode = attr->ia_mode; 2727 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); 2728 mask |= CEPH_SETATTR_MODE; 2729 release |= CEPH_CAP_AUTH_SHARED; 2730 } 2731 } 2732 2733 if (ia_valid & ATTR_ATIME) { 2734 struct timespec64 atime = inode_get_atime(inode); 2735 2736 doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n", 2737 inode, ceph_vinop(inode), &atime, &attr->ia_atime); 2738 if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { 2739 ci->i_time_warp_seq++; 2740 inode_set_atime_to_ts(inode, attr->ia_atime); 2741 dirtied |= CEPH_CAP_FILE_EXCL; 2742 } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && 2743 timespec64_compare(&atime, 2744 &attr->ia_atime) < 0) { 2745 inode_set_atime_to_ts(inode, attr->ia_atime); 2746 dirtied |= CEPH_CAP_FILE_WR; 2747 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2748 !timespec64_equal(&atime, &attr->ia_atime)) { 2749 ceph_encode_timespec64(&req->r_args.setattr.atime, 2750 &attr->ia_atime); 2751 mask |= CEPH_SETATTR_ATIME; 2752 release |= CEPH_CAP_FILE_SHARED | 2753 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2754 } 2755 } 2756 if (ia_valid & ATTR_SIZE) { 2757 doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode, 2758 ceph_vinop(inode), isize, attr->ia_size); 2759 /* 2760 * Only when the new size is smaller and not aligned to 2761 * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. 2762 */ 2763 if (IS_ENCRYPTED(inode) && attr->ia_size < isize && 2764 (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { 2765 mask |= CEPH_SETATTR_SIZE; 2766 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2767 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2768 set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); 2769 mask |= CEPH_SETATTR_FSCRYPT_FILE; 2770 req->r_args.setattr.size = 2771 cpu_to_le64(round_up(attr->ia_size, 2772 CEPH_FSCRYPT_BLOCK_SIZE)); 2773 req->r_args.setattr.old_size = 2774 cpu_to_le64(round_up(isize, 2775 CEPH_FSCRYPT_BLOCK_SIZE)); 2776 req->r_fscrypt_file = attr->ia_size; 2777 fill_fscrypt = true; 2778 } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { 2779 if (attr->ia_size > isize) { 2780 i_size_write(inode, attr->ia_size); 2781 inode->i_blocks = calc_inode_blocks(attr->ia_size); 2782 ci->i_reported_size = attr->ia_size; 2783 dirtied |= CEPH_CAP_FILE_EXCL; 2784 ia_valid |= ATTR_MTIME; 2785 } 2786 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2787 attr->ia_size != isize) { 2788 mask |= CEPH_SETATTR_SIZE; 2789 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2790 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2791 if (IS_ENCRYPTED(inode) && attr->ia_size) { 2792 set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); 2793 mask |= CEPH_SETATTR_FSCRYPT_FILE; 2794 req->r_args.setattr.size = 2795 cpu_to_le64(round_up(attr->ia_size, 2796 CEPH_FSCRYPT_BLOCK_SIZE)); 2797 req->r_args.setattr.old_size = 2798 cpu_to_le64(round_up(isize, 2799 CEPH_FSCRYPT_BLOCK_SIZE)); 2800 req->r_fscrypt_file = attr->ia_size; 2801 } else { 2802 req->r_args.setattr.size = cpu_to_le64(attr->ia_size); 2803 req->r_args.setattr.old_size = cpu_to_le64(isize); 2804 req->r_fscrypt_file = 0; 2805 } 2806 } 2807 } 2808 if (ia_valid & ATTR_MTIME) { 2809 struct timespec64 mtime = inode_get_mtime(inode); 2810 2811 doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n", 2812 inode, ceph_vinop(inode), &mtime, &attr->ia_mtime); 2813 if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { 2814 ci->i_time_warp_seq++; 2815 inode_set_mtime_to_ts(inode, attr->ia_mtime); 2816 dirtied |= CEPH_CAP_FILE_EXCL; 2817 } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && 2818 timespec64_compare(&mtime, &attr->ia_mtime) < 0) { 2819 inode_set_mtime_to_ts(inode, attr->ia_mtime); 2820 dirtied |= CEPH_CAP_FILE_WR; 2821 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2822 !timespec64_equal(&mtime, &attr->ia_mtime)) { 2823 ceph_encode_timespec64(&req->r_args.setattr.mtime, 2824 &attr->ia_mtime); 2825 mask |= CEPH_SETATTR_MTIME; 2826 release |= CEPH_CAP_FILE_SHARED | 2827 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2828 } 2829 } 2830 2831 /* these do nothing */ 2832 if (ia_valid & ATTR_CTIME) { 2833 struct timespec64 ictime = inode_get_ctime(inode); 2834 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| 2835 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; 2836 doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n", 2837 inode, ceph_vinop(inode), &ictime, &attr->ia_ctime, 2838 only ? "ctime only" : "ignored"); 2839 if (only) { 2840 /* 2841 * if kernel wants to dirty ctime but nothing else, 2842 * we need to choose a cap to dirty under, or do 2843 * a almost-no-op setattr 2844 */ 2845 if (issued & CEPH_CAP_AUTH_EXCL) 2846 dirtied |= CEPH_CAP_AUTH_EXCL; 2847 else if (issued & CEPH_CAP_FILE_EXCL) 2848 dirtied |= CEPH_CAP_FILE_EXCL; 2849 else if (issued & CEPH_CAP_XATTR_EXCL) 2850 dirtied |= CEPH_CAP_XATTR_EXCL; 2851 else 2852 mask |= CEPH_SETATTR_CTIME; 2853 } 2854 } 2855 if (ia_valid & ATTR_FILE) 2856 doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode, 2857 ceph_vinop(inode)); 2858 2859 if (dirtied) { 2860 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, 2861 &prealloc_cf); 2862 inode_set_ctime_to_ts(inode, attr->ia_ctime); 2863 inode_inc_iversion_raw(inode); 2864 } 2865 2866 release &= issued; 2867 spin_unlock(&ci->i_ceph_lock); 2868 if (lock_snap_rwsem) { 2869 up_read(&mdsc->snap_rwsem); 2870 lock_snap_rwsem = false; 2871 } 2872 2873 if (inode_dirty_flags) 2874 __mark_inode_dirty(inode, inode_dirty_flags); 2875 2876 if (mask) { 2877 req->r_inode = inode; 2878 ihold(inode); 2879 req->r_inode_drop = release; 2880 req->r_args.setattr.mask = cpu_to_le32(mask); 2881 req->r_num_caps = 1; 2882 req->r_stamp = attr->ia_ctime; 2883 if (fill_fscrypt) { 2884 err = fill_fscrypt_truncate(inode, req, attr); 2885 if (err) 2886 goto out; 2887 } 2888 2889 /* 2890 * The truncate request will return -EAGAIN when the 2891 * last block has been updated just before the MDS 2892 * successfully gets the xlock for the FILE lock. To 2893 * avoid corrupting the file contents we need to retry 2894 * it. 2895 */ 2896 err = ceph_mdsc_do_request(mdsc, NULL, req); 2897 if (err == -EAGAIN && truncate_retry--) { 2898 doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n", 2899 inode, ceph_vinop(inode), err, 2900 ceph_cap_string(dirtied), mask); 2901 ceph_mdsc_put_request(req); 2902 ceph_free_cap_flush(prealloc_cf); 2903 goto retry; 2904 } 2905 } 2906 out: 2907 doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode, 2908 ceph_vinop(inode), err, ceph_cap_string(dirtied), mask); 2909 2910 ceph_mdsc_put_request(req); 2911 ceph_free_cap_flush(prealloc_cf); 2912 2913 if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) 2914 __ceph_do_pending_vmtruncate(inode); 2915 2916 return err; 2917 } 2918 2919 /* 2920 * setattr 2921 */ 2922 int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 2923 struct iattr *attr) 2924 { 2925 struct inode *inode = d_inode(dentry); 2926 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 2927 int err; 2928 2929 if (ceph_snap(inode) != CEPH_NOSNAP) 2930 return -EROFS; 2931 2932 if (ceph_inode_is_shutdown(inode)) 2933 return -ESTALE; 2934 2935 err = fscrypt_prepare_setattr(dentry, attr); 2936 if (err) 2937 return err; 2938 2939 err = setattr_prepare(idmap, dentry, attr); 2940 if (err != 0) 2941 return err; 2942 2943 if ((attr->ia_valid & ATTR_SIZE) && 2944 attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) 2945 return -EFBIG; 2946 2947 if ((attr->ia_valid & ATTR_SIZE) && 2948 ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) 2949 return -EDQUOT; 2950 2951 err = __ceph_setattr(idmap, inode, attr, NULL); 2952 2953 if (err >= 0 && (attr->ia_valid & ATTR_MODE)) 2954 err = posix_acl_chmod(idmap, dentry, attr->ia_mode); 2955 2956 return err; 2957 } 2958 2959 int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) 2960 { 2961 int issued = ceph_caps_issued(ceph_inode(inode)); 2962 2963 /* 2964 * If any 'x' caps is issued we can just choose the auth MDS 2965 * instead of the random replica MDSes. Because only when the 2966 * Locker is in LOCK_EXEC state will the loner client could 2967 * get the 'x' caps. And if we send the getattr requests to 2968 * any replica MDS it must auth pin and tries to rdlock from 2969 * the auth MDS, and then the auth MDS need to do the Locker 2970 * state transition to LOCK_SYNC. And after that the lock state 2971 * will change back. 2972 * 2973 * This cost much when doing the Locker state transition and 2974 * usually will need to revoke caps from clients. 2975 * 2976 * And for the 'Xs' caps for getxattr we will also choose the 2977 * auth MDS, because the MDS side code is buggy due to setxattr 2978 * won't notify the replica MDSes when the values changed and 2979 * the replica MDS will return the old values. Though we will 2980 * fix it in MDS code, but this still makes sense for old ceph. 2981 */ 2982 if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) 2983 || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) 2984 return USE_AUTH_MDS; 2985 else 2986 return USE_ANY_MDS; 2987 } 2988 2989 /* 2990 * Verify that we have a lease on the given mask. If not, 2991 * do a getattr against an mds. 2992 */ 2993 int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 2994 int mask, bool force) 2995 { 2996 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); 2997 struct ceph_client *cl = fsc->client; 2998 struct ceph_mds_client *mdsc = fsc->mdsc; 2999 struct ceph_mds_request *req; 3000 int mode; 3001 int err; 3002 3003 if (ceph_snap(inode) == CEPH_SNAPDIR) { 3004 doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode, 3005 ceph_vinop(inode)); 3006 return 0; 3007 } 3008 3009 doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode, 3010 ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode); 3011 if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) 3012 return 0; 3013 3014 mode = ceph_try_to_choose_auth_mds(inode, mask); 3015 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); 3016 if (IS_ERR(req)) 3017 return PTR_ERR(req); 3018 req->r_inode = inode; 3019 ihold(inode); 3020 req->r_num_caps = 1; 3021 req->r_args.getattr.mask = cpu_to_le32(mask); 3022 req->r_locked_page = locked_page; 3023 err = ceph_mdsc_do_request(mdsc, NULL, req); 3024 if (locked_page && err == 0) { 3025 u64 inline_version = req->r_reply_info.targeti.inline_version; 3026 if (inline_version == 0) { 3027 /* the reply is supposed to contain inline data */ 3028 err = -EINVAL; 3029 } else if (inline_version == CEPH_INLINE_NONE || 3030 inline_version == 1) { 3031 err = -ENODATA; 3032 } else { 3033 err = req->r_reply_info.targeti.inline_len; 3034 } 3035 } 3036 ceph_mdsc_put_request(req); 3037 doutc(cl, "result=%d\n", err); 3038 return err; 3039 } 3040 3041 int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, 3042 size_t size) 3043 { 3044 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); 3045 struct ceph_client *cl = fsc->client; 3046 struct ceph_mds_client *mdsc = fsc->mdsc; 3047 struct ceph_mds_request *req; 3048 int mode = USE_AUTH_MDS; 3049 int err; 3050 char *xattr_value; 3051 size_t xattr_value_len; 3052 3053 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); 3054 if (IS_ERR(req)) { 3055 err = -ENOMEM; 3056 goto out; 3057 } 3058 3059 req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; 3060 req->r_path2 = kstrdup(name, GFP_NOFS); 3061 if (!req->r_path2) { 3062 err = -ENOMEM; 3063 goto put; 3064 } 3065 3066 ihold(inode); 3067 req->r_inode = inode; 3068 err = ceph_mdsc_do_request(mdsc, NULL, req); 3069 if (err < 0) 3070 goto put; 3071 3072 xattr_value = req->r_reply_info.xattr_info.xattr_value; 3073 xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; 3074 3075 doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); 3076 3077 err = (int)xattr_value_len; 3078 if (size == 0) 3079 goto put; 3080 3081 if (xattr_value_len > size) { 3082 err = -ERANGE; 3083 goto put; 3084 } 3085 3086 memcpy(value, xattr_value, xattr_value_len); 3087 put: 3088 ceph_mdsc_put_request(req); 3089 out: 3090 doutc(cl, "result=%d\n", err); 3091 return err; 3092 } 3093 3094 3095 /* 3096 * Check inode permissions. We verify we have a valid value for 3097 * the AUTH cap, then call the generic handler. 3098 */ 3099 int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, 3100 int mask) 3101 { 3102 int err; 3103 3104 if (mask & MAY_NOT_BLOCK) 3105 return -ECHILD; 3106 3107 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); 3108 3109 if (!err) 3110 err = generic_permission(idmap, inode, mask); 3111 return err; 3112 } 3113 3114 /* Craft a mask of needed caps given a set of requested statx attrs. */ 3115 static int statx_to_caps(u32 want, umode_t mode) 3116 { 3117 int mask = 0; 3118 3119 if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) 3120 mask |= CEPH_CAP_AUTH_SHARED; 3121 3122 if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { 3123 /* 3124 * The link count for directories depends on inode->i_subdirs, 3125 * and that is only updated when Fs caps are held. 3126 */ 3127 if (S_ISDIR(mode)) 3128 mask |= CEPH_CAP_FILE_SHARED; 3129 else 3130 mask |= CEPH_CAP_LINK_SHARED; 3131 } 3132 3133 if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) 3134 mask |= CEPH_CAP_FILE_SHARED; 3135 3136 if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) 3137 mask |= CEPH_CAP_XATTR_SHARED; 3138 3139 return mask; 3140 } 3141 3142 /* 3143 * Get all the attributes. If we have sufficient caps for the requested attrs, 3144 * then we can avoid talking to the MDS at all. 3145 */ 3146 int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, 3147 struct kstat *stat, u32 request_mask, unsigned int flags) 3148 { 3149 struct inode *inode = d_inode(path->dentry); 3150 struct super_block *sb = inode->i_sb; 3151 struct ceph_inode_info *ci = ceph_inode(inode); 3152 u32 valid_mask = STATX_BASIC_STATS; 3153 int err = 0; 3154 3155 if (ceph_inode_is_shutdown(inode)) 3156 return -ESTALE; 3157 3158 /* Skip the getattr altogether if we're asked not to sync */ 3159 if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { 3160 err = ceph_do_getattr(inode, 3161 statx_to_caps(request_mask, inode->i_mode), 3162 flags & AT_STATX_FORCE_SYNC); 3163 if (err) 3164 return err; 3165 } 3166 3167 generic_fillattr(idmap, request_mask, inode, stat); 3168 stat->ino = ceph_present_inode(inode); 3169 3170 /* 3171 * btime on newly-allocated inodes is 0, so if this is still set to 3172 * that, then assume that it's not valid. 3173 */ 3174 if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) { 3175 stat->btime = ci->i_btime; 3176 valid_mask |= STATX_BTIME; 3177 } 3178 3179 if (request_mask & STATX_CHANGE_COOKIE) { 3180 stat->change_cookie = inode_peek_iversion_raw(inode); 3181 valid_mask |= STATX_CHANGE_COOKIE; 3182 } 3183 3184 if (ceph_snap(inode) == CEPH_NOSNAP) 3185 stat->dev = sb->s_dev; 3186 else 3187 stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; 3188 3189 if (S_ISDIR(inode->i_mode)) { 3190 if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) { 3191 stat->size = ci->i_rbytes; 3192 } else if (ceph_snap(inode) == CEPH_SNAPDIR) { 3193 struct ceph_inode_info *pci; 3194 struct ceph_snap_realm *realm; 3195 struct inode *parent; 3196 3197 parent = ceph_lookup_inode(sb, ceph_ino(inode)); 3198 if (IS_ERR(parent)) 3199 return PTR_ERR(parent); 3200 3201 pci = ceph_inode(parent); 3202 spin_lock(&pci->i_ceph_lock); 3203 realm = pci->i_snap_realm; 3204 if (realm) 3205 stat->size = realm->num_snaps; 3206 else 3207 stat->size = 0; 3208 spin_unlock(&pci->i_ceph_lock); 3209 iput(parent); 3210 } else { 3211 stat->size = ci->i_files + ci->i_subdirs; 3212 } 3213 stat->blocks = 0; 3214 stat->blksize = 65536; 3215 /* 3216 * Some applications rely on the number of st_nlink 3217 * value on directories to be either 0 (if unlinked) 3218 * or 2 + number of subdirectories. 3219 */ 3220 if (stat->nlink == 1) 3221 /* '.' + '..' + subdirs */ 3222 stat->nlink = 1 + 1 + ci->i_subdirs; 3223 } 3224 3225 stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; 3226 if (IS_ENCRYPTED(inode)) 3227 stat->attributes |= STATX_ATTR_ENCRYPTED; 3228 stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC | 3229 STATX_ATTR_ENCRYPTED); 3230 3231 stat->result_mask = request_mask & valid_mask; 3232 return err; 3233 } 3234 3235 void ceph_inode_shutdown(struct inode *inode) 3236 { 3237 struct ceph_inode_info *ci = ceph_inode(inode); 3238 struct rb_node *p; 3239 int iputs = 0; 3240 bool invalidate = false; 3241 3242 spin_lock(&ci->i_ceph_lock); 3243 ci->i_ceph_flags |= CEPH_I_SHUTDOWN; 3244 p = rb_first(&ci->i_caps); 3245 while (p) { 3246 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 3247 3248 p = rb_next(p); 3249 iputs += ceph_purge_inode_cap(inode, cap, &invalidate); 3250 } 3251 spin_unlock(&ci->i_ceph_lock); 3252 3253 if (invalidate) 3254 ceph_queue_invalidate(inode); 3255 while (iputs--) 3256 iput(inode); 3257 } 3258