1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 #include "fs.h" 28 #include "accessors.h" 29 #include "extent-tree.h" 30 #include "root-tree.h" 31 #include "tree-checker.h" 32 33 enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) 34 { 35 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 36 return BTRFS_QGROUP_MODE_DISABLED; 37 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) 38 return BTRFS_QGROUP_MODE_SIMPLE; 39 return BTRFS_QGROUP_MODE_FULL; 40 } 41 42 bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) 43 { 44 return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; 45 } 46 47 bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) 48 { 49 return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; 50 } 51 52 /* 53 * Helpers to access qgroup reservation 54 * 55 * Callers should ensure the lock context and type are valid 56 */ 57 58 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 59 { 60 u64 ret = 0; 61 int i; 62 63 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 64 ret += qgroup->rsv.values[i]; 65 66 return ret; 67 } 68 69 #ifdef CONFIG_BTRFS_DEBUG 70 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 71 { 72 if (type == BTRFS_QGROUP_RSV_DATA) 73 return "data"; 74 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 75 return "meta_pertrans"; 76 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 77 return "meta_prealloc"; 78 return NULL; 79 } 80 #endif 81 82 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 83 struct btrfs_qgroup *qgroup, u64 num_bytes, 84 enum btrfs_qgroup_rsv_type type) 85 { 86 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 87 qgroup->rsv.values[type] += num_bytes; 88 } 89 90 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 91 struct btrfs_qgroup *qgroup, u64 num_bytes, 92 enum btrfs_qgroup_rsv_type type) 93 { 94 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 95 if (qgroup->rsv.values[type] >= num_bytes) { 96 qgroup->rsv.values[type] -= num_bytes; 97 return; 98 } 99 #ifdef CONFIG_BTRFS_DEBUG 100 WARN_RATELIMIT(1, 101 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 102 qgroup->qgroupid, qgroup_rsv_type_str(type), 103 qgroup->rsv.values[type], num_bytes); 104 #endif 105 qgroup->rsv.values[type] = 0; 106 } 107 108 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 109 struct btrfs_qgroup *dest, 110 struct btrfs_qgroup *src) 111 { 112 int i; 113 114 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 115 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 116 } 117 118 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 119 struct btrfs_qgroup *dest, 120 struct btrfs_qgroup *src) 121 { 122 int i; 123 124 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 125 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 126 } 127 128 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 129 int mod) 130 { 131 if (qg->old_refcnt < seq) 132 qg->old_refcnt = seq; 133 qg->old_refcnt += mod; 134 } 135 136 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 137 int mod) 138 { 139 if (qg->new_refcnt < seq) 140 qg->new_refcnt = seq; 141 qg->new_refcnt += mod; 142 } 143 144 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 145 { 146 if (qg->old_refcnt < seq) 147 return 0; 148 return qg->old_refcnt - seq; 149 } 150 151 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 152 { 153 if (qg->new_refcnt < seq) 154 return 0; 155 return qg->new_refcnt - seq; 156 } 157 158 /* 159 * glue structure to represent the relations between qgroups. 160 */ 161 struct btrfs_qgroup_list { 162 struct list_head next_group; 163 struct list_head next_member; 164 struct btrfs_qgroup *group; 165 struct btrfs_qgroup *member; 166 }; 167 168 static int 169 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 170 int init_flags); 171 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 172 173 /* must be called with qgroup_ioctl_lock held */ 174 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 175 u64 qgroupid) 176 { 177 struct rb_node *n = fs_info->qgroup_tree.rb_node; 178 struct btrfs_qgroup *qgroup; 179 180 while (n) { 181 qgroup = rb_entry(n, struct btrfs_qgroup, node); 182 if (qgroup->qgroupid < qgroupid) 183 n = n->rb_left; 184 else if (qgroup->qgroupid > qgroupid) 185 n = n->rb_right; 186 else 187 return qgroup; 188 } 189 return NULL; 190 } 191 192 /* 193 * Add qgroup to the filesystem's qgroup tree. 194 * 195 * Must be called with qgroup_lock held and @prealloc preallocated. 196 * 197 * The control on the lifespan of @prealloc would be transferred to this 198 * function, thus caller should no longer touch @prealloc. 199 */ 200 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 201 struct btrfs_qgroup *prealloc, 202 u64 qgroupid) 203 { 204 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 205 struct rb_node *parent = NULL; 206 struct btrfs_qgroup *qgroup; 207 208 /* Caller must have pre-allocated @prealloc. */ 209 ASSERT(prealloc); 210 211 while (*p) { 212 parent = *p; 213 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 214 215 if (qgroup->qgroupid < qgroupid) { 216 p = &(*p)->rb_left; 217 } else if (qgroup->qgroupid > qgroupid) { 218 p = &(*p)->rb_right; 219 } else { 220 kfree(prealloc); 221 return qgroup; 222 } 223 } 224 225 qgroup = prealloc; 226 qgroup->qgroupid = qgroupid; 227 INIT_LIST_HEAD(&qgroup->groups); 228 INIT_LIST_HEAD(&qgroup->members); 229 INIT_LIST_HEAD(&qgroup->dirty); 230 INIT_LIST_HEAD(&qgroup->iterator); 231 INIT_LIST_HEAD(&qgroup->nested_iterator); 232 233 rb_link_node(&qgroup->node, parent, p); 234 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 235 236 return qgroup; 237 } 238 239 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 240 struct btrfs_qgroup *qgroup) 241 { 242 struct btrfs_qgroup_list *list; 243 244 list_del(&qgroup->dirty); 245 while (!list_empty(&qgroup->groups)) { 246 list = list_first_entry(&qgroup->groups, 247 struct btrfs_qgroup_list, next_group); 248 list_del(&list->next_group); 249 list_del(&list->next_member); 250 kfree(list); 251 } 252 253 while (!list_empty(&qgroup->members)) { 254 list = list_first_entry(&qgroup->members, 255 struct btrfs_qgroup_list, next_member); 256 list_del(&list->next_group); 257 list_del(&list->next_member); 258 kfree(list); 259 } 260 } 261 262 /* must be called with qgroup_lock held */ 263 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 264 { 265 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 266 267 if (!qgroup) 268 return -ENOENT; 269 270 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 271 __del_qgroup_rb(fs_info, qgroup); 272 return 0; 273 } 274 275 /* 276 * Add relation specified by two qgroups. 277 * 278 * Must be called with qgroup_lock held, the ownership of @prealloc is 279 * transferred to this function and caller should not touch it anymore. 280 * 281 * Return: 0 on success 282 * -ENOENT if one of the qgroups is NULL 283 * <0 other errors 284 */ 285 static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, 286 struct btrfs_qgroup *member, 287 struct btrfs_qgroup *parent) 288 { 289 if (!member || !parent) { 290 kfree(prealloc); 291 return -ENOENT; 292 } 293 294 prealloc->group = parent; 295 prealloc->member = member; 296 list_add_tail(&prealloc->next_group, &member->groups); 297 list_add_tail(&prealloc->next_member, &parent->members); 298 299 return 0; 300 } 301 302 /* 303 * Add relation specified by two qgroup ids. 304 * 305 * Must be called with qgroup_lock held. 306 * 307 * Return: 0 on success 308 * -ENOENT if one of the ids does not exist 309 * <0 other errors 310 */ 311 static int add_relation_rb(struct btrfs_fs_info *fs_info, 312 struct btrfs_qgroup_list *prealloc, 313 u64 memberid, u64 parentid) 314 { 315 struct btrfs_qgroup *member; 316 struct btrfs_qgroup *parent; 317 318 member = find_qgroup_rb(fs_info, memberid); 319 parent = find_qgroup_rb(fs_info, parentid); 320 321 return __add_relation_rb(prealloc, member, parent); 322 } 323 324 /* Must be called with qgroup_lock held */ 325 static int del_relation_rb(struct btrfs_fs_info *fs_info, 326 u64 memberid, u64 parentid) 327 { 328 struct btrfs_qgroup *member; 329 struct btrfs_qgroup *parent; 330 struct btrfs_qgroup_list *list; 331 332 member = find_qgroup_rb(fs_info, memberid); 333 parent = find_qgroup_rb(fs_info, parentid); 334 if (!member || !parent) 335 return -ENOENT; 336 337 list_for_each_entry(list, &member->groups, next_group) { 338 if (list->group == parent) { 339 list_del(&list->next_group); 340 list_del(&list->next_member); 341 kfree(list); 342 return 0; 343 } 344 } 345 return -ENOENT; 346 } 347 348 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 349 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 350 u64 rfer, u64 excl) 351 { 352 struct btrfs_qgroup *qgroup; 353 354 qgroup = find_qgroup_rb(fs_info, qgroupid); 355 if (!qgroup) 356 return -EINVAL; 357 if (qgroup->rfer != rfer || qgroup->excl != excl) 358 return -EINVAL; 359 return 0; 360 } 361 #endif 362 363 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) 364 { 365 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 366 return; 367 fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | 368 BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 369 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 370 } 371 372 static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, 373 struct extent_buffer *leaf, int slot, 374 struct btrfs_qgroup_status_item *ptr) 375 { 376 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); 377 ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); 378 fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); 379 } 380 381 /* 382 * The full config is read in one go, only called from open_ctree() 383 * It doesn't use any locking, as at this point we're still single-threaded 384 */ 385 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 386 { 387 struct btrfs_key key; 388 struct btrfs_key found_key; 389 struct btrfs_root *quota_root = fs_info->quota_root; 390 struct btrfs_path *path = NULL; 391 struct extent_buffer *l; 392 int slot; 393 int ret = 0; 394 u64 flags = 0; 395 u64 rescan_progress = 0; 396 397 if (!fs_info->quota_root) 398 return 0; 399 400 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 401 if (!fs_info->qgroup_ulist) { 402 ret = -ENOMEM; 403 goto out; 404 } 405 406 path = btrfs_alloc_path(); 407 if (!path) { 408 ret = -ENOMEM; 409 goto out; 410 } 411 412 ret = btrfs_sysfs_add_qgroups(fs_info); 413 if (ret < 0) 414 goto out; 415 /* default this to quota off, in case no status key is found */ 416 fs_info->qgroup_flags = 0; 417 418 /* 419 * pass 1: read status, all qgroup infos and limits 420 */ 421 key.objectid = 0; 422 key.type = 0; 423 key.offset = 0; 424 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 425 if (ret) 426 goto out; 427 428 while (1) { 429 struct btrfs_qgroup *qgroup; 430 431 slot = path->slots[0]; 432 l = path->nodes[0]; 433 btrfs_item_key_to_cpu(l, &found_key, slot); 434 435 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 436 struct btrfs_qgroup_status_item *ptr; 437 438 ptr = btrfs_item_ptr(l, slot, 439 struct btrfs_qgroup_status_item); 440 441 if (btrfs_qgroup_status_version(l, ptr) != 442 BTRFS_QGROUP_STATUS_VERSION) { 443 btrfs_err(fs_info, 444 "old qgroup version, quota disabled"); 445 goto out; 446 } 447 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); 448 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { 449 qgroup_read_enable_gen(fs_info, l, slot, ptr); 450 } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { 451 qgroup_mark_inconsistent(fs_info); 452 btrfs_err(fs_info, 453 "qgroup generation mismatch, marked as inconsistent"); 454 } 455 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 456 goto next1; 457 } 458 459 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 460 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 461 goto next1; 462 463 qgroup = find_qgroup_rb(fs_info, found_key.offset); 464 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 465 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 466 btrfs_err(fs_info, "inconsistent qgroup config"); 467 qgroup_mark_inconsistent(fs_info); 468 } 469 if (!qgroup) { 470 struct btrfs_qgroup *prealloc; 471 struct btrfs_root *tree_root = fs_info->tree_root; 472 473 prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); 474 if (!prealloc) { 475 ret = -ENOMEM; 476 goto out; 477 } 478 qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); 479 /* 480 * If a qgroup exists for a subvolume ID, it is possible 481 * that subvolume has been deleted, in which case 482 * re-using that ID would lead to incorrect accounting. 483 * 484 * Ensure that we skip any such subvol ids. 485 * 486 * We don't need to lock because this is only called 487 * during mount before we start doing things like creating 488 * subvolumes. 489 */ 490 if (is_fstree(qgroup->qgroupid) && 491 qgroup->qgroupid > tree_root->free_objectid) 492 /* 493 * Don't need to check against BTRFS_LAST_FREE_OBJECTID, 494 * as it will get checked on the next call to 495 * btrfs_get_free_objectid. 496 */ 497 tree_root->free_objectid = qgroup->qgroupid + 1; 498 } 499 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 500 if (ret < 0) 501 goto out; 502 503 switch (found_key.type) { 504 case BTRFS_QGROUP_INFO_KEY: { 505 struct btrfs_qgroup_info_item *ptr; 506 507 ptr = btrfs_item_ptr(l, slot, 508 struct btrfs_qgroup_info_item); 509 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 510 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 511 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 512 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 513 /* generation currently unused */ 514 break; 515 } 516 case BTRFS_QGROUP_LIMIT_KEY: { 517 struct btrfs_qgroup_limit_item *ptr; 518 519 ptr = btrfs_item_ptr(l, slot, 520 struct btrfs_qgroup_limit_item); 521 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 522 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 523 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 524 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 525 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 526 break; 527 } 528 } 529 next1: 530 ret = btrfs_next_item(quota_root, path); 531 if (ret < 0) 532 goto out; 533 if (ret) 534 break; 535 } 536 btrfs_release_path(path); 537 538 /* 539 * pass 2: read all qgroup relations 540 */ 541 key.objectid = 0; 542 key.type = BTRFS_QGROUP_RELATION_KEY; 543 key.offset = 0; 544 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 545 if (ret) 546 goto out; 547 while (1) { 548 struct btrfs_qgroup_list *list = NULL; 549 550 slot = path->slots[0]; 551 l = path->nodes[0]; 552 btrfs_item_key_to_cpu(l, &found_key, slot); 553 554 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 555 goto next2; 556 557 if (found_key.objectid > found_key.offset) { 558 /* parent <- member, not needed to build config */ 559 /* FIXME should we omit the key completely? */ 560 goto next2; 561 } 562 563 list = kzalloc(sizeof(*list), GFP_KERNEL); 564 if (!list) { 565 ret = -ENOMEM; 566 goto out; 567 } 568 ret = add_relation_rb(fs_info, list, found_key.objectid, 569 found_key.offset); 570 list = NULL; 571 if (ret == -ENOENT) { 572 btrfs_warn(fs_info, 573 "orphan qgroup relation 0x%llx->0x%llx", 574 found_key.objectid, found_key.offset); 575 ret = 0; /* ignore the error */ 576 } 577 if (ret) 578 goto out; 579 next2: 580 ret = btrfs_next_item(quota_root, path); 581 if (ret < 0) 582 goto out; 583 if (ret) 584 break; 585 } 586 out: 587 btrfs_free_path(path); 588 fs_info->qgroup_flags |= flags; 589 if (ret >= 0) { 590 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) 591 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 592 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 593 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 594 } else { 595 ulist_free(fs_info->qgroup_ulist); 596 fs_info->qgroup_ulist = NULL; 597 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 598 btrfs_sysfs_del_qgroups(fs_info); 599 } 600 601 return ret < 0 ? ret : 0; 602 } 603 604 /* 605 * Called in close_ctree() when quota is still enabled. This verifies we don't 606 * leak some reserved space. 607 * 608 * Return false if no reserved space is left. 609 * Return true if some reserved space is leaked. 610 */ 611 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 612 { 613 struct rb_node *node; 614 bool ret = false; 615 616 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) 617 return ret; 618 /* 619 * Since we're unmounting, there is no race and no need to grab qgroup 620 * lock. And here we don't go post-order to provide a more user 621 * friendly sorted result. 622 */ 623 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 624 struct btrfs_qgroup *qgroup; 625 int i; 626 627 qgroup = rb_entry(node, struct btrfs_qgroup, node); 628 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 629 if (qgroup->rsv.values[i]) { 630 ret = true; 631 btrfs_warn(fs_info, 632 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 633 btrfs_qgroup_level(qgroup->qgroupid), 634 btrfs_qgroup_subvolid(qgroup->qgroupid), 635 i, qgroup->rsv.values[i]); 636 } 637 } 638 } 639 return ret; 640 } 641 642 /* 643 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 644 * first two are in single-threaded paths.And for the third one, we have set 645 * quota_root to be null with qgroup_lock held before, so it is safe to clean 646 * up the in-memory structures without qgroup_lock held. 647 */ 648 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 649 { 650 struct rb_node *n; 651 struct btrfs_qgroup *qgroup; 652 653 while ((n = rb_first(&fs_info->qgroup_tree))) { 654 qgroup = rb_entry(n, struct btrfs_qgroup, node); 655 rb_erase(n, &fs_info->qgroup_tree); 656 __del_qgroup_rb(fs_info, qgroup); 657 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 658 kfree(qgroup); 659 } 660 /* 661 * We call btrfs_free_qgroup_config() when unmounting 662 * filesystem and disabling quota, so we set qgroup_ulist 663 * to be null here to avoid double free. 664 */ 665 ulist_free(fs_info->qgroup_ulist); 666 fs_info->qgroup_ulist = NULL; 667 btrfs_sysfs_del_qgroups(fs_info); 668 } 669 670 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 671 u64 dst) 672 { 673 int ret; 674 struct btrfs_root *quota_root = trans->fs_info->quota_root; 675 struct btrfs_path *path; 676 struct btrfs_key key; 677 678 path = btrfs_alloc_path(); 679 if (!path) 680 return -ENOMEM; 681 682 key.objectid = src; 683 key.type = BTRFS_QGROUP_RELATION_KEY; 684 key.offset = dst; 685 686 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 687 688 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 689 690 btrfs_free_path(path); 691 return ret; 692 } 693 694 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 695 u64 dst) 696 { 697 int ret; 698 struct btrfs_root *quota_root = trans->fs_info->quota_root; 699 struct btrfs_path *path; 700 struct btrfs_key key; 701 702 path = btrfs_alloc_path(); 703 if (!path) 704 return -ENOMEM; 705 706 key.objectid = src; 707 key.type = BTRFS_QGROUP_RELATION_KEY; 708 key.offset = dst; 709 710 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 711 if (ret < 0) 712 goto out; 713 714 if (ret > 0) { 715 ret = -ENOENT; 716 goto out; 717 } 718 719 ret = btrfs_del_item(trans, quota_root, path); 720 out: 721 btrfs_free_path(path); 722 return ret; 723 } 724 725 static int add_qgroup_item(struct btrfs_trans_handle *trans, 726 struct btrfs_root *quota_root, u64 qgroupid) 727 { 728 int ret; 729 struct btrfs_path *path; 730 struct btrfs_qgroup_info_item *qgroup_info; 731 struct btrfs_qgroup_limit_item *qgroup_limit; 732 struct extent_buffer *leaf; 733 struct btrfs_key key; 734 735 if (btrfs_is_testing(quota_root->fs_info)) 736 return 0; 737 738 path = btrfs_alloc_path(); 739 if (!path) 740 return -ENOMEM; 741 742 key.objectid = 0; 743 key.type = BTRFS_QGROUP_INFO_KEY; 744 key.offset = qgroupid; 745 746 /* 747 * Avoid a transaction abort by catching -EEXIST here. In that 748 * case, we proceed by re-initializing the existing structure 749 * on disk. 750 */ 751 752 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 753 sizeof(*qgroup_info)); 754 if (ret && ret != -EEXIST) 755 goto out; 756 757 leaf = path->nodes[0]; 758 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 759 struct btrfs_qgroup_info_item); 760 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 761 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 762 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 763 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 764 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 765 766 btrfs_mark_buffer_dirty(trans, leaf); 767 768 btrfs_release_path(path); 769 770 key.type = BTRFS_QGROUP_LIMIT_KEY; 771 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 772 sizeof(*qgroup_limit)); 773 if (ret && ret != -EEXIST) 774 goto out; 775 776 leaf = path->nodes[0]; 777 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 778 struct btrfs_qgroup_limit_item); 779 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 780 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 781 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 782 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 783 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 784 785 btrfs_mark_buffer_dirty(trans, leaf); 786 787 ret = 0; 788 out: 789 btrfs_free_path(path); 790 return ret; 791 } 792 793 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 794 { 795 int ret; 796 struct btrfs_root *quota_root = trans->fs_info->quota_root; 797 struct btrfs_path *path; 798 struct btrfs_key key; 799 800 path = btrfs_alloc_path(); 801 if (!path) 802 return -ENOMEM; 803 804 key.objectid = 0; 805 key.type = BTRFS_QGROUP_INFO_KEY; 806 key.offset = qgroupid; 807 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 808 if (ret < 0) 809 goto out; 810 811 if (ret > 0) { 812 ret = -ENOENT; 813 goto out; 814 } 815 816 ret = btrfs_del_item(trans, quota_root, path); 817 if (ret) 818 goto out; 819 820 btrfs_release_path(path); 821 822 key.type = BTRFS_QGROUP_LIMIT_KEY; 823 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 824 if (ret < 0) 825 goto out; 826 827 if (ret > 0) { 828 ret = -ENOENT; 829 goto out; 830 } 831 832 ret = btrfs_del_item(trans, quota_root, path); 833 834 out: 835 btrfs_free_path(path); 836 return ret; 837 } 838 839 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 840 struct btrfs_qgroup *qgroup) 841 { 842 struct btrfs_root *quota_root = trans->fs_info->quota_root; 843 struct btrfs_path *path; 844 struct btrfs_key key; 845 struct extent_buffer *l; 846 struct btrfs_qgroup_limit_item *qgroup_limit; 847 int ret; 848 int slot; 849 850 key.objectid = 0; 851 key.type = BTRFS_QGROUP_LIMIT_KEY; 852 key.offset = qgroup->qgroupid; 853 854 path = btrfs_alloc_path(); 855 if (!path) 856 return -ENOMEM; 857 858 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 859 if (ret > 0) 860 ret = -ENOENT; 861 862 if (ret) 863 goto out; 864 865 l = path->nodes[0]; 866 slot = path->slots[0]; 867 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 868 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 869 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 870 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 871 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 872 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 873 874 btrfs_mark_buffer_dirty(trans, l); 875 876 out: 877 btrfs_free_path(path); 878 return ret; 879 } 880 881 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 882 struct btrfs_qgroup *qgroup) 883 { 884 struct btrfs_fs_info *fs_info = trans->fs_info; 885 struct btrfs_root *quota_root = fs_info->quota_root; 886 struct btrfs_path *path; 887 struct btrfs_key key; 888 struct extent_buffer *l; 889 struct btrfs_qgroup_info_item *qgroup_info; 890 int ret; 891 int slot; 892 893 if (btrfs_is_testing(fs_info)) 894 return 0; 895 896 key.objectid = 0; 897 key.type = BTRFS_QGROUP_INFO_KEY; 898 key.offset = qgroup->qgroupid; 899 900 path = btrfs_alloc_path(); 901 if (!path) 902 return -ENOMEM; 903 904 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 905 if (ret > 0) 906 ret = -ENOENT; 907 908 if (ret) 909 goto out; 910 911 l = path->nodes[0]; 912 slot = path->slots[0]; 913 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 914 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 915 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 916 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 917 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 918 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 919 920 btrfs_mark_buffer_dirty(trans, l); 921 922 out: 923 btrfs_free_path(path); 924 return ret; 925 } 926 927 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 928 { 929 struct btrfs_fs_info *fs_info = trans->fs_info; 930 struct btrfs_root *quota_root = fs_info->quota_root; 931 struct btrfs_path *path; 932 struct btrfs_key key; 933 struct extent_buffer *l; 934 struct btrfs_qgroup_status_item *ptr; 935 int ret; 936 int slot; 937 938 key.objectid = 0; 939 key.type = BTRFS_QGROUP_STATUS_KEY; 940 key.offset = 0; 941 942 path = btrfs_alloc_path(); 943 if (!path) 944 return -ENOMEM; 945 946 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 947 if (ret > 0) 948 ret = -ENOENT; 949 950 if (ret) 951 goto out; 952 953 l = path->nodes[0]; 954 slot = path->slots[0]; 955 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 956 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & 957 BTRFS_QGROUP_STATUS_FLAGS_MASK); 958 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 959 btrfs_set_qgroup_status_rescan(l, ptr, 960 fs_info->qgroup_rescan_progress.objectid); 961 962 btrfs_mark_buffer_dirty(trans, l); 963 964 out: 965 btrfs_free_path(path); 966 return ret; 967 } 968 969 /* 970 * called with qgroup_lock held 971 */ 972 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 973 struct btrfs_root *root) 974 { 975 struct btrfs_path *path; 976 struct btrfs_key key; 977 struct extent_buffer *leaf = NULL; 978 int ret; 979 int nr = 0; 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return -ENOMEM; 984 985 key.objectid = 0; 986 key.offset = 0; 987 key.type = 0; 988 989 while (1) { 990 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 991 if (ret < 0) 992 goto out; 993 leaf = path->nodes[0]; 994 nr = btrfs_header_nritems(leaf); 995 if (!nr) 996 break; 997 /* 998 * delete the leaf one by one 999 * since the whole tree is going 1000 * to be deleted. 1001 */ 1002 path->slots[0] = 0; 1003 ret = btrfs_del_items(trans, root, path, 0, nr); 1004 if (ret) 1005 goto out; 1006 1007 btrfs_release_path(path); 1008 } 1009 ret = 0; 1010 out: 1011 btrfs_free_path(path); 1012 return ret; 1013 } 1014 1015 int btrfs_quota_enable(struct btrfs_fs_info *fs_info, 1016 struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) 1017 { 1018 struct btrfs_root *quota_root; 1019 struct btrfs_root *tree_root = fs_info->tree_root; 1020 struct btrfs_path *path = NULL; 1021 struct btrfs_qgroup_status_item *ptr; 1022 struct extent_buffer *leaf; 1023 struct btrfs_key key; 1024 struct btrfs_key found_key; 1025 struct btrfs_qgroup *qgroup = NULL; 1026 struct btrfs_qgroup *prealloc = NULL; 1027 struct btrfs_trans_handle *trans = NULL; 1028 struct ulist *ulist = NULL; 1029 const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); 1030 int ret = 0; 1031 int slot; 1032 1033 /* 1034 * We need to have subvol_sem write locked, to prevent races between 1035 * concurrent tasks trying to enable quotas, because we will unlock 1036 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 1037 * and before setting BTRFS_FS_QUOTA_ENABLED. 1038 */ 1039 lockdep_assert_held_write(&fs_info->subvol_sem); 1040 1041 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 1042 btrfs_err(fs_info, 1043 "qgroups are currently unsupported in extent tree v2"); 1044 return -EINVAL; 1045 } 1046 1047 mutex_lock(&fs_info->qgroup_ioctl_lock); 1048 if (fs_info->quota_root) 1049 goto out; 1050 1051 ulist = ulist_alloc(GFP_KERNEL); 1052 if (!ulist) { 1053 ret = -ENOMEM; 1054 goto out; 1055 } 1056 1057 ret = btrfs_sysfs_add_qgroups(fs_info); 1058 if (ret < 0) 1059 goto out; 1060 1061 /* 1062 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 1063 * avoid lock acquisition inversion problems (reported by lockdep) between 1064 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 1065 * start a transaction. 1066 * After we started the transaction lock qgroup_ioctl_lock again and 1067 * check if someone else created the quota root in the meanwhile. If so, 1068 * just return success and release the transaction handle. 1069 * 1070 * Also we don't need to worry about someone else calling 1071 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 1072 * that function returns 0 (success) when the sysfs entries already exist. 1073 */ 1074 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1075 1076 /* 1077 * 1 for quota root item 1078 * 1 for BTRFS_QGROUP_STATUS item 1079 * 1080 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 1081 * per subvolume. However those are not currently reserved since it 1082 * would be a lot of overkill. 1083 */ 1084 trans = btrfs_start_transaction(tree_root, 2); 1085 1086 mutex_lock(&fs_info->qgroup_ioctl_lock); 1087 if (IS_ERR(trans)) { 1088 ret = PTR_ERR(trans); 1089 trans = NULL; 1090 goto out; 1091 } 1092 1093 if (fs_info->quota_root) 1094 goto out; 1095 1096 fs_info->qgroup_ulist = ulist; 1097 ulist = NULL; 1098 1099 /* 1100 * initially create the quota tree 1101 */ 1102 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1103 if (IS_ERR(quota_root)) { 1104 ret = PTR_ERR(quota_root); 1105 btrfs_abort_transaction(trans, ret); 1106 goto out; 1107 } 1108 1109 path = btrfs_alloc_path(); 1110 if (!path) { 1111 ret = -ENOMEM; 1112 btrfs_abort_transaction(trans, ret); 1113 goto out_free_root; 1114 } 1115 1116 key.objectid = 0; 1117 key.type = BTRFS_QGROUP_STATUS_KEY; 1118 key.offset = 0; 1119 1120 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1121 sizeof(*ptr)); 1122 if (ret) { 1123 btrfs_abort_transaction(trans, ret); 1124 goto out_free_path; 1125 } 1126 1127 leaf = path->nodes[0]; 1128 ptr = btrfs_item_ptr(leaf, path->slots[0], 1129 struct btrfs_qgroup_status_item); 1130 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1131 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1132 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; 1133 if (simple) { 1134 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; 1135 btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); 1136 } else { 1137 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1138 } 1139 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & 1140 BTRFS_QGROUP_STATUS_FLAGS_MASK); 1141 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1142 1143 btrfs_mark_buffer_dirty(trans, leaf); 1144 1145 key.objectid = 0; 1146 key.type = BTRFS_ROOT_REF_KEY; 1147 key.offset = 0; 1148 1149 btrfs_release_path(path); 1150 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1151 if (ret > 0) 1152 goto out_add_root; 1153 if (ret < 0) { 1154 btrfs_abort_transaction(trans, ret); 1155 goto out_free_path; 1156 } 1157 1158 while (1) { 1159 slot = path->slots[0]; 1160 leaf = path->nodes[0]; 1161 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1162 1163 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1164 1165 /* Release locks on tree_root before we access quota_root */ 1166 btrfs_release_path(path); 1167 1168 /* We should not have a stray @prealloc pointer. */ 1169 ASSERT(prealloc == NULL); 1170 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 1171 if (!prealloc) { 1172 ret = -ENOMEM; 1173 btrfs_abort_transaction(trans, ret); 1174 goto out_free_path; 1175 } 1176 1177 ret = add_qgroup_item(trans, quota_root, 1178 found_key.offset); 1179 if (ret) { 1180 btrfs_abort_transaction(trans, ret); 1181 goto out_free_path; 1182 } 1183 1184 qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); 1185 prealloc = NULL; 1186 if (IS_ERR(qgroup)) { 1187 ret = PTR_ERR(qgroup); 1188 btrfs_abort_transaction(trans, ret); 1189 goto out_free_path; 1190 } 1191 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1192 if (ret < 0) { 1193 btrfs_abort_transaction(trans, ret); 1194 goto out_free_path; 1195 } 1196 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1197 path, 1, 0); 1198 if (ret < 0) { 1199 btrfs_abort_transaction(trans, ret); 1200 goto out_free_path; 1201 } 1202 if (ret > 0) { 1203 /* 1204 * Shouldn't happen, but in case it does we 1205 * don't need to do the btrfs_next_item, just 1206 * continue. 1207 */ 1208 continue; 1209 } 1210 } 1211 ret = btrfs_next_item(tree_root, path); 1212 if (ret < 0) { 1213 btrfs_abort_transaction(trans, ret); 1214 goto out_free_path; 1215 } 1216 if (ret) 1217 break; 1218 } 1219 1220 out_add_root: 1221 btrfs_release_path(path); 1222 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1223 if (ret) { 1224 btrfs_abort_transaction(trans, ret); 1225 goto out_free_path; 1226 } 1227 1228 ASSERT(prealloc == NULL); 1229 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 1230 if (!prealloc) { 1231 ret = -ENOMEM; 1232 goto out_free_path; 1233 } 1234 qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); 1235 prealloc = NULL; 1236 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1237 if (ret < 0) { 1238 btrfs_abort_transaction(trans, ret); 1239 goto out_free_path; 1240 } 1241 1242 fs_info->qgroup_enable_gen = trans->transid; 1243 1244 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1245 /* 1246 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1247 * a deadlock with tasks concurrently doing other qgroup operations, such 1248 * adding/removing qgroups or adding/deleting qgroup relations for example, 1249 * because all qgroup operations first start or join a transaction and then 1250 * lock the qgroup_ioctl_lock mutex. 1251 * We are safe from a concurrent task trying to enable quotas, by calling 1252 * this function, since we are serialized by fs_info->subvol_sem. 1253 */ 1254 ret = btrfs_commit_transaction(trans); 1255 trans = NULL; 1256 mutex_lock(&fs_info->qgroup_ioctl_lock); 1257 if (ret) 1258 goto out_free_path; 1259 1260 /* 1261 * Set quota enabled flag after committing the transaction, to avoid 1262 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1263 * creation. 1264 */ 1265 spin_lock(&fs_info->qgroup_lock); 1266 fs_info->quota_root = quota_root; 1267 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1268 if (simple) 1269 btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); 1270 spin_unlock(&fs_info->qgroup_lock); 1271 1272 /* Skip rescan for simple qgroups. */ 1273 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 1274 goto out_free_path; 1275 1276 ret = qgroup_rescan_init(fs_info, 0, 1); 1277 if (!ret) { 1278 qgroup_rescan_zero_tracking(fs_info); 1279 fs_info->qgroup_rescan_running = true; 1280 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1281 &fs_info->qgroup_rescan_work); 1282 } else { 1283 /* 1284 * We have set both BTRFS_FS_QUOTA_ENABLED and 1285 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with 1286 * -EINPROGRESS. That can happen because someone started the 1287 * rescan worker by calling quota rescan ioctl before we 1288 * attempted to initialize the rescan worker. Failure due to 1289 * quotas disabled in the meanwhile is not possible, because 1290 * we are holding a write lock on fs_info->subvol_sem, which 1291 * is also acquired when disabling quotas. 1292 * Ignore such error, and any other error would need to undo 1293 * everything we did in the transaction we just committed. 1294 */ 1295 ASSERT(ret == -EINPROGRESS); 1296 ret = 0; 1297 } 1298 1299 out_free_path: 1300 btrfs_free_path(path); 1301 out_free_root: 1302 if (ret) 1303 btrfs_put_root(quota_root); 1304 out: 1305 if (ret) { 1306 ulist_free(fs_info->qgroup_ulist); 1307 fs_info->qgroup_ulist = NULL; 1308 btrfs_sysfs_del_qgroups(fs_info); 1309 } 1310 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1311 if (ret && trans) 1312 btrfs_end_transaction(trans); 1313 else if (trans) 1314 ret = btrfs_end_transaction(trans); 1315 ulist_free(ulist); 1316 kfree(prealloc); 1317 return ret; 1318 } 1319 1320 /* 1321 * It is possible to have outstanding ordered extents which reserved bytes 1322 * before we disabled. We need to fully flush delalloc, ordered extents, and a 1323 * commit to ensure that we don't leak such reservations, only to have them 1324 * come back if we re-enable. 1325 * 1326 * - enable simple quotas 1327 * - reserve space 1328 * - release it, store rsv_bytes in OE 1329 * - disable quotas 1330 * - enable simple quotas (qgroup rsv are all 0) 1331 * - OE finishes 1332 * - run delayed refs 1333 * - free rsv_bytes, resulting in miscounting or even underflow 1334 */ 1335 static int flush_reservations(struct btrfs_fs_info *fs_info) 1336 { 1337 struct btrfs_trans_handle *trans; 1338 int ret; 1339 1340 ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); 1341 if (ret) 1342 return ret; 1343 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 1344 trans = btrfs_join_transaction(fs_info->tree_root); 1345 if (IS_ERR(trans)) 1346 return PTR_ERR(trans); 1347 ret = btrfs_commit_transaction(trans); 1348 1349 return ret; 1350 } 1351 1352 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1353 { 1354 struct btrfs_root *quota_root; 1355 struct btrfs_trans_handle *trans = NULL; 1356 int ret = 0; 1357 1358 /* 1359 * We need to have subvol_sem write locked to prevent races with 1360 * snapshot creation. 1361 */ 1362 lockdep_assert_held_write(&fs_info->subvol_sem); 1363 1364 /* 1365 * Relocation will mess with backrefs, so make sure we have the 1366 * cleaner_mutex held to protect us from relocate. 1367 */ 1368 lockdep_assert_held(&fs_info->cleaner_mutex); 1369 1370 mutex_lock(&fs_info->qgroup_ioctl_lock); 1371 if (!fs_info->quota_root) 1372 goto out; 1373 1374 /* 1375 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1376 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1377 * to lock that mutex while holding a transaction handle and the rescan 1378 * worker needs to commit a transaction. 1379 */ 1380 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1381 1382 /* 1383 * Request qgroup rescan worker to complete and wait for it. This wait 1384 * must be done before transaction start for quota disable since it may 1385 * deadlock with transaction by the qgroup rescan worker. 1386 */ 1387 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1388 btrfs_qgroup_wait_for_completion(fs_info, false); 1389 1390 /* 1391 * We have nothing held here and no trans handle, just return the error 1392 * if there is one. 1393 */ 1394 ret = flush_reservations(fs_info); 1395 if (ret) 1396 return ret; 1397 1398 /* 1399 * 1 For the root item 1400 * 1401 * We should also reserve enough items for the quota tree deletion in 1402 * btrfs_clean_quota_tree but this is not done. 1403 * 1404 * Also, we must always start a transaction without holding the mutex 1405 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1406 */ 1407 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1408 1409 mutex_lock(&fs_info->qgroup_ioctl_lock); 1410 if (IS_ERR(trans)) { 1411 ret = PTR_ERR(trans); 1412 trans = NULL; 1413 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1414 goto out; 1415 } 1416 1417 if (!fs_info->quota_root) 1418 goto out; 1419 1420 spin_lock(&fs_info->qgroup_lock); 1421 quota_root = fs_info->quota_root; 1422 fs_info->quota_root = NULL; 1423 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; 1425 fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; 1426 spin_unlock(&fs_info->qgroup_lock); 1427 1428 btrfs_free_qgroup_config(fs_info); 1429 1430 ret = btrfs_clean_quota_tree(trans, quota_root); 1431 if (ret) { 1432 btrfs_abort_transaction(trans, ret); 1433 goto out; 1434 } 1435 1436 ret = btrfs_del_root(trans, "a_root->root_key); 1437 if (ret) { 1438 btrfs_abort_transaction(trans, ret); 1439 goto out; 1440 } 1441 1442 spin_lock(&fs_info->trans_lock); 1443 list_del("a_root->dirty_list); 1444 spin_unlock(&fs_info->trans_lock); 1445 1446 btrfs_tree_lock(quota_root->node); 1447 btrfs_clear_buffer_dirty(trans, quota_root->node); 1448 btrfs_tree_unlock(quota_root->node); 1449 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1450 quota_root->node, 0, 1); 1451 1452 btrfs_put_root(quota_root); 1453 1454 out: 1455 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1456 if (ret && trans) 1457 btrfs_end_transaction(trans); 1458 else if (trans) 1459 ret = btrfs_commit_transaction(trans); 1460 return ret; 1461 } 1462 1463 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1464 struct btrfs_qgroup *qgroup) 1465 { 1466 if (list_empty(&qgroup->dirty)) 1467 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1468 } 1469 1470 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) 1471 { 1472 if (!list_empty(&qgroup->iterator)) 1473 return; 1474 1475 list_add_tail(&qgroup->iterator, head); 1476 } 1477 1478 static void qgroup_iterator_clean(struct list_head *head) 1479 { 1480 while (!list_empty(head)) { 1481 struct btrfs_qgroup *qgroup; 1482 1483 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); 1484 list_del_init(&qgroup->iterator); 1485 } 1486 } 1487 1488 /* 1489 * The easy accounting, we're updating qgroup relationship whose child qgroup 1490 * only has exclusive extents. 1491 * 1492 * In this case, all exclusive extents will also be exclusive for parent, so 1493 * excl/rfer just get added/removed. 1494 * 1495 * So is qgroup reservation space, which should also be added/removed to 1496 * parent. 1497 * Or when child tries to release reservation space, parent will underflow its 1498 * reservation (for relationship adding case). 1499 * 1500 * Caller should hold fs_info->qgroup_lock. 1501 */ 1502 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, 1503 struct btrfs_qgroup *src, int sign) 1504 { 1505 struct btrfs_qgroup *qgroup; 1506 struct btrfs_qgroup *cur; 1507 LIST_HEAD(qgroup_list); 1508 u64 num_bytes = src->excl; 1509 int ret = 0; 1510 1511 qgroup = find_qgroup_rb(fs_info, ref_root); 1512 if (!qgroup) 1513 goto out; 1514 1515 qgroup_iterator_add(&qgroup_list, qgroup); 1516 list_for_each_entry(cur, &qgroup_list, iterator) { 1517 struct btrfs_qgroup_list *glist; 1518 1519 qgroup->rfer += sign * num_bytes; 1520 qgroup->rfer_cmpr += sign * num_bytes; 1521 1522 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1523 qgroup->excl += sign * num_bytes; 1524 qgroup->excl_cmpr += sign * num_bytes; 1525 1526 if (sign > 0) 1527 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1528 else 1529 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1530 qgroup_dirty(fs_info, qgroup); 1531 1532 /* Append parent qgroups to @qgroup_list. */ 1533 list_for_each_entry(glist, &qgroup->groups, next_group) 1534 qgroup_iterator_add(&qgroup_list, glist->group); 1535 } 1536 ret = 0; 1537 out: 1538 qgroup_iterator_clean(&qgroup_list); 1539 return ret; 1540 } 1541 1542 1543 /* 1544 * Quick path for updating qgroup with only excl refs. 1545 * 1546 * In that case, just update all parent will be enough. 1547 * Or we needs to do a full rescan. 1548 * Caller should also hold fs_info->qgroup_lock. 1549 * 1550 * Return 0 for quick update, return >0 for need to full rescan 1551 * and mark INCONSISTENT flag. 1552 * Return < 0 for other error. 1553 */ 1554 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1555 u64 src, u64 dst, int sign) 1556 { 1557 struct btrfs_qgroup *qgroup; 1558 int ret = 1; 1559 1560 qgroup = find_qgroup_rb(fs_info, src); 1561 if (!qgroup) 1562 goto out; 1563 if (qgroup->excl == qgroup->rfer) { 1564 ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); 1565 if (ret < 0) 1566 goto out; 1567 ret = 0; 1568 } 1569 out: 1570 if (ret) 1571 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1572 return ret; 1573 } 1574 1575 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) 1576 { 1577 struct btrfs_fs_info *fs_info = trans->fs_info; 1578 struct btrfs_qgroup *parent; 1579 struct btrfs_qgroup *member; 1580 struct btrfs_qgroup_list *list; 1581 struct btrfs_qgroup_list *prealloc = NULL; 1582 int ret = 0; 1583 1584 /* Check the level of src and dst first */ 1585 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1586 return -EINVAL; 1587 1588 mutex_lock(&fs_info->qgroup_ioctl_lock); 1589 if (!fs_info->quota_root) { 1590 ret = -ENOTCONN; 1591 goto out; 1592 } 1593 member = find_qgroup_rb(fs_info, src); 1594 parent = find_qgroup_rb(fs_info, dst); 1595 if (!member || !parent) { 1596 ret = -EINVAL; 1597 goto out; 1598 } 1599 1600 /* check if such qgroup relation exist firstly */ 1601 list_for_each_entry(list, &member->groups, next_group) { 1602 if (list->group == parent) { 1603 ret = -EEXIST; 1604 goto out; 1605 } 1606 } 1607 1608 prealloc = kzalloc(sizeof(*list), GFP_NOFS); 1609 if (!prealloc) { 1610 ret = -ENOMEM; 1611 goto out; 1612 } 1613 ret = add_qgroup_relation_item(trans, src, dst); 1614 if (ret) 1615 goto out; 1616 1617 ret = add_qgroup_relation_item(trans, dst, src); 1618 if (ret) { 1619 del_qgroup_relation_item(trans, src, dst); 1620 goto out; 1621 } 1622 1623 spin_lock(&fs_info->qgroup_lock); 1624 ret = __add_relation_rb(prealloc, member, parent); 1625 prealloc = NULL; 1626 if (ret < 0) { 1627 spin_unlock(&fs_info->qgroup_lock); 1628 goto out; 1629 } 1630 ret = quick_update_accounting(fs_info, src, dst, 1); 1631 spin_unlock(&fs_info->qgroup_lock); 1632 out: 1633 kfree(prealloc); 1634 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1635 return ret; 1636 } 1637 1638 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1639 u64 dst) 1640 { 1641 struct btrfs_fs_info *fs_info = trans->fs_info; 1642 struct btrfs_qgroup *parent; 1643 struct btrfs_qgroup *member; 1644 struct btrfs_qgroup_list *list; 1645 bool found = false; 1646 int ret = 0; 1647 int ret2; 1648 1649 if (!fs_info->quota_root) { 1650 ret = -ENOTCONN; 1651 goto out; 1652 } 1653 1654 member = find_qgroup_rb(fs_info, src); 1655 parent = find_qgroup_rb(fs_info, dst); 1656 /* 1657 * The parent/member pair doesn't exist, then try to delete the dead 1658 * relation items only. 1659 */ 1660 if (!member || !parent) 1661 goto delete_item; 1662 1663 /* check if such qgroup relation exist firstly */ 1664 list_for_each_entry(list, &member->groups, next_group) { 1665 if (list->group == parent) { 1666 found = true; 1667 break; 1668 } 1669 } 1670 1671 delete_item: 1672 ret = del_qgroup_relation_item(trans, src, dst); 1673 if (ret < 0 && ret != -ENOENT) 1674 goto out; 1675 ret2 = del_qgroup_relation_item(trans, dst, src); 1676 if (ret2 < 0 && ret2 != -ENOENT) 1677 goto out; 1678 1679 /* At least one deletion succeeded, return 0 */ 1680 if (!ret || !ret2) 1681 ret = 0; 1682 1683 if (found) { 1684 spin_lock(&fs_info->qgroup_lock); 1685 del_relation_rb(fs_info, src, dst); 1686 ret = quick_update_accounting(fs_info, src, dst, -1); 1687 spin_unlock(&fs_info->qgroup_lock); 1688 } 1689 out: 1690 return ret; 1691 } 1692 1693 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1694 u64 dst) 1695 { 1696 struct btrfs_fs_info *fs_info = trans->fs_info; 1697 int ret = 0; 1698 1699 mutex_lock(&fs_info->qgroup_ioctl_lock); 1700 ret = __del_qgroup_relation(trans, src, dst); 1701 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1702 1703 return ret; 1704 } 1705 1706 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1707 { 1708 struct btrfs_fs_info *fs_info = trans->fs_info; 1709 struct btrfs_root *quota_root; 1710 struct btrfs_qgroup *qgroup; 1711 struct btrfs_qgroup *prealloc = NULL; 1712 int ret = 0; 1713 1714 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) 1715 return 0; 1716 1717 mutex_lock(&fs_info->qgroup_ioctl_lock); 1718 if (!fs_info->quota_root) { 1719 ret = -ENOTCONN; 1720 goto out; 1721 } 1722 quota_root = fs_info->quota_root; 1723 qgroup = find_qgroup_rb(fs_info, qgroupid); 1724 if (qgroup) { 1725 ret = -EEXIST; 1726 goto out; 1727 } 1728 1729 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 1730 if (!prealloc) { 1731 ret = -ENOMEM; 1732 goto out; 1733 } 1734 1735 ret = add_qgroup_item(trans, quota_root, qgroupid); 1736 if (ret) 1737 goto out; 1738 1739 spin_lock(&fs_info->qgroup_lock); 1740 qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); 1741 spin_unlock(&fs_info->qgroup_lock); 1742 prealloc = NULL; 1743 1744 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1745 out: 1746 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1747 kfree(prealloc); 1748 return ret; 1749 } 1750 1751 static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) 1752 { 1753 return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || 1754 qgroup->excl > 0 || qgroup->excl_cmpr > 0 || 1755 qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || 1756 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || 1757 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); 1758 } 1759 1760 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1761 { 1762 struct btrfs_fs_info *fs_info = trans->fs_info; 1763 struct btrfs_qgroup *qgroup; 1764 struct btrfs_qgroup_list *list; 1765 int ret = 0; 1766 1767 mutex_lock(&fs_info->qgroup_ioctl_lock); 1768 if (!fs_info->quota_root) { 1769 ret = -ENOTCONN; 1770 goto out; 1771 } 1772 1773 qgroup = find_qgroup_rb(fs_info, qgroupid); 1774 if (!qgroup) { 1775 ret = -ENOENT; 1776 goto out; 1777 } 1778 1779 if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { 1780 ret = -EBUSY; 1781 goto out; 1782 } 1783 1784 /* Check if there are no children of this qgroup */ 1785 if (!list_empty(&qgroup->members)) { 1786 ret = -EBUSY; 1787 goto out; 1788 } 1789 1790 ret = del_qgroup_item(trans, qgroupid); 1791 if (ret && ret != -ENOENT) 1792 goto out; 1793 1794 while (!list_empty(&qgroup->groups)) { 1795 list = list_first_entry(&qgroup->groups, 1796 struct btrfs_qgroup_list, next_group); 1797 ret = __del_qgroup_relation(trans, qgroupid, 1798 list->group->qgroupid); 1799 if (ret) 1800 goto out; 1801 } 1802 1803 spin_lock(&fs_info->qgroup_lock); 1804 del_qgroup_rb(fs_info, qgroupid); 1805 spin_unlock(&fs_info->qgroup_lock); 1806 1807 /* 1808 * Remove the qgroup from sysfs now without holding the qgroup_lock 1809 * spinlock, since the sysfs_remove_group() function needs to take 1810 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1811 */ 1812 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1813 kfree(qgroup); 1814 out: 1815 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1816 return ret; 1817 } 1818 1819 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1820 struct btrfs_qgroup_limit *limit) 1821 { 1822 struct btrfs_fs_info *fs_info = trans->fs_info; 1823 struct btrfs_qgroup *qgroup; 1824 int ret = 0; 1825 /* Sometimes we would want to clear the limit on this qgroup. 1826 * To meet this requirement, we treat the -1 as a special value 1827 * which tell kernel to clear the limit on this qgroup. 1828 */ 1829 const u64 CLEAR_VALUE = -1; 1830 1831 mutex_lock(&fs_info->qgroup_ioctl_lock); 1832 if (!fs_info->quota_root) { 1833 ret = -ENOTCONN; 1834 goto out; 1835 } 1836 1837 qgroup = find_qgroup_rb(fs_info, qgroupid); 1838 if (!qgroup) { 1839 ret = -ENOENT; 1840 goto out; 1841 } 1842 1843 spin_lock(&fs_info->qgroup_lock); 1844 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1845 if (limit->max_rfer == CLEAR_VALUE) { 1846 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1847 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1848 qgroup->max_rfer = 0; 1849 } else { 1850 qgroup->max_rfer = limit->max_rfer; 1851 } 1852 } 1853 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1854 if (limit->max_excl == CLEAR_VALUE) { 1855 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1856 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1857 qgroup->max_excl = 0; 1858 } else { 1859 qgroup->max_excl = limit->max_excl; 1860 } 1861 } 1862 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1863 if (limit->rsv_rfer == CLEAR_VALUE) { 1864 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1865 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1866 qgroup->rsv_rfer = 0; 1867 } else { 1868 qgroup->rsv_rfer = limit->rsv_rfer; 1869 } 1870 } 1871 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1872 if (limit->rsv_excl == CLEAR_VALUE) { 1873 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1874 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1875 qgroup->rsv_excl = 0; 1876 } else { 1877 qgroup->rsv_excl = limit->rsv_excl; 1878 } 1879 } 1880 qgroup->lim_flags |= limit->flags; 1881 1882 spin_unlock(&fs_info->qgroup_lock); 1883 1884 ret = update_qgroup_limit_item(trans, qgroup); 1885 if (ret) { 1886 qgroup_mark_inconsistent(fs_info); 1887 btrfs_info(fs_info, "unable to update quota limit for %llu", 1888 qgroupid); 1889 } 1890 1891 out: 1892 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1893 return ret; 1894 } 1895 1896 /* 1897 * Inform qgroup to trace one dirty extent, its info is recorded in @record. 1898 * So qgroup can account it at transaction committing time. 1899 * 1900 * No lock version, caller must acquire delayed ref lock and allocated memory, 1901 * then call btrfs_qgroup_trace_extent_post() after exiting lock context. 1902 * 1903 * Return 0 for success insert 1904 * Return >0 for existing record, caller can free @record safely. 1905 * Error is not possible 1906 */ 1907 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1908 struct btrfs_delayed_ref_root *delayed_refs, 1909 struct btrfs_qgroup_extent_record *record) 1910 { 1911 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1912 struct rb_node *parent_node = NULL; 1913 struct btrfs_qgroup_extent_record *entry; 1914 u64 bytenr = record->bytenr; 1915 1916 if (!btrfs_qgroup_full_accounting(fs_info)) 1917 return 1; 1918 1919 lockdep_assert_held(&delayed_refs->lock); 1920 trace_btrfs_qgroup_trace_extent(fs_info, record); 1921 1922 while (*p) { 1923 parent_node = *p; 1924 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1925 node); 1926 if (bytenr < entry->bytenr) { 1927 p = &(*p)->rb_left; 1928 } else if (bytenr > entry->bytenr) { 1929 p = &(*p)->rb_right; 1930 } else { 1931 if (record->data_rsv && !entry->data_rsv) { 1932 entry->data_rsv = record->data_rsv; 1933 entry->data_rsv_refroot = 1934 record->data_rsv_refroot; 1935 } 1936 return 1; 1937 } 1938 } 1939 1940 rb_link_node(&record->node, parent_node, p); 1941 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1942 return 0; 1943 } 1944 1945 /* 1946 * Post handler after qgroup_trace_extent_nolock(). 1947 * 1948 * NOTE: Current qgroup does the expensive backref walk at transaction 1949 * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming 1950 * new transaction. 1951 * This is designed to allow btrfs_find_all_roots() to get correct new_roots 1952 * result. 1953 * 1954 * However for old_roots there is no need to do backref walk at that time, 1955 * since we search commit roots to walk backref and result will always be 1956 * correct. 1957 * 1958 * Due to the nature of no lock version, we can't do backref there. 1959 * So we must call btrfs_qgroup_trace_extent_post() after exiting 1960 * spinlock context. 1961 * 1962 * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result 1963 * using current root, then we can move all expensive backref walk out of 1964 * transaction committing, but not now as qgroup accounting will be wrong again. 1965 */ 1966 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1967 struct btrfs_qgroup_extent_record *qrecord) 1968 { 1969 struct btrfs_backref_walk_ctx ctx = { 0 }; 1970 int ret; 1971 1972 if (!btrfs_qgroup_full_accounting(trans->fs_info)) 1973 return 0; 1974 /* 1975 * We are always called in a context where we are already holding a 1976 * transaction handle. Often we are called when adding a data delayed 1977 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1978 * in which case we will be holding a write lock on extent buffer from a 1979 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1980 * acquire fs_info->commit_root_sem, because that is a higher level lock 1981 * that must be acquired before locking any extent buffers. 1982 * 1983 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1984 * but we can't pass it a non-NULL transaction handle, because otherwise 1985 * it would not use commit roots and would lock extent buffers, causing 1986 * a deadlock if it ends up trying to read lock the same extent buffer 1987 * that was previously write locked at btrfs_truncate_inode_items(). 1988 * 1989 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1990 * explicitly tell it to not acquire the commit_root_sem - if we are 1991 * holding a transaction handle we don't need its protection. 1992 */ 1993 ASSERT(trans != NULL); 1994 1995 if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 1996 return 0; 1997 1998 ctx.bytenr = qrecord->bytenr; 1999 ctx.fs_info = trans->fs_info; 2000 2001 ret = btrfs_find_all_roots(&ctx, true); 2002 if (ret < 0) { 2003 qgroup_mark_inconsistent(trans->fs_info); 2004 btrfs_warn(trans->fs_info, 2005 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 2006 ret); 2007 return 0; 2008 } 2009 2010 /* 2011 * Here we don't need to get the lock of 2012 * trans->transaction->delayed_refs, since inserted qrecord won't 2013 * be deleted, only qrecord->node may be modified (new qrecord insert) 2014 * 2015 * So modifying qrecord->old_roots is safe here 2016 */ 2017 qrecord->old_roots = ctx.roots; 2018 return 0; 2019 } 2020 2021 /* 2022 * Inform qgroup to trace one dirty extent, specified by @bytenr and 2023 * @num_bytes. 2024 * So qgroup can account it at commit trans time. 2025 * 2026 * Better encapsulated version, with memory allocation and backref walk for 2027 * commit roots. 2028 * So this can sleep. 2029 * 2030 * Return 0 if the operation is done. 2031 * Return <0 for error, like memory allocation failure or invalid parameter 2032 * (NULL trans) 2033 */ 2034 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2035 u64 num_bytes) 2036 { 2037 struct btrfs_fs_info *fs_info = trans->fs_info; 2038 struct btrfs_qgroup_extent_record *record; 2039 struct btrfs_delayed_ref_root *delayed_refs; 2040 int ret; 2041 2042 if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) 2043 return 0; 2044 record = kzalloc(sizeof(*record), GFP_NOFS); 2045 if (!record) 2046 return -ENOMEM; 2047 2048 delayed_refs = &trans->transaction->delayed_refs; 2049 record->bytenr = bytenr; 2050 record->num_bytes = num_bytes; 2051 record->old_roots = NULL; 2052 2053 spin_lock(&delayed_refs->lock); 2054 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 2055 spin_unlock(&delayed_refs->lock); 2056 if (ret > 0) { 2057 kfree(record); 2058 return 0; 2059 } 2060 return btrfs_qgroup_trace_extent_post(trans, record); 2061 } 2062 2063 /* 2064 * Inform qgroup to trace all leaf items of data 2065 * 2066 * Return 0 for success 2067 * Return <0 for error(ENOMEM) 2068 */ 2069 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 2070 struct extent_buffer *eb) 2071 { 2072 struct btrfs_fs_info *fs_info = trans->fs_info; 2073 int nr = btrfs_header_nritems(eb); 2074 int i, extent_type, ret; 2075 struct btrfs_key key; 2076 struct btrfs_file_extent_item *fi; 2077 u64 bytenr, num_bytes; 2078 2079 /* We can be called directly from walk_up_proc() */ 2080 if (!btrfs_qgroup_full_accounting(fs_info)) 2081 return 0; 2082 2083 for (i = 0; i < nr; i++) { 2084 btrfs_item_key_to_cpu(eb, &key, i); 2085 2086 if (key.type != BTRFS_EXTENT_DATA_KEY) 2087 continue; 2088 2089 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 2090 /* filter out non qgroup-accountable extents */ 2091 extent_type = btrfs_file_extent_type(eb, fi); 2092 2093 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 2094 continue; 2095 2096 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 2097 if (!bytenr) 2098 continue; 2099 2100 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 2101 2102 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); 2103 if (ret) 2104 return ret; 2105 } 2106 cond_resched(); 2107 return 0; 2108 } 2109 2110 /* 2111 * Walk up the tree from the bottom, freeing leaves and any interior 2112 * nodes which have had all slots visited. If a node (leaf or 2113 * interior) is freed, the node above it will have it's slot 2114 * incremented. The root node will never be freed. 2115 * 2116 * At the end of this function, we should have a path which has all 2117 * slots incremented to the next position for a search. If we need to 2118 * read a new node it will be NULL and the node above it will have the 2119 * correct slot selected for a later read. 2120 * 2121 * If we increment the root nodes slot counter past the number of 2122 * elements, 1 is returned to signal completion of the search. 2123 */ 2124 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 2125 { 2126 int level = 0; 2127 int nr, slot; 2128 struct extent_buffer *eb; 2129 2130 if (root_level == 0) 2131 return 1; 2132 2133 while (level <= root_level) { 2134 eb = path->nodes[level]; 2135 nr = btrfs_header_nritems(eb); 2136 path->slots[level]++; 2137 slot = path->slots[level]; 2138 if (slot >= nr || level == 0) { 2139 /* 2140 * Don't free the root - we will detect this 2141 * condition after our loop and return a 2142 * positive value for caller to stop walking the tree. 2143 */ 2144 if (level != root_level) { 2145 btrfs_tree_unlock_rw(eb, path->locks[level]); 2146 path->locks[level] = 0; 2147 2148 free_extent_buffer(eb); 2149 path->nodes[level] = NULL; 2150 path->slots[level] = 0; 2151 } 2152 } else { 2153 /* 2154 * We have a valid slot to walk back down 2155 * from. Stop here so caller can process these 2156 * new nodes. 2157 */ 2158 break; 2159 } 2160 2161 level++; 2162 } 2163 2164 eb = path->nodes[root_level]; 2165 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 2166 return 1; 2167 2168 return 0; 2169 } 2170 2171 /* 2172 * Helper function to trace a subtree tree block swap. 2173 * 2174 * The swap will happen in highest tree block, but there may be a lot of 2175 * tree blocks involved. 2176 * 2177 * For example: 2178 * OO = Old tree blocks 2179 * NN = New tree blocks allocated during balance 2180 * 2181 * File tree (257) Reloc tree for 257 2182 * L2 OO NN 2183 * / \ / \ 2184 * L1 OO OO (a) OO NN (a) 2185 * / \ / \ / \ / \ 2186 * L0 OO OO OO OO OO OO NN NN 2187 * (b) (c) (b) (c) 2188 * 2189 * When calling qgroup_trace_extent_swap(), we will pass: 2190 * @src_eb = OO(a) 2191 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 2192 * @dst_level = 0 2193 * @root_level = 1 2194 * 2195 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 2196 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 2197 * 2198 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 2199 * 2200 * 1) Tree search from @src_eb 2201 * It should acts as a simplified btrfs_search_slot(). 2202 * The key for search can be extracted from @dst_path->nodes[dst_level] 2203 * (first key). 2204 * 2205 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 2206 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 2207 * They should be marked during previous (@dst_level = 1) iteration. 2208 * 2209 * 3) Mark file extents in leaves dirty 2210 * We don't have good way to pick out new file extents only. 2211 * So we still follow the old method by scanning all file extents in 2212 * the leave. 2213 * 2214 * This function can free us from keeping two paths, thus later we only need 2215 * to care about how to iterate all new tree blocks in reloc tree. 2216 */ 2217 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 2218 struct extent_buffer *src_eb, 2219 struct btrfs_path *dst_path, 2220 int dst_level, int root_level, 2221 bool trace_leaf) 2222 { 2223 struct btrfs_key key; 2224 struct btrfs_path *src_path; 2225 struct btrfs_fs_info *fs_info = trans->fs_info; 2226 u32 nodesize = fs_info->nodesize; 2227 int cur_level = root_level; 2228 int ret; 2229 2230 BUG_ON(dst_level > root_level); 2231 /* Level mismatch */ 2232 if (btrfs_header_level(src_eb) != root_level) 2233 return -EINVAL; 2234 2235 src_path = btrfs_alloc_path(); 2236 if (!src_path) { 2237 ret = -ENOMEM; 2238 goto out; 2239 } 2240 2241 if (dst_level) 2242 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2243 else 2244 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2245 2246 /* For src_path */ 2247 atomic_inc(&src_eb->refs); 2248 src_path->nodes[root_level] = src_eb; 2249 src_path->slots[root_level] = dst_path->slots[root_level]; 2250 src_path->locks[root_level] = 0; 2251 2252 /* A simplified version of btrfs_search_slot() */ 2253 while (cur_level >= dst_level) { 2254 struct btrfs_key src_key; 2255 struct btrfs_key dst_key; 2256 2257 if (src_path->nodes[cur_level] == NULL) { 2258 struct extent_buffer *eb; 2259 int parent_slot; 2260 2261 eb = src_path->nodes[cur_level + 1]; 2262 parent_slot = src_path->slots[cur_level + 1]; 2263 2264 eb = btrfs_read_node_slot(eb, parent_slot); 2265 if (IS_ERR(eb)) { 2266 ret = PTR_ERR(eb); 2267 goto out; 2268 } 2269 2270 src_path->nodes[cur_level] = eb; 2271 2272 btrfs_tree_read_lock(eb); 2273 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2274 } 2275 2276 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2277 if (cur_level) { 2278 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2279 &dst_key, dst_path->slots[cur_level]); 2280 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2281 &src_key, src_path->slots[cur_level]); 2282 } else { 2283 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2284 &dst_key, dst_path->slots[cur_level]); 2285 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2286 &src_key, src_path->slots[cur_level]); 2287 } 2288 /* Content mismatch, something went wrong */ 2289 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2290 ret = -ENOENT; 2291 goto out; 2292 } 2293 cur_level--; 2294 } 2295 2296 /* 2297 * Now both @dst_path and @src_path have been populated, record the tree 2298 * blocks for qgroup accounting. 2299 */ 2300 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2301 nodesize); 2302 if (ret < 0) 2303 goto out; 2304 ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, 2305 nodesize); 2306 if (ret < 0) 2307 goto out; 2308 2309 /* Record leaf file extents */ 2310 if (dst_level == 0 && trace_leaf) { 2311 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2312 if (ret < 0) 2313 goto out; 2314 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2315 } 2316 out: 2317 btrfs_free_path(src_path); 2318 return ret; 2319 } 2320 2321 /* 2322 * Helper function to do recursive generation-aware depth-first search, to 2323 * locate all new tree blocks in a subtree of reloc tree. 2324 * 2325 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2326 * reloc tree 2327 * L2 NN (a) 2328 * / \ 2329 * L1 OO NN (b) 2330 * / \ / \ 2331 * L0 OO OO OO NN 2332 * (c) (d) 2333 * If we pass: 2334 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2335 * @cur_level = 1 2336 * @root_level = 1 2337 * 2338 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2339 * above tree blocks along with their counter parts in file tree. 2340 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2341 * won't affect OO(c). 2342 */ 2343 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2344 struct extent_buffer *src_eb, 2345 struct btrfs_path *dst_path, 2346 int cur_level, int root_level, 2347 u64 last_snapshot, bool trace_leaf) 2348 { 2349 struct btrfs_fs_info *fs_info = trans->fs_info; 2350 struct extent_buffer *eb; 2351 bool need_cleanup = false; 2352 int ret = 0; 2353 int i; 2354 2355 /* Level sanity check */ 2356 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2357 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2358 root_level < cur_level) { 2359 btrfs_err_rl(fs_info, 2360 "%s: bad levels, cur_level=%d root_level=%d", 2361 __func__, cur_level, root_level); 2362 return -EUCLEAN; 2363 } 2364 2365 /* Read the tree block if needed */ 2366 if (dst_path->nodes[cur_level] == NULL) { 2367 int parent_slot; 2368 u64 child_gen; 2369 2370 /* 2371 * dst_path->nodes[root_level] must be initialized before 2372 * calling this function. 2373 */ 2374 if (cur_level == root_level) { 2375 btrfs_err_rl(fs_info, 2376 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2377 __func__, root_level, root_level, cur_level); 2378 return -EUCLEAN; 2379 } 2380 2381 /* 2382 * We need to get child blockptr/gen from parent before we can 2383 * read it. 2384 */ 2385 eb = dst_path->nodes[cur_level + 1]; 2386 parent_slot = dst_path->slots[cur_level + 1]; 2387 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2388 2389 /* This node is old, no need to trace */ 2390 if (child_gen < last_snapshot) 2391 goto out; 2392 2393 eb = btrfs_read_node_slot(eb, parent_slot); 2394 if (IS_ERR(eb)) { 2395 ret = PTR_ERR(eb); 2396 goto out; 2397 } 2398 2399 dst_path->nodes[cur_level] = eb; 2400 dst_path->slots[cur_level] = 0; 2401 2402 btrfs_tree_read_lock(eb); 2403 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2404 need_cleanup = true; 2405 } 2406 2407 /* Now record this tree block and its counter part for qgroups */ 2408 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2409 root_level, trace_leaf); 2410 if (ret < 0) 2411 goto cleanup; 2412 2413 eb = dst_path->nodes[cur_level]; 2414 2415 if (cur_level > 0) { 2416 /* Iterate all child tree blocks */ 2417 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2418 /* Skip old tree blocks as they won't be swapped */ 2419 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2420 continue; 2421 dst_path->slots[cur_level] = i; 2422 2423 /* Recursive call (at most 7 times) */ 2424 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2425 dst_path, cur_level - 1, root_level, 2426 last_snapshot, trace_leaf); 2427 if (ret < 0) 2428 goto cleanup; 2429 } 2430 } 2431 2432 cleanup: 2433 if (need_cleanup) { 2434 /* Clean up */ 2435 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2436 dst_path->locks[cur_level]); 2437 free_extent_buffer(dst_path->nodes[cur_level]); 2438 dst_path->nodes[cur_level] = NULL; 2439 dst_path->slots[cur_level] = 0; 2440 dst_path->locks[cur_level] = 0; 2441 } 2442 out: 2443 return ret; 2444 } 2445 2446 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2447 struct extent_buffer *src_eb, 2448 struct extent_buffer *dst_eb, 2449 u64 last_snapshot, bool trace_leaf) 2450 { 2451 struct btrfs_fs_info *fs_info = trans->fs_info; 2452 struct btrfs_path *dst_path = NULL; 2453 int level; 2454 int ret; 2455 2456 if (!btrfs_qgroup_full_accounting(fs_info)) 2457 return 0; 2458 2459 /* Wrong parameter order */ 2460 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2461 btrfs_err_rl(fs_info, 2462 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2463 btrfs_header_generation(src_eb), 2464 btrfs_header_generation(dst_eb)); 2465 return -EUCLEAN; 2466 } 2467 2468 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2469 ret = -EIO; 2470 goto out; 2471 } 2472 2473 level = btrfs_header_level(dst_eb); 2474 dst_path = btrfs_alloc_path(); 2475 if (!dst_path) { 2476 ret = -ENOMEM; 2477 goto out; 2478 } 2479 /* For dst_path */ 2480 atomic_inc(&dst_eb->refs); 2481 dst_path->nodes[level] = dst_eb; 2482 dst_path->slots[level] = 0; 2483 dst_path->locks[level] = 0; 2484 2485 /* Do the generation aware breadth-first search */ 2486 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2487 level, last_snapshot, trace_leaf); 2488 if (ret < 0) 2489 goto out; 2490 ret = 0; 2491 2492 out: 2493 btrfs_free_path(dst_path); 2494 if (ret < 0) 2495 qgroup_mark_inconsistent(fs_info); 2496 return ret; 2497 } 2498 2499 /* 2500 * Inform qgroup to trace a whole subtree, including all its child tree 2501 * blocks and data. 2502 * The root tree block is specified by @root_eb. 2503 * 2504 * Normally used by relocation(tree block swap) and subvolume deletion. 2505 * 2506 * Return 0 for success 2507 * Return <0 for error(ENOMEM or tree search error) 2508 */ 2509 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2510 struct extent_buffer *root_eb, 2511 u64 root_gen, int root_level) 2512 { 2513 struct btrfs_fs_info *fs_info = trans->fs_info; 2514 int ret = 0; 2515 int level; 2516 u8 drop_subptree_thres; 2517 struct extent_buffer *eb = root_eb; 2518 struct btrfs_path *path = NULL; 2519 2520 ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); 2521 ASSERT(root_eb != NULL); 2522 2523 if (!btrfs_qgroup_full_accounting(fs_info)) 2524 return 0; 2525 2526 spin_lock(&fs_info->qgroup_lock); 2527 drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; 2528 spin_unlock(&fs_info->qgroup_lock); 2529 2530 /* 2531 * This function only gets called for snapshot drop, if we hit a high 2532 * node here, it means we are going to change ownership for quite a lot 2533 * of extents, which will greatly slow down btrfs_commit_transaction(). 2534 * 2535 * So here if we find a high tree here, we just skip the accounting and 2536 * mark qgroup inconsistent. 2537 */ 2538 if (root_level >= drop_subptree_thres) { 2539 qgroup_mark_inconsistent(fs_info); 2540 return 0; 2541 } 2542 2543 if (!extent_buffer_uptodate(root_eb)) { 2544 struct btrfs_tree_parent_check check = { 2545 .has_first_key = false, 2546 .transid = root_gen, 2547 .level = root_level 2548 }; 2549 2550 ret = btrfs_read_extent_buffer(root_eb, &check); 2551 if (ret) 2552 goto out; 2553 } 2554 2555 if (root_level == 0) { 2556 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2557 goto out; 2558 } 2559 2560 path = btrfs_alloc_path(); 2561 if (!path) 2562 return -ENOMEM; 2563 2564 /* 2565 * Walk down the tree. Missing extent blocks are filled in as 2566 * we go. Metadata is accounted every time we read a new 2567 * extent block. 2568 * 2569 * When we reach a leaf, we account for file extent items in it, 2570 * walk back up the tree (adjusting slot pointers as we go) 2571 * and restart the search process. 2572 */ 2573 atomic_inc(&root_eb->refs); /* For path */ 2574 path->nodes[root_level] = root_eb; 2575 path->slots[root_level] = 0; 2576 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2577 walk_down: 2578 level = root_level; 2579 while (level >= 0) { 2580 if (path->nodes[level] == NULL) { 2581 int parent_slot; 2582 u64 child_bytenr; 2583 2584 /* 2585 * We need to get child blockptr from parent before we 2586 * can read it. 2587 */ 2588 eb = path->nodes[level + 1]; 2589 parent_slot = path->slots[level + 1]; 2590 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2591 2592 eb = btrfs_read_node_slot(eb, parent_slot); 2593 if (IS_ERR(eb)) { 2594 ret = PTR_ERR(eb); 2595 goto out; 2596 } 2597 2598 path->nodes[level] = eb; 2599 path->slots[level] = 0; 2600 2601 btrfs_tree_read_lock(eb); 2602 path->locks[level] = BTRFS_READ_LOCK; 2603 2604 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2605 fs_info->nodesize); 2606 if (ret) 2607 goto out; 2608 } 2609 2610 if (level == 0) { 2611 ret = btrfs_qgroup_trace_leaf_items(trans, 2612 path->nodes[level]); 2613 if (ret) 2614 goto out; 2615 2616 /* Nonzero return here means we completed our search */ 2617 ret = adjust_slots_upwards(path, root_level); 2618 if (ret) 2619 break; 2620 2621 /* Restart search with new slots */ 2622 goto walk_down; 2623 } 2624 2625 level--; 2626 } 2627 2628 ret = 0; 2629 out: 2630 btrfs_free_path(path); 2631 2632 return ret; 2633 } 2634 2635 static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) 2636 { 2637 if (!list_empty(&qgroup->nested_iterator)) 2638 return; 2639 2640 list_add_tail(&qgroup->nested_iterator, head); 2641 } 2642 2643 static void qgroup_iterator_nested_clean(struct list_head *head) 2644 { 2645 while (!list_empty(head)) { 2646 struct btrfs_qgroup *qgroup; 2647 2648 qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); 2649 list_del_init(&qgroup->nested_iterator); 2650 } 2651 } 2652 2653 #define UPDATE_NEW 0 2654 #define UPDATE_OLD 1 2655 /* 2656 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2657 */ 2658 static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2659 struct ulist *roots, struct list_head *qgroups, 2660 u64 seq, int update_old) 2661 { 2662 struct ulist_node *unode; 2663 struct ulist_iterator uiter; 2664 struct btrfs_qgroup *qg; 2665 2666 if (!roots) 2667 return; 2668 ULIST_ITER_INIT(&uiter); 2669 while ((unode = ulist_next(roots, &uiter))) { 2670 LIST_HEAD(tmp); 2671 2672 qg = find_qgroup_rb(fs_info, unode->val); 2673 if (!qg) 2674 continue; 2675 2676 qgroup_iterator_nested_add(qgroups, qg); 2677 qgroup_iterator_add(&tmp, qg); 2678 list_for_each_entry(qg, &tmp, iterator) { 2679 struct btrfs_qgroup_list *glist; 2680 2681 if (update_old) 2682 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2683 else 2684 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2685 2686 list_for_each_entry(glist, &qg->groups, next_group) { 2687 qgroup_iterator_nested_add(qgroups, glist->group); 2688 qgroup_iterator_add(&tmp, glist->group); 2689 } 2690 } 2691 qgroup_iterator_clean(&tmp); 2692 } 2693 } 2694 2695 /* 2696 * Update qgroup rfer/excl counters. 2697 * Rfer update is easy, codes can explain themselves. 2698 * 2699 * Excl update is tricky, the update is split into 2 parts. 2700 * Part 1: Possible exclusive <-> sharing detect: 2701 * | A | !A | 2702 * ------------------------------------- 2703 * B | * | - | 2704 * ------------------------------------- 2705 * !B | + | ** | 2706 * ------------------------------------- 2707 * 2708 * Conditions: 2709 * A: cur_old_roots < nr_old_roots (not exclusive before) 2710 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2711 * B: cur_new_roots < nr_new_roots (not exclusive now) 2712 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2713 * 2714 * Results: 2715 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2716 * *: Definitely not changed. **: Possible unchanged. 2717 * 2718 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2719 * 2720 * To make the logic clear, we first use condition A and B to split 2721 * combination into 4 results. 2722 * 2723 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2724 * only on variant maybe 0. 2725 * 2726 * Lastly, check result **, since there are 2 variants maybe 0, split them 2727 * again(2x2). 2728 * But this time we don't need to consider other things, the codes and logic 2729 * is easy to understand now. 2730 */ 2731 static void qgroup_update_counters(struct btrfs_fs_info *fs_info, 2732 struct list_head *qgroups, u64 nr_old_roots, 2733 u64 nr_new_roots, u64 num_bytes, u64 seq) 2734 { 2735 struct btrfs_qgroup *qg; 2736 2737 list_for_each_entry(qg, qgroups, nested_iterator) { 2738 u64 cur_new_count, cur_old_count; 2739 bool dirty = false; 2740 2741 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2742 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2743 2744 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2745 cur_new_count); 2746 2747 /* Rfer update part */ 2748 if (cur_old_count == 0 && cur_new_count > 0) { 2749 qg->rfer += num_bytes; 2750 qg->rfer_cmpr += num_bytes; 2751 dirty = true; 2752 } 2753 if (cur_old_count > 0 && cur_new_count == 0) { 2754 qg->rfer -= num_bytes; 2755 qg->rfer_cmpr -= num_bytes; 2756 dirty = true; 2757 } 2758 2759 /* Excl update part */ 2760 /* Exclusive/none -> shared case */ 2761 if (cur_old_count == nr_old_roots && 2762 cur_new_count < nr_new_roots) { 2763 /* Exclusive -> shared */ 2764 if (cur_old_count != 0) { 2765 qg->excl -= num_bytes; 2766 qg->excl_cmpr -= num_bytes; 2767 dirty = true; 2768 } 2769 } 2770 2771 /* Shared -> exclusive/none case */ 2772 if (cur_old_count < nr_old_roots && 2773 cur_new_count == nr_new_roots) { 2774 /* Shared->exclusive */ 2775 if (cur_new_count != 0) { 2776 qg->excl += num_bytes; 2777 qg->excl_cmpr += num_bytes; 2778 dirty = true; 2779 } 2780 } 2781 2782 /* Exclusive/none -> exclusive/none case */ 2783 if (cur_old_count == nr_old_roots && 2784 cur_new_count == nr_new_roots) { 2785 if (cur_old_count == 0) { 2786 /* None -> exclusive/none */ 2787 2788 if (cur_new_count != 0) { 2789 /* None -> exclusive */ 2790 qg->excl += num_bytes; 2791 qg->excl_cmpr += num_bytes; 2792 dirty = true; 2793 } 2794 /* None -> none, nothing changed */ 2795 } else { 2796 /* Exclusive -> exclusive/none */ 2797 2798 if (cur_new_count == 0) { 2799 /* Exclusive -> none */ 2800 qg->excl -= num_bytes; 2801 qg->excl_cmpr -= num_bytes; 2802 dirty = true; 2803 } 2804 /* Exclusive -> exclusive, nothing changed */ 2805 } 2806 } 2807 2808 if (dirty) 2809 qgroup_dirty(fs_info, qg); 2810 } 2811 } 2812 2813 /* 2814 * Check if the @roots potentially is a list of fs tree roots 2815 * 2816 * Return 0 for definitely not a fs/subvol tree roots ulist 2817 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2818 * one as well) 2819 */ 2820 static int maybe_fs_roots(struct ulist *roots) 2821 { 2822 struct ulist_node *unode; 2823 struct ulist_iterator uiter; 2824 2825 /* Empty one, still possible for fs roots */ 2826 if (!roots || roots->nnodes == 0) 2827 return 1; 2828 2829 ULIST_ITER_INIT(&uiter); 2830 unode = ulist_next(roots, &uiter); 2831 if (!unode) 2832 return 1; 2833 2834 /* 2835 * If it contains fs tree roots, then it must belong to fs/subvol 2836 * trees. 2837 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2838 */ 2839 return is_fstree(unode->val); 2840 } 2841 2842 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2843 u64 num_bytes, struct ulist *old_roots, 2844 struct ulist *new_roots) 2845 { 2846 struct btrfs_fs_info *fs_info = trans->fs_info; 2847 LIST_HEAD(qgroups); 2848 u64 seq; 2849 u64 nr_new_roots = 0; 2850 u64 nr_old_roots = 0; 2851 int ret = 0; 2852 2853 /* 2854 * If quotas get disabled meanwhile, the resources need to be freed and 2855 * we can't just exit here. 2856 */ 2857 if (!btrfs_qgroup_full_accounting(fs_info) || 2858 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 2859 goto out_free; 2860 2861 if (new_roots) { 2862 if (!maybe_fs_roots(new_roots)) 2863 goto out_free; 2864 nr_new_roots = new_roots->nnodes; 2865 } 2866 if (old_roots) { 2867 if (!maybe_fs_roots(old_roots)) 2868 goto out_free; 2869 nr_old_roots = old_roots->nnodes; 2870 } 2871 2872 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2873 if (nr_old_roots == 0 && nr_new_roots == 0) 2874 goto out_free; 2875 2876 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2877 num_bytes, nr_old_roots, nr_new_roots); 2878 2879 mutex_lock(&fs_info->qgroup_rescan_lock); 2880 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2881 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2882 mutex_unlock(&fs_info->qgroup_rescan_lock); 2883 ret = 0; 2884 goto out_free; 2885 } 2886 } 2887 mutex_unlock(&fs_info->qgroup_rescan_lock); 2888 2889 spin_lock(&fs_info->qgroup_lock); 2890 seq = fs_info->qgroup_seq; 2891 2892 /* Update old refcnts using old_roots */ 2893 qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); 2894 2895 /* Update new refcnts using new_roots */ 2896 qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); 2897 2898 qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, 2899 num_bytes, seq); 2900 2901 /* 2902 * We're done using the iterator, release all its qgroups while holding 2903 * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() 2904 * and trigger use-after-free accesses to qgroups. 2905 */ 2906 qgroup_iterator_nested_clean(&qgroups); 2907 2908 /* 2909 * Bump qgroup_seq to avoid seq overlap 2910 */ 2911 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2912 spin_unlock(&fs_info->qgroup_lock); 2913 out_free: 2914 ulist_free(old_roots); 2915 ulist_free(new_roots); 2916 return ret; 2917 } 2918 2919 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2920 { 2921 struct btrfs_fs_info *fs_info = trans->fs_info; 2922 struct btrfs_qgroup_extent_record *record; 2923 struct btrfs_delayed_ref_root *delayed_refs; 2924 struct ulist *new_roots = NULL; 2925 struct rb_node *node; 2926 u64 num_dirty_extents = 0; 2927 u64 qgroup_to_skip; 2928 int ret = 0; 2929 2930 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 2931 return 0; 2932 2933 delayed_refs = &trans->transaction->delayed_refs; 2934 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2935 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2936 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2937 node); 2938 2939 num_dirty_extents++; 2940 trace_btrfs_qgroup_account_extents(fs_info, record); 2941 2942 if (!ret && !(fs_info->qgroup_flags & 2943 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { 2944 struct btrfs_backref_walk_ctx ctx = { 0 }; 2945 2946 ctx.bytenr = record->bytenr; 2947 ctx.fs_info = fs_info; 2948 2949 /* 2950 * Old roots should be searched when inserting qgroup 2951 * extent record. 2952 * 2953 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, 2954 * we may have some record inserted during 2955 * NO_ACCOUNTING (thus no old_roots populated), but 2956 * later we start rescan, which clears NO_ACCOUNTING, 2957 * leaving some inserted records without old_roots 2958 * populated. 2959 * 2960 * Those cases are rare and should not cause too much 2961 * time spent during commit_transaction(). 2962 */ 2963 if (!record->old_roots) { 2964 /* Search commit root to find old_roots */ 2965 ret = btrfs_find_all_roots(&ctx, false); 2966 if (ret < 0) 2967 goto cleanup; 2968 record->old_roots = ctx.roots; 2969 ctx.roots = NULL; 2970 } 2971 2972 /* 2973 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2974 * which doesn't lock tree or delayed_refs and search 2975 * current root. It's safe inside commit_transaction(). 2976 */ 2977 ctx.trans = trans; 2978 ctx.time_seq = BTRFS_SEQ_LAST; 2979 ret = btrfs_find_all_roots(&ctx, false); 2980 if (ret < 0) 2981 goto cleanup; 2982 new_roots = ctx.roots; 2983 if (qgroup_to_skip) { 2984 ulist_del(new_roots, qgroup_to_skip, 0); 2985 ulist_del(record->old_roots, qgroup_to_skip, 2986 0); 2987 } 2988 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2989 record->num_bytes, 2990 record->old_roots, 2991 new_roots); 2992 record->old_roots = NULL; 2993 new_roots = NULL; 2994 } 2995 /* Free the reserved data space */ 2996 btrfs_qgroup_free_refroot(fs_info, 2997 record->data_rsv_refroot, 2998 record->data_rsv, 2999 BTRFS_QGROUP_RSV_DATA); 3000 cleanup: 3001 ulist_free(record->old_roots); 3002 ulist_free(new_roots); 3003 new_roots = NULL; 3004 rb_erase(node, &delayed_refs->dirty_extent_root); 3005 kfree(record); 3006 3007 } 3008 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 3009 num_dirty_extents); 3010 return ret; 3011 } 3012 3013 /* 3014 * Writes all changed qgroups to disk. 3015 * Called by the transaction commit path and the qgroup assign ioctl. 3016 */ 3017 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 3018 { 3019 struct btrfs_fs_info *fs_info = trans->fs_info; 3020 int ret = 0; 3021 3022 /* 3023 * In case we are called from the qgroup assign ioctl, assert that we 3024 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota 3025 * disable operation (ioctl) and access a freed quota root. 3026 */ 3027 if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) 3028 lockdep_assert_held(&fs_info->qgroup_ioctl_lock); 3029 3030 if (!fs_info->quota_root) 3031 return ret; 3032 3033 spin_lock(&fs_info->qgroup_lock); 3034 while (!list_empty(&fs_info->dirty_qgroups)) { 3035 struct btrfs_qgroup *qgroup; 3036 qgroup = list_first_entry(&fs_info->dirty_qgroups, 3037 struct btrfs_qgroup, dirty); 3038 list_del_init(&qgroup->dirty); 3039 spin_unlock(&fs_info->qgroup_lock); 3040 ret = update_qgroup_info_item(trans, qgroup); 3041 if (ret) 3042 qgroup_mark_inconsistent(fs_info); 3043 ret = update_qgroup_limit_item(trans, qgroup); 3044 if (ret) 3045 qgroup_mark_inconsistent(fs_info); 3046 spin_lock(&fs_info->qgroup_lock); 3047 } 3048 if (btrfs_qgroup_enabled(fs_info)) 3049 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 3050 else 3051 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 3052 spin_unlock(&fs_info->qgroup_lock); 3053 3054 ret = update_qgroup_status_item(trans); 3055 if (ret) 3056 qgroup_mark_inconsistent(fs_info); 3057 3058 return ret; 3059 } 3060 3061 int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, 3062 struct btrfs_qgroup_inherit *inherit, 3063 size_t size) 3064 { 3065 if (!btrfs_qgroup_enabled(fs_info)) 3066 return 0; 3067 if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) 3068 return -EOPNOTSUPP; 3069 if (size < sizeof(*inherit) || size > PAGE_SIZE) 3070 return -EINVAL; 3071 3072 /* 3073 * In the past we allowed btrfs_qgroup_inherit to specify to copy 3074 * rfer/excl numbers directly from other qgroups. This behavior has 3075 * been disabled in userspace for a very long time, but here we should 3076 * also disable it in kernel, as this behavior is known to mark qgroup 3077 * inconsistent, and a rescan would wipe out the changes anyway. 3078 * 3079 * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. 3080 */ 3081 if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) 3082 return -EINVAL; 3083 3084 if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) 3085 return -EINVAL; 3086 3087 /* 3088 * Now check all the remaining qgroups, they should all: 3089 * 3090 * - Exist 3091 * - Be higher level qgroups. 3092 */ 3093 for (int i = 0; i < inherit->num_qgroups; i++) { 3094 struct btrfs_qgroup *qgroup; 3095 u64 qgroupid = inherit->qgroups[i]; 3096 3097 if (btrfs_qgroup_level(qgroupid) == 0) 3098 return -EINVAL; 3099 3100 spin_lock(&fs_info->qgroup_lock); 3101 qgroup = find_qgroup_rb(fs_info, qgroupid); 3102 if (!qgroup) { 3103 spin_unlock(&fs_info->qgroup_lock); 3104 return -ENOENT; 3105 } 3106 spin_unlock(&fs_info->qgroup_lock); 3107 } 3108 return 0; 3109 } 3110 3111 static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, 3112 u64 inode_rootid, 3113 struct btrfs_qgroup_inherit **inherit) 3114 { 3115 int i = 0; 3116 u64 num_qgroups = 0; 3117 struct btrfs_qgroup *inode_qg; 3118 struct btrfs_qgroup_list *qg_list; 3119 struct btrfs_qgroup_inherit *res; 3120 size_t struct_sz; 3121 u64 *qgids; 3122 3123 if (*inherit) 3124 return -EEXIST; 3125 3126 inode_qg = find_qgroup_rb(fs_info, inode_rootid); 3127 if (!inode_qg) 3128 return -ENOENT; 3129 3130 num_qgroups = list_count_nodes(&inode_qg->groups); 3131 3132 if (!num_qgroups) 3133 return 0; 3134 3135 struct_sz = struct_size(res, qgroups, num_qgroups); 3136 if (struct_sz == SIZE_MAX) 3137 return -ERANGE; 3138 3139 res = kzalloc(struct_sz, GFP_NOFS); 3140 if (!res) 3141 return -ENOMEM; 3142 res->num_qgroups = num_qgroups; 3143 qgids = res->qgroups; 3144 3145 list_for_each_entry(qg_list, &inode_qg->groups, next_group) 3146 qgids[i++] = qg_list->group->qgroupid; 3147 3148 *inherit = res; 3149 return 0; 3150 } 3151 3152 /* 3153 * Check if we can skip rescan when inheriting qgroups. If @src has a single 3154 * @parent, and that @parent is owning all its bytes exclusively, we can skip 3155 * the full rescan, by just adding nodesize to the @parent's excl/rfer. 3156 * 3157 * Return <0 for fatal errors (like srcid/parentid has no qgroup). 3158 * Return 0 if a quick inherit is done. 3159 * Return >0 if a quick inherit is not possible, and a full rescan is needed. 3160 */ 3161 static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, 3162 u64 srcid, u64 parentid) 3163 { 3164 struct btrfs_qgroup *src; 3165 struct btrfs_qgroup *parent; 3166 struct btrfs_qgroup_list *list; 3167 int nr_parents = 0; 3168 3169 src = find_qgroup_rb(fs_info, srcid); 3170 if (!src) 3171 return -ENOENT; 3172 parent = find_qgroup_rb(fs_info, parentid); 3173 if (!parent) 3174 return -ENOENT; 3175 3176 /* 3177 * Source has no parent qgroup, but our new qgroup would have one. 3178 * Qgroup numbers would become inconsistent. 3179 */ 3180 if (list_empty(&src->groups)) 3181 return 1; 3182 3183 list_for_each_entry(list, &src->groups, next_group) { 3184 /* The parent is not the same, quick update is not possible. */ 3185 if (list->group->qgroupid != parentid) 3186 return 1; 3187 nr_parents++; 3188 /* 3189 * More than one parent qgroup, we can't be sure about accounting 3190 * consistency. 3191 */ 3192 if (nr_parents > 1) 3193 return 1; 3194 } 3195 3196 /* 3197 * The parent is not exclusively owning all its bytes. We're not sure 3198 * if the source has any bytes not fully owned by the parent. 3199 */ 3200 if (parent->excl != parent->rfer) 3201 return 1; 3202 3203 parent->excl += fs_info->nodesize; 3204 parent->rfer += fs_info->nodesize; 3205 return 0; 3206 } 3207 3208 /* 3209 * Copy the accounting information between qgroups. This is necessary 3210 * when a snapshot or a subvolume is created. Throwing an error will 3211 * cause a transaction abort so we take extra care here to only error 3212 * when a readonly fs is a reasonable outcome. 3213 */ 3214 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 3215 u64 objectid, u64 inode_rootid, 3216 struct btrfs_qgroup_inherit *inherit) 3217 { 3218 int ret = 0; 3219 int i; 3220 u64 *i_qgroups; 3221 bool committing = false; 3222 struct btrfs_fs_info *fs_info = trans->fs_info; 3223 struct btrfs_root *quota_root; 3224 struct btrfs_qgroup *srcgroup; 3225 struct btrfs_qgroup *dstgroup; 3226 struct btrfs_qgroup *prealloc; 3227 struct btrfs_qgroup_list **qlist_prealloc = NULL; 3228 bool free_inherit = false; 3229 bool need_rescan = false; 3230 u32 level_size = 0; 3231 u64 nums; 3232 3233 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 3234 if (!prealloc) 3235 return -ENOMEM; 3236 3237 /* 3238 * There are only two callers of this function. 3239 * 3240 * One in create_subvol() in the ioctl context, which needs to hold 3241 * the qgroup_ioctl_lock. 3242 * 3243 * The other one in create_pending_snapshot() where no other qgroup 3244 * code can modify the fs as they all need to either start a new trans 3245 * or hold a trans handler, thus we don't need to hold 3246 * qgroup_ioctl_lock. 3247 * This would avoid long and complex lock chain and make lockdep happy. 3248 */ 3249 spin_lock(&fs_info->trans_lock); 3250 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 3251 committing = true; 3252 spin_unlock(&fs_info->trans_lock); 3253 3254 if (!committing) 3255 mutex_lock(&fs_info->qgroup_ioctl_lock); 3256 if (!btrfs_qgroup_enabled(fs_info)) 3257 goto out; 3258 3259 quota_root = fs_info->quota_root; 3260 if (!quota_root) { 3261 ret = -EINVAL; 3262 goto out; 3263 } 3264 3265 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { 3266 ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); 3267 if (ret) 3268 goto out; 3269 free_inherit = true; 3270 } 3271 3272 if (inherit) { 3273 i_qgroups = (u64 *)(inherit + 1); 3274 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 3275 2 * inherit->num_excl_copies; 3276 for (i = 0; i < nums; ++i) { 3277 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 3278 3279 /* 3280 * Zero out invalid groups so we can ignore 3281 * them later. 3282 */ 3283 if (!srcgroup || 3284 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 3285 *i_qgroups = 0ULL; 3286 3287 ++i_qgroups; 3288 } 3289 } 3290 3291 /* 3292 * create a tracking group for the subvol itself 3293 */ 3294 ret = add_qgroup_item(trans, quota_root, objectid); 3295 if (ret) 3296 goto out; 3297 3298 /* 3299 * add qgroup to all inherited groups 3300 */ 3301 if (inherit) { 3302 i_qgroups = (u64 *)(inherit + 1); 3303 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 3304 if (*i_qgroups == 0) 3305 continue; 3306 ret = add_qgroup_relation_item(trans, objectid, 3307 *i_qgroups); 3308 if (ret && ret != -EEXIST) 3309 goto out; 3310 ret = add_qgroup_relation_item(trans, *i_qgroups, 3311 objectid); 3312 if (ret && ret != -EEXIST) 3313 goto out; 3314 } 3315 ret = 0; 3316 3317 qlist_prealloc = kcalloc(inherit->num_qgroups, 3318 sizeof(struct btrfs_qgroup_list *), 3319 GFP_NOFS); 3320 if (!qlist_prealloc) { 3321 ret = -ENOMEM; 3322 goto out; 3323 } 3324 for (int i = 0; i < inherit->num_qgroups; i++) { 3325 qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), 3326 GFP_NOFS); 3327 if (!qlist_prealloc[i]) { 3328 ret = -ENOMEM; 3329 goto out; 3330 } 3331 } 3332 } 3333 3334 spin_lock(&fs_info->qgroup_lock); 3335 3336 dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); 3337 prealloc = NULL; 3338 3339 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 3340 dstgroup->lim_flags = inherit->lim.flags; 3341 dstgroup->max_rfer = inherit->lim.max_rfer; 3342 dstgroup->max_excl = inherit->lim.max_excl; 3343 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 3344 dstgroup->rsv_excl = inherit->lim.rsv_excl; 3345 3346 qgroup_dirty(fs_info, dstgroup); 3347 } 3348 3349 if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { 3350 srcgroup = find_qgroup_rb(fs_info, srcid); 3351 if (!srcgroup) 3352 goto unlock; 3353 3354 /* 3355 * We call inherit after we clone the root in order to make sure 3356 * our counts don't go crazy, so at this point the only 3357 * difference between the two roots should be the root node. 3358 */ 3359 level_size = fs_info->nodesize; 3360 dstgroup->rfer = srcgroup->rfer; 3361 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 3362 dstgroup->excl = level_size; 3363 dstgroup->excl_cmpr = level_size; 3364 srcgroup->excl = level_size; 3365 srcgroup->excl_cmpr = level_size; 3366 3367 /* inherit the limit info */ 3368 dstgroup->lim_flags = srcgroup->lim_flags; 3369 dstgroup->max_rfer = srcgroup->max_rfer; 3370 dstgroup->max_excl = srcgroup->max_excl; 3371 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 3372 dstgroup->rsv_excl = srcgroup->rsv_excl; 3373 3374 qgroup_dirty(fs_info, dstgroup); 3375 qgroup_dirty(fs_info, srcgroup); 3376 3377 /* 3378 * If the source qgroup has parent but the new one doesn't, 3379 * we need a full rescan. 3380 */ 3381 if (!inherit && !list_empty(&srcgroup->groups)) 3382 need_rescan = true; 3383 } 3384 3385 if (!inherit) 3386 goto unlock; 3387 3388 i_qgroups = (u64 *)(inherit + 1); 3389 for (i = 0; i < inherit->num_qgroups; ++i) { 3390 if (*i_qgroups) { 3391 ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, 3392 *i_qgroups); 3393 qlist_prealloc[i] = NULL; 3394 if (ret) 3395 goto unlock; 3396 } 3397 if (srcid) { 3398 /* Check if we can do a quick inherit. */ 3399 ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); 3400 if (ret < 0) 3401 goto unlock; 3402 if (ret > 0) 3403 need_rescan = true; 3404 ret = 0; 3405 } 3406 ++i_qgroups; 3407 } 3408 3409 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 3410 struct btrfs_qgroup *src; 3411 struct btrfs_qgroup *dst; 3412 3413 if (!i_qgroups[0] || !i_qgroups[1]) 3414 continue; 3415 3416 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3417 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3418 3419 if (!src || !dst) { 3420 ret = -EINVAL; 3421 goto unlock; 3422 } 3423 3424 dst->rfer = src->rfer - level_size; 3425 dst->rfer_cmpr = src->rfer_cmpr - level_size; 3426 3427 /* Manually tweaking numbers certainly needs a rescan */ 3428 need_rescan = true; 3429 } 3430 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 3431 struct btrfs_qgroup *src; 3432 struct btrfs_qgroup *dst; 3433 3434 if (!i_qgroups[0] || !i_qgroups[1]) 3435 continue; 3436 3437 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3438 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3439 3440 if (!src || !dst) { 3441 ret = -EINVAL; 3442 goto unlock; 3443 } 3444 3445 dst->excl = src->excl + level_size; 3446 dst->excl_cmpr = src->excl_cmpr + level_size; 3447 need_rescan = true; 3448 } 3449 3450 unlock: 3451 spin_unlock(&fs_info->qgroup_lock); 3452 if (!ret) 3453 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 3454 out: 3455 if (!committing) 3456 mutex_unlock(&fs_info->qgroup_ioctl_lock); 3457 if (need_rescan) 3458 qgroup_mark_inconsistent(fs_info); 3459 if (qlist_prealloc) { 3460 for (int i = 0; i < inherit->num_qgroups; i++) 3461 kfree(qlist_prealloc[i]); 3462 kfree(qlist_prealloc); 3463 } 3464 if (free_inherit) 3465 kfree(inherit); 3466 kfree(prealloc); 3467 return ret; 3468 } 3469 3470 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 3471 { 3472 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 3473 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3474 return false; 3475 3476 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3477 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3478 return false; 3479 3480 return true; 3481 } 3482 3483 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3484 enum btrfs_qgroup_rsv_type type) 3485 { 3486 struct btrfs_qgroup *qgroup; 3487 struct btrfs_fs_info *fs_info = root->fs_info; 3488 u64 ref_root = btrfs_root_id(root); 3489 int ret = 0; 3490 LIST_HEAD(qgroup_list); 3491 3492 if (!is_fstree(ref_root)) 3493 return 0; 3494 3495 if (num_bytes == 0) 3496 return 0; 3497 3498 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3499 capable(CAP_SYS_RESOURCE)) 3500 enforce = false; 3501 3502 spin_lock(&fs_info->qgroup_lock); 3503 if (!fs_info->quota_root) 3504 goto out; 3505 3506 qgroup = find_qgroup_rb(fs_info, ref_root); 3507 if (!qgroup) 3508 goto out; 3509 3510 qgroup_iterator_add(&qgroup_list, qgroup); 3511 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3512 struct btrfs_qgroup_list *glist; 3513 3514 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { 3515 ret = -EDQUOT; 3516 goto out; 3517 } 3518 3519 list_for_each_entry(glist, &qgroup->groups, next_group) 3520 qgroup_iterator_add(&qgroup_list, glist->group); 3521 } 3522 3523 ret = 0; 3524 /* 3525 * no limits exceeded, now record the reservation into all qgroups 3526 */ 3527 list_for_each_entry(qgroup, &qgroup_list, iterator) 3528 qgroup_rsv_add(fs_info, qgroup, num_bytes, type); 3529 3530 out: 3531 qgroup_iterator_clean(&qgroup_list); 3532 spin_unlock(&fs_info->qgroup_lock); 3533 return ret; 3534 } 3535 3536 /* 3537 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3538 * qgroup). 3539 * 3540 * Will handle all higher level qgroup too. 3541 * 3542 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3543 * This special case is only used for META_PERTRANS type. 3544 */ 3545 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3546 u64 ref_root, u64 num_bytes, 3547 enum btrfs_qgroup_rsv_type type) 3548 { 3549 struct btrfs_qgroup *qgroup; 3550 LIST_HEAD(qgroup_list); 3551 3552 if (!is_fstree(ref_root)) 3553 return; 3554 3555 if (num_bytes == 0) 3556 return; 3557 3558 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3559 WARN(1, "%s: Invalid type to free", __func__); 3560 return; 3561 } 3562 spin_lock(&fs_info->qgroup_lock); 3563 3564 if (!fs_info->quota_root) 3565 goto out; 3566 3567 qgroup = find_qgroup_rb(fs_info, ref_root); 3568 if (!qgroup) 3569 goto out; 3570 3571 if (num_bytes == (u64)-1) 3572 /* 3573 * We're freeing all pertrans rsv, get reserved value from 3574 * level 0 qgroup as real num_bytes to free. 3575 */ 3576 num_bytes = qgroup->rsv.values[type]; 3577 3578 qgroup_iterator_add(&qgroup_list, qgroup); 3579 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3580 struct btrfs_qgroup_list *glist; 3581 3582 qgroup_rsv_release(fs_info, qgroup, num_bytes, type); 3583 list_for_each_entry(glist, &qgroup->groups, next_group) { 3584 qgroup_iterator_add(&qgroup_list, glist->group); 3585 } 3586 } 3587 out: 3588 qgroup_iterator_clean(&qgroup_list); 3589 spin_unlock(&fs_info->qgroup_lock); 3590 } 3591 3592 /* 3593 * Check if the leaf is the last leaf. Which means all node pointers 3594 * are at their last position. 3595 */ 3596 static bool is_last_leaf(struct btrfs_path *path) 3597 { 3598 int i; 3599 3600 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3601 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3602 return false; 3603 } 3604 return true; 3605 } 3606 3607 /* 3608 * returns < 0 on error, 0 when more leafs are to be scanned. 3609 * returns 1 when done. 3610 */ 3611 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3612 struct btrfs_path *path) 3613 { 3614 struct btrfs_fs_info *fs_info = trans->fs_info; 3615 struct btrfs_root *extent_root; 3616 struct btrfs_key found; 3617 struct extent_buffer *scratch_leaf = NULL; 3618 u64 num_bytes; 3619 bool done; 3620 int slot; 3621 int ret; 3622 3623 if (!btrfs_qgroup_full_accounting(fs_info)) 3624 return 1; 3625 3626 mutex_lock(&fs_info->qgroup_rescan_lock); 3627 extent_root = btrfs_extent_root(fs_info, 3628 fs_info->qgroup_rescan_progress.objectid); 3629 ret = btrfs_search_slot_for_read(extent_root, 3630 &fs_info->qgroup_rescan_progress, 3631 path, 1, 0); 3632 3633 btrfs_debug(fs_info, 3634 "current progress key (%llu %u %llu), search_slot ret %d", 3635 fs_info->qgroup_rescan_progress.objectid, 3636 fs_info->qgroup_rescan_progress.type, 3637 fs_info->qgroup_rescan_progress.offset, ret); 3638 3639 if (ret) { 3640 /* 3641 * The rescan is about to end, we will not be scanning any 3642 * further blocks. We cannot unset the RESCAN flag here, because 3643 * we want to commit the transaction if everything went well. 3644 * To make the live accounting work in this phase, we set our 3645 * scan progress pointer such that every real extent objectid 3646 * will be smaller. 3647 */ 3648 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3649 btrfs_release_path(path); 3650 mutex_unlock(&fs_info->qgroup_rescan_lock); 3651 return ret; 3652 } 3653 done = is_last_leaf(path); 3654 3655 btrfs_item_key_to_cpu(path->nodes[0], &found, 3656 btrfs_header_nritems(path->nodes[0]) - 1); 3657 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3658 3659 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3660 if (!scratch_leaf) { 3661 ret = -ENOMEM; 3662 mutex_unlock(&fs_info->qgroup_rescan_lock); 3663 goto out; 3664 } 3665 slot = path->slots[0]; 3666 btrfs_release_path(path); 3667 mutex_unlock(&fs_info->qgroup_rescan_lock); 3668 3669 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3670 struct btrfs_backref_walk_ctx ctx = { 0 }; 3671 3672 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3673 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3674 found.type != BTRFS_METADATA_ITEM_KEY) 3675 continue; 3676 if (found.type == BTRFS_METADATA_ITEM_KEY) 3677 num_bytes = fs_info->nodesize; 3678 else 3679 num_bytes = found.offset; 3680 3681 ctx.bytenr = found.objectid; 3682 ctx.fs_info = fs_info; 3683 3684 ret = btrfs_find_all_roots(&ctx, false); 3685 if (ret < 0) 3686 goto out; 3687 /* For rescan, just pass old_roots as NULL */ 3688 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3689 num_bytes, NULL, ctx.roots); 3690 if (ret < 0) 3691 goto out; 3692 } 3693 out: 3694 if (scratch_leaf) 3695 free_extent_buffer(scratch_leaf); 3696 3697 if (done && !ret) { 3698 ret = 1; 3699 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3700 } 3701 return ret; 3702 } 3703 3704 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3705 { 3706 if (btrfs_fs_closing(fs_info)) 3707 return true; 3708 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 3709 return true; 3710 if (!btrfs_qgroup_enabled(fs_info)) 3711 return true; 3712 if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3713 return true; 3714 return false; 3715 } 3716 3717 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3718 { 3719 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3720 qgroup_rescan_work); 3721 struct btrfs_path *path; 3722 struct btrfs_trans_handle *trans = NULL; 3723 int ret = 0; 3724 bool stopped = false; 3725 bool did_leaf_rescans = false; 3726 3727 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 3728 return; 3729 3730 path = btrfs_alloc_path(); 3731 if (!path) { 3732 ret = -ENOMEM; 3733 goto out; 3734 } 3735 /* 3736 * Rescan should only search for commit root, and any later difference 3737 * should be recorded by qgroup 3738 */ 3739 path->search_commit_root = 1; 3740 path->skip_locking = 1; 3741 3742 while (!ret && !(stopped = rescan_should_stop(fs_info))) { 3743 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3744 if (IS_ERR(trans)) { 3745 ret = PTR_ERR(trans); 3746 break; 3747 } 3748 3749 ret = qgroup_rescan_leaf(trans, path); 3750 did_leaf_rescans = true; 3751 3752 if (ret > 0) 3753 btrfs_commit_transaction(trans); 3754 else 3755 btrfs_end_transaction(trans); 3756 } 3757 3758 out: 3759 btrfs_free_path(path); 3760 3761 mutex_lock(&fs_info->qgroup_rescan_lock); 3762 if (ret > 0 && 3763 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3764 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3765 } else if (ret < 0 || stopped) { 3766 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3767 } 3768 mutex_unlock(&fs_info->qgroup_rescan_lock); 3769 3770 /* 3771 * Only update status, since the previous part has already updated the 3772 * qgroup info, and only if we did any actual work. This also prevents 3773 * race with a concurrent quota disable, which has already set 3774 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at 3775 * btrfs_quota_disable(). 3776 */ 3777 if (did_leaf_rescans) { 3778 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3779 if (IS_ERR(trans)) { 3780 ret = PTR_ERR(trans); 3781 trans = NULL; 3782 btrfs_err(fs_info, 3783 "fail to start transaction for status update: %d", 3784 ret); 3785 } 3786 } else { 3787 trans = NULL; 3788 } 3789 3790 mutex_lock(&fs_info->qgroup_rescan_lock); 3791 if (!stopped || 3792 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3793 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3794 if (trans) { 3795 int ret2 = update_qgroup_status_item(trans); 3796 3797 if (ret2 < 0) { 3798 ret = ret2; 3799 btrfs_err(fs_info, "fail to update qgroup status: %d", ret); 3800 } 3801 } 3802 fs_info->qgroup_rescan_running = false; 3803 fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3804 complete_all(&fs_info->qgroup_rescan_completion); 3805 mutex_unlock(&fs_info->qgroup_rescan_lock); 3806 3807 if (!trans) 3808 return; 3809 3810 btrfs_end_transaction(trans); 3811 3812 if (stopped) { 3813 btrfs_info(fs_info, "qgroup scan paused"); 3814 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { 3815 btrfs_info(fs_info, "qgroup scan cancelled"); 3816 } else if (ret >= 0) { 3817 btrfs_info(fs_info, "qgroup scan completed%s", 3818 ret > 0 ? " (inconsistency flag cleared)" : ""); 3819 } else { 3820 btrfs_err(fs_info, "qgroup scan failed with %d", ret); 3821 } 3822 } 3823 3824 /* 3825 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3826 * memory required for the rescan context. 3827 */ 3828 static int 3829 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3830 int init_flags) 3831 { 3832 int ret = 0; 3833 3834 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3835 btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); 3836 return -EINVAL; 3837 } 3838 3839 if (!init_flags) { 3840 /* we're resuming qgroup rescan at mount time */ 3841 if (!(fs_info->qgroup_flags & 3842 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3843 btrfs_debug(fs_info, 3844 "qgroup rescan init failed, qgroup rescan is not queued"); 3845 ret = -EINVAL; 3846 } else if (!(fs_info->qgroup_flags & 3847 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3848 btrfs_debug(fs_info, 3849 "qgroup rescan init failed, qgroup is not enabled"); 3850 ret = -ENOTCONN; 3851 } 3852 3853 if (ret) 3854 return ret; 3855 } 3856 3857 mutex_lock(&fs_info->qgroup_rescan_lock); 3858 3859 if (init_flags) { 3860 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3861 ret = -EINPROGRESS; 3862 } else if (!(fs_info->qgroup_flags & 3863 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3864 btrfs_debug(fs_info, 3865 "qgroup rescan init failed, qgroup is not enabled"); 3866 ret = -ENOTCONN; 3867 } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { 3868 /* Quota disable is in progress */ 3869 ret = -EBUSY; 3870 } 3871 3872 if (ret) { 3873 mutex_unlock(&fs_info->qgroup_rescan_lock); 3874 return ret; 3875 } 3876 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3877 } 3878 3879 memset(&fs_info->qgroup_rescan_progress, 0, 3880 sizeof(fs_info->qgroup_rescan_progress)); 3881 fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 3882 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 3883 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3884 init_completion(&fs_info->qgroup_rescan_completion); 3885 mutex_unlock(&fs_info->qgroup_rescan_lock); 3886 3887 btrfs_init_work(&fs_info->qgroup_rescan_work, 3888 btrfs_qgroup_rescan_worker, NULL); 3889 return 0; 3890 } 3891 3892 static void 3893 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3894 { 3895 struct rb_node *n; 3896 struct btrfs_qgroup *qgroup; 3897 3898 spin_lock(&fs_info->qgroup_lock); 3899 /* clear all current qgroup tracking information */ 3900 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3901 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3902 qgroup->rfer = 0; 3903 qgroup->rfer_cmpr = 0; 3904 qgroup->excl = 0; 3905 qgroup->excl_cmpr = 0; 3906 qgroup_dirty(fs_info, qgroup); 3907 } 3908 spin_unlock(&fs_info->qgroup_lock); 3909 } 3910 3911 int 3912 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3913 { 3914 int ret = 0; 3915 struct btrfs_trans_handle *trans; 3916 3917 ret = qgroup_rescan_init(fs_info, 0, 1); 3918 if (ret) 3919 return ret; 3920 3921 /* 3922 * We have set the rescan_progress to 0, which means no more 3923 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3924 * However, btrfs_qgroup_account_ref may be right after its call 3925 * to btrfs_find_all_roots, in which case it would still do the 3926 * accounting. 3927 * To solve this, we're committing the transaction, which will 3928 * ensure we run all delayed refs and only after that, we are 3929 * going to clear all tracking information for a clean start. 3930 */ 3931 3932 trans = btrfs_attach_transaction_barrier(fs_info->fs_root); 3933 if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { 3934 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3935 return PTR_ERR(trans); 3936 } else if (trans != ERR_PTR(-ENOENT)) { 3937 ret = btrfs_commit_transaction(trans); 3938 if (ret) { 3939 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3940 return ret; 3941 } 3942 } 3943 3944 qgroup_rescan_zero_tracking(fs_info); 3945 3946 mutex_lock(&fs_info->qgroup_rescan_lock); 3947 fs_info->qgroup_rescan_running = true; 3948 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3949 &fs_info->qgroup_rescan_work); 3950 mutex_unlock(&fs_info->qgroup_rescan_lock); 3951 3952 return 0; 3953 } 3954 3955 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3956 bool interruptible) 3957 { 3958 int running; 3959 int ret = 0; 3960 3961 mutex_lock(&fs_info->qgroup_rescan_lock); 3962 running = fs_info->qgroup_rescan_running; 3963 mutex_unlock(&fs_info->qgroup_rescan_lock); 3964 3965 if (!running) 3966 return 0; 3967 3968 if (interruptible) 3969 ret = wait_for_completion_interruptible( 3970 &fs_info->qgroup_rescan_completion); 3971 else 3972 wait_for_completion(&fs_info->qgroup_rescan_completion); 3973 3974 return ret; 3975 } 3976 3977 /* 3978 * this is only called from open_ctree where we're still single threaded, thus 3979 * locking is omitted here. 3980 */ 3981 void 3982 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3983 { 3984 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3985 mutex_lock(&fs_info->qgroup_rescan_lock); 3986 fs_info->qgroup_rescan_running = true; 3987 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3988 &fs_info->qgroup_rescan_work); 3989 mutex_unlock(&fs_info->qgroup_rescan_lock); 3990 } 3991 } 3992 3993 #define rbtree_iterate_from_safe(node, next, start) \ 3994 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3995 3996 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3997 struct extent_changeset *reserved, u64 start, 3998 u64 len) 3999 { 4000 struct rb_node *node; 4001 struct rb_node *next; 4002 struct ulist_node *entry; 4003 int ret = 0; 4004 4005 node = reserved->range_changed.root.rb_node; 4006 if (!node) 4007 return 0; 4008 while (node) { 4009 entry = rb_entry(node, struct ulist_node, rb_node); 4010 if (entry->val < start) 4011 node = node->rb_right; 4012 else 4013 node = node->rb_left; 4014 } 4015 4016 if (entry->val > start && rb_prev(&entry->rb_node)) 4017 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 4018 rb_node); 4019 4020 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 4021 u64 entry_start; 4022 u64 entry_end; 4023 u64 entry_len; 4024 int clear_ret; 4025 4026 entry = rb_entry(node, struct ulist_node, rb_node); 4027 entry_start = entry->val; 4028 entry_end = entry->aux; 4029 entry_len = entry_end - entry_start + 1; 4030 4031 if (entry_start >= start + len) 4032 break; 4033 if (entry_start + entry_len <= start) 4034 continue; 4035 /* 4036 * Now the entry is in [start, start + len), revert the 4037 * EXTENT_QGROUP_RESERVED bit. 4038 */ 4039 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 4040 entry_end, EXTENT_QGROUP_RESERVED); 4041 if (!ret && clear_ret < 0) 4042 ret = clear_ret; 4043 4044 ulist_del(&reserved->range_changed, entry->val, entry->aux); 4045 if (likely(reserved->bytes_changed >= entry_len)) { 4046 reserved->bytes_changed -= entry_len; 4047 } else { 4048 WARN_ON(1); 4049 reserved->bytes_changed = 0; 4050 } 4051 } 4052 4053 return ret; 4054 } 4055 4056 /* 4057 * Try to free some space for qgroup. 4058 * 4059 * For qgroup, there are only 3 ways to free qgroup space: 4060 * - Flush nodatacow write 4061 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 4062 * In theory, we should only flush nodatacow inodes, but it's not yet 4063 * possible, so we need to flush the whole root. 4064 * 4065 * - Wait for ordered extents 4066 * When ordered extents are finished, their reserved metadata is finally 4067 * converted to per_trans status, which can be freed by later commit 4068 * transaction. 4069 * 4070 * - Commit transaction 4071 * This would free the meta_per_trans space. 4072 * In theory this shouldn't provide much space, but any more qgroup space 4073 * is needed. 4074 */ 4075 static int try_flush_qgroup(struct btrfs_root *root) 4076 { 4077 struct btrfs_trans_handle *trans; 4078 int ret; 4079 4080 /* Can't hold an open transaction or we run the risk of deadlocking. */ 4081 ASSERT(current->journal_info == NULL); 4082 if (WARN_ON(current->journal_info)) 4083 return 0; 4084 4085 /* 4086 * We don't want to run flush again and again, so if there is a running 4087 * one, we won't try to start a new flush, but exit directly. 4088 */ 4089 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 4090 wait_event(root->qgroup_flush_wait, 4091 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 4092 return 0; 4093 } 4094 4095 ret = btrfs_start_delalloc_snapshot(root, true); 4096 if (ret < 0) 4097 goto out; 4098 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 4099 4100 trans = btrfs_attach_transaction_barrier(root); 4101 if (IS_ERR(trans)) { 4102 ret = PTR_ERR(trans); 4103 if (ret == -ENOENT) 4104 ret = 0; 4105 goto out; 4106 } 4107 4108 ret = btrfs_commit_transaction(trans); 4109 out: 4110 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 4111 wake_up(&root->qgroup_flush_wait); 4112 return ret; 4113 } 4114 4115 static int qgroup_reserve_data(struct btrfs_inode *inode, 4116 struct extent_changeset **reserved_ret, u64 start, 4117 u64 len) 4118 { 4119 struct btrfs_root *root = inode->root; 4120 struct extent_changeset *reserved; 4121 bool new_reserved = false; 4122 u64 orig_reserved; 4123 u64 to_reserve; 4124 int ret; 4125 4126 if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4127 !is_fstree(btrfs_root_id(root)) || len == 0) 4128 return 0; 4129 4130 /* @reserved parameter is mandatory for qgroup */ 4131 if (WARN_ON(!reserved_ret)) 4132 return -EINVAL; 4133 if (!*reserved_ret) { 4134 new_reserved = true; 4135 *reserved_ret = extent_changeset_alloc(); 4136 if (!*reserved_ret) 4137 return -ENOMEM; 4138 } 4139 reserved = *reserved_ret; 4140 /* Record already reserved space */ 4141 orig_reserved = reserved->bytes_changed; 4142 ret = set_record_extent_bits(&inode->io_tree, start, 4143 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 4144 4145 /* Newly reserved space */ 4146 to_reserve = reserved->bytes_changed - orig_reserved; 4147 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 4148 to_reserve, QGROUP_RESERVE); 4149 if (ret < 0) 4150 goto out; 4151 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 4152 if (ret < 0) 4153 goto cleanup; 4154 4155 return ret; 4156 4157 cleanup: 4158 qgroup_unreserve_range(inode, reserved, start, len); 4159 out: 4160 if (new_reserved) { 4161 extent_changeset_free(reserved); 4162 *reserved_ret = NULL; 4163 } 4164 return ret; 4165 } 4166 4167 /* 4168 * Reserve qgroup space for range [start, start + len). 4169 * 4170 * This function will either reserve space from related qgroups or do nothing 4171 * if the range is already reserved. 4172 * 4173 * Return 0 for successful reservation 4174 * Return <0 for error (including -EQUOT) 4175 * 4176 * NOTE: This function may sleep for memory allocation, dirty page flushing and 4177 * commit transaction. So caller should not hold any dirty page locked. 4178 */ 4179 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 4180 struct extent_changeset **reserved_ret, u64 start, 4181 u64 len) 4182 { 4183 int ret; 4184 4185 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 4186 if (ret <= 0 && ret != -EDQUOT) 4187 return ret; 4188 4189 ret = try_flush_qgroup(inode->root); 4190 if (ret < 0) 4191 return ret; 4192 return qgroup_reserve_data(inode, reserved_ret, start, len); 4193 } 4194 4195 /* Free ranges specified by @reserved, normally in error path */ 4196 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 4197 struct extent_changeset *reserved, 4198 u64 start, u64 len, u64 *freed_ret) 4199 { 4200 struct btrfs_root *root = inode->root; 4201 struct ulist_node *unode; 4202 struct ulist_iterator uiter; 4203 struct extent_changeset changeset; 4204 u64 freed = 0; 4205 int ret; 4206 4207 extent_changeset_init(&changeset); 4208 len = round_up(start + len, root->fs_info->sectorsize); 4209 start = round_down(start, root->fs_info->sectorsize); 4210 4211 ULIST_ITER_INIT(&uiter); 4212 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 4213 u64 range_start = unode->val; 4214 /* unode->aux is the inclusive end */ 4215 u64 range_len = unode->aux - range_start + 1; 4216 u64 free_start; 4217 u64 free_len; 4218 4219 extent_changeset_release(&changeset); 4220 4221 /* Only free range in range [start, start + len) */ 4222 if (range_start >= start + len || 4223 range_start + range_len <= start) 4224 continue; 4225 free_start = max(range_start, start); 4226 free_len = min(start + len, range_start + range_len) - 4227 free_start; 4228 /* 4229 * TODO: To also modify reserved->ranges_reserved to reflect 4230 * the modification. 4231 * 4232 * However as long as we free qgroup reserved according to 4233 * EXTENT_QGROUP_RESERVED, we won't double free. 4234 * So not need to rush. 4235 */ 4236 ret = clear_record_extent_bits(&inode->io_tree, free_start, 4237 free_start + free_len - 1, 4238 EXTENT_QGROUP_RESERVED, &changeset); 4239 if (ret < 0) 4240 goto out; 4241 freed += changeset.bytes_changed; 4242 } 4243 btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed, 4244 BTRFS_QGROUP_RSV_DATA); 4245 if (freed_ret) 4246 *freed_ret = freed; 4247 ret = 0; 4248 out: 4249 extent_changeset_release(&changeset); 4250 return ret; 4251 } 4252 4253 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 4254 struct extent_changeset *reserved, u64 start, u64 len, 4255 u64 *released, int free) 4256 { 4257 struct extent_changeset changeset; 4258 int trace_op = QGROUP_RELEASE; 4259 int ret; 4260 4261 if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { 4262 extent_changeset_init(&changeset); 4263 return clear_record_extent_bits(&inode->io_tree, start, 4264 start + len - 1, 4265 EXTENT_QGROUP_RESERVED, &changeset); 4266 } 4267 4268 /* In release case, we shouldn't have @reserved */ 4269 WARN_ON(!free && reserved); 4270 if (free && reserved) 4271 return qgroup_free_reserved_data(inode, reserved, start, len, released); 4272 extent_changeset_init(&changeset); 4273 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 4274 EXTENT_QGROUP_RESERVED, &changeset); 4275 if (ret < 0) 4276 goto out; 4277 4278 if (free) 4279 trace_op = QGROUP_FREE; 4280 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 4281 changeset.bytes_changed, trace_op); 4282 if (free) 4283 btrfs_qgroup_free_refroot(inode->root->fs_info, 4284 btrfs_root_id(inode->root), 4285 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4286 if (released) 4287 *released = changeset.bytes_changed; 4288 out: 4289 extent_changeset_release(&changeset); 4290 return ret; 4291 } 4292 4293 /* 4294 * Free a reserved space range from io_tree and related qgroups 4295 * 4296 * Should be called when a range of pages get invalidated before reaching disk. 4297 * Or for error cleanup case. 4298 * if @reserved is given, only reserved range in [@start, @start + @len) will 4299 * be freed. 4300 * 4301 * For data written to disk, use btrfs_qgroup_release_data(). 4302 * 4303 * NOTE: This function may sleep for memory allocation. 4304 */ 4305 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 4306 struct extent_changeset *reserved, 4307 u64 start, u64 len, u64 *freed) 4308 { 4309 return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); 4310 } 4311 4312 /* 4313 * Release a reserved space range from io_tree only. 4314 * 4315 * Should be called when a range of pages get written to disk and corresponding 4316 * FILE_EXTENT is inserted into corresponding root. 4317 * 4318 * Since new qgroup accounting framework will only update qgroup numbers at 4319 * commit_transaction() time, its reserved space shouldn't be freed from 4320 * related qgroups. 4321 * 4322 * But we should release the range from io_tree, to allow further write to be 4323 * COWed. 4324 * 4325 * NOTE: This function may sleep for memory allocation. 4326 */ 4327 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) 4328 { 4329 return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); 4330 } 4331 4332 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4333 enum btrfs_qgroup_rsv_type type) 4334 { 4335 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4336 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4337 return; 4338 if (num_bytes == 0) 4339 return; 4340 4341 spin_lock(&root->qgroup_meta_rsv_lock); 4342 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 4343 root->qgroup_meta_rsv_prealloc += num_bytes; 4344 else 4345 root->qgroup_meta_rsv_pertrans += num_bytes; 4346 spin_unlock(&root->qgroup_meta_rsv_lock); 4347 } 4348 4349 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4350 enum btrfs_qgroup_rsv_type type) 4351 { 4352 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4353 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4354 return 0; 4355 if (num_bytes == 0) 4356 return 0; 4357 4358 spin_lock(&root->qgroup_meta_rsv_lock); 4359 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 4360 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 4361 num_bytes); 4362 root->qgroup_meta_rsv_prealloc -= num_bytes; 4363 } else { 4364 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 4365 num_bytes); 4366 root->qgroup_meta_rsv_pertrans -= num_bytes; 4367 } 4368 spin_unlock(&root->qgroup_meta_rsv_lock); 4369 return num_bytes; 4370 } 4371 4372 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4373 enum btrfs_qgroup_rsv_type type, bool enforce) 4374 { 4375 struct btrfs_fs_info *fs_info = root->fs_info; 4376 int ret; 4377 4378 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4379 !is_fstree(btrfs_root_id(root)) || num_bytes == 0) 4380 return 0; 4381 4382 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4383 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 4384 ret = qgroup_reserve(root, num_bytes, enforce, type); 4385 if (ret < 0) 4386 return ret; 4387 /* 4388 * Record what we have reserved into root. 4389 * 4390 * To avoid quota disabled->enabled underflow. 4391 * In that case, we may try to free space we haven't reserved 4392 * (since quota was disabled), so record what we reserved into root. 4393 * And ensure later release won't underflow this number. 4394 */ 4395 add_root_meta_rsv(root, num_bytes, type); 4396 return ret; 4397 } 4398 4399 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4400 enum btrfs_qgroup_rsv_type type, bool enforce, 4401 bool noflush) 4402 { 4403 int ret; 4404 4405 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4406 if ((ret <= 0 && ret != -EDQUOT) || noflush) 4407 return ret; 4408 4409 ret = try_flush_qgroup(root); 4410 if (ret < 0) 4411 return ret; 4412 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4413 } 4414 4415 /* 4416 * Per-transaction meta reservation should be all freed at transaction commit 4417 * time 4418 */ 4419 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 4420 { 4421 struct btrfs_fs_info *fs_info = root->fs_info; 4422 4423 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4424 !is_fstree(btrfs_root_id(root))) 4425 return; 4426 4427 /* TODO: Update trace point to handle such free */ 4428 trace_qgroup_meta_free_all_pertrans(root); 4429 /* Special value -1 means to free all reserved space */ 4430 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, 4431 BTRFS_QGROUP_RSV_META_PERTRANS); 4432 } 4433 4434 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 4435 enum btrfs_qgroup_rsv_type type) 4436 { 4437 struct btrfs_fs_info *fs_info = root->fs_info; 4438 4439 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4440 !is_fstree(btrfs_root_id(root))) 4441 return; 4442 4443 /* 4444 * reservation for META_PREALLOC can happen before quota is enabled, 4445 * which can lead to underflow. 4446 * Here ensure we will only free what we really have reserved. 4447 */ 4448 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 4449 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4450 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 4451 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); 4452 } 4453 4454 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 4455 int num_bytes) 4456 { 4457 struct btrfs_qgroup *qgroup; 4458 LIST_HEAD(qgroup_list); 4459 4460 if (num_bytes == 0) 4461 return; 4462 if (!fs_info->quota_root) 4463 return; 4464 4465 spin_lock(&fs_info->qgroup_lock); 4466 qgroup = find_qgroup_rb(fs_info, ref_root); 4467 if (!qgroup) 4468 goto out; 4469 4470 qgroup_iterator_add(&qgroup_list, qgroup); 4471 list_for_each_entry(qgroup, &qgroup_list, iterator) { 4472 struct btrfs_qgroup_list *glist; 4473 4474 qgroup_rsv_release(fs_info, qgroup, num_bytes, 4475 BTRFS_QGROUP_RSV_META_PREALLOC); 4476 if (!sb_rdonly(fs_info->sb)) 4477 qgroup_rsv_add(fs_info, qgroup, num_bytes, 4478 BTRFS_QGROUP_RSV_META_PERTRANS); 4479 4480 list_for_each_entry(glist, &qgroup->groups, next_group) 4481 qgroup_iterator_add(&qgroup_list, glist->group); 4482 } 4483 out: 4484 qgroup_iterator_clean(&qgroup_list); 4485 spin_unlock(&fs_info->qgroup_lock); 4486 } 4487 4488 /* 4489 * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. 4490 * 4491 * This is called when preallocated meta reservation needs to be used. 4492 * Normally after btrfs_join_transaction() call. 4493 */ 4494 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4495 { 4496 struct btrfs_fs_info *fs_info = root->fs_info; 4497 4498 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4499 !is_fstree(btrfs_root_id(root))) 4500 return; 4501 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4502 num_bytes = sub_root_meta_rsv(root, num_bytes, 4503 BTRFS_QGROUP_RSV_META_PREALLOC); 4504 trace_qgroup_meta_convert(root, num_bytes); 4505 qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); 4506 if (!sb_rdonly(fs_info->sb)) 4507 add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); 4508 } 4509 4510 /* 4511 * Check qgroup reserved space leaking, normally at destroy inode 4512 * time 4513 */ 4514 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4515 { 4516 struct extent_changeset changeset; 4517 struct ulist_node *unode; 4518 struct ulist_iterator iter; 4519 int ret; 4520 4521 extent_changeset_init(&changeset); 4522 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4523 EXTENT_QGROUP_RESERVED, &changeset); 4524 4525 WARN_ON(ret < 0); 4526 if (WARN_ON(changeset.bytes_changed)) { 4527 ULIST_ITER_INIT(&iter); 4528 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4529 btrfs_warn(inode->root->fs_info, 4530 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4531 btrfs_ino(inode), unode->val, unode->aux); 4532 } 4533 btrfs_qgroup_free_refroot(inode->root->fs_info, 4534 btrfs_root_id(inode->root), 4535 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4536 4537 } 4538 extent_changeset_release(&changeset); 4539 } 4540 4541 void btrfs_qgroup_init_swapped_blocks( 4542 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4543 { 4544 int i; 4545 4546 spin_lock_init(&swapped_blocks->lock); 4547 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4548 swapped_blocks->blocks[i] = RB_ROOT; 4549 swapped_blocks->swapped = false; 4550 } 4551 4552 /* 4553 * Delete all swapped blocks record of @root. 4554 * Every record here means we skipped a full subtree scan for qgroup. 4555 * 4556 * Gets called when committing one transaction. 4557 */ 4558 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4559 { 4560 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4561 int i; 4562 4563 swapped_blocks = &root->swapped_blocks; 4564 4565 spin_lock(&swapped_blocks->lock); 4566 if (!swapped_blocks->swapped) 4567 goto out; 4568 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4569 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4570 struct btrfs_qgroup_swapped_block *entry; 4571 struct btrfs_qgroup_swapped_block *next; 4572 4573 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4574 node) 4575 kfree(entry); 4576 swapped_blocks->blocks[i] = RB_ROOT; 4577 } 4578 swapped_blocks->swapped = false; 4579 out: 4580 spin_unlock(&swapped_blocks->lock); 4581 } 4582 4583 /* 4584 * Add subtree roots record into @subvol_root. 4585 * 4586 * @subvol_root: tree root of the subvolume tree get swapped 4587 * @bg: block group under balance 4588 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4589 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4590 * BOTH POINTERS ARE BEFORE TREE SWAP 4591 * @last_snapshot: last snapshot generation of the subvolume tree 4592 */ 4593 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4594 struct btrfs_root *subvol_root, 4595 struct btrfs_block_group *bg, 4596 struct extent_buffer *subvol_parent, int subvol_slot, 4597 struct extent_buffer *reloc_parent, int reloc_slot, 4598 u64 last_snapshot) 4599 { 4600 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4601 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4602 struct btrfs_qgroup_swapped_block *block; 4603 struct rb_node **cur; 4604 struct rb_node *parent = NULL; 4605 int level = btrfs_header_level(subvol_parent) - 1; 4606 int ret = 0; 4607 4608 if (!btrfs_qgroup_full_accounting(fs_info)) 4609 return 0; 4610 4611 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4612 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4613 btrfs_err_rl(fs_info, 4614 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4615 __func__, 4616 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4617 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4618 return -EUCLEAN; 4619 } 4620 4621 block = kmalloc(sizeof(*block), GFP_NOFS); 4622 if (!block) { 4623 ret = -ENOMEM; 4624 goto out; 4625 } 4626 4627 /* 4628 * @reloc_parent/slot is still before swap, while @block is going to 4629 * record the bytenr after swap, so we do the swap here. 4630 */ 4631 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4632 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4633 reloc_slot); 4634 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4635 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4636 subvol_slot); 4637 block->last_snapshot = last_snapshot; 4638 block->level = level; 4639 4640 /* 4641 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4642 * no one else can modify tree blocks thus we qgroup will not change 4643 * no matter the value of trace_leaf. 4644 */ 4645 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4646 block->trace_leaf = true; 4647 else 4648 block->trace_leaf = false; 4649 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4650 4651 /* Insert @block into @blocks */ 4652 spin_lock(&blocks->lock); 4653 cur = &blocks->blocks[level].rb_node; 4654 while (*cur) { 4655 struct btrfs_qgroup_swapped_block *entry; 4656 4657 parent = *cur; 4658 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4659 node); 4660 4661 if (entry->subvol_bytenr < block->subvol_bytenr) { 4662 cur = &(*cur)->rb_left; 4663 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4664 cur = &(*cur)->rb_right; 4665 } else { 4666 if (entry->subvol_generation != 4667 block->subvol_generation || 4668 entry->reloc_bytenr != block->reloc_bytenr || 4669 entry->reloc_generation != 4670 block->reloc_generation) { 4671 /* 4672 * Duplicated but mismatch entry found. 4673 * Shouldn't happen. 4674 * 4675 * Marking qgroup inconsistent should be enough 4676 * for end users. 4677 */ 4678 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4679 ret = -EEXIST; 4680 } 4681 kfree(block); 4682 goto out_unlock; 4683 } 4684 } 4685 rb_link_node(&block->node, parent, cur); 4686 rb_insert_color(&block->node, &blocks->blocks[level]); 4687 blocks->swapped = true; 4688 out_unlock: 4689 spin_unlock(&blocks->lock); 4690 out: 4691 if (ret < 0) 4692 qgroup_mark_inconsistent(fs_info); 4693 return ret; 4694 } 4695 4696 /* 4697 * Check if the tree block is a subtree root, and if so do the needed 4698 * delayed subtree trace for qgroup. 4699 * 4700 * This is called during btrfs_cow_block(). 4701 */ 4702 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4703 struct btrfs_root *root, 4704 struct extent_buffer *subvol_eb) 4705 { 4706 struct btrfs_fs_info *fs_info = root->fs_info; 4707 struct btrfs_tree_parent_check check = { 0 }; 4708 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4709 struct btrfs_qgroup_swapped_block *block; 4710 struct extent_buffer *reloc_eb = NULL; 4711 struct rb_node *node; 4712 bool found = false; 4713 bool swapped = false; 4714 int level = btrfs_header_level(subvol_eb); 4715 int ret = 0; 4716 int i; 4717 4718 if (!btrfs_qgroup_full_accounting(fs_info)) 4719 return 0; 4720 if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) 4721 return 0; 4722 4723 spin_lock(&blocks->lock); 4724 if (!blocks->swapped) { 4725 spin_unlock(&blocks->lock); 4726 return 0; 4727 } 4728 node = blocks->blocks[level].rb_node; 4729 4730 while (node) { 4731 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4732 if (block->subvol_bytenr < subvol_eb->start) { 4733 node = node->rb_left; 4734 } else if (block->subvol_bytenr > subvol_eb->start) { 4735 node = node->rb_right; 4736 } else { 4737 found = true; 4738 break; 4739 } 4740 } 4741 if (!found) { 4742 spin_unlock(&blocks->lock); 4743 goto out; 4744 } 4745 /* Found one, remove it from @blocks first and update blocks->swapped */ 4746 rb_erase(&block->node, &blocks->blocks[level]); 4747 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4748 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4749 swapped = true; 4750 break; 4751 } 4752 } 4753 blocks->swapped = swapped; 4754 spin_unlock(&blocks->lock); 4755 4756 check.level = block->level; 4757 check.transid = block->reloc_generation; 4758 check.has_first_key = true; 4759 memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); 4760 4761 /* Read out reloc subtree root */ 4762 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); 4763 if (IS_ERR(reloc_eb)) { 4764 ret = PTR_ERR(reloc_eb); 4765 reloc_eb = NULL; 4766 goto free_out; 4767 } 4768 if (!extent_buffer_uptodate(reloc_eb)) { 4769 ret = -EIO; 4770 goto free_out; 4771 } 4772 4773 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4774 block->last_snapshot, block->trace_leaf); 4775 free_out: 4776 kfree(block); 4777 free_extent_buffer(reloc_eb); 4778 out: 4779 if (ret < 0) { 4780 btrfs_err_rl(fs_info, 4781 "failed to account subtree at bytenr %llu: %d", 4782 subvol_eb->start, ret); 4783 qgroup_mark_inconsistent(fs_info); 4784 } 4785 return ret; 4786 } 4787 4788 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4789 { 4790 struct btrfs_qgroup_extent_record *entry; 4791 struct btrfs_qgroup_extent_record *next; 4792 struct rb_root *root; 4793 4794 root = &trans->delayed_refs.dirty_extent_root; 4795 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4796 ulist_free(entry->old_roots); 4797 kfree(entry); 4798 } 4799 *root = RB_ROOT; 4800 } 4801 4802 void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) 4803 { 4804 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) 4805 return; 4806 4807 if (!is_fstree(root)) 4808 return; 4809 4810 btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); 4811 } 4812 4813 int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, 4814 struct btrfs_squota_delta *delta) 4815 { 4816 int ret; 4817 struct btrfs_qgroup *qgroup; 4818 struct btrfs_qgroup *qg; 4819 LIST_HEAD(qgroup_list); 4820 u64 root = delta->root; 4821 u64 num_bytes = delta->num_bytes; 4822 const int sign = (delta->is_inc ? 1 : -1); 4823 4824 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) 4825 return 0; 4826 4827 if (!is_fstree(root)) 4828 return 0; 4829 4830 /* If the extent predates enabling quotas, don't count it. */ 4831 if (delta->generation < fs_info->qgroup_enable_gen) 4832 return 0; 4833 4834 spin_lock(&fs_info->qgroup_lock); 4835 qgroup = find_qgroup_rb(fs_info, root); 4836 if (!qgroup) { 4837 ret = -ENOENT; 4838 goto out; 4839 } 4840 4841 ret = 0; 4842 qgroup_iterator_add(&qgroup_list, qgroup); 4843 list_for_each_entry(qg, &qgroup_list, iterator) { 4844 struct btrfs_qgroup_list *glist; 4845 4846 qg->excl += num_bytes * sign; 4847 qg->rfer += num_bytes * sign; 4848 qgroup_dirty(fs_info, qg); 4849 4850 list_for_each_entry(glist, &qg->groups, next_group) 4851 qgroup_iterator_add(&qgroup_list, glist->group); 4852 } 4853 qgroup_iterator_clean(&qgroup_list); 4854 4855 out: 4856 spin_unlock(&fs_info->qgroup_lock); 4857 return ret; 4858 } 4859