1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 #include "fs.h" 28 #include "accessors.h" 29 #include "extent-tree.h" 30 #include "root-tree.h" 31 #include "tree-checker.h" 32 33 enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) 34 { 35 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 36 return BTRFS_QGROUP_MODE_DISABLED; 37 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) 38 return BTRFS_QGROUP_MODE_SIMPLE; 39 return BTRFS_QGROUP_MODE_FULL; 40 } 41 42 bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) 43 { 44 return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; 45 } 46 47 bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) 48 { 49 return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; 50 } 51 52 /* 53 * Helpers to access qgroup reservation 54 * 55 * Callers should ensure the lock context and type are valid 56 */ 57 58 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 59 { 60 u64 ret = 0; 61 int i; 62 63 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 64 ret += qgroup->rsv.values[i]; 65 66 return ret; 67 } 68 69 #ifdef CONFIG_BTRFS_DEBUG 70 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 71 { 72 if (type == BTRFS_QGROUP_RSV_DATA) 73 return "data"; 74 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 75 return "meta_pertrans"; 76 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 77 return "meta_prealloc"; 78 return NULL; 79 } 80 #endif 81 82 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 83 struct btrfs_qgroup *qgroup, u64 num_bytes, 84 enum btrfs_qgroup_rsv_type type) 85 { 86 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 87 qgroup->rsv.values[type] += num_bytes; 88 } 89 90 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 91 struct btrfs_qgroup *qgroup, u64 num_bytes, 92 enum btrfs_qgroup_rsv_type type) 93 { 94 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 95 if (qgroup->rsv.values[type] >= num_bytes) { 96 qgroup->rsv.values[type] -= num_bytes; 97 return; 98 } 99 #ifdef CONFIG_BTRFS_DEBUG 100 WARN_RATELIMIT(1, 101 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 102 qgroup->qgroupid, qgroup_rsv_type_str(type), 103 qgroup->rsv.values[type], num_bytes); 104 #endif 105 qgroup->rsv.values[type] = 0; 106 } 107 108 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 109 struct btrfs_qgroup *dest, 110 struct btrfs_qgroup *src) 111 { 112 int i; 113 114 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 115 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 116 } 117 118 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 119 struct btrfs_qgroup *dest, 120 struct btrfs_qgroup *src) 121 { 122 int i; 123 124 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 125 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 126 } 127 128 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 129 int mod) 130 { 131 if (qg->old_refcnt < seq) 132 qg->old_refcnt = seq; 133 qg->old_refcnt += mod; 134 } 135 136 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 137 int mod) 138 { 139 if (qg->new_refcnt < seq) 140 qg->new_refcnt = seq; 141 qg->new_refcnt += mod; 142 } 143 144 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 145 { 146 if (qg->old_refcnt < seq) 147 return 0; 148 return qg->old_refcnt - seq; 149 } 150 151 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 152 { 153 if (qg->new_refcnt < seq) 154 return 0; 155 return qg->new_refcnt - seq; 156 } 157 158 /* 159 * glue structure to represent the relations between qgroups. 160 */ 161 struct btrfs_qgroup_list { 162 struct list_head next_group; 163 struct list_head next_member; 164 struct btrfs_qgroup *group; 165 struct btrfs_qgroup *member; 166 }; 167 168 static int 169 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 170 int init_flags); 171 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 172 173 /* must be called with qgroup_ioctl_lock held */ 174 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 175 u64 qgroupid) 176 { 177 struct rb_node *n = fs_info->qgroup_tree.rb_node; 178 struct btrfs_qgroup *qgroup; 179 180 while (n) { 181 qgroup = rb_entry(n, struct btrfs_qgroup, node); 182 if (qgroup->qgroupid < qgroupid) 183 n = n->rb_left; 184 else if (qgroup->qgroupid > qgroupid) 185 n = n->rb_right; 186 else 187 return qgroup; 188 } 189 return NULL; 190 } 191 192 /* 193 * Add qgroup to the filesystem's qgroup tree. 194 * 195 * Must be called with qgroup_lock held and @prealloc preallocated. 196 * 197 * The control on the lifespan of @prealloc would be transferred to this 198 * function, thus caller should no longer touch @prealloc. 199 */ 200 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 201 struct btrfs_qgroup *prealloc, 202 u64 qgroupid) 203 { 204 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 205 struct rb_node *parent = NULL; 206 struct btrfs_qgroup *qgroup; 207 208 /* Caller must have pre-allocated @prealloc. */ 209 ASSERT(prealloc); 210 211 while (*p) { 212 parent = *p; 213 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 214 215 if (qgroup->qgroupid < qgroupid) { 216 p = &(*p)->rb_left; 217 } else if (qgroup->qgroupid > qgroupid) { 218 p = &(*p)->rb_right; 219 } else { 220 kfree(prealloc); 221 return qgroup; 222 } 223 } 224 225 qgroup = prealloc; 226 qgroup->qgroupid = qgroupid; 227 INIT_LIST_HEAD(&qgroup->groups); 228 INIT_LIST_HEAD(&qgroup->members); 229 INIT_LIST_HEAD(&qgroup->dirty); 230 INIT_LIST_HEAD(&qgroup->iterator); 231 INIT_LIST_HEAD(&qgroup->nested_iterator); 232 233 rb_link_node(&qgroup->node, parent, p); 234 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 235 236 return qgroup; 237 } 238 239 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 240 struct btrfs_qgroup *qgroup) 241 { 242 struct btrfs_qgroup_list *list; 243 244 list_del(&qgroup->dirty); 245 while (!list_empty(&qgroup->groups)) { 246 list = list_first_entry(&qgroup->groups, 247 struct btrfs_qgroup_list, next_group); 248 list_del(&list->next_group); 249 list_del(&list->next_member); 250 kfree(list); 251 } 252 253 while (!list_empty(&qgroup->members)) { 254 list = list_first_entry(&qgroup->members, 255 struct btrfs_qgroup_list, next_member); 256 list_del(&list->next_group); 257 list_del(&list->next_member); 258 kfree(list); 259 } 260 } 261 262 /* must be called with qgroup_lock held */ 263 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 264 { 265 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 266 267 if (!qgroup) 268 return -ENOENT; 269 270 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 271 __del_qgroup_rb(fs_info, qgroup); 272 return 0; 273 } 274 275 /* 276 * Add relation specified by two qgroups. 277 * 278 * Must be called with qgroup_lock held, the ownership of @prealloc is 279 * transferred to this function and caller should not touch it anymore. 280 * 281 * Return: 0 on success 282 * -ENOENT if one of the qgroups is NULL 283 * <0 other errors 284 */ 285 static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, 286 struct btrfs_qgroup *member, 287 struct btrfs_qgroup *parent) 288 { 289 if (!member || !parent) { 290 kfree(prealloc); 291 return -ENOENT; 292 } 293 294 prealloc->group = parent; 295 prealloc->member = member; 296 list_add_tail(&prealloc->next_group, &member->groups); 297 list_add_tail(&prealloc->next_member, &parent->members); 298 299 return 0; 300 } 301 302 /* 303 * Add relation specified by two qgroup ids. 304 * 305 * Must be called with qgroup_lock held. 306 * 307 * Return: 0 on success 308 * -ENOENT if one of the ids does not exist 309 * <0 other errors 310 */ 311 static int add_relation_rb(struct btrfs_fs_info *fs_info, 312 struct btrfs_qgroup_list *prealloc, 313 u64 memberid, u64 parentid) 314 { 315 struct btrfs_qgroup *member; 316 struct btrfs_qgroup *parent; 317 318 member = find_qgroup_rb(fs_info, memberid); 319 parent = find_qgroup_rb(fs_info, parentid); 320 321 return __add_relation_rb(prealloc, member, parent); 322 } 323 324 /* Must be called with qgroup_lock held */ 325 static int del_relation_rb(struct btrfs_fs_info *fs_info, 326 u64 memberid, u64 parentid) 327 { 328 struct btrfs_qgroup *member; 329 struct btrfs_qgroup *parent; 330 struct btrfs_qgroup_list *list; 331 332 member = find_qgroup_rb(fs_info, memberid); 333 parent = find_qgroup_rb(fs_info, parentid); 334 if (!member || !parent) 335 return -ENOENT; 336 337 list_for_each_entry(list, &member->groups, next_group) { 338 if (list->group == parent) { 339 list_del(&list->next_group); 340 list_del(&list->next_member); 341 kfree(list); 342 return 0; 343 } 344 } 345 return -ENOENT; 346 } 347 348 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 349 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 350 u64 rfer, u64 excl) 351 { 352 struct btrfs_qgroup *qgroup; 353 354 qgroup = find_qgroup_rb(fs_info, qgroupid); 355 if (!qgroup) 356 return -EINVAL; 357 if (qgroup->rfer != rfer || qgroup->excl != excl) 358 return -EINVAL; 359 return 0; 360 } 361 #endif 362 363 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) 364 { 365 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 366 return; 367 fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | 368 BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 369 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 370 } 371 372 static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, 373 struct extent_buffer *leaf, int slot, 374 struct btrfs_qgroup_status_item *ptr) 375 { 376 ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); 377 ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); 378 fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); 379 } 380 381 /* 382 * The full config is read in one go, only called from open_ctree() 383 * It doesn't use any locking, as at this point we're still single-threaded 384 */ 385 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 386 { 387 struct btrfs_key key; 388 struct btrfs_key found_key; 389 struct btrfs_root *quota_root = fs_info->quota_root; 390 struct btrfs_path *path = NULL; 391 struct extent_buffer *l; 392 int slot; 393 int ret = 0; 394 u64 flags = 0; 395 u64 rescan_progress = 0; 396 397 if (!fs_info->quota_root) 398 return 0; 399 400 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 401 if (!fs_info->qgroup_ulist) { 402 ret = -ENOMEM; 403 goto out; 404 } 405 406 path = btrfs_alloc_path(); 407 if (!path) { 408 ret = -ENOMEM; 409 goto out; 410 } 411 412 ret = btrfs_sysfs_add_qgroups(fs_info); 413 if (ret < 0) 414 goto out; 415 /* default this to quota off, in case no status key is found */ 416 fs_info->qgroup_flags = 0; 417 418 /* 419 * pass 1: read status, all qgroup infos and limits 420 */ 421 key.objectid = 0; 422 key.type = 0; 423 key.offset = 0; 424 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 425 if (ret) 426 goto out; 427 428 while (1) { 429 struct btrfs_qgroup *qgroup; 430 431 slot = path->slots[0]; 432 l = path->nodes[0]; 433 btrfs_item_key_to_cpu(l, &found_key, slot); 434 435 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 436 struct btrfs_qgroup_status_item *ptr; 437 438 ptr = btrfs_item_ptr(l, slot, 439 struct btrfs_qgroup_status_item); 440 441 if (btrfs_qgroup_status_version(l, ptr) != 442 BTRFS_QGROUP_STATUS_VERSION) { 443 btrfs_err(fs_info, 444 "old qgroup version, quota disabled"); 445 goto out; 446 } 447 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); 448 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { 449 qgroup_read_enable_gen(fs_info, l, slot, ptr); 450 } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { 451 qgroup_mark_inconsistent(fs_info); 452 btrfs_err(fs_info, 453 "qgroup generation mismatch, marked as inconsistent"); 454 } 455 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 456 goto next1; 457 } 458 459 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 460 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 461 goto next1; 462 463 qgroup = find_qgroup_rb(fs_info, found_key.offset); 464 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 465 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 466 btrfs_err(fs_info, "inconsistent qgroup config"); 467 qgroup_mark_inconsistent(fs_info); 468 } 469 if (!qgroup) { 470 struct btrfs_qgroup *prealloc; 471 struct btrfs_root *tree_root = fs_info->tree_root; 472 473 prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); 474 if (!prealloc) { 475 ret = -ENOMEM; 476 goto out; 477 } 478 qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); 479 /* 480 * If a qgroup exists for a subvolume ID, it is possible 481 * that subvolume has been deleted, in which case 482 * re-using that ID would lead to incorrect accounting. 483 * 484 * Ensure that we skip any such subvol ids. 485 * 486 * We don't need to lock because this is only called 487 * during mount before we start doing things like creating 488 * subvolumes. 489 */ 490 if (is_fstree(qgroup->qgroupid) && 491 qgroup->qgroupid > tree_root->free_objectid) 492 /* 493 * Don't need to check against BTRFS_LAST_FREE_OBJECTID, 494 * as it will get checked on the next call to 495 * btrfs_get_free_objectid. 496 */ 497 tree_root->free_objectid = qgroup->qgroupid + 1; 498 } 499 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 500 if (ret < 0) 501 goto out; 502 503 switch (found_key.type) { 504 case BTRFS_QGROUP_INFO_KEY: { 505 struct btrfs_qgroup_info_item *ptr; 506 507 ptr = btrfs_item_ptr(l, slot, 508 struct btrfs_qgroup_info_item); 509 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 510 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 511 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 512 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 513 /* generation currently unused */ 514 break; 515 } 516 case BTRFS_QGROUP_LIMIT_KEY: { 517 struct btrfs_qgroup_limit_item *ptr; 518 519 ptr = btrfs_item_ptr(l, slot, 520 struct btrfs_qgroup_limit_item); 521 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 522 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 523 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 524 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 525 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 526 break; 527 } 528 } 529 next1: 530 ret = btrfs_next_item(quota_root, path); 531 if (ret < 0) 532 goto out; 533 if (ret) 534 break; 535 } 536 btrfs_release_path(path); 537 538 /* 539 * pass 2: read all qgroup relations 540 */ 541 key.objectid = 0; 542 key.type = BTRFS_QGROUP_RELATION_KEY; 543 key.offset = 0; 544 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 545 if (ret) 546 goto out; 547 while (1) { 548 struct btrfs_qgroup_list *list = NULL; 549 550 slot = path->slots[0]; 551 l = path->nodes[0]; 552 btrfs_item_key_to_cpu(l, &found_key, slot); 553 554 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 555 goto next2; 556 557 if (found_key.objectid > found_key.offset) { 558 /* parent <- member, not needed to build config */ 559 /* FIXME should we omit the key completely? */ 560 goto next2; 561 } 562 563 list = kzalloc(sizeof(*list), GFP_KERNEL); 564 if (!list) { 565 ret = -ENOMEM; 566 goto out; 567 } 568 ret = add_relation_rb(fs_info, list, found_key.objectid, 569 found_key.offset); 570 list = NULL; 571 if (ret == -ENOENT) { 572 btrfs_warn(fs_info, 573 "orphan qgroup relation 0x%llx->0x%llx", 574 found_key.objectid, found_key.offset); 575 ret = 0; /* ignore the error */ 576 } 577 if (ret) 578 goto out; 579 next2: 580 ret = btrfs_next_item(quota_root, path); 581 if (ret < 0) 582 goto out; 583 if (ret) 584 break; 585 } 586 out: 587 btrfs_free_path(path); 588 fs_info->qgroup_flags |= flags; 589 if (ret >= 0) { 590 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) 591 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 592 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 593 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 594 } else { 595 ulist_free(fs_info->qgroup_ulist); 596 fs_info->qgroup_ulist = NULL; 597 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 598 btrfs_sysfs_del_qgroups(fs_info); 599 } 600 601 return ret < 0 ? ret : 0; 602 } 603 604 /* 605 * Called in close_ctree() when quota is still enabled. This verifies we don't 606 * leak some reserved space. 607 * 608 * Return false if no reserved space is left. 609 * Return true if some reserved space is leaked. 610 */ 611 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 612 { 613 struct rb_node *node; 614 bool ret = false; 615 616 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) 617 return ret; 618 /* 619 * Since we're unmounting, there is no race and no need to grab qgroup 620 * lock. And here we don't go post-order to provide a more user 621 * friendly sorted result. 622 */ 623 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 624 struct btrfs_qgroup *qgroup; 625 int i; 626 627 qgroup = rb_entry(node, struct btrfs_qgroup, node); 628 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 629 if (qgroup->rsv.values[i]) { 630 ret = true; 631 btrfs_warn(fs_info, 632 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 633 btrfs_qgroup_level(qgroup->qgroupid), 634 btrfs_qgroup_subvolid(qgroup->qgroupid), 635 i, qgroup->rsv.values[i]); 636 } 637 } 638 } 639 return ret; 640 } 641 642 /* 643 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 644 * first two are in single-threaded paths.And for the third one, we have set 645 * quota_root to be null with qgroup_lock held before, so it is safe to clean 646 * up the in-memory structures without qgroup_lock held. 647 */ 648 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 649 { 650 struct rb_node *n; 651 struct btrfs_qgroup *qgroup; 652 653 while ((n = rb_first(&fs_info->qgroup_tree))) { 654 qgroup = rb_entry(n, struct btrfs_qgroup, node); 655 rb_erase(n, &fs_info->qgroup_tree); 656 __del_qgroup_rb(fs_info, qgroup); 657 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 658 kfree(qgroup); 659 } 660 /* 661 * We call btrfs_free_qgroup_config() when unmounting 662 * filesystem and disabling quota, so we set qgroup_ulist 663 * to be null here to avoid double free. 664 */ 665 ulist_free(fs_info->qgroup_ulist); 666 fs_info->qgroup_ulist = NULL; 667 btrfs_sysfs_del_qgroups(fs_info); 668 } 669 670 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 671 u64 dst) 672 { 673 int ret; 674 struct btrfs_root *quota_root = trans->fs_info->quota_root; 675 struct btrfs_path *path; 676 struct btrfs_key key; 677 678 path = btrfs_alloc_path(); 679 if (!path) 680 return -ENOMEM; 681 682 key.objectid = src; 683 key.type = BTRFS_QGROUP_RELATION_KEY; 684 key.offset = dst; 685 686 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 687 688 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 689 690 btrfs_free_path(path); 691 return ret; 692 } 693 694 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 695 u64 dst) 696 { 697 int ret; 698 struct btrfs_root *quota_root = trans->fs_info->quota_root; 699 struct btrfs_path *path; 700 struct btrfs_key key; 701 702 path = btrfs_alloc_path(); 703 if (!path) 704 return -ENOMEM; 705 706 key.objectid = src; 707 key.type = BTRFS_QGROUP_RELATION_KEY; 708 key.offset = dst; 709 710 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 711 if (ret < 0) 712 goto out; 713 714 if (ret > 0) { 715 ret = -ENOENT; 716 goto out; 717 } 718 719 ret = btrfs_del_item(trans, quota_root, path); 720 out: 721 btrfs_free_path(path); 722 return ret; 723 } 724 725 static int add_qgroup_item(struct btrfs_trans_handle *trans, 726 struct btrfs_root *quota_root, u64 qgroupid) 727 { 728 int ret; 729 struct btrfs_path *path; 730 struct btrfs_qgroup_info_item *qgroup_info; 731 struct btrfs_qgroup_limit_item *qgroup_limit; 732 struct extent_buffer *leaf; 733 struct btrfs_key key; 734 735 if (btrfs_is_testing(quota_root->fs_info)) 736 return 0; 737 738 path = btrfs_alloc_path(); 739 if (!path) 740 return -ENOMEM; 741 742 key.objectid = 0; 743 key.type = BTRFS_QGROUP_INFO_KEY; 744 key.offset = qgroupid; 745 746 /* 747 * Avoid a transaction abort by catching -EEXIST here. In that 748 * case, we proceed by re-initializing the existing structure 749 * on disk. 750 */ 751 752 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 753 sizeof(*qgroup_info)); 754 if (ret && ret != -EEXIST) 755 goto out; 756 757 leaf = path->nodes[0]; 758 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 759 struct btrfs_qgroup_info_item); 760 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 761 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 762 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 763 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 764 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 765 766 btrfs_mark_buffer_dirty(trans, leaf); 767 768 btrfs_release_path(path); 769 770 key.type = BTRFS_QGROUP_LIMIT_KEY; 771 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 772 sizeof(*qgroup_limit)); 773 if (ret && ret != -EEXIST) 774 goto out; 775 776 leaf = path->nodes[0]; 777 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 778 struct btrfs_qgroup_limit_item); 779 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 780 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 781 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 782 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 783 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 784 785 btrfs_mark_buffer_dirty(trans, leaf); 786 787 ret = 0; 788 out: 789 btrfs_free_path(path); 790 return ret; 791 } 792 793 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 794 { 795 int ret; 796 struct btrfs_root *quota_root = trans->fs_info->quota_root; 797 struct btrfs_path *path; 798 struct btrfs_key key; 799 800 path = btrfs_alloc_path(); 801 if (!path) 802 return -ENOMEM; 803 804 key.objectid = 0; 805 key.type = BTRFS_QGROUP_INFO_KEY; 806 key.offset = qgroupid; 807 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 808 if (ret < 0) 809 goto out; 810 811 if (ret > 0) { 812 ret = -ENOENT; 813 goto out; 814 } 815 816 ret = btrfs_del_item(trans, quota_root, path); 817 if (ret) 818 goto out; 819 820 btrfs_release_path(path); 821 822 key.type = BTRFS_QGROUP_LIMIT_KEY; 823 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 824 if (ret < 0) 825 goto out; 826 827 if (ret > 0) { 828 ret = -ENOENT; 829 goto out; 830 } 831 832 ret = btrfs_del_item(trans, quota_root, path); 833 834 out: 835 btrfs_free_path(path); 836 return ret; 837 } 838 839 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 840 struct btrfs_qgroup *qgroup) 841 { 842 struct btrfs_root *quota_root = trans->fs_info->quota_root; 843 struct btrfs_path *path; 844 struct btrfs_key key; 845 struct extent_buffer *l; 846 struct btrfs_qgroup_limit_item *qgroup_limit; 847 int ret; 848 int slot; 849 850 key.objectid = 0; 851 key.type = BTRFS_QGROUP_LIMIT_KEY; 852 key.offset = qgroup->qgroupid; 853 854 path = btrfs_alloc_path(); 855 if (!path) 856 return -ENOMEM; 857 858 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 859 if (ret > 0) 860 ret = -ENOENT; 861 862 if (ret) 863 goto out; 864 865 l = path->nodes[0]; 866 slot = path->slots[0]; 867 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 868 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 869 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 870 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 871 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 872 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 873 874 btrfs_mark_buffer_dirty(trans, l); 875 876 out: 877 btrfs_free_path(path); 878 return ret; 879 } 880 881 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 882 struct btrfs_qgroup *qgroup) 883 { 884 struct btrfs_fs_info *fs_info = trans->fs_info; 885 struct btrfs_root *quota_root = fs_info->quota_root; 886 struct btrfs_path *path; 887 struct btrfs_key key; 888 struct extent_buffer *l; 889 struct btrfs_qgroup_info_item *qgroup_info; 890 int ret; 891 int slot; 892 893 if (btrfs_is_testing(fs_info)) 894 return 0; 895 896 key.objectid = 0; 897 key.type = BTRFS_QGROUP_INFO_KEY; 898 key.offset = qgroup->qgroupid; 899 900 path = btrfs_alloc_path(); 901 if (!path) 902 return -ENOMEM; 903 904 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 905 if (ret > 0) 906 ret = -ENOENT; 907 908 if (ret) 909 goto out; 910 911 l = path->nodes[0]; 912 slot = path->slots[0]; 913 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 914 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 915 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 916 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 917 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 918 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 919 920 btrfs_mark_buffer_dirty(trans, l); 921 922 out: 923 btrfs_free_path(path); 924 return ret; 925 } 926 927 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 928 { 929 struct btrfs_fs_info *fs_info = trans->fs_info; 930 struct btrfs_root *quota_root = fs_info->quota_root; 931 struct btrfs_path *path; 932 struct btrfs_key key; 933 struct extent_buffer *l; 934 struct btrfs_qgroup_status_item *ptr; 935 int ret; 936 int slot; 937 938 key.objectid = 0; 939 key.type = BTRFS_QGROUP_STATUS_KEY; 940 key.offset = 0; 941 942 path = btrfs_alloc_path(); 943 if (!path) 944 return -ENOMEM; 945 946 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 947 if (ret > 0) 948 ret = -ENOENT; 949 950 if (ret) 951 goto out; 952 953 l = path->nodes[0]; 954 slot = path->slots[0]; 955 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 956 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & 957 BTRFS_QGROUP_STATUS_FLAGS_MASK); 958 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 959 btrfs_set_qgroup_status_rescan(l, ptr, 960 fs_info->qgroup_rescan_progress.objectid); 961 962 btrfs_mark_buffer_dirty(trans, l); 963 964 out: 965 btrfs_free_path(path); 966 return ret; 967 } 968 969 /* 970 * called with qgroup_lock held 971 */ 972 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 973 struct btrfs_root *root) 974 { 975 struct btrfs_path *path; 976 struct btrfs_key key; 977 struct extent_buffer *leaf = NULL; 978 int ret; 979 int nr = 0; 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return -ENOMEM; 984 985 key.objectid = 0; 986 key.offset = 0; 987 key.type = 0; 988 989 while (1) { 990 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 991 if (ret < 0) 992 goto out; 993 leaf = path->nodes[0]; 994 nr = btrfs_header_nritems(leaf); 995 if (!nr) 996 break; 997 /* 998 * delete the leaf one by one 999 * since the whole tree is going 1000 * to be deleted. 1001 */ 1002 path->slots[0] = 0; 1003 ret = btrfs_del_items(trans, root, path, 0, nr); 1004 if (ret) 1005 goto out; 1006 1007 btrfs_release_path(path); 1008 } 1009 ret = 0; 1010 out: 1011 btrfs_free_path(path); 1012 return ret; 1013 } 1014 1015 int btrfs_quota_enable(struct btrfs_fs_info *fs_info, 1016 struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) 1017 { 1018 struct btrfs_root *quota_root; 1019 struct btrfs_root *tree_root = fs_info->tree_root; 1020 struct btrfs_path *path = NULL; 1021 struct btrfs_qgroup_status_item *ptr; 1022 struct extent_buffer *leaf; 1023 struct btrfs_key key; 1024 struct btrfs_key found_key; 1025 struct btrfs_qgroup *qgroup = NULL; 1026 struct btrfs_qgroup *prealloc = NULL; 1027 struct btrfs_trans_handle *trans = NULL; 1028 struct ulist *ulist = NULL; 1029 const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); 1030 int ret = 0; 1031 int slot; 1032 1033 /* 1034 * We need to have subvol_sem write locked, to prevent races between 1035 * concurrent tasks trying to enable quotas, because we will unlock 1036 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 1037 * and before setting BTRFS_FS_QUOTA_ENABLED. 1038 */ 1039 lockdep_assert_held_write(&fs_info->subvol_sem); 1040 1041 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 1042 btrfs_err(fs_info, 1043 "qgroups are currently unsupported in extent tree v2"); 1044 return -EINVAL; 1045 } 1046 1047 mutex_lock(&fs_info->qgroup_ioctl_lock); 1048 if (fs_info->quota_root) 1049 goto out; 1050 1051 ulist = ulist_alloc(GFP_KERNEL); 1052 if (!ulist) { 1053 ret = -ENOMEM; 1054 goto out; 1055 } 1056 1057 ret = btrfs_sysfs_add_qgroups(fs_info); 1058 if (ret < 0) 1059 goto out; 1060 1061 /* 1062 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 1063 * avoid lock acquisition inversion problems (reported by lockdep) between 1064 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 1065 * start a transaction. 1066 * After we started the transaction lock qgroup_ioctl_lock again and 1067 * check if someone else created the quota root in the meanwhile. If so, 1068 * just return success and release the transaction handle. 1069 * 1070 * Also we don't need to worry about someone else calling 1071 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 1072 * that function returns 0 (success) when the sysfs entries already exist. 1073 */ 1074 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1075 1076 /* 1077 * 1 for quota root item 1078 * 1 for BTRFS_QGROUP_STATUS item 1079 * 1080 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 1081 * per subvolume. However those are not currently reserved since it 1082 * would be a lot of overkill. 1083 */ 1084 trans = btrfs_start_transaction(tree_root, 2); 1085 1086 mutex_lock(&fs_info->qgroup_ioctl_lock); 1087 if (IS_ERR(trans)) { 1088 ret = PTR_ERR(trans); 1089 trans = NULL; 1090 goto out; 1091 } 1092 1093 if (fs_info->quota_root) 1094 goto out; 1095 1096 fs_info->qgroup_ulist = ulist; 1097 ulist = NULL; 1098 1099 /* 1100 * initially create the quota tree 1101 */ 1102 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1103 if (IS_ERR(quota_root)) { 1104 ret = PTR_ERR(quota_root); 1105 btrfs_abort_transaction(trans, ret); 1106 goto out; 1107 } 1108 1109 path = btrfs_alloc_path(); 1110 if (!path) { 1111 ret = -ENOMEM; 1112 btrfs_abort_transaction(trans, ret); 1113 goto out_free_root; 1114 } 1115 1116 key.objectid = 0; 1117 key.type = BTRFS_QGROUP_STATUS_KEY; 1118 key.offset = 0; 1119 1120 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1121 sizeof(*ptr)); 1122 if (ret) { 1123 btrfs_abort_transaction(trans, ret); 1124 goto out_free_path; 1125 } 1126 1127 leaf = path->nodes[0]; 1128 ptr = btrfs_item_ptr(leaf, path->slots[0], 1129 struct btrfs_qgroup_status_item); 1130 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1131 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1132 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; 1133 if (simple) { 1134 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; 1135 btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); 1136 } else { 1137 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1138 } 1139 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & 1140 BTRFS_QGROUP_STATUS_FLAGS_MASK); 1141 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1142 1143 btrfs_mark_buffer_dirty(trans, leaf); 1144 1145 key.objectid = 0; 1146 key.type = BTRFS_ROOT_REF_KEY; 1147 key.offset = 0; 1148 1149 btrfs_release_path(path); 1150 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1151 if (ret > 0) 1152 goto out_add_root; 1153 if (ret < 0) { 1154 btrfs_abort_transaction(trans, ret); 1155 goto out_free_path; 1156 } 1157 1158 while (1) { 1159 slot = path->slots[0]; 1160 leaf = path->nodes[0]; 1161 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1162 1163 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1164 1165 /* Release locks on tree_root before we access quota_root */ 1166 btrfs_release_path(path); 1167 1168 /* We should not have a stray @prealloc pointer. */ 1169 ASSERT(prealloc == NULL); 1170 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 1171 if (!prealloc) { 1172 ret = -ENOMEM; 1173 btrfs_abort_transaction(trans, ret); 1174 goto out_free_path; 1175 } 1176 1177 ret = add_qgroup_item(trans, quota_root, 1178 found_key.offset); 1179 if (ret) { 1180 btrfs_abort_transaction(trans, ret); 1181 goto out_free_path; 1182 } 1183 1184 qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); 1185 prealloc = NULL; 1186 if (IS_ERR(qgroup)) { 1187 ret = PTR_ERR(qgroup); 1188 btrfs_abort_transaction(trans, ret); 1189 goto out_free_path; 1190 } 1191 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1192 if (ret < 0) { 1193 btrfs_abort_transaction(trans, ret); 1194 goto out_free_path; 1195 } 1196 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1197 path, 1, 0); 1198 if (ret < 0) { 1199 btrfs_abort_transaction(trans, ret); 1200 goto out_free_path; 1201 } 1202 if (ret > 0) { 1203 /* 1204 * Shouldn't happen, but in case it does we 1205 * don't need to do the btrfs_next_item, just 1206 * continue. 1207 */ 1208 continue; 1209 } 1210 } 1211 ret = btrfs_next_item(tree_root, path); 1212 if (ret < 0) { 1213 btrfs_abort_transaction(trans, ret); 1214 goto out_free_path; 1215 } 1216 if (ret) 1217 break; 1218 } 1219 1220 out_add_root: 1221 btrfs_release_path(path); 1222 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1223 if (ret) { 1224 btrfs_abort_transaction(trans, ret); 1225 goto out_free_path; 1226 } 1227 1228 ASSERT(prealloc == NULL); 1229 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 1230 if (!prealloc) { 1231 ret = -ENOMEM; 1232 goto out_free_path; 1233 } 1234 qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); 1235 prealloc = NULL; 1236 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1237 if (ret < 0) { 1238 btrfs_abort_transaction(trans, ret); 1239 goto out_free_path; 1240 } 1241 1242 fs_info->qgroup_enable_gen = trans->transid; 1243 1244 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1245 /* 1246 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1247 * a deadlock with tasks concurrently doing other qgroup operations, such 1248 * adding/removing qgroups or adding/deleting qgroup relations for example, 1249 * because all qgroup operations first start or join a transaction and then 1250 * lock the qgroup_ioctl_lock mutex. 1251 * We are safe from a concurrent task trying to enable quotas, by calling 1252 * this function, since we are serialized by fs_info->subvol_sem. 1253 */ 1254 ret = btrfs_commit_transaction(trans); 1255 trans = NULL; 1256 mutex_lock(&fs_info->qgroup_ioctl_lock); 1257 if (ret) 1258 goto out_free_path; 1259 1260 /* 1261 * Set quota enabled flag after committing the transaction, to avoid 1262 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1263 * creation. 1264 */ 1265 spin_lock(&fs_info->qgroup_lock); 1266 fs_info->quota_root = quota_root; 1267 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1268 if (simple) 1269 btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); 1270 spin_unlock(&fs_info->qgroup_lock); 1271 1272 /* Skip rescan for simple qgroups. */ 1273 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 1274 goto out_free_path; 1275 1276 ret = qgroup_rescan_init(fs_info, 0, 1); 1277 if (!ret) { 1278 qgroup_rescan_zero_tracking(fs_info); 1279 fs_info->qgroup_rescan_running = true; 1280 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1281 &fs_info->qgroup_rescan_work); 1282 } else { 1283 /* 1284 * We have set both BTRFS_FS_QUOTA_ENABLED and 1285 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with 1286 * -EINPROGRESS. That can happen because someone started the 1287 * rescan worker by calling quota rescan ioctl before we 1288 * attempted to initialize the rescan worker. Failure due to 1289 * quotas disabled in the meanwhile is not possible, because 1290 * we are holding a write lock on fs_info->subvol_sem, which 1291 * is also acquired when disabling quotas. 1292 * Ignore such error, and any other error would need to undo 1293 * everything we did in the transaction we just committed. 1294 */ 1295 ASSERT(ret == -EINPROGRESS); 1296 ret = 0; 1297 } 1298 1299 out_free_path: 1300 btrfs_free_path(path); 1301 out_free_root: 1302 if (ret) 1303 btrfs_put_root(quota_root); 1304 out: 1305 if (ret) { 1306 ulist_free(fs_info->qgroup_ulist); 1307 fs_info->qgroup_ulist = NULL; 1308 btrfs_sysfs_del_qgroups(fs_info); 1309 } 1310 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1311 if (ret && trans) 1312 btrfs_end_transaction(trans); 1313 else if (trans) 1314 ret = btrfs_end_transaction(trans); 1315 ulist_free(ulist); 1316 kfree(prealloc); 1317 return ret; 1318 } 1319 1320 /* 1321 * It is possible to have outstanding ordered extents which reserved bytes 1322 * before we disabled. We need to fully flush delalloc, ordered extents, and a 1323 * commit to ensure that we don't leak such reservations, only to have them 1324 * come back if we re-enable. 1325 * 1326 * - enable simple quotas 1327 * - reserve space 1328 * - release it, store rsv_bytes in OE 1329 * - disable quotas 1330 * - enable simple quotas (qgroup rsv are all 0) 1331 * - OE finishes 1332 * - run delayed refs 1333 * - free rsv_bytes, resulting in miscounting or even underflow 1334 */ 1335 static int flush_reservations(struct btrfs_fs_info *fs_info) 1336 { 1337 struct btrfs_trans_handle *trans; 1338 int ret; 1339 1340 ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); 1341 if (ret) 1342 return ret; 1343 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 1344 trans = btrfs_join_transaction(fs_info->tree_root); 1345 if (IS_ERR(trans)) 1346 return PTR_ERR(trans); 1347 ret = btrfs_commit_transaction(trans); 1348 1349 return ret; 1350 } 1351 1352 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1353 { 1354 struct btrfs_root *quota_root = NULL; 1355 struct btrfs_trans_handle *trans = NULL; 1356 int ret = 0; 1357 1358 /* 1359 * We need to have subvol_sem write locked to prevent races with 1360 * snapshot creation. 1361 */ 1362 lockdep_assert_held_write(&fs_info->subvol_sem); 1363 1364 /* 1365 * Relocation will mess with backrefs, so make sure we have the 1366 * cleaner_mutex held to protect us from relocate. 1367 */ 1368 lockdep_assert_held(&fs_info->cleaner_mutex); 1369 1370 mutex_lock(&fs_info->qgroup_ioctl_lock); 1371 if (!fs_info->quota_root) 1372 goto out; 1373 1374 /* 1375 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1376 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1377 * to lock that mutex while holding a transaction handle and the rescan 1378 * worker needs to commit a transaction. 1379 */ 1380 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1381 1382 /* 1383 * Request qgroup rescan worker to complete and wait for it. This wait 1384 * must be done before transaction start for quota disable since it may 1385 * deadlock with transaction by the qgroup rescan worker. 1386 */ 1387 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1388 btrfs_qgroup_wait_for_completion(fs_info, false); 1389 1390 /* 1391 * We have nothing held here and no trans handle, just return the error 1392 * if there is one. 1393 */ 1394 ret = flush_reservations(fs_info); 1395 if (ret) 1396 return ret; 1397 1398 /* 1399 * 1 For the root item 1400 * 1401 * We should also reserve enough items for the quota tree deletion in 1402 * btrfs_clean_quota_tree but this is not done. 1403 * 1404 * Also, we must always start a transaction without holding the mutex 1405 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1406 */ 1407 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1408 1409 mutex_lock(&fs_info->qgroup_ioctl_lock); 1410 if (IS_ERR(trans)) { 1411 ret = PTR_ERR(trans); 1412 trans = NULL; 1413 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1414 goto out; 1415 } 1416 1417 if (!fs_info->quota_root) 1418 goto out; 1419 1420 spin_lock(&fs_info->qgroup_lock); 1421 quota_root = fs_info->quota_root; 1422 fs_info->quota_root = NULL; 1423 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; 1425 fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; 1426 spin_unlock(&fs_info->qgroup_lock); 1427 1428 btrfs_free_qgroup_config(fs_info); 1429 1430 ret = btrfs_clean_quota_tree(trans, quota_root); 1431 if (ret) { 1432 btrfs_abort_transaction(trans, ret); 1433 goto out; 1434 } 1435 1436 ret = btrfs_del_root(trans, "a_root->root_key); 1437 if (ret) { 1438 btrfs_abort_transaction(trans, ret); 1439 goto out; 1440 } 1441 1442 spin_lock(&fs_info->trans_lock); 1443 list_del("a_root->dirty_list); 1444 spin_unlock(&fs_info->trans_lock); 1445 1446 btrfs_tree_lock(quota_root->node); 1447 btrfs_clear_buffer_dirty(trans, quota_root->node); 1448 btrfs_tree_unlock(quota_root->node); 1449 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1450 quota_root->node, 0, 1); 1451 1452 1453 out: 1454 btrfs_put_root(quota_root); 1455 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1456 if (ret && trans) 1457 btrfs_end_transaction(trans); 1458 else if (trans) 1459 ret = btrfs_commit_transaction(trans); 1460 return ret; 1461 } 1462 1463 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1464 struct btrfs_qgroup *qgroup) 1465 { 1466 if (list_empty(&qgroup->dirty)) 1467 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1468 } 1469 1470 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) 1471 { 1472 if (!list_empty(&qgroup->iterator)) 1473 return; 1474 1475 list_add_tail(&qgroup->iterator, head); 1476 } 1477 1478 static void qgroup_iterator_clean(struct list_head *head) 1479 { 1480 while (!list_empty(head)) { 1481 struct btrfs_qgroup *qgroup; 1482 1483 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); 1484 list_del_init(&qgroup->iterator); 1485 } 1486 } 1487 1488 /* 1489 * The easy accounting, we're updating qgroup relationship whose child qgroup 1490 * only has exclusive extents. 1491 * 1492 * In this case, all exclusive extents will also be exclusive for parent, so 1493 * excl/rfer just get added/removed. 1494 * 1495 * So is qgroup reservation space, which should also be added/removed to 1496 * parent. 1497 * Or when child tries to release reservation space, parent will underflow its 1498 * reservation (for relationship adding case). 1499 * 1500 * Caller should hold fs_info->qgroup_lock. 1501 */ 1502 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, 1503 struct btrfs_qgroup *src, int sign) 1504 { 1505 struct btrfs_qgroup *qgroup; 1506 struct btrfs_qgroup *cur; 1507 LIST_HEAD(qgroup_list); 1508 u64 num_bytes = src->excl; 1509 int ret = 0; 1510 1511 qgroup = find_qgroup_rb(fs_info, ref_root); 1512 if (!qgroup) 1513 goto out; 1514 1515 qgroup_iterator_add(&qgroup_list, qgroup); 1516 list_for_each_entry(cur, &qgroup_list, iterator) { 1517 struct btrfs_qgroup_list *glist; 1518 1519 qgroup->rfer += sign * num_bytes; 1520 qgroup->rfer_cmpr += sign * num_bytes; 1521 1522 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1523 qgroup->excl += sign * num_bytes; 1524 qgroup->excl_cmpr += sign * num_bytes; 1525 1526 if (sign > 0) 1527 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1528 else 1529 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1530 qgroup_dirty(fs_info, qgroup); 1531 1532 /* Append parent qgroups to @qgroup_list. */ 1533 list_for_each_entry(glist, &qgroup->groups, next_group) 1534 qgroup_iterator_add(&qgroup_list, glist->group); 1535 } 1536 ret = 0; 1537 out: 1538 qgroup_iterator_clean(&qgroup_list); 1539 return ret; 1540 } 1541 1542 1543 /* 1544 * Quick path for updating qgroup with only excl refs. 1545 * 1546 * In that case, just update all parent will be enough. 1547 * Or we needs to do a full rescan. 1548 * Caller should also hold fs_info->qgroup_lock. 1549 * 1550 * Return 0 for quick update, return >0 for need to full rescan 1551 * and mark INCONSISTENT flag. 1552 * Return < 0 for other error. 1553 */ 1554 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1555 u64 src, u64 dst, int sign) 1556 { 1557 struct btrfs_qgroup *qgroup; 1558 int ret = 1; 1559 1560 qgroup = find_qgroup_rb(fs_info, src); 1561 if (!qgroup) 1562 goto out; 1563 if (qgroup->excl == qgroup->rfer) { 1564 ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); 1565 if (ret < 0) 1566 goto out; 1567 ret = 0; 1568 } 1569 out: 1570 if (ret) 1571 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1572 return ret; 1573 } 1574 1575 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) 1576 { 1577 struct btrfs_fs_info *fs_info = trans->fs_info; 1578 struct btrfs_qgroup *parent; 1579 struct btrfs_qgroup *member; 1580 struct btrfs_qgroup_list *list; 1581 struct btrfs_qgroup_list *prealloc = NULL; 1582 int ret = 0; 1583 1584 /* Check the level of src and dst first */ 1585 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1586 return -EINVAL; 1587 1588 mutex_lock(&fs_info->qgroup_ioctl_lock); 1589 if (!fs_info->quota_root) { 1590 ret = -ENOTCONN; 1591 goto out; 1592 } 1593 member = find_qgroup_rb(fs_info, src); 1594 parent = find_qgroup_rb(fs_info, dst); 1595 if (!member || !parent) { 1596 ret = -EINVAL; 1597 goto out; 1598 } 1599 1600 /* check if such qgroup relation exist firstly */ 1601 list_for_each_entry(list, &member->groups, next_group) { 1602 if (list->group == parent) { 1603 ret = -EEXIST; 1604 goto out; 1605 } 1606 } 1607 1608 prealloc = kzalloc(sizeof(*list), GFP_NOFS); 1609 if (!prealloc) { 1610 ret = -ENOMEM; 1611 goto out; 1612 } 1613 ret = add_qgroup_relation_item(trans, src, dst); 1614 if (ret) 1615 goto out; 1616 1617 ret = add_qgroup_relation_item(trans, dst, src); 1618 if (ret) { 1619 del_qgroup_relation_item(trans, src, dst); 1620 goto out; 1621 } 1622 1623 spin_lock(&fs_info->qgroup_lock); 1624 ret = __add_relation_rb(prealloc, member, parent); 1625 prealloc = NULL; 1626 if (ret < 0) { 1627 spin_unlock(&fs_info->qgroup_lock); 1628 goto out; 1629 } 1630 ret = quick_update_accounting(fs_info, src, dst, 1); 1631 spin_unlock(&fs_info->qgroup_lock); 1632 out: 1633 kfree(prealloc); 1634 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1635 return ret; 1636 } 1637 1638 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1639 u64 dst) 1640 { 1641 struct btrfs_fs_info *fs_info = trans->fs_info; 1642 struct btrfs_qgroup *parent; 1643 struct btrfs_qgroup *member; 1644 struct btrfs_qgroup_list *list; 1645 bool found = false; 1646 int ret = 0; 1647 int ret2; 1648 1649 if (!fs_info->quota_root) { 1650 ret = -ENOTCONN; 1651 goto out; 1652 } 1653 1654 member = find_qgroup_rb(fs_info, src); 1655 parent = find_qgroup_rb(fs_info, dst); 1656 /* 1657 * The parent/member pair doesn't exist, then try to delete the dead 1658 * relation items only. 1659 */ 1660 if (!member || !parent) 1661 goto delete_item; 1662 1663 /* check if such qgroup relation exist firstly */ 1664 list_for_each_entry(list, &member->groups, next_group) { 1665 if (list->group == parent) { 1666 found = true; 1667 break; 1668 } 1669 } 1670 1671 delete_item: 1672 ret = del_qgroup_relation_item(trans, src, dst); 1673 if (ret < 0 && ret != -ENOENT) 1674 goto out; 1675 ret2 = del_qgroup_relation_item(trans, dst, src); 1676 if (ret2 < 0 && ret2 != -ENOENT) 1677 goto out; 1678 1679 /* At least one deletion succeeded, return 0 */ 1680 if (!ret || !ret2) 1681 ret = 0; 1682 1683 if (found) { 1684 spin_lock(&fs_info->qgroup_lock); 1685 del_relation_rb(fs_info, src, dst); 1686 ret = quick_update_accounting(fs_info, src, dst, -1); 1687 spin_unlock(&fs_info->qgroup_lock); 1688 } 1689 out: 1690 return ret; 1691 } 1692 1693 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1694 u64 dst) 1695 { 1696 struct btrfs_fs_info *fs_info = trans->fs_info; 1697 int ret = 0; 1698 1699 mutex_lock(&fs_info->qgroup_ioctl_lock); 1700 ret = __del_qgroup_relation(trans, src, dst); 1701 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1702 1703 return ret; 1704 } 1705 1706 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1707 { 1708 struct btrfs_fs_info *fs_info = trans->fs_info; 1709 struct btrfs_root *quota_root; 1710 struct btrfs_qgroup *qgroup; 1711 struct btrfs_qgroup *prealloc = NULL; 1712 int ret = 0; 1713 1714 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) 1715 return 0; 1716 1717 mutex_lock(&fs_info->qgroup_ioctl_lock); 1718 if (!fs_info->quota_root) { 1719 ret = -ENOTCONN; 1720 goto out; 1721 } 1722 quota_root = fs_info->quota_root; 1723 qgroup = find_qgroup_rb(fs_info, qgroupid); 1724 if (qgroup) { 1725 ret = -EEXIST; 1726 goto out; 1727 } 1728 1729 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 1730 if (!prealloc) { 1731 ret = -ENOMEM; 1732 goto out; 1733 } 1734 1735 ret = add_qgroup_item(trans, quota_root, qgroupid); 1736 if (ret) 1737 goto out; 1738 1739 spin_lock(&fs_info->qgroup_lock); 1740 qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); 1741 spin_unlock(&fs_info->qgroup_lock); 1742 prealloc = NULL; 1743 1744 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1745 out: 1746 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1747 kfree(prealloc); 1748 return ret; 1749 } 1750 1751 static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) 1752 { 1753 return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || 1754 qgroup->excl > 0 || qgroup->excl_cmpr > 0 || 1755 qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || 1756 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || 1757 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); 1758 } 1759 1760 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1761 { 1762 struct btrfs_fs_info *fs_info = trans->fs_info; 1763 struct btrfs_qgroup *qgroup; 1764 struct btrfs_qgroup_list *list; 1765 int ret = 0; 1766 1767 mutex_lock(&fs_info->qgroup_ioctl_lock); 1768 if (!fs_info->quota_root) { 1769 ret = -ENOTCONN; 1770 goto out; 1771 } 1772 1773 qgroup = find_qgroup_rb(fs_info, qgroupid); 1774 if (!qgroup) { 1775 ret = -ENOENT; 1776 goto out; 1777 } 1778 1779 if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { 1780 ret = -EBUSY; 1781 goto out; 1782 } 1783 1784 /* Check if there are no children of this qgroup */ 1785 if (!list_empty(&qgroup->members)) { 1786 ret = -EBUSY; 1787 goto out; 1788 } 1789 1790 ret = del_qgroup_item(trans, qgroupid); 1791 if (ret && ret != -ENOENT) 1792 goto out; 1793 1794 while (!list_empty(&qgroup->groups)) { 1795 list = list_first_entry(&qgroup->groups, 1796 struct btrfs_qgroup_list, next_group); 1797 ret = __del_qgroup_relation(trans, qgroupid, 1798 list->group->qgroupid); 1799 if (ret) 1800 goto out; 1801 } 1802 1803 spin_lock(&fs_info->qgroup_lock); 1804 del_qgroup_rb(fs_info, qgroupid); 1805 spin_unlock(&fs_info->qgroup_lock); 1806 1807 /* 1808 * Remove the qgroup from sysfs now without holding the qgroup_lock 1809 * spinlock, since the sysfs_remove_group() function needs to take 1810 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1811 */ 1812 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1813 kfree(qgroup); 1814 out: 1815 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1816 return ret; 1817 } 1818 1819 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1820 struct btrfs_qgroup_limit *limit) 1821 { 1822 struct btrfs_fs_info *fs_info = trans->fs_info; 1823 struct btrfs_qgroup *qgroup; 1824 int ret = 0; 1825 /* Sometimes we would want to clear the limit on this qgroup. 1826 * To meet this requirement, we treat the -1 as a special value 1827 * which tell kernel to clear the limit on this qgroup. 1828 */ 1829 const u64 CLEAR_VALUE = -1; 1830 1831 mutex_lock(&fs_info->qgroup_ioctl_lock); 1832 if (!fs_info->quota_root) { 1833 ret = -ENOTCONN; 1834 goto out; 1835 } 1836 1837 qgroup = find_qgroup_rb(fs_info, qgroupid); 1838 if (!qgroup) { 1839 ret = -ENOENT; 1840 goto out; 1841 } 1842 1843 spin_lock(&fs_info->qgroup_lock); 1844 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1845 if (limit->max_rfer == CLEAR_VALUE) { 1846 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1847 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1848 qgroup->max_rfer = 0; 1849 } else { 1850 qgroup->max_rfer = limit->max_rfer; 1851 } 1852 } 1853 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1854 if (limit->max_excl == CLEAR_VALUE) { 1855 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1856 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1857 qgroup->max_excl = 0; 1858 } else { 1859 qgroup->max_excl = limit->max_excl; 1860 } 1861 } 1862 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1863 if (limit->rsv_rfer == CLEAR_VALUE) { 1864 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1865 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1866 qgroup->rsv_rfer = 0; 1867 } else { 1868 qgroup->rsv_rfer = limit->rsv_rfer; 1869 } 1870 } 1871 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1872 if (limit->rsv_excl == CLEAR_VALUE) { 1873 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1874 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1875 qgroup->rsv_excl = 0; 1876 } else { 1877 qgroup->rsv_excl = limit->rsv_excl; 1878 } 1879 } 1880 qgroup->lim_flags |= limit->flags; 1881 1882 spin_unlock(&fs_info->qgroup_lock); 1883 1884 ret = update_qgroup_limit_item(trans, qgroup); 1885 if (ret) { 1886 qgroup_mark_inconsistent(fs_info); 1887 btrfs_info(fs_info, "unable to update quota limit for %llu", 1888 qgroupid); 1889 } 1890 1891 out: 1892 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1893 return ret; 1894 } 1895 1896 /* 1897 * Inform qgroup to trace one dirty extent, its info is recorded in @record. 1898 * So qgroup can account it at transaction committing time. 1899 * 1900 * No lock version, caller must acquire delayed ref lock and allocated memory, 1901 * then call btrfs_qgroup_trace_extent_post() after exiting lock context. 1902 * 1903 * Return 0 for success insert 1904 * Return >0 for existing record, caller can free @record safely. 1905 * Error is not possible 1906 */ 1907 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1908 struct btrfs_delayed_ref_root *delayed_refs, 1909 struct btrfs_qgroup_extent_record *record) 1910 { 1911 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1912 struct rb_node *parent_node = NULL; 1913 struct btrfs_qgroup_extent_record *entry; 1914 u64 bytenr = record->bytenr; 1915 1916 if (!btrfs_qgroup_full_accounting(fs_info)) 1917 return 1; 1918 1919 lockdep_assert_held(&delayed_refs->lock); 1920 trace_btrfs_qgroup_trace_extent(fs_info, record); 1921 1922 while (*p) { 1923 parent_node = *p; 1924 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1925 node); 1926 if (bytenr < entry->bytenr) { 1927 p = &(*p)->rb_left; 1928 } else if (bytenr > entry->bytenr) { 1929 p = &(*p)->rb_right; 1930 } else { 1931 if (record->data_rsv && !entry->data_rsv) { 1932 entry->data_rsv = record->data_rsv; 1933 entry->data_rsv_refroot = 1934 record->data_rsv_refroot; 1935 } 1936 return 1; 1937 } 1938 } 1939 1940 rb_link_node(&record->node, parent_node, p); 1941 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1942 return 0; 1943 } 1944 1945 /* 1946 * Post handler after qgroup_trace_extent_nolock(). 1947 * 1948 * NOTE: Current qgroup does the expensive backref walk at transaction 1949 * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming 1950 * new transaction. 1951 * This is designed to allow btrfs_find_all_roots() to get correct new_roots 1952 * result. 1953 * 1954 * However for old_roots there is no need to do backref walk at that time, 1955 * since we search commit roots to walk backref and result will always be 1956 * correct. 1957 * 1958 * Due to the nature of no lock version, we can't do backref there. 1959 * So we must call btrfs_qgroup_trace_extent_post() after exiting 1960 * spinlock context. 1961 * 1962 * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result 1963 * using current root, then we can move all expensive backref walk out of 1964 * transaction committing, but not now as qgroup accounting will be wrong again. 1965 */ 1966 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1967 struct btrfs_qgroup_extent_record *qrecord) 1968 { 1969 struct btrfs_backref_walk_ctx ctx = { 0 }; 1970 int ret; 1971 1972 if (!btrfs_qgroup_full_accounting(trans->fs_info)) 1973 return 0; 1974 /* 1975 * We are always called in a context where we are already holding a 1976 * transaction handle. Often we are called when adding a data delayed 1977 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1978 * in which case we will be holding a write lock on extent buffer from a 1979 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1980 * acquire fs_info->commit_root_sem, because that is a higher level lock 1981 * that must be acquired before locking any extent buffers. 1982 * 1983 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1984 * but we can't pass it a non-NULL transaction handle, because otherwise 1985 * it would not use commit roots and would lock extent buffers, causing 1986 * a deadlock if it ends up trying to read lock the same extent buffer 1987 * that was previously write locked at btrfs_truncate_inode_items(). 1988 * 1989 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1990 * explicitly tell it to not acquire the commit_root_sem - if we are 1991 * holding a transaction handle we don't need its protection. 1992 */ 1993 ASSERT(trans != NULL); 1994 1995 if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 1996 return 0; 1997 1998 ctx.bytenr = qrecord->bytenr; 1999 ctx.fs_info = trans->fs_info; 2000 2001 ret = btrfs_find_all_roots(&ctx, true); 2002 if (ret < 0) { 2003 qgroup_mark_inconsistent(trans->fs_info); 2004 btrfs_warn(trans->fs_info, 2005 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 2006 ret); 2007 return 0; 2008 } 2009 2010 /* 2011 * Here we don't need to get the lock of 2012 * trans->transaction->delayed_refs, since inserted qrecord won't 2013 * be deleted, only qrecord->node may be modified (new qrecord insert) 2014 * 2015 * So modifying qrecord->old_roots is safe here 2016 */ 2017 qrecord->old_roots = ctx.roots; 2018 return 0; 2019 } 2020 2021 /* 2022 * Inform qgroup to trace one dirty extent, specified by @bytenr and 2023 * @num_bytes. 2024 * So qgroup can account it at commit trans time. 2025 * 2026 * Better encapsulated version, with memory allocation and backref walk for 2027 * commit roots. 2028 * So this can sleep. 2029 * 2030 * Return 0 if the operation is done. 2031 * Return <0 for error, like memory allocation failure or invalid parameter 2032 * (NULL trans) 2033 */ 2034 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2035 u64 num_bytes) 2036 { 2037 struct btrfs_fs_info *fs_info = trans->fs_info; 2038 struct btrfs_qgroup_extent_record *record; 2039 struct btrfs_delayed_ref_root *delayed_refs; 2040 int ret; 2041 2042 if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) 2043 return 0; 2044 record = kzalloc(sizeof(*record), GFP_NOFS); 2045 if (!record) 2046 return -ENOMEM; 2047 2048 delayed_refs = &trans->transaction->delayed_refs; 2049 record->bytenr = bytenr; 2050 record->num_bytes = num_bytes; 2051 record->old_roots = NULL; 2052 2053 spin_lock(&delayed_refs->lock); 2054 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 2055 spin_unlock(&delayed_refs->lock); 2056 if (ret > 0) { 2057 kfree(record); 2058 return 0; 2059 } 2060 return btrfs_qgroup_trace_extent_post(trans, record); 2061 } 2062 2063 /* 2064 * Inform qgroup to trace all leaf items of data 2065 * 2066 * Return 0 for success 2067 * Return <0 for error(ENOMEM) 2068 */ 2069 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 2070 struct extent_buffer *eb) 2071 { 2072 struct btrfs_fs_info *fs_info = trans->fs_info; 2073 int nr = btrfs_header_nritems(eb); 2074 int i, extent_type, ret; 2075 struct btrfs_key key; 2076 struct btrfs_file_extent_item *fi; 2077 u64 bytenr, num_bytes; 2078 2079 /* We can be called directly from walk_up_proc() */ 2080 if (!btrfs_qgroup_full_accounting(fs_info)) 2081 return 0; 2082 2083 for (i = 0; i < nr; i++) { 2084 btrfs_item_key_to_cpu(eb, &key, i); 2085 2086 if (key.type != BTRFS_EXTENT_DATA_KEY) 2087 continue; 2088 2089 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 2090 /* filter out non qgroup-accountable extents */ 2091 extent_type = btrfs_file_extent_type(eb, fi); 2092 2093 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 2094 continue; 2095 2096 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 2097 if (!bytenr) 2098 continue; 2099 2100 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 2101 2102 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); 2103 if (ret) 2104 return ret; 2105 } 2106 cond_resched(); 2107 return 0; 2108 } 2109 2110 /* 2111 * Walk up the tree from the bottom, freeing leaves and any interior 2112 * nodes which have had all slots visited. If a node (leaf or 2113 * interior) is freed, the node above it will have it's slot 2114 * incremented. The root node will never be freed. 2115 * 2116 * At the end of this function, we should have a path which has all 2117 * slots incremented to the next position for a search. If we need to 2118 * read a new node it will be NULL and the node above it will have the 2119 * correct slot selected for a later read. 2120 * 2121 * If we increment the root nodes slot counter past the number of 2122 * elements, 1 is returned to signal completion of the search. 2123 */ 2124 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 2125 { 2126 int level = 0; 2127 int nr, slot; 2128 struct extent_buffer *eb; 2129 2130 if (root_level == 0) 2131 return 1; 2132 2133 while (level <= root_level) { 2134 eb = path->nodes[level]; 2135 nr = btrfs_header_nritems(eb); 2136 path->slots[level]++; 2137 slot = path->slots[level]; 2138 if (slot >= nr || level == 0) { 2139 /* 2140 * Don't free the root - we will detect this 2141 * condition after our loop and return a 2142 * positive value for caller to stop walking the tree. 2143 */ 2144 if (level != root_level) { 2145 btrfs_tree_unlock_rw(eb, path->locks[level]); 2146 path->locks[level] = 0; 2147 2148 free_extent_buffer(eb); 2149 path->nodes[level] = NULL; 2150 path->slots[level] = 0; 2151 } 2152 } else { 2153 /* 2154 * We have a valid slot to walk back down 2155 * from. Stop here so caller can process these 2156 * new nodes. 2157 */ 2158 break; 2159 } 2160 2161 level++; 2162 } 2163 2164 eb = path->nodes[root_level]; 2165 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 2166 return 1; 2167 2168 return 0; 2169 } 2170 2171 /* 2172 * Helper function to trace a subtree tree block swap. 2173 * 2174 * The swap will happen in highest tree block, but there may be a lot of 2175 * tree blocks involved. 2176 * 2177 * For example: 2178 * OO = Old tree blocks 2179 * NN = New tree blocks allocated during balance 2180 * 2181 * File tree (257) Reloc tree for 257 2182 * L2 OO NN 2183 * / \ / \ 2184 * L1 OO OO (a) OO NN (a) 2185 * / \ / \ / \ / \ 2186 * L0 OO OO OO OO OO OO NN NN 2187 * (b) (c) (b) (c) 2188 * 2189 * When calling qgroup_trace_extent_swap(), we will pass: 2190 * @src_eb = OO(a) 2191 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 2192 * @dst_level = 0 2193 * @root_level = 1 2194 * 2195 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 2196 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 2197 * 2198 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 2199 * 2200 * 1) Tree search from @src_eb 2201 * It should acts as a simplified btrfs_search_slot(). 2202 * The key for search can be extracted from @dst_path->nodes[dst_level] 2203 * (first key). 2204 * 2205 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 2206 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 2207 * They should be marked during previous (@dst_level = 1) iteration. 2208 * 2209 * 3) Mark file extents in leaves dirty 2210 * We don't have good way to pick out new file extents only. 2211 * So we still follow the old method by scanning all file extents in 2212 * the leave. 2213 * 2214 * This function can free us from keeping two paths, thus later we only need 2215 * to care about how to iterate all new tree blocks in reloc tree. 2216 */ 2217 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 2218 struct extent_buffer *src_eb, 2219 struct btrfs_path *dst_path, 2220 int dst_level, int root_level, 2221 bool trace_leaf) 2222 { 2223 struct btrfs_key key; 2224 struct btrfs_path *src_path; 2225 struct btrfs_fs_info *fs_info = trans->fs_info; 2226 u32 nodesize = fs_info->nodesize; 2227 int cur_level = root_level; 2228 int ret; 2229 2230 BUG_ON(dst_level > root_level); 2231 /* Level mismatch */ 2232 if (btrfs_header_level(src_eb) != root_level) 2233 return -EINVAL; 2234 2235 src_path = btrfs_alloc_path(); 2236 if (!src_path) { 2237 ret = -ENOMEM; 2238 goto out; 2239 } 2240 2241 if (dst_level) 2242 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2243 else 2244 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2245 2246 /* For src_path */ 2247 atomic_inc(&src_eb->refs); 2248 src_path->nodes[root_level] = src_eb; 2249 src_path->slots[root_level] = dst_path->slots[root_level]; 2250 src_path->locks[root_level] = 0; 2251 2252 /* A simplified version of btrfs_search_slot() */ 2253 while (cur_level >= dst_level) { 2254 struct btrfs_key src_key; 2255 struct btrfs_key dst_key; 2256 2257 if (src_path->nodes[cur_level] == NULL) { 2258 struct extent_buffer *eb; 2259 int parent_slot; 2260 2261 eb = src_path->nodes[cur_level + 1]; 2262 parent_slot = src_path->slots[cur_level + 1]; 2263 2264 eb = btrfs_read_node_slot(eb, parent_slot); 2265 if (IS_ERR(eb)) { 2266 ret = PTR_ERR(eb); 2267 goto out; 2268 } 2269 2270 src_path->nodes[cur_level] = eb; 2271 2272 btrfs_tree_read_lock(eb); 2273 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2274 } 2275 2276 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2277 if (cur_level) { 2278 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2279 &dst_key, dst_path->slots[cur_level]); 2280 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2281 &src_key, src_path->slots[cur_level]); 2282 } else { 2283 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2284 &dst_key, dst_path->slots[cur_level]); 2285 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2286 &src_key, src_path->slots[cur_level]); 2287 } 2288 /* Content mismatch, something went wrong */ 2289 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2290 ret = -ENOENT; 2291 goto out; 2292 } 2293 cur_level--; 2294 } 2295 2296 /* 2297 * Now both @dst_path and @src_path have been populated, record the tree 2298 * blocks for qgroup accounting. 2299 */ 2300 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2301 nodesize); 2302 if (ret < 0) 2303 goto out; 2304 ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, 2305 nodesize); 2306 if (ret < 0) 2307 goto out; 2308 2309 /* Record leaf file extents */ 2310 if (dst_level == 0 && trace_leaf) { 2311 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2312 if (ret < 0) 2313 goto out; 2314 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2315 } 2316 out: 2317 btrfs_free_path(src_path); 2318 return ret; 2319 } 2320 2321 /* 2322 * Helper function to do recursive generation-aware depth-first search, to 2323 * locate all new tree blocks in a subtree of reloc tree. 2324 * 2325 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2326 * reloc tree 2327 * L2 NN (a) 2328 * / \ 2329 * L1 OO NN (b) 2330 * / \ / \ 2331 * L0 OO OO OO NN 2332 * (c) (d) 2333 * If we pass: 2334 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2335 * @cur_level = 1 2336 * @root_level = 1 2337 * 2338 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2339 * above tree blocks along with their counter parts in file tree. 2340 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2341 * won't affect OO(c). 2342 */ 2343 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2344 struct extent_buffer *src_eb, 2345 struct btrfs_path *dst_path, 2346 int cur_level, int root_level, 2347 u64 last_snapshot, bool trace_leaf) 2348 { 2349 struct btrfs_fs_info *fs_info = trans->fs_info; 2350 struct extent_buffer *eb; 2351 bool need_cleanup = false; 2352 int ret = 0; 2353 int i; 2354 2355 /* Level sanity check */ 2356 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2357 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2358 root_level < cur_level) { 2359 btrfs_err_rl(fs_info, 2360 "%s: bad levels, cur_level=%d root_level=%d", 2361 __func__, cur_level, root_level); 2362 return -EUCLEAN; 2363 } 2364 2365 /* Read the tree block if needed */ 2366 if (dst_path->nodes[cur_level] == NULL) { 2367 int parent_slot; 2368 u64 child_gen; 2369 2370 /* 2371 * dst_path->nodes[root_level] must be initialized before 2372 * calling this function. 2373 */ 2374 if (cur_level == root_level) { 2375 btrfs_err_rl(fs_info, 2376 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2377 __func__, root_level, root_level, cur_level); 2378 return -EUCLEAN; 2379 } 2380 2381 /* 2382 * We need to get child blockptr/gen from parent before we can 2383 * read it. 2384 */ 2385 eb = dst_path->nodes[cur_level + 1]; 2386 parent_slot = dst_path->slots[cur_level + 1]; 2387 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2388 2389 /* This node is old, no need to trace */ 2390 if (child_gen < last_snapshot) 2391 goto out; 2392 2393 eb = btrfs_read_node_slot(eb, parent_slot); 2394 if (IS_ERR(eb)) { 2395 ret = PTR_ERR(eb); 2396 goto out; 2397 } 2398 2399 dst_path->nodes[cur_level] = eb; 2400 dst_path->slots[cur_level] = 0; 2401 2402 btrfs_tree_read_lock(eb); 2403 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2404 need_cleanup = true; 2405 } 2406 2407 /* Now record this tree block and its counter part for qgroups */ 2408 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2409 root_level, trace_leaf); 2410 if (ret < 0) 2411 goto cleanup; 2412 2413 eb = dst_path->nodes[cur_level]; 2414 2415 if (cur_level > 0) { 2416 /* Iterate all child tree blocks */ 2417 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2418 /* Skip old tree blocks as they won't be swapped */ 2419 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2420 continue; 2421 dst_path->slots[cur_level] = i; 2422 2423 /* Recursive call (at most 7 times) */ 2424 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2425 dst_path, cur_level - 1, root_level, 2426 last_snapshot, trace_leaf); 2427 if (ret < 0) 2428 goto cleanup; 2429 } 2430 } 2431 2432 cleanup: 2433 if (need_cleanup) { 2434 /* Clean up */ 2435 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2436 dst_path->locks[cur_level]); 2437 free_extent_buffer(dst_path->nodes[cur_level]); 2438 dst_path->nodes[cur_level] = NULL; 2439 dst_path->slots[cur_level] = 0; 2440 dst_path->locks[cur_level] = 0; 2441 } 2442 out: 2443 return ret; 2444 } 2445 2446 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2447 struct extent_buffer *src_eb, 2448 struct extent_buffer *dst_eb, 2449 u64 last_snapshot, bool trace_leaf) 2450 { 2451 struct btrfs_fs_info *fs_info = trans->fs_info; 2452 struct btrfs_path *dst_path = NULL; 2453 int level; 2454 int ret; 2455 2456 if (!btrfs_qgroup_full_accounting(fs_info)) 2457 return 0; 2458 2459 /* Wrong parameter order */ 2460 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2461 btrfs_err_rl(fs_info, 2462 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2463 btrfs_header_generation(src_eb), 2464 btrfs_header_generation(dst_eb)); 2465 return -EUCLEAN; 2466 } 2467 2468 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2469 ret = -EIO; 2470 goto out; 2471 } 2472 2473 level = btrfs_header_level(dst_eb); 2474 dst_path = btrfs_alloc_path(); 2475 if (!dst_path) { 2476 ret = -ENOMEM; 2477 goto out; 2478 } 2479 /* For dst_path */ 2480 atomic_inc(&dst_eb->refs); 2481 dst_path->nodes[level] = dst_eb; 2482 dst_path->slots[level] = 0; 2483 dst_path->locks[level] = 0; 2484 2485 /* Do the generation aware breadth-first search */ 2486 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2487 level, last_snapshot, trace_leaf); 2488 if (ret < 0) 2489 goto out; 2490 ret = 0; 2491 2492 out: 2493 btrfs_free_path(dst_path); 2494 if (ret < 0) 2495 qgroup_mark_inconsistent(fs_info); 2496 return ret; 2497 } 2498 2499 /* 2500 * Inform qgroup to trace a whole subtree, including all its child tree 2501 * blocks and data. 2502 * The root tree block is specified by @root_eb. 2503 * 2504 * Normally used by relocation(tree block swap) and subvolume deletion. 2505 * 2506 * Return 0 for success 2507 * Return <0 for error(ENOMEM or tree search error) 2508 */ 2509 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2510 struct extent_buffer *root_eb, 2511 u64 root_gen, int root_level) 2512 { 2513 struct btrfs_fs_info *fs_info = trans->fs_info; 2514 int ret = 0; 2515 int level; 2516 u8 drop_subptree_thres; 2517 struct extent_buffer *eb = root_eb; 2518 struct btrfs_path *path = NULL; 2519 2520 ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); 2521 ASSERT(root_eb != NULL); 2522 2523 if (!btrfs_qgroup_full_accounting(fs_info)) 2524 return 0; 2525 2526 spin_lock(&fs_info->qgroup_lock); 2527 drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; 2528 spin_unlock(&fs_info->qgroup_lock); 2529 2530 /* 2531 * This function only gets called for snapshot drop, if we hit a high 2532 * node here, it means we are going to change ownership for quite a lot 2533 * of extents, which will greatly slow down btrfs_commit_transaction(). 2534 * 2535 * So here if we find a high tree here, we just skip the accounting and 2536 * mark qgroup inconsistent. 2537 */ 2538 if (root_level >= drop_subptree_thres) { 2539 qgroup_mark_inconsistent(fs_info); 2540 return 0; 2541 } 2542 2543 if (!extent_buffer_uptodate(root_eb)) { 2544 struct btrfs_tree_parent_check check = { 2545 .has_first_key = false, 2546 .transid = root_gen, 2547 .level = root_level 2548 }; 2549 2550 ret = btrfs_read_extent_buffer(root_eb, &check); 2551 if (ret) 2552 goto out; 2553 } 2554 2555 if (root_level == 0) { 2556 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2557 goto out; 2558 } 2559 2560 path = btrfs_alloc_path(); 2561 if (!path) 2562 return -ENOMEM; 2563 2564 /* 2565 * Walk down the tree. Missing extent blocks are filled in as 2566 * we go. Metadata is accounted every time we read a new 2567 * extent block. 2568 * 2569 * When we reach a leaf, we account for file extent items in it, 2570 * walk back up the tree (adjusting slot pointers as we go) 2571 * and restart the search process. 2572 */ 2573 atomic_inc(&root_eb->refs); /* For path */ 2574 path->nodes[root_level] = root_eb; 2575 path->slots[root_level] = 0; 2576 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2577 walk_down: 2578 level = root_level; 2579 while (level >= 0) { 2580 if (path->nodes[level] == NULL) { 2581 int parent_slot; 2582 u64 child_bytenr; 2583 2584 /* 2585 * We need to get child blockptr from parent before we 2586 * can read it. 2587 */ 2588 eb = path->nodes[level + 1]; 2589 parent_slot = path->slots[level + 1]; 2590 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2591 2592 eb = btrfs_read_node_slot(eb, parent_slot); 2593 if (IS_ERR(eb)) { 2594 ret = PTR_ERR(eb); 2595 goto out; 2596 } 2597 2598 path->nodes[level] = eb; 2599 path->slots[level] = 0; 2600 2601 btrfs_tree_read_lock(eb); 2602 path->locks[level] = BTRFS_READ_LOCK; 2603 2604 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2605 fs_info->nodesize); 2606 if (ret) 2607 goto out; 2608 } 2609 2610 if (level == 0) { 2611 ret = btrfs_qgroup_trace_leaf_items(trans, 2612 path->nodes[level]); 2613 if (ret) 2614 goto out; 2615 2616 /* Nonzero return here means we completed our search */ 2617 ret = adjust_slots_upwards(path, root_level); 2618 if (ret) 2619 break; 2620 2621 /* Restart search with new slots */ 2622 goto walk_down; 2623 } 2624 2625 level--; 2626 } 2627 2628 ret = 0; 2629 out: 2630 btrfs_free_path(path); 2631 2632 return ret; 2633 } 2634 2635 static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) 2636 { 2637 if (!list_empty(&qgroup->nested_iterator)) 2638 return; 2639 2640 list_add_tail(&qgroup->nested_iterator, head); 2641 } 2642 2643 static void qgroup_iterator_nested_clean(struct list_head *head) 2644 { 2645 while (!list_empty(head)) { 2646 struct btrfs_qgroup *qgroup; 2647 2648 qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); 2649 list_del_init(&qgroup->nested_iterator); 2650 } 2651 } 2652 2653 #define UPDATE_NEW 0 2654 #define UPDATE_OLD 1 2655 /* 2656 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2657 */ 2658 static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2659 struct ulist *roots, struct list_head *qgroups, 2660 u64 seq, int update_old) 2661 { 2662 struct ulist_node *unode; 2663 struct ulist_iterator uiter; 2664 struct btrfs_qgroup *qg; 2665 2666 if (!roots) 2667 return; 2668 ULIST_ITER_INIT(&uiter); 2669 while ((unode = ulist_next(roots, &uiter))) { 2670 LIST_HEAD(tmp); 2671 2672 qg = find_qgroup_rb(fs_info, unode->val); 2673 if (!qg) 2674 continue; 2675 2676 qgroup_iterator_nested_add(qgroups, qg); 2677 qgroup_iterator_add(&tmp, qg); 2678 list_for_each_entry(qg, &tmp, iterator) { 2679 struct btrfs_qgroup_list *glist; 2680 2681 if (update_old) 2682 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2683 else 2684 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2685 2686 list_for_each_entry(glist, &qg->groups, next_group) { 2687 qgroup_iterator_nested_add(qgroups, glist->group); 2688 qgroup_iterator_add(&tmp, glist->group); 2689 } 2690 } 2691 qgroup_iterator_clean(&tmp); 2692 } 2693 } 2694 2695 /* 2696 * Update qgroup rfer/excl counters. 2697 * Rfer update is easy, codes can explain themselves. 2698 * 2699 * Excl update is tricky, the update is split into 2 parts. 2700 * Part 1: Possible exclusive <-> sharing detect: 2701 * | A | !A | 2702 * ------------------------------------- 2703 * B | * | - | 2704 * ------------------------------------- 2705 * !B | + | ** | 2706 * ------------------------------------- 2707 * 2708 * Conditions: 2709 * A: cur_old_roots < nr_old_roots (not exclusive before) 2710 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2711 * B: cur_new_roots < nr_new_roots (not exclusive now) 2712 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2713 * 2714 * Results: 2715 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2716 * *: Definitely not changed. **: Possible unchanged. 2717 * 2718 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2719 * 2720 * To make the logic clear, we first use condition A and B to split 2721 * combination into 4 results. 2722 * 2723 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2724 * only on variant maybe 0. 2725 * 2726 * Lastly, check result **, since there are 2 variants maybe 0, split them 2727 * again(2x2). 2728 * But this time we don't need to consider other things, the codes and logic 2729 * is easy to understand now. 2730 */ 2731 static void qgroup_update_counters(struct btrfs_fs_info *fs_info, 2732 struct list_head *qgroups, u64 nr_old_roots, 2733 u64 nr_new_roots, u64 num_bytes, u64 seq) 2734 { 2735 struct btrfs_qgroup *qg; 2736 2737 list_for_each_entry(qg, qgroups, nested_iterator) { 2738 u64 cur_new_count, cur_old_count; 2739 bool dirty = false; 2740 2741 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2742 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2743 2744 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2745 cur_new_count); 2746 2747 /* Rfer update part */ 2748 if (cur_old_count == 0 && cur_new_count > 0) { 2749 qg->rfer += num_bytes; 2750 qg->rfer_cmpr += num_bytes; 2751 dirty = true; 2752 } 2753 if (cur_old_count > 0 && cur_new_count == 0) { 2754 qg->rfer -= num_bytes; 2755 qg->rfer_cmpr -= num_bytes; 2756 dirty = true; 2757 } 2758 2759 /* Excl update part */ 2760 /* Exclusive/none -> shared case */ 2761 if (cur_old_count == nr_old_roots && 2762 cur_new_count < nr_new_roots) { 2763 /* Exclusive -> shared */ 2764 if (cur_old_count != 0) { 2765 qg->excl -= num_bytes; 2766 qg->excl_cmpr -= num_bytes; 2767 dirty = true; 2768 } 2769 } 2770 2771 /* Shared -> exclusive/none case */ 2772 if (cur_old_count < nr_old_roots && 2773 cur_new_count == nr_new_roots) { 2774 /* Shared->exclusive */ 2775 if (cur_new_count != 0) { 2776 qg->excl += num_bytes; 2777 qg->excl_cmpr += num_bytes; 2778 dirty = true; 2779 } 2780 } 2781 2782 /* Exclusive/none -> exclusive/none case */ 2783 if (cur_old_count == nr_old_roots && 2784 cur_new_count == nr_new_roots) { 2785 if (cur_old_count == 0) { 2786 /* None -> exclusive/none */ 2787 2788 if (cur_new_count != 0) { 2789 /* None -> exclusive */ 2790 qg->excl += num_bytes; 2791 qg->excl_cmpr += num_bytes; 2792 dirty = true; 2793 } 2794 /* None -> none, nothing changed */ 2795 } else { 2796 /* Exclusive -> exclusive/none */ 2797 2798 if (cur_new_count == 0) { 2799 /* Exclusive -> none */ 2800 qg->excl -= num_bytes; 2801 qg->excl_cmpr -= num_bytes; 2802 dirty = true; 2803 } 2804 /* Exclusive -> exclusive, nothing changed */ 2805 } 2806 } 2807 2808 if (dirty) 2809 qgroup_dirty(fs_info, qg); 2810 } 2811 } 2812 2813 /* 2814 * Check if the @roots potentially is a list of fs tree roots 2815 * 2816 * Return 0 for definitely not a fs/subvol tree roots ulist 2817 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2818 * one as well) 2819 */ 2820 static int maybe_fs_roots(struct ulist *roots) 2821 { 2822 struct ulist_node *unode; 2823 struct ulist_iterator uiter; 2824 2825 /* Empty one, still possible for fs roots */ 2826 if (!roots || roots->nnodes == 0) 2827 return 1; 2828 2829 ULIST_ITER_INIT(&uiter); 2830 unode = ulist_next(roots, &uiter); 2831 if (!unode) 2832 return 1; 2833 2834 /* 2835 * If it contains fs tree roots, then it must belong to fs/subvol 2836 * trees. 2837 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2838 */ 2839 return is_fstree(unode->val); 2840 } 2841 2842 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2843 u64 num_bytes, struct ulist *old_roots, 2844 struct ulist *new_roots) 2845 { 2846 struct btrfs_fs_info *fs_info = trans->fs_info; 2847 LIST_HEAD(qgroups); 2848 u64 seq; 2849 u64 nr_new_roots = 0; 2850 u64 nr_old_roots = 0; 2851 int ret = 0; 2852 2853 /* 2854 * If quotas get disabled meanwhile, the resources need to be freed and 2855 * we can't just exit here. 2856 */ 2857 if (!btrfs_qgroup_full_accounting(fs_info) || 2858 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 2859 goto out_free; 2860 2861 if (new_roots) { 2862 if (!maybe_fs_roots(new_roots)) 2863 goto out_free; 2864 nr_new_roots = new_roots->nnodes; 2865 } 2866 if (old_roots) { 2867 if (!maybe_fs_roots(old_roots)) 2868 goto out_free; 2869 nr_old_roots = old_roots->nnodes; 2870 } 2871 2872 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2873 if (nr_old_roots == 0 && nr_new_roots == 0) 2874 goto out_free; 2875 2876 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2877 num_bytes, nr_old_roots, nr_new_roots); 2878 2879 mutex_lock(&fs_info->qgroup_rescan_lock); 2880 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2881 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2882 mutex_unlock(&fs_info->qgroup_rescan_lock); 2883 ret = 0; 2884 goto out_free; 2885 } 2886 } 2887 mutex_unlock(&fs_info->qgroup_rescan_lock); 2888 2889 spin_lock(&fs_info->qgroup_lock); 2890 seq = fs_info->qgroup_seq; 2891 2892 /* Update old refcnts using old_roots */ 2893 qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); 2894 2895 /* Update new refcnts using new_roots */ 2896 qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); 2897 2898 qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, 2899 num_bytes, seq); 2900 2901 /* 2902 * We're done using the iterator, release all its qgroups while holding 2903 * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() 2904 * and trigger use-after-free accesses to qgroups. 2905 */ 2906 qgroup_iterator_nested_clean(&qgroups); 2907 2908 /* 2909 * Bump qgroup_seq to avoid seq overlap 2910 */ 2911 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2912 spin_unlock(&fs_info->qgroup_lock); 2913 out_free: 2914 ulist_free(old_roots); 2915 ulist_free(new_roots); 2916 return ret; 2917 } 2918 2919 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2920 { 2921 struct btrfs_fs_info *fs_info = trans->fs_info; 2922 struct btrfs_qgroup_extent_record *record; 2923 struct btrfs_delayed_ref_root *delayed_refs; 2924 struct ulist *new_roots = NULL; 2925 struct rb_node *node; 2926 u64 num_dirty_extents = 0; 2927 u64 qgroup_to_skip; 2928 int ret = 0; 2929 2930 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 2931 return 0; 2932 2933 delayed_refs = &trans->transaction->delayed_refs; 2934 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2935 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2936 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2937 node); 2938 2939 num_dirty_extents++; 2940 trace_btrfs_qgroup_account_extents(fs_info, record); 2941 2942 if (!ret && !(fs_info->qgroup_flags & 2943 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { 2944 struct btrfs_backref_walk_ctx ctx = { 0 }; 2945 2946 ctx.bytenr = record->bytenr; 2947 ctx.fs_info = fs_info; 2948 2949 /* 2950 * Old roots should be searched when inserting qgroup 2951 * extent record. 2952 * 2953 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, 2954 * we may have some record inserted during 2955 * NO_ACCOUNTING (thus no old_roots populated), but 2956 * later we start rescan, which clears NO_ACCOUNTING, 2957 * leaving some inserted records without old_roots 2958 * populated. 2959 * 2960 * Those cases are rare and should not cause too much 2961 * time spent during commit_transaction(). 2962 */ 2963 if (!record->old_roots) { 2964 /* Search commit root to find old_roots */ 2965 ret = btrfs_find_all_roots(&ctx, false); 2966 if (ret < 0) 2967 goto cleanup; 2968 record->old_roots = ctx.roots; 2969 ctx.roots = NULL; 2970 } 2971 2972 /* 2973 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2974 * which doesn't lock tree or delayed_refs and search 2975 * current root. It's safe inside commit_transaction(). 2976 */ 2977 ctx.trans = trans; 2978 ctx.time_seq = BTRFS_SEQ_LAST; 2979 ret = btrfs_find_all_roots(&ctx, false); 2980 if (ret < 0) 2981 goto cleanup; 2982 new_roots = ctx.roots; 2983 if (qgroup_to_skip) { 2984 ulist_del(new_roots, qgroup_to_skip, 0); 2985 ulist_del(record->old_roots, qgroup_to_skip, 2986 0); 2987 } 2988 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2989 record->num_bytes, 2990 record->old_roots, 2991 new_roots); 2992 record->old_roots = NULL; 2993 new_roots = NULL; 2994 } 2995 /* Free the reserved data space */ 2996 btrfs_qgroup_free_refroot(fs_info, 2997 record->data_rsv_refroot, 2998 record->data_rsv, 2999 BTRFS_QGROUP_RSV_DATA); 3000 cleanup: 3001 ulist_free(record->old_roots); 3002 ulist_free(new_roots); 3003 new_roots = NULL; 3004 rb_erase(node, &delayed_refs->dirty_extent_root); 3005 kfree(record); 3006 3007 } 3008 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 3009 num_dirty_extents); 3010 return ret; 3011 } 3012 3013 /* 3014 * Writes all changed qgroups to disk. 3015 * Called by the transaction commit path and the qgroup assign ioctl. 3016 */ 3017 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 3018 { 3019 struct btrfs_fs_info *fs_info = trans->fs_info; 3020 int ret = 0; 3021 3022 /* 3023 * In case we are called from the qgroup assign ioctl, assert that we 3024 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota 3025 * disable operation (ioctl) and access a freed quota root. 3026 */ 3027 if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) 3028 lockdep_assert_held(&fs_info->qgroup_ioctl_lock); 3029 3030 if (!fs_info->quota_root) 3031 return ret; 3032 3033 spin_lock(&fs_info->qgroup_lock); 3034 while (!list_empty(&fs_info->dirty_qgroups)) { 3035 struct btrfs_qgroup *qgroup; 3036 qgroup = list_first_entry(&fs_info->dirty_qgroups, 3037 struct btrfs_qgroup, dirty); 3038 list_del_init(&qgroup->dirty); 3039 spin_unlock(&fs_info->qgroup_lock); 3040 ret = update_qgroup_info_item(trans, qgroup); 3041 if (ret) 3042 qgroup_mark_inconsistent(fs_info); 3043 ret = update_qgroup_limit_item(trans, qgroup); 3044 if (ret) 3045 qgroup_mark_inconsistent(fs_info); 3046 spin_lock(&fs_info->qgroup_lock); 3047 } 3048 if (btrfs_qgroup_enabled(fs_info)) 3049 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 3050 else 3051 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 3052 spin_unlock(&fs_info->qgroup_lock); 3053 3054 ret = update_qgroup_status_item(trans); 3055 if (ret) 3056 qgroup_mark_inconsistent(fs_info); 3057 3058 return ret; 3059 } 3060 3061 int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, 3062 struct btrfs_qgroup_inherit *inherit, 3063 size_t size) 3064 { 3065 if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) 3066 return -EOPNOTSUPP; 3067 if (size < sizeof(*inherit) || size > PAGE_SIZE) 3068 return -EINVAL; 3069 3070 /* 3071 * In the past we allowed btrfs_qgroup_inherit to specify to copy 3072 * rfer/excl numbers directly from other qgroups. This behavior has 3073 * been disabled in userspace for a very long time, but here we should 3074 * also disable it in kernel, as this behavior is known to mark qgroup 3075 * inconsistent, and a rescan would wipe out the changes anyway. 3076 * 3077 * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies. 3078 */ 3079 if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) 3080 return -EINVAL; 3081 3082 if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) 3083 return -EINVAL; 3084 3085 /* 3086 * Skip the inherit source qgroups check if qgroup is not enabled. 3087 * Qgroup can still be later enabled causing problems, but in that case 3088 * btrfs_qgroup_inherit() would just ignore those invalid ones. 3089 */ 3090 if (!btrfs_qgroup_enabled(fs_info)) 3091 return 0; 3092 3093 /* 3094 * Now check all the remaining qgroups, they should all: 3095 * 3096 * - Exist 3097 * - Be higher level qgroups. 3098 */ 3099 for (int i = 0; i < inherit->num_qgroups; i++) { 3100 struct btrfs_qgroup *qgroup; 3101 u64 qgroupid = inherit->qgroups[i]; 3102 3103 if (btrfs_qgroup_level(qgroupid) == 0) 3104 return -EINVAL; 3105 3106 spin_lock(&fs_info->qgroup_lock); 3107 qgroup = find_qgroup_rb(fs_info, qgroupid); 3108 if (!qgroup) { 3109 spin_unlock(&fs_info->qgroup_lock); 3110 return -ENOENT; 3111 } 3112 spin_unlock(&fs_info->qgroup_lock); 3113 } 3114 return 0; 3115 } 3116 3117 static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, 3118 u64 inode_rootid, 3119 struct btrfs_qgroup_inherit **inherit) 3120 { 3121 int i = 0; 3122 u64 num_qgroups = 0; 3123 struct btrfs_qgroup *inode_qg; 3124 struct btrfs_qgroup_list *qg_list; 3125 struct btrfs_qgroup_inherit *res; 3126 size_t struct_sz; 3127 u64 *qgids; 3128 3129 if (*inherit) 3130 return -EEXIST; 3131 3132 inode_qg = find_qgroup_rb(fs_info, inode_rootid); 3133 if (!inode_qg) 3134 return -ENOENT; 3135 3136 num_qgroups = list_count_nodes(&inode_qg->groups); 3137 3138 if (!num_qgroups) 3139 return 0; 3140 3141 struct_sz = struct_size(res, qgroups, num_qgroups); 3142 if (struct_sz == SIZE_MAX) 3143 return -ERANGE; 3144 3145 res = kzalloc(struct_sz, GFP_NOFS); 3146 if (!res) 3147 return -ENOMEM; 3148 res->num_qgroups = num_qgroups; 3149 qgids = res->qgroups; 3150 3151 list_for_each_entry(qg_list, &inode_qg->groups, next_group) 3152 qgids[i++] = qg_list->group->qgroupid; 3153 3154 *inherit = res; 3155 return 0; 3156 } 3157 3158 /* 3159 * Check if we can skip rescan when inheriting qgroups. If @src has a single 3160 * @parent, and that @parent is owning all its bytes exclusively, we can skip 3161 * the full rescan, by just adding nodesize to the @parent's excl/rfer. 3162 * 3163 * Return <0 for fatal errors (like srcid/parentid has no qgroup). 3164 * Return 0 if a quick inherit is done. 3165 * Return >0 if a quick inherit is not possible, and a full rescan is needed. 3166 */ 3167 static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info, 3168 u64 srcid, u64 parentid) 3169 { 3170 struct btrfs_qgroup *src; 3171 struct btrfs_qgroup *parent; 3172 struct btrfs_qgroup_list *list; 3173 int nr_parents = 0; 3174 3175 src = find_qgroup_rb(fs_info, srcid); 3176 if (!src) 3177 return -ENOENT; 3178 parent = find_qgroup_rb(fs_info, parentid); 3179 if (!parent) 3180 return -ENOENT; 3181 3182 /* 3183 * Source has no parent qgroup, but our new qgroup would have one. 3184 * Qgroup numbers would become inconsistent. 3185 */ 3186 if (list_empty(&src->groups)) 3187 return 1; 3188 3189 list_for_each_entry(list, &src->groups, next_group) { 3190 /* The parent is not the same, quick update is not possible. */ 3191 if (list->group->qgroupid != parentid) 3192 return 1; 3193 nr_parents++; 3194 /* 3195 * More than one parent qgroup, we can't be sure about accounting 3196 * consistency. 3197 */ 3198 if (nr_parents > 1) 3199 return 1; 3200 } 3201 3202 /* 3203 * The parent is not exclusively owning all its bytes. We're not sure 3204 * if the source has any bytes not fully owned by the parent. 3205 */ 3206 if (parent->excl != parent->rfer) 3207 return 1; 3208 3209 parent->excl += fs_info->nodesize; 3210 parent->rfer += fs_info->nodesize; 3211 return 0; 3212 } 3213 3214 /* 3215 * Copy the accounting information between qgroups. This is necessary 3216 * when a snapshot or a subvolume is created. Throwing an error will 3217 * cause a transaction abort so we take extra care here to only error 3218 * when a readonly fs is a reasonable outcome. 3219 */ 3220 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 3221 u64 objectid, u64 inode_rootid, 3222 struct btrfs_qgroup_inherit *inherit) 3223 { 3224 int ret = 0; 3225 int i; 3226 u64 *i_qgroups; 3227 bool committing = false; 3228 struct btrfs_fs_info *fs_info = trans->fs_info; 3229 struct btrfs_root *quota_root; 3230 struct btrfs_qgroup *srcgroup; 3231 struct btrfs_qgroup *dstgroup; 3232 struct btrfs_qgroup *prealloc; 3233 struct btrfs_qgroup_list **qlist_prealloc = NULL; 3234 bool free_inherit = false; 3235 bool need_rescan = false; 3236 u32 level_size = 0; 3237 u64 nums; 3238 3239 prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); 3240 if (!prealloc) 3241 return -ENOMEM; 3242 3243 /* 3244 * There are only two callers of this function. 3245 * 3246 * One in create_subvol() in the ioctl context, which needs to hold 3247 * the qgroup_ioctl_lock. 3248 * 3249 * The other one in create_pending_snapshot() where no other qgroup 3250 * code can modify the fs as they all need to either start a new trans 3251 * or hold a trans handler, thus we don't need to hold 3252 * qgroup_ioctl_lock. 3253 * This would avoid long and complex lock chain and make lockdep happy. 3254 */ 3255 spin_lock(&fs_info->trans_lock); 3256 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 3257 committing = true; 3258 spin_unlock(&fs_info->trans_lock); 3259 3260 if (!committing) 3261 mutex_lock(&fs_info->qgroup_ioctl_lock); 3262 if (!btrfs_qgroup_enabled(fs_info)) 3263 goto out; 3264 3265 quota_root = fs_info->quota_root; 3266 if (!quota_root) { 3267 ret = -EINVAL; 3268 goto out; 3269 } 3270 3271 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { 3272 ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); 3273 if (ret) 3274 goto out; 3275 free_inherit = true; 3276 } 3277 3278 if (inherit) { 3279 i_qgroups = (u64 *)(inherit + 1); 3280 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 3281 2 * inherit->num_excl_copies; 3282 for (i = 0; i < nums; ++i) { 3283 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 3284 3285 /* 3286 * Zero out invalid groups so we can ignore 3287 * them later. 3288 */ 3289 if (!srcgroup || 3290 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 3291 *i_qgroups = 0ULL; 3292 3293 ++i_qgroups; 3294 } 3295 } 3296 3297 /* 3298 * create a tracking group for the subvol itself 3299 */ 3300 ret = add_qgroup_item(trans, quota_root, objectid); 3301 if (ret) 3302 goto out; 3303 3304 /* 3305 * add qgroup to all inherited groups 3306 */ 3307 if (inherit) { 3308 i_qgroups = (u64 *)(inherit + 1); 3309 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 3310 if (*i_qgroups == 0) 3311 continue; 3312 ret = add_qgroup_relation_item(trans, objectid, 3313 *i_qgroups); 3314 if (ret && ret != -EEXIST) 3315 goto out; 3316 ret = add_qgroup_relation_item(trans, *i_qgroups, 3317 objectid); 3318 if (ret && ret != -EEXIST) 3319 goto out; 3320 } 3321 ret = 0; 3322 3323 qlist_prealloc = kcalloc(inherit->num_qgroups, 3324 sizeof(struct btrfs_qgroup_list *), 3325 GFP_NOFS); 3326 if (!qlist_prealloc) { 3327 ret = -ENOMEM; 3328 goto out; 3329 } 3330 for (int i = 0; i < inherit->num_qgroups; i++) { 3331 qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), 3332 GFP_NOFS); 3333 if (!qlist_prealloc[i]) { 3334 ret = -ENOMEM; 3335 goto out; 3336 } 3337 } 3338 } 3339 3340 spin_lock(&fs_info->qgroup_lock); 3341 3342 dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); 3343 prealloc = NULL; 3344 3345 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 3346 dstgroup->lim_flags = inherit->lim.flags; 3347 dstgroup->max_rfer = inherit->lim.max_rfer; 3348 dstgroup->max_excl = inherit->lim.max_excl; 3349 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 3350 dstgroup->rsv_excl = inherit->lim.rsv_excl; 3351 3352 qgroup_dirty(fs_info, dstgroup); 3353 } 3354 3355 if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { 3356 srcgroup = find_qgroup_rb(fs_info, srcid); 3357 if (!srcgroup) 3358 goto unlock; 3359 3360 /* 3361 * We call inherit after we clone the root in order to make sure 3362 * our counts don't go crazy, so at this point the only 3363 * difference between the two roots should be the root node. 3364 */ 3365 level_size = fs_info->nodesize; 3366 dstgroup->rfer = srcgroup->rfer; 3367 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 3368 dstgroup->excl = level_size; 3369 dstgroup->excl_cmpr = level_size; 3370 srcgroup->excl = level_size; 3371 srcgroup->excl_cmpr = level_size; 3372 3373 /* inherit the limit info */ 3374 dstgroup->lim_flags = srcgroup->lim_flags; 3375 dstgroup->max_rfer = srcgroup->max_rfer; 3376 dstgroup->max_excl = srcgroup->max_excl; 3377 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 3378 dstgroup->rsv_excl = srcgroup->rsv_excl; 3379 3380 qgroup_dirty(fs_info, dstgroup); 3381 qgroup_dirty(fs_info, srcgroup); 3382 3383 /* 3384 * If the source qgroup has parent but the new one doesn't, 3385 * we need a full rescan. 3386 */ 3387 if (!inherit && !list_empty(&srcgroup->groups)) 3388 need_rescan = true; 3389 } 3390 3391 if (!inherit) 3392 goto unlock; 3393 3394 i_qgroups = (u64 *)(inherit + 1); 3395 for (i = 0; i < inherit->num_qgroups; ++i) { 3396 if (*i_qgroups) { 3397 ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, 3398 *i_qgroups); 3399 qlist_prealloc[i] = NULL; 3400 if (ret) 3401 goto unlock; 3402 } 3403 if (srcid) { 3404 /* Check if we can do a quick inherit. */ 3405 ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); 3406 if (ret < 0) 3407 goto unlock; 3408 if (ret > 0) 3409 need_rescan = true; 3410 ret = 0; 3411 } 3412 ++i_qgroups; 3413 } 3414 3415 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 3416 struct btrfs_qgroup *src; 3417 struct btrfs_qgroup *dst; 3418 3419 if (!i_qgroups[0] || !i_qgroups[1]) 3420 continue; 3421 3422 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3423 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3424 3425 if (!src || !dst) { 3426 ret = -EINVAL; 3427 goto unlock; 3428 } 3429 3430 dst->rfer = src->rfer - level_size; 3431 dst->rfer_cmpr = src->rfer_cmpr - level_size; 3432 3433 /* Manually tweaking numbers certainly needs a rescan */ 3434 need_rescan = true; 3435 } 3436 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 3437 struct btrfs_qgroup *src; 3438 struct btrfs_qgroup *dst; 3439 3440 if (!i_qgroups[0] || !i_qgroups[1]) 3441 continue; 3442 3443 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3444 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3445 3446 if (!src || !dst) { 3447 ret = -EINVAL; 3448 goto unlock; 3449 } 3450 3451 dst->excl = src->excl + level_size; 3452 dst->excl_cmpr = src->excl_cmpr + level_size; 3453 need_rescan = true; 3454 } 3455 3456 unlock: 3457 spin_unlock(&fs_info->qgroup_lock); 3458 if (!ret) 3459 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 3460 out: 3461 if (!committing) 3462 mutex_unlock(&fs_info->qgroup_ioctl_lock); 3463 if (need_rescan) 3464 qgroup_mark_inconsistent(fs_info); 3465 if (qlist_prealloc) { 3466 for (int i = 0; i < inherit->num_qgroups; i++) 3467 kfree(qlist_prealloc[i]); 3468 kfree(qlist_prealloc); 3469 } 3470 if (free_inherit) 3471 kfree(inherit); 3472 kfree(prealloc); 3473 return ret; 3474 } 3475 3476 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 3477 { 3478 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 3479 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3480 return false; 3481 3482 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3483 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3484 return false; 3485 3486 return true; 3487 } 3488 3489 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3490 enum btrfs_qgroup_rsv_type type) 3491 { 3492 struct btrfs_qgroup *qgroup; 3493 struct btrfs_fs_info *fs_info = root->fs_info; 3494 u64 ref_root = btrfs_root_id(root); 3495 int ret = 0; 3496 LIST_HEAD(qgroup_list); 3497 3498 if (!is_fstree(ref_root)) 3499 return 0; 3500 3501 if (num_bytes == 0) 3502 return 0; 3503 3504 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3505 capable(CAP_SYS_RESOURCE)) 3506 enforce = false; 3507 3508 spin_lock(&fs_info->qgroup_lock); 3509 if (!fs_info->quota_root) 3510 goto out; 3511 3512 qgroup = find_qgroup_rb(fs_info, ref_root); 3513 if (!qgroup) 3514 goto out; 3515 3516 qgroup_iterator_add(&qgroup_list, qgroup); 3517 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3518 struct btrfs_qgroup_list *glist; 3519 3520 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { 3521 ret = -EDQUOT; 3522 goto out; 3523 } 3524 3525 list_for_each_entry(glist, &qgroup->groups, next_group) 3526 qgroup_iterator_add(&qgroup_list, glist->group); 3527 } 3528 3529 ret = 0; 3530 /* 3531 * no limits exceeded, now record the reservation into all qgroups 3532 */ 3533 list_for_each_entry(qgroup, &qgroup_list, iterator) 3534 qgroup_rsv_add(fs_info, qgroup, num_bytes, type); 3535 3536 out: 3537 qgroup_iterator_clean(&qgroup_list); 3538 spin_unlock(&fs_info->qgroup_lock); 3539 return ret; 3540 } 3541 3542 /* 3543 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3544 * qgroup). 3545 * 3546 * Will handle all higher level qgroup too. 3547 * 3548 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3549 * This special case is only used for META_PERTRANS type. 3550 */ 3551 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3552 u64 ref_root, u64 num_bytes, 3553 enum btrfs_qgroup_rsv_type type) 3554 { 3555 struct btrfs_qgroup *qgroup; 3556 LIST_HEAD(qgroup_list); 3557 3558 if (!is_fstree(ref_root)) 3559 return; 3560 3561 if (num_bytes == 0) 3562 return; 3563 3564 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3565 WARN(1, "%s: Invalid type to free", __func__); 3566 return; 3567 } 3568 spin_lock(&fs_info->qgroup_lock); 3569 3570 if (!fs_info->quota_root) 3571 goto out; 3572 3573 qgroup = find_qgroup_rb(fs_info, ref_root); 3574 if (!qgroup) 3575 goto out; 3576 3577 if (num_bytes == (u64)-1) 3578 /* 3579 * We're freeing all pertrans rsv, get reserved value from 3580 * level 0 qgroup as real num_bytes to free. 3581 */ 3582 num_bytes = qgroup->rsv.values[type]; 3583 3584 qgroup_iterator_add(&qgroup_list, qgroup); 3585 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3586 struct btrfs_qgroup_list *glist; 3587 3588 qgroup_rsv_release(fs_info, qgroup, num_bytes, type); 3589 list_for_each_entry(glist, &qgroup->groups, next_group) { 3590 qgroup_iterator_add(&qgroup_list, glist->group); 3591 } 3592 } 3593 out: 3594 qgroup_iterator_clean(&qgroup_list); 3595 spin_unlock(&fs_info->qgroup_lock); 3596 } 3597 3598 /* 3599 * Check if the leaf is the last leaf. Which means all node pointers 3600 * are at their last position. 3601 */ 3602 static bool is_last_leaf(struct btrfs_path *path) 3603 { 3604 int i; 3605 3606 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3607 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3608 return false; 3609 } 3610 return true; 3611 } 3612 3613 /* 3614 * returns < 0 on error, 0 when more leafs are to be scanned. 3615 * returns 1 when done. 3616 */ 3617 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3618 struct btrfs_path *path) 3619 { 3620 struct btrfs_fs_info *fs_info = trans->fs_info; 3621 struct btrfs_root *extent_root; 3622 struct btrfs_key found; 3623 struct extent_buffer *scratch_leaf = NULL; 3624 u64 num_bytes; 3625 bool done; 3626 int slot; 3627 int ret; 3628 3629 if (!btrfs_qgroup_full_accounting(fs_info)) 3630 return 1; 3631 3632 mutex_lock(&fs_info->qgroup_rescan_lock); 3633 extent_root = btrfs_extent_root(fs_info, 3634 fs_info->qgroup_rescan_progress.objectid); 3635 ret = btrfs_search_slot_for_read(extent_root, 3636 &fs_info->qgroup_rescan_progress, 3637 path, 1, 0); 3638 3639 btrfs_debug(fs_info, 3640 "current progress key (%llu %u %llu), search_slot ret %d", 3641 fs_info->qgroup_rescan_progress.objectid, 3642 fs_info->qgroup_rescan_progress.type, 3643 fs_info->qgroup_rescan_progress.offset, ret); 3644 3645 if (ret) { 3646 /* 3647 * The rescan is about to end, we will not be scanning any 3648 * further blocks. We cannot unset the RESCAN flag here, because 3649 * we want to commit the transaction if everything went well. 3650 * To make the live accounting work in this phase, we set our 3651 * scan progress pointer such that every real extent objectid 3652 * will be smaller. 3653 */ 3654 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3655 btrfs_release_path(path); 3656 mutex_unlock(&fs_info->qgroup_rescan_lock); 3657 return ret; 3658 } 3659 done = is_last_leaf(path); 3660 3661 btrfs_item_key_to_cpu(path->nodes[0], &found, 3662 btrfs_header_nritems(path->nodes[0]) - 1); 3663 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3664 3665 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3666 if (!scratch_leaf) { 3667 ret = -ENOMEM; 3668 mutex_unlock(&fs_info->qgroup_rescan_lock); 3669 goto out; 3670 } 3671 slot = path->slots[0]; 3672 btrfs_release_path(path); 3673 mutex_unlock(&fs_info->qgroup_rescan_lock); 3674 3675 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3676 struct btrfs_backref_walk_ctx ctx = { 0 }; 3677 3678 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3679 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3680 found.type != BTRFS_METADATA_ITEM_KEY) 3681 continue; 3682 if (found.type == BTRFS_METADATA_ITEM_KEY) 3683 num_bytes = fs_info->nodesize; 3684 else 3685 num_bytes = found.offset; 3686 3687 ctx.bytenr = found.objectid; 3688 ctx.fs_info = fs_info; 3689 3690 ret = btrfs_find_all_roots(&ctx, false); 3691 if (ret < 0) 3692 goto out; 3693 /* For rescan, just pass old_roots as NULL */ 3694 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3695 num_bytes, NULL, ctx.roots); 3696 if (ret < 0) 3697 goto out; 3698 } 3699 out: 3700 if (scratch_leaf) 3701 free_extent_buffer(scratch_leaf); 3702 3703 if (done && !ret) { 3704 ret = 1; 3705 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3706 } 3707 return ret; 3708 } 3709 3710 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3711 { 3712 if (btrfs_fs_closing(fs_info)) 3713 return true; 3714 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 3715 return true; 3716 if (!btrfs_qgroup_enabled(fs_info)) 3717 return true; 3718 if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3719 return true; 3720 return false; 3721 } 3722 3723 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3724 { 3725 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3726 qgroup_rescan_work); 3727 struct btrfs_path *path; 3728 struct btrfs_trans_handle *trans = NULL; 3729 int ret = 0; 3730 bool stopped = false; 3731 bool did_leaf_rescans = false; 3732 3733 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 3734 return; 3735 3736 path = btrfs_alloc_path(); 3737 if (!path) { 3738 ret = -ENOMEM; 3739 goto out; 3740 } 3741 /* 3742 * Rescan should only search for commit root, and any later difference 3743 * should be recorded by qgroup 3744 */ 3745 path->search_commit_root = 1; 3746 path->skip_locking = 1; 3747 3748 while (!ret && !(stopped = rescan_should_stop(fs_info))) { 3749 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3750 if (IS_ERR(trans)) { 3751 ret = PTR_ERR(trans); 3752 break; 3753 } 3754 3755 ret = qgroup_rescan_leaf(trans, path); 3756 did_leaf_rescans = true; 3757 3758 if (ret > 0) 3759 btrfs_commit_transaction(trans); 3760 else 3761 btrfs_end_transaction(trans); 3762 } 3763 3764 out: 3765 btrfs_free_path(path); 3766 3767 mutex_lock(&fs_info->qgroup_rescan_lock); 3768 if (ret > 0 && 3769 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3770 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3771 } else if (ret < 0 || stopped) { 3772 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3773 } 3774 mutex_unlock(&fs_info->qgroup_rescan_lock); 3775 3776 /* 3777 * Only update status, since the previous part has already updated the 3778 * qgroup info, and only if we did any actual work. This also prevents 3779 * race with a concurrent quota disable, which has already set 3780 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at 3781 * btrfs_quota_disable(). 3782 */ 3783 if (did_leaf_rescans) { 3784 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3785 if (IS_ERR(trans)) { 3786 ret = PTR_ERR(trans); 3787 trans = NULL; 3788 btrfs_err(fs_info, 3789 "fail to start transaction for status update: %d", 3790 ret); 3791 } 3792 } else { 3793 trans = NULL; 3794 } 3795 3796 mutex_lock(&fs_info->qgroup_rescan_lock); 3797 if (!stopped || 3798 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3799 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3800 if (trans) { 3801 int ret2 = update_qgroup_status_item(trans); 3802 3803 if (ret2 < 0) { 3804 ret = ret2; 3805 btrfs_err(fs_info, "fail to update qgroup status: %d", ret); 3806 } 3807 } 3808 fs_info->qgroup_rescan_running = false; 3809 fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3810 complete_all(&fs_info->qgroup_rescan_completion); 3811 mutex_unlock(&fs_info->qgroup_rescan_lock); 3812 3813 if (!trans) 3814 return; 3815 3816 btrfs_end_transaction(trans); 3817 3818 if (stopped) { 3819 btrfs_info(fs_info, "qgroup scan paused"); 3820 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { 3821 btrfs_info(fs_info, "qgroup scan cancelled"); 3822 } else if (ret >= 0) { 3823 btrfs_info(fs_info, "qgroup scan completed%s", 3824 ret > 0 ? " (inconsistency flag cleared)" : ""); 3825 } else { 3826 btrfs_err(fs_info, "qgroup scan failed with %d", ret); 3827 } 3828 } 3829 3830 /* 3831 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3832 * memory required for the rescan context. 3833 */ 3834 static int 3835 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3836 int init_flags) 3837 { 3838 int ret = 0; 3839 3840 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3841 btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); 3842 return -EINVAL; 3843 } 3844 3845 if (!init_flags) { 3846 /* we're resuming qgroup rescan at mount time */ 3847 if (!(fs_info->qgroup_flags & 3848 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3849 btrfs_debug(fs_info, 3850 "qgroup rescan init failed, qgroup rescan is not queued"); 3851 ret = -EINVAL; 3852 } else if (!(fs_info->qgroup_flags & 3853 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3854 btrfs_debug(fs_info, 3855 "qgroup rescan init failed, qgroup is not enabled"); 3856 ret = -ENOTCONN; 3857 } 3858 3859 if (ret) 3860 return ret; 3861 } 3862 3863 mutex_lock(&fs_info->qgroup_rescan_lock); 3864 3865 if (init_flags) { 3866 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3867 ret = -EINPROGRESS; 3868 } else if (!(fs_info->qgroup_flags & 3869 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3870 btrfs_debug(fs_info, 3871 "qgroup rescan init failed, qgroup is not enabled"); 3872 ret = -ENOTCONN; 3873 } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { 3874 /* Quota disable is in progress */ 3875 ret = -EBUSY; 3876 } 3877 3878 if (ret) { 3879 mutex_unlock(&fs_info->qgroup_rescan_lock); 3880 return ret; 3881 } 3882 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3883 } 3884 3885 memset(&fs_info->qgroup_rescan_progress, 0, 3886 sizeof(fs_info->qgroup_rescan_progress)); 3887 fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 3888 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 3889 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3890 init_completion(&fs_info->qgroup_rescan_completion); 3891 mutex_unlock(&fs_info->qgroup_rescan_lock); 3892 3893 btrfs_init_work(&fs_info->qgroup_rescan_work, 3894 btrfs_qgroup_rescan_worker, NULL); 3895 return 0; 3896 } 3897 3898 static void 3899 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3900 { 3901 struct rb_node *n; 3902 struct btrfs_qgroup *qgroup; 3903 3904 spin_lock(&fs_info->qgroup_lock); 3905 /* clear all current qgroup tracking information */ 3906 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3907 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3908 qgroup->rfer = 0; 3909 qgroup->rfer_cmpr = 0; 3910 qgroup->excl = 0; 3911 qgroup->excl_cmpr = 0; 3912 qgroup_dirty(fs_info, qgroup); 3913 } 3914 spin_unlock(&fs_info->qgroup_lock); 3915 } 3916 3917 int 3918 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3919 { 3920 int ret = 0; 3921 struct btrfs_trans_handle *trans; 3922 3923 ret = qgroup_rescan_init(fs_info, 0, 1); 3924 if (ret) 3925 return ret; 3926 3927 /* 3928 * We have set the rescan_progress to 0, which means no more 3929 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3930 * However, btrfs_qgroup_account_ref may be right after its call 3931 * to btrfs_find_all_roots, in which case it would still do the 3932 * accounting. 3933 * To solve this, we're committing the transaction, which will 3934 * ensure we run all delayed refs and only after that, we are 3935 * going to clear all tracking information for a clean start. 3936 */ 3937 3938 trans = btrfs_attach_transaction_barrier(fs_info->fs_root); 3939 if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { 3940 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3941 return PTR_ERR(trans); 3942 } else if (trans != ERR_PTR(-ENOENT)) { 3943 ret = btrfs_commit_transaction(trans); 3944 if (ret) { 3945 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3946 return ret; 3947 } 3948 } 3949 3950 qgroup_rescan_zero_tracking(fs_info); 3951 3952 mutex_lock(&fs_info->qgroup_rescan_lock); 3953 fs_info->qgroup_rescan_running = true; 3954 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3955 &fs_info->qgroup_rescan_work); 3956 mutex_unlock(&fs_info->qgroup_rescan_lock); 3957 3958 return 0; 3959 } 3960 3961 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3962 bool interruptible) 3963 { 3964 int running; 3965 int ret = 0; 3966 3967 mutex_lock(&fs_info->qgroup_rescan_lock); 3968 running = fs_info->qgroup_rescan_running; 3969 mutex_unlock(&fs_info->qgroup_rescan_lock); 3970 3971 if (!running) 3972 return 0; 3973 3974 if (interruptible) 3975 ret = wait_for_completion_interruptible( 3976 &fs_info->qgroup_rescan_completion); 3977 else 3978 wait_for_completion(&fs_info->qgroup_rescan_completion); 3979 3980 return ret; 3981 } 3982 3983 /* 3984 * this is only called from open_ctree where we're still single threaded, thus 3985 * locking is omitted here. 3986 */ 3987 void 3988 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3989 { 3990 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3991 mutex_lock(&fs_info->qgroup_rescan_lock); 3992 fs_info->qgroup_rescan_running = true; 3993 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3994 &fs_info->qgroup_rescan_work); 3995 mutex_unlock(&fs_info->qgroup_rescan_lock); 3996 } 3997 } 3998 3999 #define rbtree_iterate_from_safe(node, next, start) \ 4000 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 4001 4002 static int qgroup_unreserve_range(struct btrfs_inode *inode, 4003 struct extent_changeset *reserved, u64 start, 4004 u64 len) 4005 { 4006 struct rb_node *node; 4007 struct rb_node *next; 4008 struct ulist_node *entry; 4009 int ret = 0; 4010 4011 node = reserved->range_changed.root.rb_node; 4012 if (!node) 4013 return 0; 4014 while (node) { 4015 entry = rb_entry(node, struct ulist_node, rb_node); 4016 if (entry->val < start) 4017 node = node->rb_right; 4018 else 4019 node = node->rb_left; 4020 } 4021 4022 if (entry->val > start && rb_prev(&entry->rb_node)) 4023 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 4024 rb_node); 4025 4026 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 4027 u64 entry_start; 4028 u64 entry_end; 4029 u64 entry_len; 4030 int clear_ret; 4031 4032 entry = rb_entry(node, struct ulist_node, rb_node); 4033 entry_start = entry->val; 4034 entry_end = entry->aux; 4035 entry_len = entry_end - entry_start + 1; 4036 4037 if (entry_start >= start + len) 4038 break; 4039 if (entry_start + entry_len <= start) 4040 continue; 4041 /* 4042 * Now the entry is in [start, start + len), revert the 4043 * EXTENT_QGROUP_RESERVED bit. 4044 */ 4045 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 4046 entry_end, EXTENT_QGROUP_RESERVED); 4047 if (!ret && clear_ret < 0) 4048 ret = clear_ret; 4049 4050 ulist_del(&reserved->range_changed, entry->val, entry->aux); 4051 if (likely(reserved->bytes_changed >= entry_len)) { 4052 reserved->bytes_changed -= entry_len; 4053 } else { 4054 WARN_ON(1); 4055 reserved->bytes_changed = 0; 4056 } 4057 } 4058 4059 return ret; 4060 } 4061 4062 /* 4063 * Try to free some space for qgroup. 4064 * 4065 * For qgroup, there are only 3 ways to free qgroup space: 4066 * - Flush nodatacow write 4067 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 4068 * In theory, we should only flush nodatacow inodes, but it's not yet 4069 * possible, so we need to flush the whole root. 4070 * 4071 * - Wait for ordered extents 4072 * When ordered extents are finished, their reserved metadata is finally 4073 * converted to per_trans status, which can be freed by later commit 4074 * transaction. 4075 * 4076 * - Commit transaction 4077 * This would free the meta_per_trans space. 4078 * In theory this shouldn't provide much space, but any more qgroup space 4079 * is needed. 4080 */ 4081 static int try_flush_qgroup(struct btrfs_root *root) 4082 { 4083 struct btrfs_trans_handle *trans; 4084 int ret; 4085 4086 /* Can't hold an open transaction or we run the risk of deadlocking. */ 4087 ASSERT(current->journal_info == NULL); 4088 if (WARN_ON(current->journal_info)) 4089 return 0; 4090 4091 /* 4092 * We don't want to run flush again and again, so if there is a running 4093 * one, we won't try to start a new flush, but exit directly. 4094 */ 4095 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 4096 wait_event(root->qgroup_flush_wait, 4097 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 4098 return 0; 4099 } 4100 4101 ret = btrfs_start_delalloc_snapshot(root, true); 4102 if (ret < 0) 4103 goto out; 4104 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 4105 4106 trans = btrfs_attach_transaction_barrier(root); 4107 if (IS_ERR(trans)) { 4108 ret = PTR_ERR(trans); 4109 if (ret == -ENOENT) 4110 ret = 0; 4111 goto out; 4112 } 4113 4114 ret = btrfs_commit_transaction(trans); 4115 out: 4116 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 4117 wake_up(&root->qgroup_flush_wait); 4118 return ret; 4119 } 4120 4121 static int qgroup_reserve_data(struct btrfs_inode *inode, 4122 struct extent_changeset **reserved_ret, u64 start, 4123 u64 len) 4124 { 4125 struct btrfs_root *root = inode->root; 4126 struct extent_changeset *reserved; 4127 bool new_reserved = false; 4128 u64 orig_reserved; 4129 u64 to_reserve; 4130 int ret; 4131 4132 if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4133 !is_fstree(btrfs_root_id(root)) || len == 0) 4134 return 0; 4135 4136 /* @reserved parameter is mandatory for qgroup */ 4137 if (WARN_ON(!reserved_ret)) 4138 return -EINVAL; 4139 if (!*reserved_ret) { 4140 new_reserved = true; 4141 *reserved_ret = extent_changeset_alloc(); 4142 if (!*reserved_ret) 4143 return -ENOMEM; 4144 } 4145 reserved = *reserved_ret; 4146 /* Record already reserved space */ 4147 orig_reserved = reserved->bytes_changed; 4148 ret = set_record_extent_bits(&inode->io_tree, start, 4149 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 4150 4151 /* Newly reserved space */ 4152 to_reserve = reserved->bytes_changed - orig_reserved; 4153 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 4154 to_reserve, QGROUP_RESERVE); 4155 if (ret < 0) 4156 goto out; 4157 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 4158 if (ret < 0) 4159 goto cleanup; 4160 4161 return ret; 4162 4163 cleanup: 4164 qgroup_unreserve_range(inode, reserved, start, len); 4165 out: 4166 if (new_reserved) { 4167 extent_changeset_free(reserved); 4168 *reserved_ret = NULL; 4169 } 4170 return ret; 4171 } 4172 4173 /* 4174 * Reserve qgroup space for range [start, start + len). 4175 * 4176 * This function will either reserve space from related qgroups or do nothing 4177 * if the range is already reserved. 4178 * 4179 * Return 0 for successful reservation 4180 * Return <0 for error (including -EQUOT) 4181 * 4182 * NOTE: This function may sleep for memory allocation, dirty page flushing and 4183 * commit transaction. So caller should not hold any dirty page locked. 4184 */ 4185 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 4186 struct extent_changeset **reserved_ret, u64 start, 4187 u64 len) 4188 { 4189 int ret; 4190 4191 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 4192 if (ret <= 0 && ret != -EDQUOT) 4193 return ret; 4194 4195 ret = try_flush_qgroup(inode->root); 4196 if (ret < 0) 4197 return ret; 4198 return qgroup_reserve_data(inode, reserved_ret, start, len); 4199 } 4200 4201 /* Free ranges specified by @reserved, normally in error path */ 4202 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 4203 struct extent_changeset *reserved, 4204 u64 start, u64 len, u64 *freed_ret) 4205 { 4206 struct btrfs_root *root = inode->root; 4207 struct ulist_node *unode; 4208 struct ulist_iterator uiter; 4209 struct extent_changeset changeset; 4210 u64 freed = 0; 4211 int ret; 4212 4213 extent_changeset_init(&changeset); 4214 len = round_up(start + len, root->fs_info->sectorsize); 4215 start = round_down(start, root->fs_info->sectorsize); 4216 4217 ULIST_ITER_INIT(&uiter); 4218 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 4219 u64 range_start = unode->val; 4220 /* unode->aux is the inclusive end */ 4221 u64 range_len = unode->aux - range_start + 1; 4222 u64 free_start; 4223 u64 free_len; 4224 4225 extent_changeset_release(&changeset); 4226 4227 /* Only free range in range [start, start + len) */ 4228 if (range_start >= start + len || 4229 range_start + range_len <= start) 4230 continue; 4231 free_start = max(range_start, start); 4232 free_len = min(start + len, range_start + range_len) - 4233 free_start; 4234 /* 4235 * TODO: To also modify reserved->ranges_reserved to reflect 4236 * the modification. 4237 * 4238 * However as long as we free qgroup reserved according to 4239 * EXTENT_QGROUP_RESERVED, we won't double free. 4240 * So not need to rush. 4241 */ 4242 ret = clear_record_extent_bits(&inode->io_tree, free_start, 4243 free_start + free_len - 1, 4244 EXTENT_QGROUP_RESERVED, &changeset); 4245 if (ret < 0) 4246 goto out; 4247 freed += changeset.bytes_changed; 4248 } 4249 btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed, 4250 BTRFS_QGROUP_RSV_DATA); 4251 if (freed_ret) 4252 *freed_ret = freed; 4253 ret = 0; 4254 out: 4255 extent_changeset_release(&changeset); 4256 return ret; 4257 } 4258 4259 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 4260 struct extent_changeset *reserved, u64 start, u64 len, 4261 u64 *released, int free) 4262 { 4263 struct extent_changeset changeset; 4264 int trace_op = QGROUP_RELEASE; 4265 int ret; 4266 4267 if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { 4268 extent_changeset_init(&changeset); 4269 return clear_record_extent_bits(&inode->io_tree, start, 4270 start + len - 1, 4271 EXTENT_QGROUP_RESERVED, &changeset); 4272 } 4273 4274 /* In release case, we shouldn't have @reserved */ 4275 WARN_ON(!free && reserved); 4276 if (free && reserved) 4277 return qgroup_free_reserved_data(inode, reserved, start, len, released); 4278 extent_changeset_init(&changeset); 4279 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 4280 EXTENT_QGROUP_RESERVED, &changeset); 4281 if (ret < 0) 4282 goto out; 4283 4284 if (free) 4285 trace_op = QGROUP_FREE; 4286 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 4287 changeset.bytes_changed, trace_op); 4288 if (free) 4289 btrfs_qgroup_free_refroot(inode->root->fs_info, 4290 btrfs_root_id(inode->root), 4291 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4292 if (released) 4293 *released = changeset.bytes_changed; 4294 out: 4295 extent_changeset_release(&changeset); 4296 return ret; 4297 } 4298 4299 /* 4300 * Free a reserved space range from io_tree and related qgroups 4301 * 4302 * Should be called when a range of pages get invalidated before reaching disk. 4303 * Or for error cleanup case. 4304 * if @reserved is given, only reserved range in [@start, @start + @len) will 4305 * be freed. 4306 * 4307 * For data written to disk, use btrfs_qgroup_release_data(). 4308 * 4309 * NOTE: This function may sleep for memory allocation. 4310 */ 4311 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 4312 struct extent_changeset *reserved, 4313 u64 start, u64 len, u64 *freed) 4314 { 4315 return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); 4316 } 4317 4318 /* 4319 * Release a reserved space range from io_tree only. 4320 * 4321 * Should be called when a range of pages get written to disk and corresponding 4322 * FILE_EXTENT is inserted into corresponding root. 4323 * 4324 * Since new qgroup accounting framework will only update qgroup numbers at 4325 * commit_transaction() time, its reserved space shouldn't be freed from 4326 * related qgroups. 4327 * 4328 * But we should release the range from io_tree, to allow further write to be 4329 * COWed. 4330 * 4331 * NOTE: This function may sleep for memory allocation. 4332 */ 4333 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) 4334 { 4335 return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); 4336 } 4337 4338 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4339 enum btrfs_qgroup_rsv_type type) 4340 { 4341 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4342 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4343 return; 4344 if (num_bytes == 0) 4345 return; 4346 4347 spin_lock(&root->qgroup_meta_rsv_lock); 4348 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 4349 root->qgroup_meta_rsv_prealloc += num_bytes; 4350 else 4351 root->qgroup_meta_rsv_pertrans += num_bytes; 4352 spin_unlock(&root->qgroup_meta_rsv_lock); 4353 } 4354 4355 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4356 enum btrfs_qgroup_rsv_type type) 4357 { 4358 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4359 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4360 return 0; 4361 if (num_bytes == 0) 4362 return 0; 4363 4364 spin_lock(&root->qgroup_meta_rsv_lock); 4365 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 4366 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 4367 num_bytes); 4368 root->qgroup_meta_rsv_prealloc -= num_bytes; 4369 } else { 4370 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 4371 num_bytes); 4372 root->qgroup_meta_rsv_pertrans -= num_bytes; 4373 } 4374 spin_unlock(&root->qgroup_meta_rsv_lock); 4375 return num_bytes; 4376 } 4377 4378 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4379 enum btrfs_qgroup_rsv_type type, bool enforce) 4380 { 4381 struct btrfs_fs_info *fs_info = root->fs_info; 4382 int ret; 4383 4384 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4385 !is_fstree(btrfs_root_id(root)) || num_bytes == 0) 4386 return 0; 4387 4388 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4389 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 4390 ret = qgroup_reserve(root, num_bytes, enforce, type); 4391 if (ret < 0) 4392 return ret; 4393 /* 4394 * Record what we have reserved into root. 4395 * 4396 * To avoid quota disabled->enabled underflow. 4397 * In that case, we may try to free space we haven't reserved 4398 * (since quota was disabled), so record what we reserved into root. 4399 * And ensure later release won't underflow this number. 4400 */ 4401 add_root_meta_rsv(root, num_bytes, type); 4402 return ret; 4403 } 4404 4405 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4406 enum btrfs_qgroup_rsv_type type, bool enforce, 4407 bool noflush) 4408 { 4409 int ret; 4410 4411 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4412 if ((ret <= 0 && ret != -EDQUOT) || noflush) 4413 return ret; 4414 4415 ret = try_flush_qgroup(root); 4416 if (ret < 0) 4417 return ret; 4418 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4419 } 4420 4421 /* 4422 * Per-transaction meta reservation should be all freed at transaction commit 4423 * time 4424 */ 4425 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 4426 { 4427 struct btrfs_fs_info *fs_info = root->fs_info; 4428 4429 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4430 !is_fstree(btrfs_root_id(root))) 4431 return; 4432 4433 /* TODO: Update trace point to handle such free */ 4434 trace_qgroup_meta_free_all_pertrans(root); 4435 /* Special value -1 means to free all reserved space */ 4436 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, 4437 BTRFS_QGROUP_RSV_META_PERTRANS); 4438 } 4439 4440 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 4441 enum btrfs_qgroup_rsv_type type) 4442 { 4443 struct btrfs_fs_info *fs_info = root->fs_info; 4444 4445 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4446 !is_fstree(btrfs_root_id(root))) 4447 return; 4448 4449 /* 4450 * reservation for META_PREALLOC can happen before quota is enabled, 4451 * which can lead to underflow. 4452 * Here ensure we will only free what we really have reserved. 4453 */ 4454 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 4455 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4456 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 4457 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); 4458 } 4459 4460 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 4461 int num_bytes) 4462 { 4463 struct btrfs_qgroup *qgroup; 4464 LIST_HEAD(qgroup_list); 4465 4466 if (num_bytes == 0) 4467 return; 4468 if (!fs_info->quota_root) 4469 return; 4470 4471 spin_lock(&fs_info->qgroup_lock); 4472 qgroup = find_qgroup_rb(fs_info, ref_root); 4473 if (!qgroup) 4474 goto out; 4475 4476 qgroup_iterator_add(&qgroup_list, qgroup); 4477 list_for_each_entry(qgroup, &qgroup_list, iterator) { 4478 struct btrfs_qgroup_list *glist; 4479 4480 qgroup_rsv_release(fs_info, qgroup, num_bytes, 4481 BTRFS_QGROUP_RSV_META_PREALLOC); 4482 if (!sb_rdonly(fs_info->sb)) 4483 qgroup_rsv_add(fs_info, qgroup, num_bytes, 4484 BTRFS_QGROUP_RSV_META_PERTRANS); 4485 4486 list_for_each_entry(glist, &qgroup->groups, next_group) 4487 qgroup_iterator_add(&qgroup_list, glist->group); 4488 } 4489 out: 4490 qgroup_iterator_clean(&qgroup_list); 4491 spin_unlock(&fs_info->qgroup_lock); 4492 } 4493 4494 /* 4495 * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. 4496 * 4497 * This is called when preallocated meta reservation needs to be used. 4498 * Normally after btrfs_join_transaction() call. 4499 */ 4500 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4501 { 4502 struct btrfs_fs_info *fs_info = root->fs_info; 4503 4504 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || 4505 !is_fstree(btrfs_root_id(root))) 4506 return; 4507 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4508 num_bytes = sub_root_meta_rsv(root, num_bytes, 4509 BTRFS_QGROUP_RSV_META_PREALLOC); 4510 trace_qgroup_meta_convert(root, num_bytes); 4511 qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); 4512 if (!sb_rdonly(fs_info->sb)) 4513 add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); 4514 } 4515 4516 /* 4517 * Check qgroup reserved space leaking, normally at destroy inode 4518 * time 4519 */ 4520 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4521 { 4522 struct extent_changeset changeset; 4523 struct ulist_node *unode; 4524 struct ulist_iterator iter; 4525 int ret; 4526 4527 extent_changeset_init(&changeset); 4528 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4529 EXTENT_QGROUP_RESERVED, &changeset); 4530 4531 WARN_ON(ret < 0); 4532 if (WARN_ON(changeset.bytes_changed)) { 4533 ULIST_ITER_INIT(&iter); 4534 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4535 btrfs_warn(inode->root->fs_info, 4536 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4537 btrfs_ino(inode), unode->val, unode->aux); 4538 } 4539 btrfs_qgroup_free_refroot(inode->root->fs_info, 4540 btrfs_root_id(inode->root), 4541 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4542 4543 } 4544 extent_changeset_release(&changeset); 4545 } 4546 4547 void btrfs_qgroup_init_swapped_blocks( 4548 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4549 { 4550 int i; 4551 4552 spin_lock_init(&swapped_blocks->lock); 4553 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4554 swapped_blocks->blocks[i] = RB_ROOT; 4555 swapped_blocks->swapped = false; 4556 } 4557 4558 /* 4559 * Delete all swapped blocks record of @root. 4560 * Every record here means we skipped a full subtree scan for qgroup. 4561 * 4562 * Gets called when committing one transaction. 4563 */ 4564 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4565 { 4566 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4567 int i; 4568 4569 swapped_blocks = &root->swapped_blocks; 4570 4571 spin_lock(&swapped_blocks->lock); 4572 if (!swapped_blocks->swapped) 4573 goto out; 4574 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4575 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4576 struct btrfs_qgroup_swapped_block *entry; 4577 struct btrfs_qgroup_swapped_block *next; 4578 4579 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4580 node) 4581 kfree(entry); 4582 swapped_blocks->blocks[i] = RB_ROOT; 4583 } 4584 swapped_blocks->swapped = false; 4585 out: 4586 spin_unlock(&swapped_blocks->lock); 4587 } 4588 4589 /* 4590 * Add subtree roots record into @subvol_root. 4591 * 4592 * @subvol_root: tree root of the subvolume tree get swapped 4593 * @bg: block group under balance 4594 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4595 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4596 * BOTH POINTERS ARE BEFORE TREE SWAP 4597 * @last_snapshot: last snapshot generation of the subvolume tree 4598 */ 4599 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4600 struct btrfs_root *subvol_root, 4601 struct btrfs_block_group *bg, 4602 struct extent_buffer *subvol_parent, int subvol_slot, 4603 struct extent_buffer *reloc_parent, int reloc_slot, 4604 u64 last_snapshot) 4605 { 4606 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4607 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4608 struct btrfs_qgroup_swapped_block *block; 4609 struct rb_node **cur; 4610 struct rb_node *parent = NULL; 4611 int level = btrfs_header_level(subvol_parent) - 1; 4612 int ret = 0; 4613 4614 if (!btrfs_qgroup_full_accounting(fs_info)) 4615 return 0; 4616 4617 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4618 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4619 btrfs_err_rl(fs_info, 4620 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4621 __func__, 4622 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4623 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4624 return -EUCLEAN; 4625 } 4626 4627 block = kmalloc(sizeof(*block), GFP_NOFS); 4628 if (!block) { 4629 ret = -ENOMEM; 4630 goto out; 4631 } 4632 4633 /* 4634 * @reloc_parent/slot is still before swap, while @block is going to 4635 * record the bytenr after swap, so we do the swap here. 4636 */ 4637 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4638 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4639 reloc_slot); 4640 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4641 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4642 subvol_slot); 4643 block->last_snapshot = last_snapshot; 4644 block->level = level; 4645 4646 /* 4647 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4648 * no one else can modify tree blocks thus we qgroup will not change 4649 * no matter the value of trace_leaf. 4650 */ 4651 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4652 block->trace_leaf = true; 4653 else 4654 block->trace_leaf = false; 4655 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4656 4657 /* Insert @block into @blocks */ 4658 spin_lock(&blocks->lock); 4659 cur = &blocks->blocks[level].rb_node; 4660 while (*cur) { 4661 struct btrfs_qgroup_swapped_block *entry; 4662 4663 parent = *cur; 4664 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4665 node); 4666 4667 if (entry->subvol_bytenr < block->subvol_bytenr) { 4668 cur = &(*cur)->rb_left; 4669 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4670 cur = &(*cur)->rb_right; 4671 } else { 4672 if (entry->subvol_generation != 4673 block->subvol_generation || 4674 entry->reloc_bytenr != block->reloc_bytenr || 4675 entry->reloc_generation != 4676 block->reloc_generation) { 4677 /* 4678 * Duplicated but mismatch entry found. 4679 * Shouldn't happen. 4680 * 4681 * Marking qgroup inconsistent should be enough 4682 * for end users. 4683 */ 4684 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4685 ret = -EEXIST; 4686 } 4687 kfree(block); 4688 goto out_unlock; 4689 } 4690 } 4691 rb_link_node(&block->node, parent, cur); 4692 rb_insert_color(&block->node, &blocks->blocks[level]); 4693 blocks->swapped = true; 4694 out_unlock: 4695 spin_unlock(&blocks->lock); 4696 out: 4697 if (ret < 0) 4698 qgroup_mark_inconsistent(fs_info); 4699 return ret; 4700 } 4701 4702 /* 4703 * Check if the tree block is a subtree root, and if so do the needed 4704 * delayed subtree trace for qgroup. 4705 * 4706 * This is called during btrfs_cow_block(). 4707 */ 4708 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4709 struct btrfs_root *root, 4710 struct extent_buffer *subvol_eb) 4711 { 4712 struct btrfs_fs_info *fs_info = root->fs_info; 4713 struct btrfs_tree_parent_check check = { 0 }; 4714 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4715 struct btrfs_qgroup_swapped_block *block; 4716 struct extent_buffer *reloc_eb = NULL; 4717 struct rb_node *node; 4718 bool found = false; 4719 bool swapped = false; 4720 int level = btrfs_header_level(subvol_eb); 4721 int ret = 0; 4722 int i; 4723 4724 if (!btrfs_qgroup_full_accounting(fs_info)) 4725 return 0; 4726 if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) 4727 return 0; 4728 4729 spin_lock(&blocks->lock); 4730 if (!blocks->swapped) { 4731 spin_unlock(&blocks->lock); 4732 return 0; 4733 } 4734 node = blocks->blocks[level].rb_node; 4735 4736 while (node) { 4737 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4738 if (block->subvol_bytenr < subvol_eb->start) { 4739 node = node->rb_left; 4740 } else if (block->subvol_bytenr > subvol_eb->start) { 4741 node = node->rb_right; 4742 } else { 4743 found = true; 4744 break; 4745 } 4746 } 4747 if (!found) { 4748 spin_unlock(&blocks->lock); 4749 goto out; 4750 } 4751 /* Found one, remove it from @blocks first and update blocks->swapped */ 4752 rb_erase(&block->node, &blocks->blocks[level]); 4753 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4754 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4755 swapped = true; 4756 break; 4757 } 4758 } 4759 blocks->swapped = swapped; 4760 spin_unlock(&blocks->lock); 4761 4762 check.level = block->level; 4763 check.transid = block->reloc_generation; 4764 check.has_first_key = true; 4765 memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); 4766 4767 /* Read out reloc subtree root */ 4768 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); 4769 if (IS_ERR(reloc_eb)) { 4770 ret = PTR_ERR(reloc_eb); 4771 reloc_eb = NULL; 4772 goto free_out; 4773 } 4774 if (!extent_buffer_uptodate(reloc_eb)) { 4775 ret = -EIO; 4776 goto free_out; 4777 } 4778 4779 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4780 block->last_snapshot, block->trace_leaf); 4781 free_out: 4782 kfree(block); 4783 free_extent_buffer(reloc_eb); 4784 out: 4785 if (ret < 0) { 4786 btrfs_err_rl(fs_info, 4787 "failed to account subtree at bytenr %llu: %d", 4788 subvol_eb->start, ret); 4789 qgroup_mark_inconsistent(fs_info); 4790 } 4791 return ret; 4792 } 4793 4794 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4795 { 4796 struct btrfs_qgroup_extent_record *entry; 4797 struct btrfs_qgroup_extent_record *next; 4798 struct rb_root *root; 4799 4800 root = &trans->delayed_refs.dirty_extent_root; 4801 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4802 ulist_free(entry->old_roots); 4803 kfree(entry); 4804 } 4805 *root = RB_ROOT; 4806 } 4807 4808 void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) 4809 { 4810 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) 4811 return; 4812 4813 if (!is_fstree(root)) 4814 return; 4815 4816 btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); 4817 } 4818 4819 int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, 4820 struct btrfs_squota_delta *delta) 4821 { 4822 int ret; 4823 struct btrfs_qgroup *qgroup; 4824 struct btrfs_qgroup *qg; 4825 LIST_HEAD(qgroup_list); 4826 u64 root = delta->root; 4827 u64 num_bytes = delta->num_bytes; 4828 const int sign = (delta->is_inc ? 1 : -1); 4829 4830 if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) 4831 return 0; 4832 4833 if (!is_fstree(root)) 4834 return 0; 4835 4836 /* If the extent predates enabling quotas, don't count it. */ 4837 if (delta->generation < fs_info->qgroup_enable_gen) 4838 return 0; 4839 4840 spin_lock(&fs_info->qgroup_lock); 4841 qgroup = find_qgroup_rb(fs_info, root); 4842 if (!qgroup) { 4843 ret = -ENOENT; 4844 goto out; 4845 } 4846 4847 ret = 0; 4848 qgroup_iterator_add(&qgroup_list, qgroup); 4849 list_for_each_entry(qg, &qgroup_list, iterator) { 4850 struct btrfs_qgroup_list *glist; 4851 4852 qg->excl += num_bytes * sign; 4853 qg->rfer += num_bytes * sign; 4854 qgroup_dirty(fs_info, qg); 4855 4856 list_for_each_entry(glist, &qg->groups, next_group) 4857 qgroup_iterator_add(&qgroup_list, glist->group); 4858 } 4859 qgroup_iterator_clean(&qgroup_list); 4860 4861 out: 4862 spin_unlock(&fs_info->qgroup_lock); 4863 return ret; 4864 } 4865