1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * Copyright (c) 2010 David Chinner. 5 * Copyright (c) 2011 Christoph Hellwig. 6 * All Rights Reserved. 7 */ 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_format.h" 11 #include "xfs_log_format.h" 12 #include "xfs_shared.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_alloc.h" 16 #include "xfs_extent_busy.h" 17 #include "xfs_trace.h" 18 #include "xfs_trans.h" 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 #include "xfs_rtgroup.h" 22 23 struct xfs_extent_busy_tree { 24 spinlock_t eb_lock; 25 struct rb_root eb_tree; 26 unsigned int eb_gen; 27 wait_queue_head_t eb_wait; 28 }; 29 30 static void 31 xfs_extent_busy_insert_list( 32 struct xfs_group *xg, 33 xfs_agblock_t bno, 34 xfs_extlen_t len, 35 unsigned int flags, 36 struct list_head *busy_list) 37 { 38 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 39 struct xfs_extent_busy *new; 40 struct xfs_extent_busy *busyp; 41 struct rb_node **rbp; 42 struct rb_node *parent = NULL; 43 44 new = kzalloc(sizeof(struct xfs_extent_busy), 45 GFP_KERNEL | __GFP_NOFAIL); 46 new->group = xfs_group_hold(xg); 47 new->bno = bno; 48 new->length = len; 49 INIT_LIST_HEAD(&new->list); 50 new->flags = flags; 51 52 /* trace before insert to be able to see failed inserts */ 53 trace_xfs_extent_busy(xg, bno, len); 54 55 spin_lock(&eb->eb_lock); 56 rbp = &eb->eb_tree.rb_node; 57 while (*rbp) { 58 parent = *rbp; 59 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); 60 61 if (new->bno < busyp->bno) { 62 rbp = &(*rbp)->rb_left; 63 ASSERT(new->bno + new->length <= busyp->bno); 64 } else if (new->bno > busyp->bno) { 65 rbp = &(*rbp)->rb_right; 66 ASSERT(bno >= busyp->bno + busyp->length); 67 } else { 68 ASSERT(0); 69 } 70 } 71 72 rb_link_node(&new->rb_node, parent, rbp); 73 rb_insert_color(&new->rb_node, &eb->eb_tree); 74 75 /* always process discard lists in fifo order */ 76 list_add_tail(&new->list, busy_list); 77 spin_unlock(&eb->eb_lock); 78 } 79 80 void 81 xfs_extent_busy_insert( 82 struct xfs_trans *tp, 83 struct xfs_group *xg, 84 xfs_agblock_t bno, 85 xfs_extlen_t len, 86 unsigned int flags) 87 { 88 xfs_extent_busy_insert_list(xg, bno, len, flags, &tp->t_busy); 89 } 90 91 void 92 xfs_extent_busy_insert_discard( 93 struct xfs_group *xg, 94 xfs_agblock_t bno, 95 xfs_extlen_t len, 96 struct list_head *busy_list) 97 { 98 xfs_extent_busy_insert_list(xg, bno, len, XFS_EXTENT_BUSY_DISCARDED, 99 busy_list); 100 } 101 102 /* 103 * Search for a busy extent within the range of the extent we are about to 104 * allocate. You need to be holding the busy extent tree lock when calling 105 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy 106 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact 107 * match. This is done so that a non-zero return indicates an overlap that 108 * will require a synchronous transaction, but it can still be 109 * used to distinguish between a partial or exact match. 110 */ 111 int 112 xfs_extent_busy_search( 113 struct xfs_group *xg, 114 xfs_agblock_t bno, 115 xfs_extlen_t len) 116 { 117 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 118 struct rb_node *rbp; 119 struct xfs_extent_busy *busyp; 120 int match = 0; 121 122 /* find closest start bno overlap */ 123 spin_lock(&eb->eb_lock); 124 rbp = eb->eb_tree.rb_node; 125 while (rbp) { 126 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); 127 if (bno < busyp->bno) { 128 /* may overlap, but exact start block is lower */ 129 if (bno + len > busyp->bno) 130 match = -1; 131 rbp = rbp->rb_left; 132 } else if (bno > busyp->bno) { 133 /* may overlap, but exact start block is higher */ 134 if (bno < busyp->bno + busyp->length) 135 match = -1; 136 rbp = rbp->rb_right; 137 } else { 138 /* bno matches busyp, length determines exact match */ 139 match = (busyp->length == len) ? 1 : -1; 140 break; 141 } 142 } 143 spin_unlock(&eb->eb_lock); 144 return match; 145 } 146 147 /* 148 * The found free extent [fbno, fend] overlaps part or all of the given busy 149 * extent. If the overlap covers the beginning, the end, or all of the busy 150 * extent, the overlapping portion can be made unbusy and used for the 151 * allocation. We can't split a busy extent because we can't modify a 152 * transaction/CIL context busy list, but we can update an entry's block 153 * number or length. 154 * 155 * Returns true if the extent can safely be reused, or false if the search 156 * needs to be restarted. 157 */ 158 STATIC bool 159 xfs_extent_busy_update_extent( 160 struct xfs_group *xg, 161 struct xfs_extent_busy *busyp, 162 xfs_agblock_t fbno, 163 xfs_extlen_t flen, 164 bool userdata) 165 __releases(&eb->eb_lock) 166 __acquires(&eb->eb_lock) 167 { 168 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 169 xfs_agblock_t fend = fbno + flen; 170 xfs_agblock_t bbno = busyp->bno; 171 xfs_agblock_t bend = bbno + busyp->length; 172 173 /* 174 * This extent is currently being discarded. Give the thread 175 * performing the discard a chance to mark the extent unbusy 176 * and retry. 177 */ 178 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { 179 spin_unlock(&eb->eb_lock); 180 delay(1); 181 spin_lock(&eb->eb_lock); 182 return false; 183 } 184 185 /* 186 * If there is a busy extent overlapping a user allocation, we have 187 * no choice but to force the log and retry the search. 188 * 189 * Fortunately this does not happen during normal operation, but 190 * only if the filesystem is very low on space and has to dip into 191 * the AGFL for normal allocations. 192 */ 193 if (userdata) 194 goto out_force_log; 195 196 if (bbno < fbno && bend > fend) { 197 /* 198 * Case 1: 199 * bbno bend 200 * +BBBBBBBBBBBBBBBBB+ 201 * +---------+ 202 * fbno fend 203 */ 204 205 /* 206 * We would have to split the busy extent to be able to track 207 * it correct, which we cannot do because we would have to 208 * modify the list of busy extents attached to the transaction 209 * or CIL context, which is immutable. 210 * 211 * Force out the log to clear the busy extent and retry the 212 * search. 213 */ 214 goto out_force_log; 215 } else if (bbno >= fbno && bend <= fend) { 216 /* 217 * Case 2: 218 * bbno bend 219 * +BBBBBBBBBBBBBBBBB+ 220 * +-----------------+ 221 * fbno fend 222 * 223 * Case 3: 224 * bbno bend 225 * +BBBBBBBBBBBBBBBBB+ 226 * +--------------------------+ 227 * fbno fend 228 * 229 * Case 4: 230 * bbno bend 231 * +BBBBBBBBBBBBBBBBB+ 232 * +--------------------------+ 233 * fbno fend 234 * 235 * Case 5: 236 * bbno bend 237 * +BBBBBBBBBBBBBBBBB+ 238 * +-----------------------------------+ 239 * fbno fend 240 * 241 */ 242 243 /* 244 * The busy extent is fully covered by the extent we are 245 * allocating, and can simply be removed from the rbtree. 246 * However we cannot remove it from the immutable list 247 * tracking busy extents in the transaction or CIL context, 248 * so set the length to zero to mark it invalid. 249 * 250 * We also need to restart the busy extent search from the 251 * tree root, because erasing the node can rearrange the 252 * tree topology. 253 */ 254 rb_erase(&busyp->rb_node, &eb->eb_tree); 255 busyp->length = 0; 256 return false; 257 } else if (fend < bend) { 258 /* 259 * Case 6: 260 * bbno bend 261 * +BBBBBBBBBBBBBBBBB+ 262 * +---------+ 263 * fbno fend 264 * 265 * Case 7: 266 * bbno bend 267 * +BBBBBBBBBBBBBBBBB+ 268 * +------------------+ 269 * fbno fend 270 * 271 */ 272 busyp->bno = fend; 273 busyp->length = bend - fend; 274 } else if (bbno < fbno) { 275 /* 276 * Case 8: 277 * bbno bend 278 * +BBBBBBBBBBBBBBBBB+ 279 * +-------------+ 280 * fbno fend 281 * 282 * Case 9: 283 * bbno bend 284 * +BBBBBBBBBBBBBBBBB+ 285 * +----------------------+ 286 * fbno fend 287 */ 288 busyp->length = fbno - busyp->bno; 289 } else { 290 ASSERT(0); 291 } 292 293 trace_xfs_extent_busy_reuse(xg, fbno, flen); 294 return true; 295 296 out_force_log: 297 spin_unlock(&eb->eb_lock); 298 xfs_log_force(xg->xg_mount, XFS_LOG_SYNC); 299 trace_xfs_extent_busy_force(xg, fbno, flen); 300 spin_lock(&eb->eb_lock); 301 return false; 302 } 303 304 /* 305 * For a given extent [fbno, flen], make sure we can reuse it safely. 306 */ 307 void 308 xfs_extent_busy_reuse( 309 struct xfs_group *xg, 310 xfs_agblock_t fbno, 311 xfs_extlen_t flen, 312 bool userdata) 313 { 314 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 315 struct rb_node *rbp; 316 317 ASSERT(flen > 0); 318 spin_lock(&eb->eb_lock); 319 restart: 320 rbp = eb->eb_tree.rb_node; 321 while (rbp) { 322 struct xfs_extent_busy *busyp = 323 rb_entry(rbp, struct xfs_extent_busy, rb_node); 324 xfs_agblock_t bbno = busyp->bno; 325 xfs_agblock_t bend = bbno + busyp->length; 326 327 if (fbno + flen <= bbno) { 328 rbp = rbp->rb_left; 329 continue; 330 } else if (fbno >= bend) { 331 rbp = rbp->rb_right; 332 continue; 333 } 334 335 if (!xfs_extent_busy_update_extent(xg, busyp, fbno, flen, 336 userdata)) 337 goto restart; 338 } 339 spin_unlock(&eb->eb_lock); 340 } 341 342 /* 343 * For a given extent [fbno, flen], search the busy extent list to find a 344 * subset of the extent that is not busy. If *rlen is smaller than 345 * args->minlen no suitable extent could be found, and the higher level 346 * code needs to force out the log and retry the allocation. 347 * 348 * Return the current busy generation for the group if the extent is busy. This 349 * value can be used to wait for at least one of the currently busy extents 350 * to be cleared. Note that the busy list is not guaranteed to be empty after 351 * the gen is woken. The state of a specific extent must always be confirmed 352 * with another call to xfs_extent_busy_trim() before it can be used. 353 */ 354 bool 355 xfs_extent_busy_trim( 356 struct xfs_group *xg, 357 xfs_extlen_t minlen, 358 xfs_extlen_t maxlen, 359 xfs_agblock_t *bno, 360 xfs_extlen_t *len, 361 unsigned *busy_gen) 362 { 363 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 364 xfs_agblock_t fbno; 365 xfs_extlen_t flen; 366 struct rb_node *rbp; 367 bool ret = false; 368 369 ASSERT(*len > 0); 370 371 spin_lock(&eb->eb_lock); 372 fbno = *bno; 373 flen = *len; 374 rbp = eb->eb_tree.rb_node; 375 while (rbp && flen >= minlen) { 376 struct xfs_extent_busy *busyp = 377 rb_entry(rbp, struct xfs_extent_busy, rb_node); 378 xfs_agblock_t fend = fbno + flen; 379 xfs_agblock_t bbno = busyp->bno; 380 xfs_agblock_t bend = bbno + busyp->length; 381 382 if (fend <= bbno) { 383 rbp = rbp->rb_left; 384 continue; 385 } else if (fbno >= bend) { 386 rbp = rbp->rb_right; 387 continue; 388 } 389 390 if (bbno <= fbno) { 391 /* start overlap */ 392 393 /* 394 * Case 1: 395 * bbno bend 396 * +BBBBBBBBBBBBBBBBB+ 397 * +---------+ 398 * fbno fend 399 * 400 * Case 2: 401 * bbno bend 402 * +BBBBBBBBBBBBBBBBB+ 403 * +-------------+ 404 * fbno fend 405 * 406 * Case 3: 407 * bbno bend 408 * +BBBBBBBBBBBBBBBBB+ 409 * +-------------+ 410 * fbno fend 411 * 412 * Case 4: 413 * bbno bend 414 * +BBBBBBBBBBBBBBBBB+ 415 * +-----------------+ 416 * fbno fend 417 * 418 * No unbusy region in extent, return failure. 419 */ 420 if (fend <= bend) 421 goto fail; 422 423 /* 424 * Case 5: 425 * bbno bend 426 * +BBBBBBBBBBBBBBBBB+ 427 * +----------------------+ 428 * fbno fend 429 * 430 * Case 6: 431 * bbno bend 432 * +BBBBBBBBBBBBBBBBB+ 433 * +--------------------------+ 434 * fbno fend 435 * 436 * Needs to be trimmed to: 437 * +-------+ 438 * fbno fend 439 */ 440 fbno = bend; 441 } else if (bend >= fend) { 442 /* end overlap */ 443 444 /* 445 * Case 7: 446 * bbno bend 447 * +BBBBBBBBBBBBBBBBB+ 448 * +------------------+ 449 * fbno fend 450 * 451 * Case 8: 452 * bbno bend 453 * +BBBBBBBBBBBBBBBBB+ 454 * +--------------------------+ 455 * fbno fend 456 * 457 * Needs to be trimmed to: 458 * +-------+ 459 * fbno fend 460 */ 461 fend = bbno; 462 } else { 463 /* middle overlap */ 464 465 /* 466 * Case 9: 467 * bbno bend 468 * +BBBBBBBBBBBBBBBBB+ 469 * +-----------------------------------+ 470 * fbno fend 471 * 472 * Can be trimmed to: 473 * +-------+ OR +-------+ 474 * fbno fend fbno fend 475 * 476 * Backward allocation leads to significant 477 * fragmentation of directories, which degrades 478 * directory performance, therefore we always want to 479 * choose the option that produces forward allocation 480 * patterns. 481 * Preferring the lower bno extent will make the next 482 * request use "fend" as the start of the next 483 * allocation; if the segment is no longer busy at 484 * that point, we'll get a contiguous allocation, but 485 * even if it is still busy, we will get a forward 486 * allocation. 487 * We try to avoid choosing the segment at "bend", 488 * because that can lead to the next allocation 489 * taking the segment at "fbno", which would be a 490 * backward allocation. We only use the segment at 491 * "fbno" if it is much larger than the current 492 * requested size, because in that case there's a 493 * good chance subsequent allocations will be 494 * contiguous. 495 */ 496 if (bbno - fbno >= maxlen) { 497 /* left candidate fits perfect */ 498 fend = bbno; 499 } else if (fend - bend >= maxlen * 4) { 500 /* right candidate has enough free space */ 501 fbno = bend; 502 } else if (bbno - fbno >= minlen) { 503 /* left candidate fits minimum requirement */ 504 fend = bbno; 505 } else { 506 goto fail; 507 } 508 } 509 510 flen = fend - fbno; 511 } 512 out: 513 514 if (fbno != *bno || flen != *len) { 515 trace_xfs_extent_busy_trim(xg, *bno, *len, fbno, flen); 516 *bno = fbno; 517 *len = flen; 518 *busy_gen = eb->eb_gen; 519 ret = true; 520 } 521 spin_unlock(&eb->eb_lock); 522 return ret; 523 fail: 524 /* 525 * Return a zero extent length as failure indications. All callers 526 * re-check if the trimmed extent satisfies the minlen requirement. 527 */ 528 flen = 0; 529 goto out; 530 } 531 532 static bool 533 xfs_extent_busy_clear_one( 534 struct xfs_extent_busy *busyp, 535 bool do_discard) 536 { 537 struct xfs_extent_busy_tree *eb = busyp->group->xg_busy_extents; 538 539 if (busyp->length) { 540 if (do_discard && 541 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { 542 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 543 return false; 544 } 545 trace_xfs_extent_busy_clear(busyp->group, busyp->bno, 546 busyp->length); 547 rb_erase(&busyp->rb_node, &eb->eb_tree); 548 } 549 550 list_del_init(&busyp->list); 551 xfs_group_put(busyp->group); 552 kfree(busyp); 553 return true; 554 } 555 556 /* 557 * Remove all extents on the passed in list from the busy extents tree. 558 * If do_discard is set skip extents that need to be discarded, and mark 559 * these as undergoing a discard operation instead. 560 */ 561 void 562 xfs_extent_busy_clear( 563 struct list_head *list, 564 bool do_discard) 565 { 566 struct xfs_extent_busy *busyp, *next; 567 568 busyp = list_first_entry_or_null(list, typeof(*busyp), list); 569 if (!busyp) 570 return; 571 572 do { 573 struct xfs_group *xg = xfs_group_hold(busyp->group); 574 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 575 bool wakeup = false; 576 577 spin_lock(&eb->eb_lock); 578 do { 579 next = list_next_entry(busyp, list); 580 if (xfs_extent_busy_clear_one(busyp, do_discard)) 581 wakeup = true; 582 busyp = next; 583 } while (!list_entry_is_head(busyp, list, list) && 584 busyp->group == xg); 585 586 if (wakeup) { 587 eb->eb_gen++; 588 wake_up_all(&eb->eb_wait); 589 } 590 spin_unlock(&eb->eb_lock); 591 xfs_group_put(xg); 592 } while (!list_entry_is_head(busyp, list, list)); 593 } 594 595 /* 596 * Flush out all busy extents for this group. 597 * 598 * If the current transaction is holding busy extents, the caller may not want 599 * to wait for committed busy extents to resolve. If we are being told just to 600 * try a flush or progress has been made since we last skipped a busy extent, 601 * return immediately to allow the caller to try again. 602 * 603 * If we are freeing extents, we might actually be holding the only free extents 604 * in the transaction busy list and the log force won't resolve that situation. 605 * In this case, we must return -EAGAIN to avoid a deadlock by informing the 606 * caller it needs to commit the busy extents it holds before retrying the 607 * extent free operation. 608 */ 609 int 610 xfs_extent_busy_flush( 611 struct xfs_trans *tp, 612 struct xfs_group *xg, 613 unsigned busy_gen, 614 uint32_t alloc_flags) 615 { 616 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 617 DEFINE_WAIT (wait); 618 int error; 619 620 error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); 621 if (error) 622 return error; 623 624 /* Avoid deadlocks on uncommitted busy extents. */ 625 if (!list_empty(&tp->t_busy)) { 626 if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) 627 return 0; 628 629 if (busy_gen != READ_ONCE(eb->eb_gen)) 630 return 0; 631 632 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 633 return -EAGAIN; 634 } 635 636 /* Wait for committed busy extents to resolve. */ 637 do { 638 prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); 639 if (busy_gen != READ_ONCE(eb->eb_gen)) 640 break; 641 schedule(); 642 } while (1); 643 644 finish_wait(&eb->eb_wait, &wait); 645 return 0; 646 } 647 648 static void 649 xfs_extent_busy_wait_group( 650 struct xfs_group *xg) 651 { 652 DEFINE_WAIT (wait); 653 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 654 655 do { 656 prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); 657 if (RB_EMPTY_ROOT(&eb->eb_tree)) 658 break; 659 schedule(); 660 } while (1); 661 finish_wait(&eb->eb_wait, &wait); 662 } 663 664 void 665 xfs_extent_busy_wait_all( 666 struct xfs_mount *mp) 667 { 668 struct xfs_perag *pag = NULL; 669 struct xfs_rtgroup *rtg = NULL; 670 671 while ((pag = xfs_perag_next(mp, pag))) 672 xfs_extent_busy_wait_group(pag_group(pag)); 673 674 if (xfs_has_rtgroups(mp)) 675 while ((rtg = xfs_rtgroup_next(mp, rtg))) 676 xfs_extent_busy_wait_group(rtg_group(rtg)); 677 } 678 679 /* 680 * Callback for list_sort to sort busy extents by the group they reside in. 681 */ 682 int 683 xfs_extent_busy_ag_cmp( 684 void *priv, 685 const struct list_head *l1, 686 const struct list_head *l2) 687 { 688 struct xfs_extent_busy *b1 = 689 container_of(l1, struct xfs_extent_busy, list); 690 struct xfs_extent_busy *b2 = 691 container_of(l2, struct xfs_extent_busy, list); 692 s32 diff; 693 694 diff = b1->group->xg_gno - b2->group->xg_gno; 695 if (!diff) 696 diff = b1->bno - b2->bno; 697 return diff; 698 } 699 700 /* Are there any busy extents in this group? */ 701 bool 702 xfs_extent_busy_list_empty( 703 struct xfs_group *xg, 704 unsigned *busy_gen) 705 { 706 struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; 707 bool res; 708 709 spin_lock(&eb->eb_lock); 710 res = RB_EMPTY_ROOT(&eb->eb_tree); 711 *busy_gen = READ_ONCE(eb->eb_gen); 712 spin_unlock(&eb->eb_lock); 713 return res; 714 } 715 716 struct xfs_extent_busy_tree * 717 xfs_extent_busy_alloc(void) 718 { 719 struct xfs_extent_busy_tree *eb; 720 721 eb = kzalloc(sizeof(*eb), GFP_KERNEL); 722 if (!eb) 723 return NULL; 724 spin_lock_init(&eb->eb_lock); 725 init_waitqueue_head(&eb->eb_wait); 726 eb->eb_tree = RB_ROOT; 727 return eb; 728 } 729