1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * Copyright (c) 2010 David Chinner. 5 * Copyright (c) 2011 Christoph Hellwig. 6 * All Rights Reserved. 7 */ 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_format.h" 11 #include "xfs_log_format.h" 12 #include "xfs_shared.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_alloc.h" 16 #include "xfs_extent_busy.h" 17 #include "xfs_trace.h" 18 #include "xfs_trans.h" 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 22 static void 23 xfs_extent_busy_insert_list( 24 struct xfs_perag *pag, 25 xfs_agblock_t bno, 26 xfs_extlen_t len, 27 unsigned int flags, 28 struct list_head *busy_list) 29 { 30 struct xfs_extent_busy *new; 31 struct xfs_extent_busy *busyp; 32 struct rb_node **rbp; 33 struct rb_node *parent = NULL; 34 35 new = kzalloc(sizeof(struct xfs_extent_busy), 36 GFP_KERNEL | __GFP_NOFAIL); 37 new->agno = pag->pag_agno; 38 new->bno = bno; 39 new->length = len; 40 INIT_LIST_HEAD(&new->list); 41 new->flags = flags; 42 43 /* trace before insert to be able to see failed inserts */ 44 trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); 45 46 spin_lock(&pag->pagb_lock); 47 rbp = &pag->pagb_tree.rb_node; 48 while (*rbp) { 49 parent = *rbp; 50 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); 51 52 if (new->bno < busyp->bno) { 53 rbp = &(*rbp)->rb_left; 54 ASSERT(new->bno + new->length <= busyp->bno); 55 } else if (new->bno > busyp->bno) { 56 rbp = &(*rbp)->rb_right; 57 ASSERT(bno >= busyp->bno + busyp->length); 58 } else { 59 ASSERT(0); 60 } 61 } 62 63 rb_link_node(&new->rb_node, parent, rbp); 64 rb_insert_color(&new->rb_node, &pag->pagb_tree); 65 66 /* always process discard lists in fifo order */ 67 list_add_tail(&new->list, busy_list); 68 spin_unlock(&pag->pagb_lock); 69 } 70 71 void 72 xfs_extent_busy_insert( 73 struct xfs_trans *tp, 74 struct xfs_perag *pag, 75 xfs_agblock_t bno, 76 xfs_extlen_t len, 77 unsigned int flags) 78 { 79 xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); 80 } 81 82 void 83 xfs_extent_busy_insert_discard( 84 struct xfs_perag *pag, 85 xfs_agblock_t bno, 86 xfs_extlen_t len, 87 struct list_head *busy_list) 88 { 89 xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, 90 busy_list); 91 } 92 93 /* 94 * Search for a busy extent within the range of the extent we are about to 95 * allocate. You need to be holding the busy extent tree lock when calling 96 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy 97 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact 98 * match. This is done so that a non-zero return indicates an overlap that 99 * will require a synchronous transaction, but it can still be 100 * used to distinguish between a partial or exact match. 101 */ 102 int 103 xfs_extent_busy_search( 104 struct xfs_mount *mp, 105 struct xfs_perag *pag, 106 xfs_agblock_t bno, 107 xfs_extlen_t len) 108 { 109 struct rb_node *rbp; 110 struct xfs_extent_busy *busyp; 111 int match = 0; 112 113 /* find closest start bno overlap */ 114 spin_lock(&pag->pagb_lock); 115 rbp = pag->pagb_tree.rb_node; 116 while (rbp) { 117 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); 118 if (bno < busyp->bno) { 119 /* may overlap, but exact start block is lower */ 120 if (bno + len > busyp->bno) 121 match = -1; 122 rbp = rbp->rb_left; 123 } else if (bno > busyp->bno) { 124 /* may overlap, but exact start block is higher */ 125 if (bno < busyp->bno + busyp->length) 126 match = -1; 127 rbp = rbp->rb_right; 128 } else { 129 /* bno matches busyp, length determines exact match */ 130 match = (busyp->length == len) ? 1 : -1; 131 break; 132 } 133 } 134 spin_unlock(&pag->pagb_lock); 135 return match; 136 } 137 138 /* 139 * The found free extent [fbno, fend] overlaps part or all of the given busy 140 * extent. If the overlap covers the beginning, the end, or all of the busy 141 * extent, the overlapping portion can be made unbusy and used for the 142 * allocation. We can't split a busy extent because we can't modify a 143 * transaction/CIL context busy list, but we can update an entry's block 144 * number or length. 145 * 146 * Returns true if the extent can safely be reused, or false if the search 147 * needs to be restarted. 148 */ 149 STATIC bool 150 xfs_extent_busy_update_extent( 151 struct xfs_mount *mp, 152 struct xfs_perag *pag, 153 struct xfs_extent_busy *busyp, 154 xfs_agblock_t fbno, 155 xfs_extlen_t flen, 156 bool userdata) __releases(&pag->pagb_lock) 157 __acquires(&pag->pagb_lock) 158 { 159 xfs_agblock_t fend = fbno + flen; 160 xfs_agblock_t bbno = busyp->bno; 161 xfs_agblock_t bend = bbno + busyp->length; 162 163 /* 164 * This extent is currently being discarded. Give the thread 165 * performing the discard a chance to mark the extent unbusy 166 * and retry. 167 */ 168 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { 169 spin_unlock(&pag->pagb_lock); 170 delay(1); 171 spin_lock(&pag->pagb_lock); 172 return false; 173 } 174 175 /* 176 * If there is a busy extent overlapping a user allocation, we have 177 * no choice but to force the log and retry the search. 178 * 179 * Fortunately this does not happen during normal operation, but 180 * only if the filesystem is very low on space and has to dip into 181 * the AGFL for normal allocations. 182 */ 183 if (userdata) 184 goto out_force_log; 185 186 if (bbno < fbno && bend > fend) { 187 /* 188 * Case 1: 189 * bbno bend 190 * +BBBBBBBBBBBBBBBBB+ 191 * +---------+ 192 * fbno fend 193 */ 194 195 /* 196 * We would have to split the busy extent to be able to track 197 * it correct, which we cannot do because we would have to 198 * modify the list of busy extents attached to the transaction 199 * or CIL context, which is immutable. 200 * 201 * Force out the log to clear the busy extent and retry the 202 * search. 203 */ 204 goto out_force_log; 205 } else if (bbno >= fbno && bend <= fend) { 206 /* 207 * Case 2: 208 * bbno bend 209 * +BBBBBBBBBBBBBBBBB+ 210 * +-----------------+ 211 * fbno fend 212 * 213 * Case 3: 214 * bbno bend 215 * +BBBBBBBBBBBBBBBBB+ 216 * +--------------------------+ 217 * fbno fend 218 * 219 * Case 4: 220 * bbno bend 221 * +BBBBBBBBBBBBBBBBB+ 222 * +--------------------------+ 223 * fbno fend 224 * 225 * Case 5: 226 * bbno bend 227 * +BBBBBBBBBBBBBBBBB+ 228 * +-----------------------------------+ 229 * fbno fend 230 * 231 */ 232 233 /* 234 * The busy extent is fully covered by the extent we are 235 * allocating, and can simply be removed from the rbtree. 236 * However we cannot remove it from the immutable list 237 * tracking busy extents in the transaction or CIL context, 238 * so set the length to zero to mark it invalid. 239 * 240 * We also need to restart the busy extent search from the 241 * tree root, because erasing the node can rearrange the 242 * tree topology. 243 */ 244 rb_erase(&busyp->rb_node, &pag->pagb_tree); 245 busyp->length = 0; 246 return false; 247 } else if (fend < bend) { 248 /* 249 * Case 6: 250 * bbno bend 251 * +BBBBBBBBBBBBBBBBB+ 252 * +---------+ 253 * fbno fend 254 * 255 * Case 7: 256 * bbno bend 257 * +BBBBBBBBBBBBBBBBB+ 258 * +------------------+ 259 * fbno fend 260 * 261 */ 262 busyp->bno = fend; 263 busyp->length = bend - fend; 264 } else if (bbno < fbno) { 265 /* 266 * Case 8: 267 * bbno bend 268 * +BBBBBBBBBBBBBBBBB+ 269 * +-------------+ 270 * fbno fend 271 * 272 * Case 9: 273 * bbno bend 274 * +BBBBBBBBBBBBBBBBB+ 275 * +----------------------+ 276 * fbno fend 277 */ 278 busyp->length = fbno - busyp->bno; 279 } else { 280 ASSERT(0); 281 } 282 283 trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); 284 return true; 285 286 out_force_log: 287 spin_unlock(&pag->pagb_lock); 288 xfs_log_force(mp, XFS_LOG_SYNC); 289 trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); 290 spin_lock(&pag->pagb_lock); 291 return false; 292 } 293 294 295 /* 296 * For a given extent [fbno, flen], make sure we can reuse it safely. 297 */ 298 void 299 xfs_extent_busy_reuse( 300 struct xfs_mount *mp, 301 struct xfs_perag *pag, 302 xfs_agblock_t fbno, 303 xfs_extlen_t flen, 304 bool userdata) 305 { 306 struct rb_node *rbp; 307 308 ASSERT(flen > 0); 309 spin_lock(&pag->pagb_lock); 310 restart: 311 rbp = pag->pagb_tree.rb_node; 312 while (rbp) { 313 struct xfs_extent_busy *busyp = 314 rb_entry(rbp, struct xfs_extent_busy, rb_node); 315 xfs_agblock_t bbno = busyp->bno; 316 xfs_agblock_t bend = bbno + busyp->length; 317 318 if (fbno + flen <= bbno) { 319 rbp = rbp->rb_left; 320 continue; 321 } else if (fbno >= bend) { 322 rbp = rbp->rb_right; 323 continue; 324 } 325 326 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, 327 userdata)) 328 goto restart; 329 } 330 spin_unlock(&pag->pagb_lock); 331 } 332 333 /* 334 * For a given extent [fbno, flen], search the busy extent list to find a 335 * subset of the extent that is not busy. If *rlen is smaller than 336 * args->minlen no suitable extent could be found, and the higher level 337 * code needs to force out the log and retry the allocation. 338 * 339 * Return the current busy generation for the AG if the extent is busy. This 340 * value can be used to wait for at least one of the currently busy extents 341 * to be cleared. Note that the busy list is not guaranteed to be empty after 342 * the gen is woken. The state of a specific extent must always be confirmed 343 * with another call to xfs_extent_busy_trim() before it can be used. 344 */ 345 bool 346 xfs_extent_busy_trim( 347 struct xfs_alloc_arg *args, 348 xfs_agblock_t *bno, 349 xfs_extlen_t *len, 350 unsigned *busy_gen) 351 { 352 xfs_agblock_t fbno; 353 xfs_extlen_t flen; 354 struct rb_node *rbp; 355 bool ret = false; 356 357 ASSERT(*len > 0); 358 359 spin_lock(&args->pag->pagb_lock); 360 fbno = *bno; 361 flen = *len; 362 rbp = args->pag->pagb_tree.rb_node; 363 while (rbp && flen >= args->minlen) { 364 struct xfs_extent_busy *busyp = 365 rb_entry(rbp, struct xfs_extent_busy, rb_node); 366 xfs_agblock_t fend = fbno + flen; 367 xfs_agblock_t bbno = busyp->bno; 368 xfs_agblock_t bend = bbno + busyp->length; 369 370 if (fend <= bbno) { 371 rbp = rbp->rb_left; 372 continue; 373 } else if (fbno >= bend) { 374 rbp = rbp->rb_right; 375 continue; 376 } 377 378 if (bbno <= fbno) { 379 /* start overlap */ 380 381 /* 382 * Case 1: 383 * bbno bend 384 * +BBBBBBBBBBBBBBBBB+ 385 * +---------+ 386 * fbno fend 387 * 388 * Case 2: 389 * bbno bend 390 * +BBBBBBBBBBBBBBBBB+ 391 * +-------------+ 392 * fbno fend 393 * 394 * Case 3: 395 * bbno bend 396 * +BBBBBBBBBBBBBBBBB+ 397 * +-------------+ 398 * fbno fend 399 * 400 * Case 4: 401 * bbno bend 402 * +BBBBBBBBBBBBBBBBB+ 403 * +-----------------+ 404 * fbno fend 405 * 406 * No unbusy region in extent, return failure. 407 */ 408 if (fend <= bend) 409 goto fail; 410 411 /* 412 * Case 5: 413 * bbno bend 414 * +BBBBBBBBBBBBBBBBB+ 415 * +----------------------+ 416 * fbno fend 417 * 418 * Case 6: 419 * bbno bend 420 * +BBBBBBBBBBBBBBBBB+ 421 * +--------------------------+ 422 * fbno fend 423 * 424 * Needs to be trimmed to: 425 * +-------+ 426 * fbno fend 427 */ 428 fbno = bend; 429 } else if (bend >= fend) { 430 /* end overlap */ 431 432 /* 433 * Case 7: 434 * bbno bend 435 * +BBBBBBBBBBBBBBBBB+ 436 * +------------------+ 437 * fbno fend 438 * 439 * Case 8: 440 * bbno bend 441 * +BBBBBBBBBBBBBBBBB+ 442 * +--------------------------+ 443 * fbno fend 444 * 445 * Needs to be trimmed to: 446 * +-------+ 447 * fbno fend 448 */ 449 fend = bbno; 450 } else { 451 /* middle overlap */ 452 453 /* 454 * Case 9: 455 * bbno bend 456 * +BBBBBBBBBBBBBBBBB+ 457 * +-----------------------------------+ 458 * fbno fend 459 * 460 * Can be trimmed to: 461 * +-------+ OR +-------+ 462 * fbno fend fbno fend 463 * 464 * Backward allocation leads to significant 465 * fragmentation of directories, which degrades 466 * directory performance, therefore we always want to 467 * choose the option that produces forward allocation 468 * patterns. 469 * Preferring the lower bno extent will make the next 470 * request use "fend" as the start of the next 471 * allocation; if the segment is no longer busy at 472 * that point, we'll get a contiguous allocation, but 473 * even if it is still busy, we will get a forward 474 * allocation. 475 * We try to avoid choosing the segment at "bend", 476 * because that can lead to the next allocation 477 * taking the segment at "fbno", which would be a 478 * backward allocation. We only use the segment at 479 * "fbno" if it is much larger than the current 480 * requested size, because in that case there's a 481 * good chance subsequent allocations will be 482 * contiguous. 483 */ 484 if (bbno - fbno >= args->maxlen) { 485 /* left candidate fits perfect */ 486 fend = bbno; 487 } else if (fend - bend >= args->maxlen * 4) { 488 /* right candidate has enough free space */ 489 fbno = bend; 490 } else if (bbno - fbno >= args->minlen) { 491 /* left candidate fits minimum requirement */ 492 fend = bbno; 493 } else { 494 goto fail; 495 } 496 } 497 498 flen = fend - fbno; 499 } 500 out: 501 502 if (fbno != *bno || flen != *len) { 503 trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, 504 fbno, flen); 505 *bno = fbno; 506 *len = flen; 507 *busy_gen = args->pag->pagb_gen; 508 ret = true; 509 } 510 spin_unlock(&args->pag->pagb_lock); 511 return ret; 512 fail: 513 /* 514 * Return a zero extent length as failure indications. All callers 515 * re-check if the trimmed extent satisfies the minlen requirement. 516 */ 517 flen = 0; 518 goto out; 519 } 520 521 STATIC void 522 xfs_extent_busy_clear_one( 523 struct xfs_mount *mp, 524 struct xfs_perag *pag, 525 struct xfs_extent_busy *busyp) 526 { 527 if (busyp->length) { 528 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, 529 busyp->length); 530 rb_erase(&busyp->rb_node, &pag->pagb_tree); 531 } 532 533 list_del_init(&busyp->list); 534 kfree(busyp); 535 } 536 537 static void 538 xfs_extent_busy_put_pag( 539 struct xfs_perag *pag, 540 bool wakeup) 541 __releases(pag->pagb_lock) 542 { 543 if (wakeup) { 544 pag->pagb_gen++; 545 wake_up_all(&pag->pagb_wait); 546 } 547 548 spin_unlock(&pag->pagb_lock); 549 xfs_perag_put(pag); 550 } 551 552 /* 553 * Remove all extents on the passed in list from the busy extents tree. 554 * If do_discard is set skip extents that need to be discarded, and mark 555 * these as undergoing a discard operation instead. 556 */ 557 void 558 xfs_extent_busy_clear( 559 struct xfs_mount *mp, 560 struct list_head *list, 561 bool do_discard) 562 { 563 struct xfs_extent_busy *busyp, *n; 564 struct xfs_perag *pag = NULL; 565 xfs_agnumber_t agno = NULLAGNUMBER; 566 bool wakeup = false; 567 568 list_for_each_entry_safe(busyp, n, list, list) { 569 if (busyp->agno != agno) { 570 if (pag) 571 xfs_extent_busy_put_pag(pag, wakeup); 572 agno = busyp->agno; 573 pag = xfs_perag_get(mp, agno); 574 spin_lock(&pag->pagb_lock); 575 wakeup = false; 576 } 577 578 if (do_discard && busyp->length && 579 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { 580 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 581 } else { 582 xfs_extent_busy_clear_one(mp, pag, busyp); 583 wakeup = true; 584 } 585 } 586 587 if (pag) 588 xfs_extent_busy_put_pag(pag, wakeup); 589 } 590 591 /* 592 * Flush out all busy extents for this AG. 593 * 594 * If the current transaction is holding busy extents, the caller may not want 595 * to wait for committed busy extents to resolve. If we are being told just to 596 * try a flush or progress has been made since we last skipped a busy extent, 597 * return immediately to allow the caller to try again. 598 * 599 * If we are freeing extents, we might actually be holding the only free extents 600 * in the transaction busy list and the log force won't resolve that situation. 601 * In this case, we must return -EAGAIN to avoid a deadlock by informing the 602 * caller it needs to commit the busy extents it holds before retrying the 603 * extent free operation. 604 */ 605 int 606 xfs_extent_busy_flush( 607 struct xfs_trans *tp, 608 struct xfs_perag *pag, 609 unsigned busy_gen, 610 uint32_t alloc_flags) 611 { 612 DEFINE_WAIT (wait); 613 int error; 614 615 error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); 616 if (error) 617 return error; 618 619 /* Avoid deadlocks on uncommitted busy extents. */ 620 if (!list_empty(&tp->t_busy)) { 621 if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) 622 return 0; 623 624 if (busy_gen != READ_ONCE(pag->pagb_gen)) 625 return 0; 626 627 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 628 return -EAGAIN; 629 } 630 631 /* Wait for committed busy extents to resolve. */ 632 do { 633 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); 634 if (busy_gen != READ_ONCE(pag->pagb_gen)) 635 break; 636 schedule(); 637 } while (1); 638 639 finish_wait(&pag->pagb_wait, &wait); 640 return 0; 641 } 642 643 void 644 xfs_extent_busy_wait_all( 645 struct xfs_mount *mp) 646 { 647 struct xfs_perag *pag; 648 DEFINE_WAIT (wait); 649 xfs_agnumber_t agno; 650 651 for_each_perag(mp, agno, pag) { 652 do { 653 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); 654 if (RB_EMPTY_ROOT(&pag->pagb_tree)) 655 break; 656 schedule(); 657 } while (1); 658 finish_wait(&pag->pagb_wait, &wait); 659 } 660 } 661 662 /* 663 * Callback for list_sort to sort busy extents by the AG they reside in. 664 */ 665 int 666 xfs_extent_busy_ag_cmp( 667 void *priv, 668 const struct list_head *l1, 669 const struct list_head *l2) 670 { 671 struct xfs_extent_busy *b1 = 672 container_of(l1, struct xfs_extent_busy, list); 673 struct xfs_extent_busy *b2 = 674 container_of(l2, struct xfs_extent_busy, list); 675 s32 diff; 676 677 diff = b1->agno - b2->agno; 678 if (!diff) 679 diff = b1->bno - b2->bno; 680 return diff; 681 } 682 683 /* Are there any busy extents in this AG? */ 684 bool 685 xfs_extent_busy_list_empty( 686 struct xfs_perag *pag) 687 { 688 bool res; 689 690 spin_lock(&pag->pagb_lock); 691 res = RB_EMPTY_ROOT(&pag->pagb_tree); 692 spin_unlock(&pag->pagb_lock); 693 return res; 694 } 695