1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_trace.h" 20 #include "xfs_icache.h" 21 #include "xfs_log.h" 22 #include "xfs_log_priv.h" 23 #include "xfs_rmap.h" 24 #include "xfs_refcount.h" 25 #include "xfs_bmap.h" 26 #include "xfs_alloc.h" 27 #include "xfs_buf.h" 28 #include "xfs_da_format.h" 29 #include "xfs_da_btree.h" 30 #include "xfs_attr.h" 31 #include "xfs_trans_priv.h" 32 #include "xfs_exchmaps.h" 33 34 static struct kmem_cache *xfs_defer_pending_cache; 35 36 /* 37 * Deferred Operations in XFS 38 * 39 * Due to the way locking rules work in XFS, certain transactions (block 40 * mapping and unmapping, typically) have permanent reservations so that 41 * we can roll the transaction to adhere to AG locking order rules and 42 * to unlock buffers between metadata updates. Prior to rmap/reflink, 43 * the mapping code had a mechanism to perform these deferrals for 44 * extents that were going to be freed; this code makes that facility 45 * more generic. 46 * 47 * When adding the reverse mapping and reflink features, it became 48 * necessary to perform complex remapping multi-transactions to comply 49 * with AG locking order rules, and to be able to spread a single 50 * refcount update operation (an operation on an n-block extent can 51 * update as many as n records!) among multiple transactions. XFS can 52 * roll a transaction to facilitate this, but using this facility 53 * requires us to log "intent" items in case log recovery needs to 54 * redo the operation, and to log "done" items to indicate that redo 55 * is not necessary. 56 * 57 * Deferred work is tracked in xfs_defer_pending items. Each pending 58 * item tracks one type of deferred work. Incoming work items (which 59 * have not yet had an intent logged) are attached to a pending item 60 * on the dop_intake list, where they wait for the caller to finish 61 * the deferred operations. 62 * 63 * Finishing a set of deferred operations is an involved process. To 64 * start, we define "rolling a deferred-op transaction" as follows: 65 * 66 * > For each xfs_defer_pending item on the dop_intake list, 67 * - Sort the work items in AG order. XFS locking 68 * order rules require us to lock buffers in AG order. 69 * - Create a log intent item for that type. 70 * - Attach it to the pending item. 71 * - Move the pending item from the dop_intake list to the 72 * dop_pending list. 73 * > Roll the transaction. 74 * 75 * NOTE: To avoid exceeding the transaction reservation, we limit the 76 * number of items that we attach to a given xfs_defer_pending. 77 * 78 * The actual finishing process looks like this: 79 * 80 * > For each xfs_defer_pending in the dop_pending list, 81 * - Roll the deferred-op transaction as above. 82 * - Create a log done item for that type, and attach it to the 83 * log intent item. 84 * - For each work item attached to the log intent item, 85 * * Perform the described action. 86 * * Attach the work item to the log done item. 87 * * If the result of doing the work was -EAGAIN, ->finish work 88 * wants a new transaction. See the "Requesting a Fresh 89 * Transaction while Finishing Deferred Work" section below for 90 * details. 91 * 92 * The key here is that we must log an intent item for all pending 93 * work items every time we roll the transaction, and that we must log 94 * a done item as soon as the work is completed. With this mechanism 95 * we can perform complex remapping operations, chaining intent items 96 * as needed. 97 * 98 * Requesting a Fresh Transaction while Finishing Deferred Work 99 * 100 * If ->finish_item decides that it needs a fresh transaction to 101 * finish the work, it must ask its caller (xfs_defer_finish) for a 102 * continuation. The most likely cause of this circumstance are the 103 * refcount adjust functions deciding that they've logged enough items 104 * to be at risk of exceeding the transaction reservation. 105 * 106 * To get a fresh transaction, we want to log the existing log done 107 * item to prevent the log intent item from replaying, immediately log 108 * a new log intent item with the unfinished work items, roll the 109 * transaction, and re-call ->finish_item wherever it left off. The 110 * log done item and the new log intent item must be in the same 111 * transaction or atomicity cannot be guaranteed; defer_finish ensures 112 * that this happens. 113 * 114 * This requires some coordination between ->finish_item and 115 * defer_finish. Upon deciding to request a new transaction, 116 * ->finish_item should update the current work item to reflect the 117 * unfinished work. Next, it should reset the log done item's list 118 * count to the number of items finished, and return -EAGAIN. 119 * defer_finish sees the -EAGAIN, logs the new log intent item 120 * with the remaining work items, and leaves the xfs_defer_pending 121 * item at the head of the dop_work queue. Then it rolls the 122 * transaction and picks up processing where it left off. It is 123 * required that ->finish_item must be careful to leave enough 124 * transaction reservation to fit the new log intent item. 125 * 126 * This is an example of remapping the extent (E, E+B) into file X at 127 * offset A and dealing with the extent (C, C+B) already being mapped 128 * there: 129 * +-------------------------------------------------+ 130 * | Unmap file X startblock C offset A length B | t0 131 * | Intent to reduce refcount for extent (C, B) | 132 * | Intent to remove rmap (X, C, A, B) | 133 * | Intent to free extent (D, 1) (bmbt block) | 134 * | Intent to map (X, A, B) at startblock E | 135 * +-------------------------------------------------+ 136 * | Map file X startblock E offset A length B | t1 137 * | Done mapping (X, E, A, B) | 138 * | Intent to increase refcount for extent (E, B) | 139 * | Intent to add rmap (X, E, A, B) | 140 * +-------------------------------------------------+ 141 * | Reduce refcount for extent (C, B) | t2 142 * | Done reducing refcount for extent (C, 9) | 143 * | Intent to reduce refcount for extent (C+9, B-9) | 144 * | (ran out of space after 9 refcount updates) | 145 * +-------------------------------------------------+ 146 * | Reduce refcount for extent (C+9, B+9) | t3 147 * | Done reducing refcount for extent (C+9, B-9) | 148 * | Increase refcount for extent (E, B) | 149 * | Done increasing refcount for extent (E, B) | 150 * | Intent to free extent (C, B) | 151 * | Intent to free extent (F, 1) (refcountbt block) | 152 * | Intent to remove rmap (F, 1, REFC) | 153 * +-------------------------------------------------+ 154 * | Remove rmap (X, C, A, B) | t4 155 * | Done removing rmap (X, C, A, B) | 156 * | Add rmap (X, E, A, B) | 157 * | Done adding rmap (X, E, A, B) | 158 * | Remove rmap (F, 1, REFC) | 159 * | Done removing rmap (F, 1, REFC) | 160 * +-------------------------------------------------+ 161 * | Free extent (C, B) | t5 162 * | Done freeing extent (C, B) | 163 * | Free extent (D, 1) | 164 * | Done freeing extent (D, 1) | 165 * | Free extent (F, 1) | 166 * | Done freeing extent (F, 1) | 167 * +-------------------------------------------------+ 168 * 169 * If we should crash before t2 commits, log recovery replays 170 * the following intent items: 171 * 172 * - Intent to reduce refcount for extent (C, B) 173 * - Intent to remove rmap (X, C, A, B) 174 * - Intent to free extent (D, 1) (bmbt block) 175 * - Intent to increase refcount for extent (E, B) 176 * - Intent to add rmap (X, E, A, B) 177 * 178 * In the process of recovering, it should also generate and take care 179 * of these intent items: 180 * 181 * - Intent to free extent (C, B) 182 * - Intent to free extent (F, 1) (refcountbt block) 183 * - Intent to remove rmap (F, 1, REFC) 184 * 185 * Note that the continuation requested between t2 and t3 is likely to 186 * reoccur. 187 */ 188 STATIC struct xfs_log_item * 189 xfs_defer_barrier_create_intent( 190 struct xfs_trans *tp, 191 struct list_head *items, 192 unsigned int count, 193 bool sort) 194 { 195 return NULL; 196 } 197 198 STATIC void 199 xfs_defer_barrier_abort_intent( 200 struct xfs_log_item *intent) 201 { 202 /* empty */ 203 } 204 205 STATIC struct xfs_log_item * 206 xfs_defer_barrier_create_done( 207 struct xfs_trans *tp, 208 struct xfs_log_item *intent, 209 unsigned int count) 210 { 211 return NULL; 212 } 213 214 STATIC int 215 xfs_defer_barrier_finish_item( 216 struct xfs_trans *tp, 217 struct xfs_log_item *done, 218 struct list_head *item, 219 struct xfs_btree_cur **state) 220 { 221 ASSERT(0); 222 return -EFSCORRUPTED; 223 } 224 225 STATIC void 226 xfs_defer_barrier_cancel_item( 227 struct list_head *item) 228 { 229 ASSERT(0); 230 } 231 232 static const struct xfs_defer_op_type xfs_barrier_defer_type = { 233 .max_items = 1, 234 .create_intent = xfs_defer_barrier_create_intent, 235 .abort_intent = xfs_defer_barrier_abort_intent, 236 .create_done = xfs_defer_barrier_create_done, 237 .finish_item = xfs_defer_barrier_finish_item, 238 .cancel_item = xfs_defer_barrier_cancel_item, 239 }; 240 241 /* Create a log intent done item for a log intent item. */ 242 static inline void 243 xfs_defer_create_done( 244 struct xfs_trans *tp, 245 struct xfs_defer_pending *dfp) 246 { 247 struct xfs_log_item *lip; 248 249 /* If there is no log intent item, there can be no log done item. */ 250 if (!dfp->dfp_intent) 251 return; 252 253 /* 254 * Mark the transaction dirty, even on error. This ensures the 255 * transaction is aborted, which: 256 * 257 * 1.) releases the log intent item and frees the log done item 258 * 2.) shuts down the filesystem 259 */ 260 tp->t_flags |= XFS_TRANS_DIRTY; 261 lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); 262 if (!lip) 263 return; 264 265 tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; 266 xfs_trans_add_item(tp, lip); 267 set_bit(XFS_LI_DIRTY, &lip->li_flags); 268 dfp->dfp_done = lip; 269 } 270 271 /* 272 * Ensure there's a log intent item associated with this deferred work item if 273 * the operation must be restarted on crash. Returns 1 if there's a log item; 274 * 0 if there isn't; or a negative errno. 275 */ 276 static int 277 xfs_defer_create_intent( 278 struct xfs_trans *tp, 279 struct xfs_defer_pending *dfp, 280 bool sort) 281 { 282 struct xfs_log_item *lip; 283 284 if (dfp->dfp_intent) 285 return 1; 286 287 lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, 288 sort); 289 if (!lip) 290 return 0; 291 if (IS_ERR(lip)) 292 return PTR_ERR(lip); 293 294 tp->t_flags |= XFS_TRANS_DIRTY; 295 xfs_trans_add_item(tp, lip); 296 set_bit(XFS_LI_DIRTY, &lip->li_flags); 297 dfp->dfp_intent = lip; 298 return 1; 299 } 300 301 /* 302 * For each pending item in the intake list, log its intent item and the 303 * associated extents, then add the entire intake list to the end of 304 * the pending list. 305 * 306 * Returns 1 if at least one log item was associated with the deferred work; 307 * 0 if there are no log items; or a negative errno. 308 */ 309 static int 310 xfs_defer_create_intents( 311 struct xfs_trans *tp) 312 { 313 struct xfs_defer_pending *dfp; 314 int ret = 0; 315 316 list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { 317 int ret2; 318 319 trace_xfs_defer_create_intent(tp->t_mountp, dfp); 320 ret2 = xfs_defer_create_intent(tp, dfp, true); 321 if (ret2 < 0) 322 return ret2; 323 ret |= ret2; 324 } 325 return ret; 326 } 327 328 static inline void 329 xfs_defer_pending_abort( 330 struct xfs_mount *mp, 331 struct xfs_defer_pending *dfp) 332 { 333 trace_xfs_defer_pending_abort(mp, dfp); 334 335 if (dfp->dfp_intent && !dfp->dfp_done) { 336 dfp->dfp_ops->abort_intent(dfp->dfp_intent); 337 dfp->dfp_intent = NULL; 338 } 339 } 340 341 static inline void 342 xfs_defer_pending_cancel_work( 343 struct xfs_mount *mp, 344 struct xfs_defer_pending *dfp) 345 { 346 struct list_head *pwi; 347 struct list_head *n; 348 349 trace_xfs_defer_cancel_list(mp, dfp); 350 351 list_del(&dfp->dfp_list); 352 list_for_each_safe(pwi, n, &dfp->dfp_work) { 353 list_del(pwi); 354 dfp->dfp_count--; 355 trace_xfs_defer_cancel_item(mp, dfp, pwi); 356 dfp->dfp_ops->cancel_item(pwi); 357 } 358 ASSERT(dfp->dfp_count == 0); 359 kmem_cache_free(xfs_defer_pending_cache, dfp); 360 } 361 362 STATIC void 363 xfs_defer_pending_abort_list( 364 struct xfs_mount *mp, 365 struct list_head *dop_list) 366 { 367 struct xfs_defer_pending *dfp; 368 369 /* Abort intent items that don't have a done item. */ 370 list_for_each_entry(dfp, dop_list, dfp_list) 371 xfs_defer_pending_abort(mp, dfp); 372 } 373 374 /* Abort all the intents that were committed. */ 375 STATIC void 376 xfs_defer_trans_abort( 377 struct xfs_trans *tp, 378 struct list_head *dop_pending) 379 { 380 trace_xfs_defer_trans_abort(tp, _RET_IP_); 381 xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); 382 } 383 384 /* 385 * Capture resources that the caller said not to release ("held") when the 386 * transaction commits. Caller is responsible for zero-initializing @dres. 387 */ 388 static int 389 xfs_defer_save_resources( 390 struct xfs_defer_resources *dres, 391 struct xfs_trans *tp) 392 { 393 struct xfs_buf_log_item *bli; 394 struct xfs_inode_log_item *ili; 395 struct xfs_log_item *lip; 396 397 BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); 398 399 list_for_each_entry(lip, &tp->t_items, li_trans) { 400 switch (lip->li_type) { 401 case XFS_LI_BUF: 402 bli = container_of(lip, struct xfs_buf_log_item, 403 bli_item); 404 if (bli->bli_flags & XFS_BLI_HOLD) { 405 if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { 406 ASSERT(0); 407 return -EFSCORRUPTED; 408 } 409 if (bli->bli_flags & XFS_BLI_ORDERED) 410 dres->dr_ordered |= 411 (1U << dres->dr_bufs); 412 else 413 xfs_trans_dirty_buf(tp, bli->bli_buf); 414 dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; 415 } 416 break; 417 case XFS_LI_INODE: 418 ili = container_of(lip, struct xfs_inode_log_item, 419 ili_item); 420 if (ili->ili_lock_flags == 0) { 421 if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { 422 ASSERT(0); 423 return -EFSCORRUPTED; 424 } 425 xfs_trans_log_inode(tp, ili->ili_inode, 426 XFS_ILOG_CORE); 427 dres->dr_ip[dres->dr_inos++] = ili->ili_inode; 428 } 429 break; 430 default: 431 break; 432 } 433 } 434 435 return 0; 436 } 437 438 /* Attach the held resources to the transaction. */ 439 static void 440 xfs_defer_restore_resources( 441 struct xfs_trans *tp, 442 struct xfs_defer_resources *dres) 443 { 444 unsigned short i; 445 446 /* Rejoin the joined inodes. */ 447 for (i = 0; i < dres->dr_inos; i++) 448 xfs_trans_ijoin(tp, dres->dr_ip[i], 0); 449 450 /* Rejoin the buffers and dirty them so the log moves forward. */ 451 for (i = 0; i < dres->dr_bufs; i++) { 452 xfs_trans_bjoin(tp, dres->dr_bp[i]); 453 if (dres->dr_ordered & (1U << i)) 454 xfs_trans_ordered_buf(tp, dres->dr_bp[i]); 455 xfs_trans_bhold(tp, dres->dr_bp[i]); 456 } 457 } 458 459 /* Roll a transaction so we can do some deferred op processing. */ 460 STATIC int 461 xfs_defer_trans_roll( 462 struct xfs_trans **tpp) 463 { 464 struct xfs_defer_resources dres = { }; 465 int error; 466 467 error = xfs_defer_save_resources(&dres, *tpp); 468 if (error) 469 return error; 470 471 trace_xfs_defer_trans_roll(*tpp, _RET_IP_); 472 473 /* 474 * Roll the transaction. Rolling always given a new transaction (even 475 * if committing the old one fails!) to hand back to the caller, so we 476 * join the held resources to the new transaction so that we always 477 * return with the held resources joined to @tpp, no matter what 478 * happened. 479 */ 480 error = xfs_trans_roll(tpp); 481 482 xfs_defer_restore_resources(*tpp, &dres); 483 484 if (error) 485 trace_xfs_defer_trans_roll_error(*tpp, error); 486 return error; 487 } 488 489 /* 490 * Free up any items left in the list. 491 */ 492 static void 493 xfs_defer_cancel_list( 494 struct xfs_mount *mp, 495 struct list_head *dop_list) 496 { 497 struct xfs_defer_pending *dfp; 498 struct xfs_defer_pending *pli; 499 500 /* 501 * Free the pending items. Caller should already have arranged 502 * for the intent items to be released. 503 */ 504 list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) 505 xfs_defer_pending_cancel_work(mp, dfp); 506 } 507 508 static inline void 509 xfs_defer_relog_intent( 510 struct xfs_trans *tp, 511 struct xfs_defer_pending *dfp) 512 { 513 struct xfs_log_item *lip; 514 515 xfs_defer_create_done(tp, dfp); 516 517 lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); 518 if (lip) { 519 xfs_trans_add_item(tp, lip); 520 set_bit(XFS_LI_DIRTY, &lip->li_flags); 521 } 522 dfp->dfp_done = NULL; 523 dfp->dfp_intent = lip; 524 } 525 526 /* 527 * Prevent a log intent item from pinning the tail of the log by logging a 528 * done item to release the intent item; and then log a new intent item. 529 * The caller should provide a fresh transaction and roll it after we're done. 530 */ 531 static void 532 xfs_defer_relog( 533 struct xfs_trans **tpp, 534 struct list_head *dfops) 535 { 536 struct xlog *log = (*tpp)->t_mountp->m_log; 537 struct xfs_defer_pending *dfp; 538 xfs_lsn_t threshold_lsn = NULLCOMMITLSN; 539 540 541 ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); 542 543 list_for_each_entry(dfp, dfops, dfp_list) { 544 /* 545 * If the log intent item for this deferred op is not a part of 546 * the current log checkpoint, relog the intent item to keep 547 * the log tail moving forward. We're ok with this being racy 548 * because an incorrect decision means we'll be a little slower 549 * at pushing the tail. 550 */ 551 if (dfp->dfp_intent == NULL || 552 xfs_log_item_in_current_chkpt(dfp->dfp_intent)) 553 continue; 554 555 /* 556 * Figure out where we need the tail to be in order to maintain 557 * the minimum required free space in the log. Only sample 558 * the log threshold once per call. 559 */ 560 if (threshold_lsn == NULLCOMMITLSN) { 561 threshold_lsn = xfs_ail_get_push_target(log->l_ailp); 562 if (threshold_lsn == NULLCOMMITLSN) 563 break; 564 } 565 if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) 566 continue; 567 568 trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); 569 XFS_STATS_INC((*tpp)->t_mountp, defer_relog); 570 571 xfs_defer_relog_intent(*tpp, dfp); 572 } 573 } 574 575 /* 576 * Log an intent-done item for the first pending intent, and finish the work 577 * items. 578 */ 579 int 580 xfs_defer_finish_one( 581 struct xfs_trans *tp, 582 struct xfs_defer_pending *dfp) 583 { 584 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 585 struct xfs_btree_cur *state = NULL; 586 struct list_head *li, *n; 587 int error; 588 589 trace_xfs_defer_pending_finish(tp->t_mountp, dfp); 590 591 xfs_defer_create_done(tp, dfp); 592 list_for_each_safe(li, n, &dfp->dfp_work) { 593 list_del(li); 594 dfp->dfp_count--; 595 trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); 596 error = ops->finish_item(tp, dfp->dfp_done, li, &state); 597 if (error == -EAGAIN) { 598 int ret; 599 600 /* 601 * Caller wants a fresh transaction; put the work item 602 * back on the list and log a new log intent item to 603 * replace the old one. See "Requesting a Fresh 604 * Transaction while Finishing Deferred Work" above. 605 */ 606 list_add(li, &dfp->dfp_work); 607 dfp->dfp_count++; 608 dfp->dfp_done = NULL; 609 dfp->dfp_intent = NULL; 610 ret = xfs_defer_create_intent(tp, dfp, false); 611 if (ret < 0) 612 error = ret; 613 } 614 615 if (error) 616 goto out; 617 } 618 619 /* Done with the dfp, free it. */ 620 list_del(&dfp->dfp_list); 621 kmem_cache_free(xfs_defer_pending_cache, dfp); 622 out: 623 if (ops->finish_cleanup) 624 ops->finish_cleanup(tp, state, error); 625 return error; 626 } 627 628 /* Move all paused deferred work from @tp to @paused_list. */ 629 static void 630 xfs_defer_isolate_paused( 631 struct xfs_trans *tp, 632 struct list_head *paused_list) 633 { 634 struct xfs_defer_pending *dfp; 635 struct xfs_defer_pending *pli; 636 637 list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { 638 if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) 639 continue; 640 641 list_move_tail(&dfp->dfp_list, paused_list); 642 trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); 643 } 644 } 645 646 /* 647 * Finish all the pending work. This involves logging intent items for 648 * any work items that wandered in since the last transaction roll (if 649 * one has even happened), rolling the transaction, and finishing the 650 * work items in the first item on the logged-and-pending list. 651 * 652 * If an inode is provided, relog it to the new transaction. 653 */ 654 int 655 xfs_defer_finish_noroll( 656 struct xfs_trans **tp) 657 { 658 struct xfs_defer_pending *dfp = NULL; 659 int error = 0; 660 LIST_HEAD(dop_pending); 661 LIST_HEAD(dop_paused); 662 663 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 664 665 trace_xfs_defer_finish(*tp, _RET_IP_); 666 667 /* Until we run out of pending work to finish... */ 668 while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { 669 /* 670 * Deferred items that are created in the process of finishing 671 * other deferred work items should be queued at the head of 672 * the pending list, which puts them ahead of the deferred work 673 * that was created by the caller. This keeps the number of 674 * pending work items to a minimum, which decreases the amount 675 * of time that any one intent item can stick around in memory, 676 * pinning the log tail. 677 */ 678 int has_intents = xfs_defer_create_intents(*tp); 679 680 xfs_defer_isolate_paused(*tp, &dop_paused); 681 682 list_splice_init(&(*tp)->t_dfops, &dop_pending); 683 684 if (has_intents < 0) { 685 error = has_intents; 686 goto out_shutdown; 687 } 688 if (has_intents || dfp) { 689 error = xfs_defer_trans_roll(tp); 690 if (error) 691 goto out_shutdown; 692 693 /* Relog intent items to keep the log moving. */ 694 xfs_defer_relog(tp, &dop_pending); 695 xfs_defer_relog(tp, &dop_paused); 696 697 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 698 error = xfs_defer_trans_roll(tp); 699 if (error) 700 goto out_shutdown; 701 } 702 } 703 704 dfp = list_first_entry_or_null(&dop_pending, 705 struct xfs_defer_pending, dfp_list); 706 if (!dfp) 707 break; 708 error = xfs_defer_finish_one(*tp, dfp); 709 if (error && error != -EAGAIN) 710 goto out_shutdown; 711 } 712 713 /* Requeue the paused items in the outgoing transaction. */ 714 list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); 715 716 trace_xfs_defer_finish_done(*tp, _RET_IP_); 717 return 0; 718 719 out_shutdown: 720 list_splice_tail_init(&dop_paused, &dop_pending); 721 xfs_defer_trans_abort(*tp, &dop_pending); 722 xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); 723 trace_xfs_defer_finish_error(*tp, error); 724 xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); 725 xfs_defer_cancel(*tp); 726 return error; 727 } 728 729 int 730 xfs_defer_finish( 731 struct xfs_trans **tp) 732 { 733 #ifdef DEBUG 734 struct xfs_defer_pending *dfp; 735 #endif 736 int error; 737 738 /* 739 * Finish and roll the transaction once more to avoid returning to the 740 * caller with a dirty transaction. 741 */ 742 error = xfs_defer_finish_noroll(tp); 743 if (error) 744 return error; 745 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 746 error = xfs_defer_trans_roll(tp); 747 if (error) { 748 xfs_force_shutdown((*tp)->t_mountp, 749 SHUTDOWN_CORRUPT_INCORE); 750 return error; 751 } 752 } 753 754 /* Reset LOWMODE now that we've finished all the dfops. */ 755 #ifdef DEBUG 756 list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) 757 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 758 #endif 759 (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; 760 return 0; 761 } 762 763 void 764 xfs_defer_cancel( 765 struct xfs_trans *tp) 766 { 767 struct xfs_mount *mp = tp->t_mountp; 768 769 trace_xfs_defer_cancel(tp, _RET_IP_); 770 xfs_defer_trans_abort(tp, &tp->t_dfops); 771 xfs_defer_cancel_list(mp, &tp->t_dfops); 772 } 773 774 /* 775 * Return the last pending work item attached to this transaction if it matches 776 * the deferred op type. 777 */ 778 static inline struct xfs_defer_pending * 779 xfs_defer_find_last( 780 struct xfs_trans *tp, 781 const struct xfs_defer_op_type *ops) 782 { 783 struct xfs_defer_pending *dfp = NULL; 784 785 /* No dfops at all? */ 786 if (list_empty(&tp->t_dfops)) 787 return NULL; 788 789 dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, 790 dfp_list); 791 792 /* Wrong type? */ 793 if (dfp->dfp_ops != ops) 794 return NULL; 795 return dfp; 796 } 797 798 /* 799 * Decide if we can add a deferred work item to the last dfops item attached 800 * to the transaction. 801 */ 802 static inline bool 803 xfs_defer_can_append( 804 struct xfs_defer_pending *dfp, 805 const struct xfs_defer_op_type *ops) 806 { 807 /* Already logged? */ 808 if (dfp->dfp_intent) 809 return false; 810 811 /* Paused items cannot absorb more work */ 812 if (dfp->dfp_flags & XFS_DEFER_PAUSED) 813 return NULL; 814 815 /* Already full? */ 816 if (ops->max_items && dfp->dfp_count >= ops->max_items) 817 return false; 818 819 return true; 820 } 821 822 /* Create a new pending item at the end of the transaction list. */ 823 static inline struct xfs_defer_pending * 824 xfs_defer_alloc( 825 struct list_head *dfops, 826 const struct xfs_defer_op_type *ops) 827 { 828 struct xfs_defer_pending *dfp; 829 830 dfp = kmem_cache_zalloc(xfs_defer_pending_cache, 831 GFP_KERNEL | __GFP_NOFAIL); 832 dfp->dfp_ops = ops; 833 INIT_LIST_HEAD(&dfp->dfp_work); 834 list_add_tail(&dfp->dfp_list, dfops); 835 836 return dfp; 837 } 838 839 /* Add an item for later deferred processing. */ 840 struct xfs_defer_pending * 841 xfs_defer_add( 842 struct xfs_trans *tp, 843 struct list_head *li, 844 const struct xfs_defer_op_type *ops) 845 { 846 struct xfs_defer_pending *dfp = NULL; 847 848 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 849 850 dfp = xfs_defer_find_last(tp, ops); 851 if (!dfp || !xfs_defer_can_append(dfp, ops)) 852 dfp = xfs_defer_alloc(&tp->t_dfops, ops); 853 854 xfs_defer_add_item(dfp, li); 855 trace_xfs_defer_add_item(tp->t_mountp, dfp, li); 856 return dfp; 857 } 858 859 /* 860 * Add a defer ops barrier to force two otherwise adjacent deferred work items 861 * to be tracked separately and have separate log items. 862 */ 863 void 864 xfs_defer_add_barrier( 865 struct xfs_trans *tp) 866 { 867 struct xfs_defer_pending *dfp; 868 869 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 870 871 /* If the last defer op added was a barrier, we're done. */ 872 dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); 873 if (dfp) 874 return; 875 876 xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); 877 878 trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); 879 } 880 881 /* 882 * Create a pending deferred work item to replay the recovered intent item 883 * and add it to the list. 884 */ 885 void 886 xfs_defer_start_recovery( 887 struct xfs_log_item *lip, 888 struct list_head *r_dfops, 889 const struct xfs_defer_op_type *ops) 890 { 891 struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); 892 893 dfp->dfp_intent = lip; 894 } 895 896 /* 897 * Cancel a deferred work item created to recover a log intent item. @dfp 898 * will be freed after this function returns. 899 */ 900 void 901 xfs_defer_cancel_recovery( 902 struct xfs_mount *mp, 903 struct xfs_defer_pending *dfp) 904 { 905 xfs_defer_pending_abort(mp, dfp); 906 xfs_defer_pending_cancel_work(mp, dfp); 907 } 908 909 /* Replay the deferred work item created from a recovered log intent item. */ 910 int 911 xfs_defer_finish_recovery( 912 struct xfs_mount *mp, 913 struct xfs_defer_pending *dfp, 914 struct list_head *capture_list) 915 { 916 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 917 int error; 918 919 /* dfp is freed by recover_work and must not be accessed afterwards */ 920 error = ops->recover_work(dfp, capture_list); 921 if (error) 922 trace_xlog_intent_recovery_failed(mp, ops, error); 923 return error; 924 } 925 926 /* 927 * Move deferred ops from one transaction to another and reset the source to 928 * initial state. This is primarily used to carry state forward across 929 * transaction rolls with pending dfops. 930 */ 931 void 932 xfs_defer_move( 933 struct xfs_trans *dtp, 934 struct xfs_trans *stp) 935 { 936 list_splice_init(&stp->t_dfops, &dtp->t_dfops); 937 938 /* 939 * Low free space mode was historically controlled by a dfops field. 940 * This meant that low mode state potentially carried across multiple 941 * transaction rolls. Transfer low mode on a dfops move to preserve 942 * that behavior. 943 */ 944 dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); 945 stp->t_flags &= ~XFS_TRANS_LOWMODE; 946 } 947 948 /* 949 * Prepare a chain of fresh deferred ops work items to be completed later. Log 950 * recovery requires the ability to put off until later the actual finishing 951 * work so that it can process unfinished items recovered from the log in 952 * correct order. 953 * 954 * Create and log intent items for all the work that we're capturing so that we 955 * can be assured that the items will get replayed if the system goes down 956 * before log recovery gets a chance to finish the work it put off. The entire 957 * deferred ops state is transferred to the capture structure and the 958 * transaction is then ready for the caller to commit it. If there are no 959 * intent items to capture, this function returns NULL. 960 * 961 * If capture_ip is not NULL, the capture structure will obtain an extra 962 * reference to the inode. 963 */ 964 static struct xfs_defer_capture * 965 xfs_defer_ops_capture( 966 struct xfs_trans *tp) 967 { 968 struct xfs_defer_capture *dfc; 969 unsigned short i; 970 int error; 971 972 if (list_empty(&tp->t_dfops)) 973 return NULL; 974 975 error = xfs_defer_create_intents(tp); 976 if (error < 0) 977 return ERR_PTR(error); 978 979 /* Create an object to capture the defer ops. */ 980 dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); 981 INIT_LIST_HEAD(&dfc->dfc_list); 982 INIT_LIST_HEAD(&dfc->dfc_dfops); 983 984 /* Move the dfops chain and transaction state to the capture struct. */ 985 list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); 986 dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; 987 tp->t_flags &= ~XFS_TRANS_LOWMODE; 988 989 /* Capture the remaining block reservations along with the dfops. */ 990 dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; 991 dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; 992 993 /* Preserve the log reservation size. */ 994 dfc->dfc_logres = tp->t_log_res; 995 996 error = xfs_defer_save_resources(&dfc->dfc_held, tp); 997 if (error) { 998 /* 999 * Resource capture should never fail, but if it does, we 1000 * still have to shut down the log and release things 1001 * properly. 1002 */ 1003 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); 1004 } 1005 1006 /* 1007 * Grab extra references to the inodes and buffers because callers are 1008 * expected to release their held references after we commit the 1009 * transaction. 1010 */ 1011 for (i = 0; i < dfc->dfc_held.dr_inos; i++) { 1012 xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); 1013 ihold(VFS_I(dfc->dfc_held.dr_ip[i])); 1014 } 1015 1016 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1017 xfs_buf_hold(dfc->dfc_held.dr_bp[i]); 1018 1019 return dfc; 1020 } 1021 1022 /* Release all resources that we used to capture deferred ops. */ 1023 void 1024 xfs_defer_ops_capture_abort( 1025 struct xfs_mount *mp, 1026 struct xfs_defer_capture *dfc) 1027 { 1028 unsigned short i; 1029 1030 xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); 1031 xfs_defer_cancel_list(mp, &dfc->dfc_dfops); 1032 1033 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1034 xfs_buf_relse(dfc->dfc_held.dr_bp[i]); 1035 1036 for (i = 0; i < dfc->dfc_held.dr_inos; i++) 1037 xfs_irele(dfc->dfc_held.dr_ip[i]); 1038 1039 kfree(dfc); 1040 } 1041 1042 /* 1043 * Capture any deferred ops and commit the transaction. This is the last step 1044 * needed to finish a log intent item that we recovered from the log. If any 1045 * of the deferred ops operate on an inode, the caller must pass in that inode 1046 * so that the reference can be transferred to the capture structure. The 1047 * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling 1048 * xfs_defer_ops_continue. 1049 */ 1050 int 1051 xfs_defer_ops_capture_and_commit( 1052 struct xfs_trans *tp, 1053 struct list_head *capture_list) 1054 { 1055 struct xfs_mount *mp = tp->t_mountp; 1056 struct xfs_defer_capture *dfc; 1057 int error; 1058 1059 /* If we don't capture anything, commit transaction and exit. */ 1060 dfc = xfs_defer_ops_capture(tp); 1061 if (IS_ERR(dfc)) { 1062 xfs_trans_cancel(tp); 1063 return PTR_ERR(dfc); 1064 } 1065 if (!dfc) 1066 return xfs_trans_commit(tp); 1067 1068 /* Commit the transaction and add the capture structure to the list. */ 1069 error = xfs_trans_commit(tp); 1070 if (error) { 1071 xfs_defer_ops_capture_abort(mp, dfc); 1072 return error; 1073 } 1074 1075 list_add_tail(&dfc->dfc_list, capture_list); 1076 return 0; 1077 } 1078 1079 /* 1080 * Attach a chain of captured deferred ops to a new transaction and free the 1081 * capture structure. If an inode was captured, it will be passed back to the 1082 * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. 1083 * The caller now owns the inode reference. 1084 */ 1085 void 1086 xfs_defer_ops_continue( 1087 struct xfs_defer_capture *dfc, 1088 struct xfs_trans *tp, 1089 struct xfs_defer_resources *dres) 1090 { 1091 unsigned int i; 1092 1093 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1094 ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); 1095 1096 /* Lock the captured resources to the new transaction. */ 1097 if (dfc->dfc_held.dr_inos > 2) { 1098 xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); 1099 xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, 1100 XFS_ILOCK_EXCL); 1101 } else if (dfc->dfc_held.dr_inos == 2) 1102 xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, 1103 dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); 1104 else if (dfc->dfc_held.dr_inos == 1) 1105 xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); 1106 1107 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1108 xfs_buf_lock(dfc->dfc_held.dr_bp[i]); 1109 1110 /* Join the captured resources to the new transaction. */ 1111 xfs_defer_restore_resources(tp, &dfc->dfc_held); 1112 memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); 1113 dres->dr_bufs = 0; 1114 1115 /* Move captured dfops chain and state to the transaction. */ 1116 list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); 1117 tp->t_flags |= dfc->dfc_tpflags; 1118 1119 kfree(dfc); 1120 } 1121 1122 /* Release the resources captured and continued during recovery. */ 1123 void 1124 xfs_defer_resources_rele( 1125 struct xfs_defer_resources *dres) 1126 { 1127 unsigned short i; 1128 1129 for (i = 0; i < dres->dr_inos; i++) { 1130 xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); 1131 xfs_irele(dres->dr_ip[i]); 1132 dres->dr_ip[i] = NULL; 1133 } 1134 1135 for (i = 0; i < dres->dr_bufs; i++) { 1136 xfs_buf_relse(dres->dr_bp[i]); 1137 dres->dr_bp[i] = NULL; 1138 } 1139 1140 dres->dr_inos = 0; 1141 dres->dr_bufs = 0; 1142 dres->dr_ordered = 0; 1143 } 1144 1145 static inline int __init 1146 xfs_defer_init_cache(void) 1147 { 1148 xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", 1149 sizeof(struct xfs_defer_pending), 1150 0, 0, NULL); 1151 1152 return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; 1153 } 1154 1155 static inline void 1156 xfs_defer_destroy_cache(void) 1157 { 1158 kmem_cache_destroy(xfs_defer_pending_cache); 1159 xfs_defer_pending_cache = NULL; 1160 } 1161 1162 /* Set up caches for deferred work items. */ 1163 int __init 1164 xfs_defer_init_item_caches(void) 1165 { 1166 int error; 1167 1168 error = xfs_defer_init_cache(); 1169 if (error) 1170 return error; 1171 error = xfs_rmap_intent_init_cache(); 1172 if (error) 1173 goto err; 1174 error = xfs_refcount_intent_init_cache(); 1175 if (error) 1176 goto err; 1177 error = xfs_bmap_intent_init_cache(); 1178 if (error) 1179 goto err; 1180 error = xfs_extfree_intent_init_cache(); 1181 if (error) 1182 goto err; 1183 error = xfs_attr_intent_init_cache(); 1184 if (error) 1185 goto err; 1186 error = xfs_exchmaps_intent_init_cache(); 1187 if (error) 1188 goto err; 1189 1190 return 0; 1191 err: 1192 xfs_defer_destroy_item_caches(); 1193 return error; 1194 } 1195 1196 /* Destroy all the deferred work item caches, if they've been allocated. */ 1197 void 1198 xfs_defer_destroy_item_caches(void) 1199 { 1200 xfs_exchmaps_intent_destroy_cache(); 1201 xfs_attr_intent_destroy_cache(); 1202 xfs_extfree_intent_destroy_cache(); 1203 xfs_bmap_intent_destroy_cache(); 1204 xfs_refcount_intent_destroy_cache(); 1205 xfs_rmap_intent_destroy_cache(); 1206 xfs_defer_destroy_cache(); 1207 } 1208 1209 /* 1210 * Mark a deferred work item so that it will be requeued indefinitely without 1211 * being finished. Caller must ensure there are no data dependencies on this 1212 * work item in the meantime. 1213 */ 1214 void 1215 xfs_defer_item_pause( 1216 struct xfs_trans *tp, 1217 struct xfs_defer_pending *dfp) 1218 { 1219 ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); 1220 1221 dfp->dfp_flags |= XFS_DEFER_PAUSED; 1222 1223 trace_xfs_defer_item_pause(tp->t_mountp, dfp); 1224 } 1225 1226 /* 1227 * Release a paused deferred work item so that it will be finished during the 1228 * next transaction roll. 1229 */ 1230 void 1231 xfs_defer_item_unpause( 1232 struct xfs_trans *tp, 1233 struct xfs_defer_pending *dfp) 1234 { 1235 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 1236 1237 dfp->dfp_flags &= ~XFS_DEFER_PAUSED; 1238 1239 trace_xfs_defer_item_unpause(tp->t_mountp, dfp); 1240 } 1241