1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_trace.h" 20 #include "xfs_icache.h" 21 #include "xfs_log.h" 22 #include "xfs_log_priv.h" 23 #include "xfs_rmap.h" 24 #include "xfs_refcount.h" 25 #include "xfs_bmap.h" 26 #include "xfs_alloc.h" 27 #include "xfs_buf.h" 28 #include "xfs_da_format.h" 29 #include "xfs_da_btree.h" 30 #include "xfs_attr.h" 31 #include "xfs_exchmaps.h" 32 33 static struct kmem_cache *xfs_defer_pending_cache; 34 35 /* 36 * Deferred Operations in XFS 37 * 38 * Due to the way locking rules work in XFS, certain transactions (block 39 * mapping and unmapping, typically) have permanent reservations so that 40 * we can roll the transaction to adhere to AG locking order rules and 41 * to unlock buffers between metadata updates. Prior to rmap/reflink, 42 * the mapping code had a mechanism to perform these deferrals for 43 * extents that were going to be freed; this code makes that facility 44 * more generic. 45 * 46 * When adding the reverse mapping and reflink features, it became 47 * necessary to perform complex remapping multi-transactions to comply 48 * with AG locking order rules, and to be able to spread a single 49 * refcount update operation (an operation on an n-block extent can 50 * update as many as n records!) among multiple transactions. XFS can 51 * roll a transaction to facilitate this, but using this facility 52 * requires us to log "intent" items in case log recovery needs to 53 * redo the operation, and to log "done" items to indicate that redo 54 * is not necessary. 55 * 56 * Deferred work is tracked in xfs_defer_pending items. Each pending 57 * item tracks one type of deferred work. Incoming work items (which 58 * have not yet had an intent logged) are attached to a pending item 59 * on the dop_intake list, where they wait for the caller to finish 60 * the deferred operations. 61 * 62 * Finishing a set of deferred operations is an involved process. To 63 * start, we define "rolling a deferred-op transaction" as follows: 64 * 65 * > For each xfs_defer_pending item on the dop_intake list, 66 * - Sort the work items in AG order. XFS locking 67 * order rules require us to lock buffers in AG order. 68 * - Create a log intent item for that type. 69 * - Attach it to the pending item. 70 * - Move the pending item from the dop_intake list to the 71 * dop_pending list. 72 * > Roll the transaction. 73 * 74 * NOTE: To avoid exceeding the transaction reservation, we limit the 75 * number of items that we attach to a given xfs_defer_pending. 76 * 77 * The actual finishing process looks like this: 78 * 79 * > For each xfs_defer_pending in the dop_pending list, 80 * - Roll the deferred-op transaction as above. 81 * - Create a log done item for that type, and attach it to the 82 * log intent item. 83 * - For each work item attached to the log intent item, 84 * * Perform the described action. 85 * * Attach the work item to the log done item. 86 * * If the result of doing the work was -EAGAIN, ->finish work 87 * wants a new transaction. See the "Requesting a Fresh 88 * Transaction while Finishing Deferred Work" section below for 89 * details. 90 * 91 * The key here is that we must log an intent item for all pending 92 * work items every time we roll the transaction, and that we must log 93 * a done item as soon as the work is completed. With this mechanism 94 * we can perform complex remapping operations, chaining intent items 95 * as needed. 96 * 97 * Requesting a Fresh Transaction while Finishing Deferred Work 98 * 99 * If ->finish_item decides that it needs a fresh transaction to 100 * finish the work, it must ask its caller (xfs_defer_finish) for a 101 * continuation. The most likely cause of this circumstance are the 102 * refcount adjust functions deciding that they've logged enough items 103 * to be at risk of exceeding the transaction reservation. 104 * 105 * To get a fresh transaction, we want to log the existing log done 106 * item to prevent the log intent item from replaying, immediately log 107 * a new log intent item with the unfinished work items, roll the 108 * transaction, and re-call ->finish_item wherever it left off. The 109 * log done item and the new log intent item must be in the same 110 * transaction or atomicity cannot be guaranteed; defer_finish ensures 111 * that this happens. 112 * 113 * This requires some coordination between ->finish_item and 114 * defer_finish. Upon deciding to request a new transaction, 115 * ->finish_item should update the current work item to reflect the 116 * unfinished work. Next, it should reset the log done item's list 117 * count to the number of items finished, and return -EAGAIN. 118 * defer_finish sees the -EAGAIN, logs the new log intent item 119 * with the remaining work items, and leaves the xfs_defer_pending 120 * item at the head of the dop_work queue. Then it rolls the 121 * transaction and picks up processing where it left off. It is 122 * required that ->finish_item must be careful to leave enough 123 * transaction reservation to fit the new log intent item. 124 * 125 * This is an example of remapping the extent (E, E+B) into file X at 126 * offset A and dealing with the extent (C, C+B) already being mapped 127 * there: 128 * +-------------------------------------------------+ 129 * | Unmap file X startblock C offset A length B | t0 130 * | Intent to reduce refcount for extent (C, B) | 131 * | Intent to remove rmap (X, C, A, B) | 132 * | Intent to free extent (D, 1) (bmbt block) | 133 * | Intent to map (X, A, B) at startblock E | 134 * +-------------------------------------------------+ 135 * | Map file X startblock E offset A length B | t1 136 * | Done mapping (X, E, A, B) | 137 * | Intent to increase refcount for extent (E, B) | 138 * | Intent to add rmap (X, E, A, B) | 139 * +-------------------------------------------------+ 140 * | Reduce refcount for extent (C, B) | t2 141 * | Done reducing refcount for extent (C, 9) | 142 * | Intent to reduce refcount for extent (C+9, B-9) | 143 * | (ran out of space after 9 refcount updates) | 144 * +-------------------------------------------------+ 145 * | Reduce refcount for extent (C+9, B+9) | t3 146 * | Done reducing refcount for extent (C+9, B-9) | 147 * | Increase refcount for extent (E, B) | 148 * | Done increasing refcount for extent (E, B) | 149 * | Intent to free extent (C, B) | 150 * | Intent to free extent (F, 1) (refcountbt block) | 151 * | Intent to remove rmap (F, 1, REFC) | 152 * +-------------------------------------------------+ 153 * | Remove rmap (X, C, A, B) | t4 154 * | Done removing rmap (X, C, A, B) | 155 * | Add rmap (X, E, A, B) | 156 * | Done adding rmap (X, E, A, B) | 157 * | Remove rmap (F, 1, REFC) | 158 * | Done removing rmap (F, 1, REFC) | 159 * +-------------------------------------------------+ 160 * | Free extent (C, B) | t5 161 * | Done freeing extent (C, B) | 162 * | Free extent (D, 1) | 163 * | Done freeing extent (D, 1) | 164 * | Free extent (F, 1) | 165 * | Done freeing extent (F, 1) | 166 * +-------------------------------------------------+ 167 * 168 * If we should crash before t2 commits, log recovery replays 169 * the following intent items: 170 * 171 * - Intent to reduce refcount for extent (C, B) 172 * - Intent to remove rmap (X, C, A, B) 173 * - Intent to free extent (D, 1) (bmbt block) 174 * - Intent to increase refcount for extent (E, B) 175 * - Intent to add rmap (X, E, A, B) 176 * 177 * In the process of recovering, it should also generate and take care 178 * of these intent items: 179 * 180 * - Intent to free extent (C, B) 181 * - Intent to free extent (F, 1) (refcountbt block) 182 * - Intent to remove rmap (F, 1, REFC) 183 * 184 * Note that the continuation requested between t2 and t3 is likely to 185 * reoccur. 186 */ 187 STATIC struct xfs_log_item * 188 xfs_defer_barrier_create_intent( 189 struct xfs_trans *tp, 190 struct list_head *items, 191 unsigned int count, 192 bool sort) 193 { 194 return NULL; 195 } 196 197 STATIC void 198 xfs_defer_barrier_abort_intent( 199 struct xfs_log_item *intent) 200 { 201 /* empty */ 202 } 203 204 STATIC struct xfs_log_item * 205 xfs_defer_barrier_create_done( 206 struct xfs_trans *tp, 207 struct xfs_log_item *intent, 208 unsigned int count) 209 { 210 return NULL; 211 } 212 213 STATIC int 214 xfs_defer_barrier_finish_item( 215 struct xfs_trans *tp, 216 struct xfs_log_item *done, 217 struct list_head *item, 218 struct xfs_btree_cur **state) 219 { 220 ASSERT(0); 221 return -EFSCORRUPTED; 222 } 223 224 STATIC void 225 xfs_defer_barrier_cancel_item( 226 struct list_head *item) 227 { 228 ASSERT(0); 229 } 230 231 static const struct xfs_defer_op_type xfs_barrier_defer_type = { 232 .max_items = 1, 233 .create_intent = xfs_defer_barrier_create_intent, 234 .abort_intent = xfs_defer_barrier_abort_intent, 235 .create_done = xfs_defer_barrier_create_done, 236 .finish_item = xfs_defer_barrier_finish_item, 237 .cancel_item = xfs_defer_barrier_cancel_item, 238 }; 239 240 /* Create a log intent done item for a log intent item. */ 241 static inline void 242 xfs_defer_create_done( 243 struct xfs_trans *tp, 244 struct xfs_defer_pending *dfp) 245 { 246 struct xfs_log_item *lip; 247 248 /* If there is no log intent item, there can be no log done item. */ 249 if (!dfp->dfp_intent) 250 return; 251 252 /* 253 * Mark the transaction dirty, even on error. This ensures the 254 * transaction is aborted, which: 255 * 256 * 1.) releases the log intent item and frees the log done item 257 * 2.) shuts down the filesystem 258 */ 259 tp->t_flags |= XFS_TRANS_DIRTY; 260 lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); 261 if (!lip) 262 return; 263 264 tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; 265 xfs_trans_add_item(tp, lip); 266 set_bit(XFS_LI_DIRTY, &lip->li_flags); 267 dfp->dfp_done = lip; 268 } 269 270 /* 271 * Ensure there's a log intent item associated with this deferred work item if 272 * the operation must be restarted on crash. Returns 1 if there's a log item; 273 * 0 if there isn't; or a negative errno. 274 */ 275 static int 276 xfs_defer_create_intent( 277 struct xfs_trans *tp, 278 struct xfs_defer_pending *dfp, 279 bool sort) 280 { 281 struct xfs_log_item *lip; 282 283 if (dfp->dfp_intent) 284 return 1; 285 286 lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, 287 sort); 288 if (!lip) 289 return 0; 290 if (IS_ERR(lip)) 291 return PTR_ERR(lip); 292 293 tp->t_flags |= XFS_TRANS_DIRTY; 294 xfs_trans_add_item(tp, lip); 295 set_bit(XFS_LI_DIRTY, &lip->li_flags); 296 dfp->dfp_intent = lip; 297 return 1; 298 } 299 300 /* 301 * For each pending item in the intake list, log its intent item and the 302 * associated extents, then add the entire intake list to the end of 303 * the pending list. 304 * 305 * Returns 1 if at least one log item was associated with the deferred work; 306 * 0 if there are no log items; or a negative errno. 307 */ 308 static int 309 xfs_defer_create_intents( 310 struct xfs_trans *tp) 311 { 312 struct xfs_defer_pending *dfp; 313 int ret = 0; 314 315 list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { 316 int ret2; 317 318 trace_xfs_defer_create_intent(tp->t_mountp, dfp); 319 ret2 = xfs_defer_create_intent(tp, dfp, true); 320 if (ret2 < 0) 321 return ret2; 322 ret |= ret2; 323 } 324 return ret; 325 } 326 327 static inline void 328 xfs_defer_pending_abort( 329 struct xfs_mount *mp, 330 struct xfs_defer_pending *dfp) 331 { 332 trace_xfs_defer_pending_abort(mp, dfp); 333 334 if (dfp->dfp_intent && !dfp->dfp_done) { 335 dfp->dfp_ops->abort_intent(dfp->dfp_intent); 336 dfp->dfp_intent = NULL; 337 } 338 } 339 340 static inline void 341 xfs_defer_pending_cancel_work( 342 struct xfs_mount *mp, 343 struct xfs_defer_pending *dfp) 344 { 345 struct list_head *pwi; 346 struct list_head *n; 347 348 trace_xfs_defer_cancel_list(mp, dfp); 349 350 list_del(&dfp->dfp_list); 351 list_for_each_safe(pwi, n, &dfp->dfp_work) { 352 list_del(pwi); 353 dfp->dfp_count--; 354 trace_xfs_defer_cancel_item(mp, dfp, pwi); 355 dfp->dfp_ops->cancel_item(pwi); 356 } 357 ASSERT(dfp->dfp_count == 0); 358 kmem_cache_free(xfs_defer_pending_cache, dfp); 359 } 360 361 STATIC void 362 xfs_defer_pending_abort_list( 363 struct xfs_mount *mp, 364 struct list_head *dop_list) 365 { 366 struct xfs_defer_pending *dfp; 367 368 /* Abort intent items that don't have a done item. */ 369 list_for_each_entry(dfp, dop_list, dfp_list) 370 xfs_defer_pending_abort(mp, dfp); 371 } 372 373 /* Abort all the intents that were committed. */ 374 STATIC void 375 xfs_defer_trans_abort( 376 struct xfs_trans *tp, 377 struct list_head *dop_pending) 378 { 379 trace_xfs_defer_trans_abort(tp, _RET_IP_); 380 xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); 381 } 382 383 /* 384 * Capture resources that the caller said not to release ("held") when the 385 * transaction commits. Caller is responsible for zero-initializing @dres. 386 */ 387 static int 388 xfs_defer_save_resources( 389 struct xfs_defer_resources *dres, 390 struct xfs_trans *tp) 391 { 392 struct xfs_buf_log_item *bli; 393 struct xfs_inode_log_item *ili; 394 struct xfs_log_item *lip; 395 396 BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); 397 398 list_for_each_entry(lip, &tp->t_items, li_trans) { 399 switch (lip->li_type) { 400 case XFS_LI_BUF: 401 bli = container_of(lip, struct xfs_buf_log_item, 402 bli_item); 403 if (bli->bli_flags & XFS_BLI_HOLD) { 404 if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { 405 ASSERT(0); 406 return -EFSCORRUPTED; 407 } 408 if (bli->bli_flags & XFS_BLI_ORDERED) 409 dres->dr_ordered |= 410 (1U << dres->dr_bufs); 411 else 412 xfs_trans_dirty_buf(tp, bli->bli_buf); 413 dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; 414 } 415 break; 416 case XFS_LI_INODE: 417 ili = container_of(lip, struct xfs_inode_log_item, 418 ili_item); 419 if (ili->ili_lock_flags == 0) { 420 if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { 421 ASSERT(0); 422 return -EFSCORRUPTED; 423 } 424 xfs_trans_log_inode(tp, ili->ili_inode, 425 XFS_ILOG_CORE); 426 dres->dr_ip[dres->dr_inos++] = ili->ili_inode; 427 } 428 break; 429 default: 430 break; 431 } 432 } 433 434 return 0; 435 } 436 437 /* Attach the held resources to the transaction. */ 438 static void 439 xfs_defer_restore_resources( 440 struct xfs_trans *tp, 441 struct xfs_defer_resources *dres) 442 { 443 unsigned short i; 444 445 /* Rejoin the joined inodes. */ 446 for (i = 0; i < dres->dr_inos; i++) 447 xfs_trans_ijoin(tp, dres->dr_ip[i], 0); 448 449 /* Rejoin the buffers and dirty them so the log moves forward. */ 450 for (i = 0; i < dres->dr_bufs; i++) { 451 xfs_trans_bjoin(tp, dres->dr_bp[i]); 452 if (dres->dr_ordered & (1U << i)) 453 xfs_trans_ordered_buf(tp, dres->dr_bp[i]); 454 xfs_trans_bhold(tp, dres->dr_bp[i]); 455 } 456 } 457 458 /* Roll a transaction so we can do some deferred op processing. */ 459 STATIC int 460 xfs_defer_trans_roll( 461 struct xfs_trans **tpp) 462 { 463 struct xfs_defer_resources dres = { }; 464 int error; 465 466 error = xfs_defer_save_resources(&dres, *tpp); 467 if (error) 468 return error; 469 470 trace_xfs_defer_trans_roll(*tpp, _RET_IP_); 471 472 /* 473 * Roll the transaction. Rolling always given a new transaction (even 474 * if committing the old one fails!) to hand back to the caller, so we 475 * join the held resources to the new transaction so that we always 476 * return with the held resources joined to @tpp, no matter what 477 * happened. 478 */ 479 error = xfs_trans_roll(tpp); 480 481 xfs_defer_restore_resources(*tpp, &dres); 482 483 if (error) 484 trace_xfs_defer_trans_roll_error(*tpp, error); 485 return error; 486 } 487 488 /* 489 * Free up any items left in the list. 490 */ 491 static void 492 xfs_defer_cancel_list( 493 struct xfs_mount *mp, 494 struct list_head *dop_list) 495 { 496 struct xfs_defer_pending *dfp; 497 struct xfs_defer_pending *pli; 498 499 /* 500 * Free the pending items. Caller should already have arranged 501 * for the intent items to be released. 502 */ 503 list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) 504 xfs_defer_pending_cancel_work(mp, dfp); 505 } 506 507 static inline void 508 xfs_defer_relog_intent( 509 struct xfs_trans *tp, 510 struct xfs_defer_pending *dfp) 511 { 512 struct xfs_log_item *lip; 513 514 xfs_defer_create_done(tp, dfp); 515 516 lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); 517 if (lip) { 518 xfs_trans_add_item(tp, lip); 519 set_bit(XFS_LI_DIRTY, &lip->li_flags); 520 } 521 dfp->dfp_done = NULL; 522 dfp->dfp_intent = lip; 523 } 524 525 /* 526 * Prevent a log intent item from pinning the tail of the log by logging a 527 * done item to release the intent item; and then log a new intent item. 528 * The caller should provide a fresh transaction and roll it after we're done. 529 */ 530 static void 531 xfs_defer_relog( 532 struct xfs_trans **tpp, 533 struct list_head *dfops) 534 { 535 struct xlog *log = (*tpp)->t_mountp->m_log; 536 struct xfs_defer_pending *dfp; 537 xfs_lsn_t threshold_lsn = NULLCOMMITLSN; 538 539 540 ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); 541 542 list_for_each_entry(dfp, dfops, dfp_list) { 543 /* 544 * If the log intent item for this deferred op is not a part of 545 * the current log checkpoint, relog the intent item to keep 546 * the log tail moving forward. We're ok with this being racy 547 * because an incorrect decision means we'll be a little slower 548 * at pushing the tail. 549 */ 550 if (dfp->dfp_intent == NULL || 551 xfs_log_item_in_current_chkpt(dfp->dfp_intent)) 552 continue; 553 554 /* 555 * Figure out where we need the tail to be in order to maintain 556 * the minimum required free space in the log. Only sample 557 * the log threshold once per call. 558 */ 559 if (threshold_lsn == NULLCOMMITLSN) { 560 threshold_lsn = xfs_ail_get_push_target(log->l_ailp); 561 if (threshold_lsn == NULLCOMMITLSN) 562 break; 563 } 564 if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) 565 continue; 566 567 trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); 568 XFS_STATS_INC((*tpp)->t_mountp, defer_relog); 569 570 xfs_defer_relog_intent(*tpp, dfp); 571 } 572 } 573 574 /* 575 * Log an intent-done item for the first pending intent, and finish the work 576 * items. 577 */ 578 int 579 xfs_defer_finish_one( 580 struct xfs_trans *tp, 581 struct xfs_defer_pending *dfp) 582 { 583 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 584 struct xfs_btree_cur *state = NULL; 585 struct list_head *li, *n; 586 int error; 587 588 trace_xfs_defer_pending_finish(tp->t_mountp, dfp); 589 590 xfs_defer_create_done(tp, dfp); 591 list_for_each_safe(li, n, &dfp->dfp_work) { 592 list_del(li); 593 dfp->dfp_count--; 594 trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); 595 error = ops->finish_item(tp, dfp->dfp_done, li, &state); 596 if (error == -EAGAIN) { 597 int ret; 598 599 /* 600 * Caller wants a fresh transaction; put the work item 601 * back on the list and log a new log intent item to 602 * replace the old one. See "Requesting a Fresh 603 * Transaction while Finishing Deferred Work" above. 604 */ 605 list_add(li, &dfp->dfp_work); 606 dfp->dfp_count++; 607 dfp->dfp_done = NULL; 608 dfp->dfp_intent = NULL; 609 ret = xfs_defer_create_intent(tp, dfp, false); 610 if (ret < 0) 611 error = ret; 612 } 613 614 if (error) 615 goto out; 616 } 617 618 /* Done with the dfp, free it. */ 619 list_del(&dfp->dfp_list); 620 kmem_cache_free(xfs_defer_pending_cache, dfp); 621 out: 622 if (ops->finish_cleanup) 623 ops->finish_cleanup(tp, state, error); 624 return error; 625 } 626 627 /* Move all paused deferred work from @tp to @paused_list. */ 628 static void 629 xfs_defer_isolate_paused( 630 struct xfs_trans *tp, 631 struct list_head *paused_list) 632 { 633 struct xfs_defer_pending *dfp; 634 struct xfs_defer_pending *pli; 635 636 list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { 637 if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) 638 continue; 639 640 list_move_tail(&dfp->dfp_list, paused_list); 641 trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); 642 } 643 } 644 645 /* 646 * Finish all the pending work. This involves logging intent items for 647 * any work items that wandered in since the last transaction roll (if 648 * one has even happened), rolling the transaction, and finishing the 649 * work items in the first item on the logged-and-pending list. 650 * 651 * If an inode is provided, relog it to the new transaction. 652 */ 653 int 654 xfs_defer_finish_noroll( 655 struct xfs_trans **tp) 656 { 657 struct xfs_defer_pending *dfp = NULL; 658 int error = 0; 659 LIST_HEAD(dop_pending); 660 LIST_HEAD(dop_paused); 661 662 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 663 664 trace_xfs_defer_finish(*tp, _RET_IP_); 665 666 /* Until we run out of pending work to finish... */ 667 while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { 668 /* 669 * Deferred items that are created in the process of finishing 670 * other deferred work items should be queued at the head of 671 * the pending list, which puts them ahead of the deferred work 672 * that was created by the caller. This keeps the number of 673 * pending work items to a minimum, which decreases the amount 674 * of time that any one intent item can stick around in memory, 675 * pinning the log tail. 676 */ 677 int has_intents = xfs_defer_create_intents(*tp); 678 679 xfs_defer_isolate_paused(*tp, &dop_paused); 680 681 list_splice_init(&(*tp)->t_dfops, &dop_pending); 682 683 if (has_intents < 0) { 684 error = has_intents; 685 goto out_shutdown; 686 } 687 if (has_intents || dfp) { 688 error = xfs_defer_trans_roll(tp); 689 if (error) 690 goto out_shutdown; 691 692 /* Relog intent items to keep the log moving. */ 693 xfs_defer_relog(tp, &dop_pending); 694 xfs_defer_relog(tp, &dop_paused); 695 696 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 697 error = xfs_defer_trans_roll(tp); 698 if (error) 699 goto out_shutdown; 700 } 701 } 702 703 dfp = list_first_entry_or_null(&dop_pending, 704 struct xfs_defer_pending, dfp_list); 705 if (!dfp) 706 break; 707 error = xfs_defer_finish_one(*tp, dfp); 708 if (error && error != -EAGAIN) 709 goto out_shutdown; 710 } 711 712 /* Requeue the paused items in the outgoing transaction. */ 713 list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); 714 715 trace_xfs_defer_finish_done(*tp, _RET_IP_); 716 return 0; 717 718 out_shutdown: 719 list_splice_tail_init(&dop_paused, &dop_pending); 720 xfs_defer_trans_abort(*tp, &dop_pending); 721 xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); 722 trace_xfs_defer_finish_error(*tp, error); 723 xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); 724 xfs_defer_cancel(*tp); 725 return error; 726 } 727 728 int 729 xfs_defer_finish( 730 struct xfs_trans **tp) 731 { 732 #ifdef DEBUG 733 struct xfs_defer_pending *dfp; 734 #endif 735 int error; 736 737 /* 738 * Finish and roll the transaction once more to avoid returning to the 739 * caller with a dirty transaction. 740 */ 741 error = xfs_defer_finish_noroll(tp); 742 if (error) 743 return error; 744 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 745 error = xfs_defer_trans_roll(tp); 746 if (error) { 747 xfs_force_shutdown((*tp)->t_mountp, 748 SHUTDOWN_CORRUPT_INCORE); 749 return error; 750 } 751 } 752 753 /* Reset LOWMODE now that we've finished all the dfops. */ 754 #ifdef DEBUG 755 list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) 756 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 757 #endif 758 (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; 759 return 0; 760 } 761 762 void 763 xfs_defer_cancel( 764 struct xfs_trans *tp) 765 { 766 struct xfs_mount *mp = tp->t_mountp; 767 768 trace_xfs_defer_cancel(tp, _RET_IP_); 769 xfs_defer_trans_abort(tp, &tp->t_dfops); 770 xfs_defer_cancel_list(mp, &tp->t_dfops); 771 } 772 773 /* 774 * Return the last pending work item attached to this transaction if it matches 775 * the deferred op type. 776 */ 777 static inline struct xfs_defer_pending * 778 xfs_defer_find_last( 779 struct xfs_trans *tp, 780 const struct xfs_defer_op_type *ops) 781 { 782 struct xfs_defer_pending *dfp = NULL; 783 784 /* No dfops at all? */ 785 if (list_empty(&tp->t_dfops)) 786 return NULL; 787 788 dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, 789 dfp_list); 790 791 /* Wrong type? */ 792 if (dfp->dfp_ops != ops) 793 return NULL; 794 return dfp; 795 } 796 797 /* 798 * Decide if we can add a deferred work item to the last dfops item attached 799 * to the transaction. 800 */ 801 static inline bool 802 xfs_defer_can_append( 803 struct xfs_defer_pending *dfp, 804 const struct xfs_defer_op_type *ops) 805 { 806 /* Already logged? */ 807 if (dfp->dfp_intent) 808 return false; 809 810 /* Paused items cannot absorb more work */ 811 if (dfp->dfp_flags & XFS_DEFER_PAUSED) 812 return NULL; 813 814 /* Already full? */ 815 if (ops->max_items && dfp->dfp_count >= ops->max_items) 816 return false; 817 818 return true; 819 } 820 821 /* Create a new pending item at the end of the transaction list. */ 822 static inline struct xfs_defer_pending * 823 xfs_defer_alloc( 824 struct list_head *dfops, 825 const struct xfs_defer_op_type *ops) 826 { 827 struct xfs_defer_pending *dfp; 828 829 dfp = kmem_cache_zalloc(xfs_defer_pending_cache, 830 GFP_KERNEL | __GFP_NOFAIL); 831 dfp->dfp_ops = ops; 832 INIT_LIST_HEAD(&dfp->dfp_work); 833 list_add_tail(&dfp->dfp_list, dfops); 834 835 return dfp; 836 } 837 838 /* Add an item for later deferred processing. */ 839 struct xfs_defer_pending * 840 xfs_defer_add( 841 struct xfs_trans *tp, 842 struct list_head *li, 843 const struct xfs_defer_op_type *ops) 844 { 845 struct xfs_defer_pending *dfp = NULL; 846 847 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 848 849 dfp = xfs_defer_find_last(tp, ops); 850 if (!dfp || !xfs_defer_can_append(dfp, ops)) 851 dfp = xfs_defer_alloc(&tp->t_dfops, ops); 852 853 xfs_defer_add_item(dfp, li); 854 trace_xfs_defer_add_item(tp->t_mountp, dfp, li); 855 return dfp; 856 } 857 858 /* 859 * Add a defer ops barrier to force two otherwise adjacent deferred work items 860 * to be tracked separately and have separate log items. 861 */ 862 void 863 xfs_defer_add_barrier( 864 struct xfs_trans *tp) 865 { 866 struct xfs_defer_pending *dfp; 867 868 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 869 870 /* If the last defer op added was a barrier, we're done. */ 871 dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); 872 if (dfp) 873 return; 874 875 xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); 876 877 trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); 878 } 879 880 /* 881 * Create a pending deferred work item to replay the recovered intent item 882 * and add it to the list. 883 */ 884 void 885 xfs_defer_start_recovery( 886 struct xfs_log_item *lip, 887 struct list_head *r_dfops, 888 const struct xfs_defer_op_type *ops) 889 { 890 struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); 891 892 dfp->dfp_intent = lip; 893 } 894 895 /* 896 * Cancel a deferred work item created to recover a log intent item. @dfp 897 * will be freed after this function returns. 898 */ 899 void 900 xfs_defer_cancel_recovery( 901 struct xfs_mount *mp, 902 struct xfs_defer_pending *dfp) 903 { 904 xfs_defer_pending_abort(mp, dfp); 905 xfs_defer_pending_cancel_work(mp, dfp); 906 } 907 908 /* Replay the deferred work item created from a recovered log intent item. */ 909 int 910 xfs_defer_finish_recovery( 911 struct xfs_mount *mp, 912 struct xfs_defer_pending *dfp, 913 struct list_head *capture_list) 914 { 915 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 916 int error; 917 918 /* dfp is freed by recover_work and must not be accessed afterwards */ 919 error = ops->recover_work(dfp, capture_list); 920 if (error) 921 trace_xlog_intent_recovery_failed(mp, ops, error); 922 return error; 923 } 924 925 /* 926 * Move deferred ops from one transaction to another and reset the source to 927 * initial state. This is primarily used to carry state forward across 928 * transaction rolls with pending dfops. 929 */ 930 void 931 xfs_defer_move( 932 struct xfs_trans *dtp, 933 struct xfs_trans *stp) 934 { 935 list_splice_init(&stp->t_dfops, &dtp->t_dfops); 936 937 /* 938 * Low free space mode was historically controlled by a dfops field. 939 * This meant that low mode state potentially carried across multiple 940 * transaction rolls. Transfer low mode on a dfops move to preserve 941 * that behavior. 942 */ 943 dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); 944 stp->t_flags &= ~XFS_TRANS_LOWMODE; 945 } 946 947 /* 948 * Prepare a chain of fresh deferred ops work items to be completed later. Log 949 * recovery requires the ability to put off until later the actual finishing 950 * work so that it can process unfinished items recovered from the log in 951 * correct order. 952 * 953 * Create and log intent items for all the work that we're capturing so that we 954 * can be assured that the items will get replayed if the system goes down 955 * before log recovery gets a chance to finish the work it put off. The entire 956 * deferred ops state is transferred to the capture structure and the 957 * transaction is then ready for the caller to commit it. If there are no 958 * intent items to capture, this function returns NULL. 959 * 960 * If capture_ip is not NULL, the capture structure will obtain an extra 961 * reference to the inode. 962 */ 963 static struct xfs_defer_capture * 964 xfs_defer_ops_capture( 965 struct xfs_trans *tp) 966 { 967 struct xfs_defer_capture *dfc; 968 unsigned short i; 969 int error; 970 971 if (list_empty(&tp->t_dfops)) 972 return NULL; 973 974 error = xfs_defer_create_intents(tp); 975 if (error < 0) 976 return ERR_PTR(error); 977 978 /* Create an object to capture the defer ops. */ 979 dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); 980 INIT_LIST_HEAD(&dfc->dfc_list); 981 INIT_LIST_HEAD(&dfc->dfc_dfops); 982 983 /* Move the dfops chain and transaction state to the capture struct. */ 984 list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); 985 dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; 986 tp->t_flags &= ~XFS_TRANS_LOWMODE; 987 988 /* Capture the remaining block reservations along with the dfops. */ 989 dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; 990 dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; 991 992 /* Preserve the log reservation size. */ 993 dfc->dfc_logres = tp->t_log_res; 994 995 error = xfs_defer_save_resources(&dfc->dfc_held, tp); 996 if (error) { 997 /* 998 * Resource capture should never fail, but if it does, we 999 * still have to shut down the log and release things 1000 * properly. 1001 */ 1002 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); 1003 } 1004 1005 /* 1006 * Grab extra references to the inodes and buffers because callers are 1007 * expected to release their held references after we commit the 1008 * transaction. 1009 */ 1010 for (i = 0; i < dfc->dfc_held.dr_inos; i++) { 1011 xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); 1012 ihold(VFS_I(dfc->dfc_held.dr_ip[i])); 1013 } 1014 1015 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1016 xfs_buf_hold(dfc->dfc_held.dr_bp[i]); 1017 1018 return dfc; 1019 } 1020 1021 /* Release all resources that we used to capture deferred ops. */ 1022 void 1023 xfs_defer_ops_capture_abort( 1024 struct xfs_mount *mp, 1025 struct xfs_defer_capture *dfc) 1026 { 1027 unsigned short i; 1028 1029 xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); 1030 xfs_defer_cancel_list(mp, &dfc->dfc_dfops); 1031 1032 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1033 xfs_buf_relse(dfc->dfc_held.dr_bp[i]); 1034 1035 for (i = 0; i < dfc->dfc_held.dr_inos; i++) 1036 xfs_irele(dfc->dfc_held.dr_ip[i]); 1037 1038 kfree(dfc); 1039 } 1040 1041 /* 1042 * Capture any deferred ops and commit the transaction. This is the last step 1043 * needed to finish a log intent item that we recovered from the log. If any 1044 * of the deferred ops operate on an inode, the caller must pass in that inode 1045 * so that the reference can be transferred to the capture structure. The 1046 * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling 1047 * xfs_defer_ops_continue. 1048 */ 1049 int 1050 xfs_defer_ops_capture_and_commit( 1051 struct xfs_trans *tp, 1052 struct list_head *capture_list) 1053 { 1054 struct xfs_mount *mp = tp->t_mountp; 1055 struct xfs_defer_capture *dfc; 1056 int error; 1057 1058 /* If we don't capture anything, commit transaction and exit. */ 1059 dfc = xfs_defer_ops_capture(tp); 1060 if (IS_ERR(dfc)) { 1061 xfs_trans_cancel(tp); 1062 return PTR_ERR(dfc); 1063 } 1064 if (!dfc) 1065 return xfs_trans_commit(tp); 1066 1067 /* Commit the transaction and add the capture structure to the list. */ 1068 error = xfs_trans_commit(tp); 1069 if (error) { 1070 xfs_defer_ops_capture_abort(mp, dfc); 1071 return error; 1072 } 1073 1074 list_add_tail(&dfc->dfc_list, capture_list); 1075 return 0; 1076 } 1077 1078 /* 1079 * Attach a chain of captured deferred ops to a new transaction and free the 1080 * capture structure. If an inode was captured, it will be passed back to the 1081 * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. 1082 * The caller now owns the inode reference. 1083 */ 1084 void 1085 xfs_defer_ops_continue( 1086 struct xfs_defer_capture *dfc, 1087 struct xfs_trans *tp, 1088 struct xfs_defer_resources *dres) 1089 { 1090 unsigned int i; 1091 1092 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1093 ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); 1094 1095 /* Lock the captured resources to the new transaction. */ 1096 if (dfc->dfc_held.dr_inos > 2) { 1097 xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); 1098 xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, 1099 XFS_ILOCK_EXCL); 1100 } else if (dfc->dfc_held.dr_inos == 2) 1101 xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, 1102 dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); 1103 else if (dfc->dfc_held.dr_inos == 1) 1104 xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); 1105 1106 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1107 xfs_buf_lock(dfc->dfc_held.dr_bp[i]); 1108 1109 /* Join the captured resources to the new transaction. */ 1110 xfs_defer_restore_resources(tp, &dfc->dfc_held); 1111 memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); 1112 dres->dr_bufs = 0; 1113 1114 /* Move captured dfops chain and state to the transaction. */ 1115 list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); 1116 tp->t_flags |= dfc->dfc_tpflags; 1117 1118 kfree(dfc); 1119 } 1120 1121 /* Release the resources captured and continued during recovery. */ 1122 void 1123 xfs_defer_resources_rele( 1124 struct xfs_defer_resources *dres) 1125 { 1126 unsigned short i; 1127 1128 for (i = 0; i < dres->dr_inos; i++) { 1129 xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); 1130 xfs_irele(dres->dr_ip[i]); 1131 dres->dr_ip[i] = NULL; 1132 } 1133 1134 for (i = 0; i < dres->dr_bufs; i++) { 1135 xfs_buf_relse(dres->dr_bp[i]); 1136 dres->dr_bp[i] = NULL; 1137 } 1138 1139 dres->dr_inos = 0; 1140 dres->dr_bufs = 0; 1141 dres->dr_ordered = 0; 1142 } 1143 1144 static inline int __init 1145 xfs_defer_init_cache(void) 1146 { 1147 xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", 1148 sizeof(struct xfs_defer_pending), 1149 0, 0, NULL); 1150 1151 return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; 1152 } 1153 1154 static inline void 1155 xfs_defer_destroy_cache(void) 1156 { 1157 kmem_cache_destroy(xfs_defer_pending_cache); 1158 xfs_defer_pending_cache = NULL; 1159 } 1160 1161 /* Set up caches for deferred work items. */ 1162 int __init 1163 xfs_defer_init_item_caches(void) 1164 { 1165 int error; 1166 1167 error = xfs_defer_init_cache(); 1168 if (error) 1169 return error; 1170 error = xfs_rmap_intent_init_cache(); 1171 if (error) 1172 goto err; 1173 error = xfs_refcount_intent_init_cache(); 1174 if (error) 1175 goto err; 1176 error = xfs_bmap_intent_init_cache(); 1177 if (error) 1178 goto err; 1179 error = xfs_extfree_intent_init_cache(); 1180 if (error) 1181 goto err; 1182 error = xfs_attr_intent_init_cache(); 1183 if (error) 1184 goto err; 1185 error = xfs_exchmaps_intent_init_cache(); 1186 if (error) 1187 goto err; 1188 1189 return 0; 1190 err: 1191 xfs_defer_destroy_item_caches(); 1192 return error; 1193 } 1194 1195 /* Destroy all the deferred work item caches, if they've been allocated. */ 1196 void 1197 xfs_defer_destroy_item_caches(void) 1198 { 1199 xfs_exchmaps_intent_destroy_cache(); 1200 xfs_attr_intent_destroy_cache(); 1201 xfs_extfree_intent_destroy_cache(); 1202 xfs_bmap_intent_destroy_cache(); 1203 xfs_refcount_intent_destroy_cache(); 1204 xfs_rmap_intent_destroy_cache(); 1205 xfs_defer_destroy_cache(); 1206 } 1207 1208 /* 1209 * Mark a deferred work item so that it will be requeued indefinitely without 1210 * being finished. Caller must ensure there are no data dependencies on this 1211 * work item in the meantime. 1212 */ 1213 void 1214 xfs_defer_item_pause( 1215 struct xfs_trans *tp, 1216 struct xfs_defer_pending *dfp) 1217 { 1218 ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); 1219 1220 dfp->dfp_flags |= XFS_DEFER_PAUSED; 1221 1222 trace_xfs_defer_item_pause(tp->t_mountp, dfp); 1223 } 1224 1225 /* 1226 * Release a paused deferred work item so that it will be finished during the 1227 * next transaction roll. 1228 */ 1229 void 1230 xfs_defer_item_unpause( 1231 struct xfs_trans *tp, 1232 struct xfs_defer_pending *dfp) 1233 { 1234 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 1235 1236 dfp->dfp_flags &= ~XFS_DEFER_PAUSED; 1237 1238 trace_xfs_defer_item_unpause(tp->t_mountp, dfp); 1239 } 1240