1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_inode.h" 17 #include "xfs_inode_item.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_log.h" 21 #include "xfs_rmap.h" 22 #include "xfs_refcount.h" 23 #include "xfs_bmap.h" 24 #include "xfs_alloc.h" 25 #include "xfs_buf.h" 26 #include "xfs_da_format.h" 27 #include "xfs_da_btree.h" 28 #include "xfs_attr.h" 29 #include "xfs_trans_priv.h" 30 #include "xfs_exchmaps.h" 31 32 static struct kmem_cache *xfs_defer_pending_cache; 33 34 /* 35 * Deferred Operations in XFS 36 * 37 * Due to the way locking rules work in XFS, certain transactions (block 38 * mapping and unmapping, typically) have permanent reservations so that 39 * we can roll the transaction to adhere to AG locking order rules and 40 * to unlock buffers between metadata updates. Prior to rmap/reflink, 41 * the mapping code had a mechanism to perform these deferrals for 42 * extents that were going to be freed; this code makes that facility 43 * more generic. 44 * 45 * When adding the reverse mapping and reflink features, it became 46 * necessary to perform complex remapping multi-transactions to comply 47 * with AG locking order rules, and to be able to spread a single 48 * refcount update operation (an operation on an n-block extent can 49 * update as many as n records!) among multiple transactions. XFS can 50 * roll a transaction to facilitate this, but using this facility 51 * requires us to log "intent" items in case log recovery needs to 52 * redo the operation, and to log "done" items to indicate that redo 53 * is not necessary. 54 * 55 * Deferred work is tracked in xfs_defer_pending items. Each pending 56 * item tracks one type of deferred work. Incoming work items (which 57 * have not yet had an intent logged) are attached to a pending item 58 * on the dop_intake list, where they wait for the caller to finish 59 * the deferred operations. 60 * 61 * Finishing a set of deferred operations is an involved process. To 62 * start, we define "rolling a deferred-op transaction" as follows: 63 * 64 * > For each xfs_defer_pending item on the dop_intake list, 65 * - Sort the work items in AG order. XFS locking 66 * order rules require us to lock buffers in AG order. 67 * - Create a log intent item for that type. 68 * - Attach it to the pending item. 69 * - Move the pending item from the dop_intake list to the 70 * dop_pending list. 71 * > Roll the transaction. 72 * 73 * NOTE: To avoid exceeding the transaction reservation, we limit the 74 * number of items that we attach to a given xfs_defer_pending. 75 * 76 * The actual finishing process looks like this: 77 * 78 * > For each xfs_defer_pending in the dop_pending list, 79 * - Roll the deferred-op transaction as above. 80 * - Create a log done item for that type, and attach it to the 81 * log intent item. 82 * - For each work item attached to the log intent item, 83 * * Perform the described action. 84 * * Attach the work item to the log done item. 85 * * If the result of doing the work was -EAGAIN, ->finish work 86 * wants a new transaction. See the "Requesting a Fresh 87 * Transaction while Finishing Deferred Work" section below for 88 * details. 89 * 90 * The key here is that we must log an intent item for all pending 91 * work items every time we roll the transaction, and that we must log 92 * a done item as soon as the work is completed. With this mechanism 93 * we can perform complex remapping operations, chaining intent items 94 * as needed. 95 * 96 * Requesting a Fresh Transaction while Finishing Deferred Work 97 * 98 * If ->finish_item decides that it needs a fresh transaction to 99 * finish the work, it must ask its caller (xfs_defer_finish) for a 100 * continuation. The most likely cause of this circumstance are the 101 * refcount adjust functions deciding that they've logged enough items 102 * to be at risk of exceeding the transaction reservation. 103 * 104 * To get a fresh transaction, we want to log the existing log done 105 * item to prevent the log intent item from replaying, immediately log 106 * a new log intent item with the unfinished work items, roll the 107 * transaction, and re-call ->finish_item wherever it left off. The 108 * log done item and the new log intent item must be in the same 109 * transaction or atomicity cannot be guaranteed; defer_finish ensures 110 * that this happens. 111 * 112 * This requires some coordination between ->finish_item and 113 * defer_finish. Upon deciding to request a new transaction, 114 * ->finish_item should update the current work item to reflect the 115 * unfinished work. Next, it should reset the log done item's list 116 * count to the number of items finished, and return -EAGAIN. 117 * defer_finish sees the -EAGAIN, logs the new log intent item 118 * with the remaining work items, and leaves the xfs_defer_pending 119 * item at the head of the dop_work queue. Then it rolls the 120 * transaction and picks up processing where it left off. It is 121 * required that ->finish_item must be careful to leave enough 122 * transaction reservation to fit the new log intent item. 123 * 124 * This is an example of remapping the extent (E, E+B) into file X at 125 * offset A and dealing with the extent (C, C+B) already being mapped 126 * there: 127 * +-------------------------------------------------+ 128 * | Unmap file X startblock C offset A length B | t0 129 * | Intent to reduce refcount for extent (C, B) | 130 * | Intent to remove rmap (X, C, A, B) | 131 * | Intent to free extent (D, 1) (bmbt block) | 132 * | Intent to map (X, A, B) at startblock E | 133 * +-------------------------------------------------+ 134 * | Map file X startblock E offset A length B | t1 135 * | Done mapping (X, E, A, B) | 136 * | Intent to increase refcount for extent (E, B) | 137 * | Intent to add rmap (X, E, A, B) | 138 * +-------------------------------------------------+ 139 * | Reduce refcount for extent (C, B) | t2 140 * | Done reducing refcount for extent (C, 9) | 141 * | Intent to reduce refcount for extent (C+9, B-9) | 142 * | (ran out of space after 9 refcount updates) | 143 * +-------------------------------------------------+ 144 * | Reduce refcount for extent (C+9, B+9) | t3 145 * | Done reducing refcount for extent (C+9, B-9) | 146 * | Increase refcount for extent (E, B) | 147 * | Done increasing refcount for extent (E, B) | 148 * | Intent to free extent (C, B) | 149 * | Intent to free extent (F, 1) (refcountbt block) | 150 * | Intent to remove rmap (F, 1, REFC) | 151 * +-------------------------------------------------+ 152 * | Remove rmap (X, C, A, B) | t4 153 * | Done removing rmap (X, C, A, B) | 154 * | Add rmap (X, E, A, B) | 155 * | Done adding rmap (X, E, A, B) | 156 * | Remove rmap (F, 1, REFC) | 157 * | Done removing rmap (F, 1, REFC) | 158 * +-------------------------------------------------+ 159 * | Free extent (C, B) | t5 160 * | Done freeing extent (C, B) | 161 * | Free extent (D, 1) | 162 * | Done freeing extent (D, 1) | 163 * | Free extent (F, 1) | 164 * | Done freeing extent (F, 1) | 165 * +-------------------------------------------------+ 166 * 167 * If we should crash before t2 commits, log recovery replays 168 * the following intent items: 169 * 170 * - Intent to reduce refcount for extent (C, B) 171 * - Intent to remove rmap (X, C, A, B) 172 * - Intent to free extent (D, 1) (bmbt block) 173 * - Intent to increase refcount for extent (E, B) 174 * - Intent to add rmap (X, E, A, B) 175 * 176 * In the process of recovering, it should also generate and take care 177 * of these intent items: 178 * 179 * - Intent to free extent (C, B) 180 * - Intent to free extent (F, 1) (refcountbt block) 181 * - Intent to remove rmap (F, 1, REFC) 182 * 183 * Note that the continuation requested between t2 and t3 is likely to 184 * reoccur. 185 */ 186 STATIC struct xfs_log_item * 187 xfs_defer_barrier_create_intent( 188 struct xfs_trans *tp, 189 struct list_head *items, 190 unsigned int count, 191 bool sort) 192 { 193 return NULL; 194 } 195 196 STATIC void 197 xfs_defer_barrier_abort_intent( 198 struct xfs_log_item *intent) 199 { 200 /* empty */ 201 } 202 203 STATIC struct xfs_log_item * 204 xfs_defer_barrier_create_done( 205 struct xfs_trans *tp, 206 struct xfs_log_item *intent, 207 unsigned int count) 208 { 209 return NULL; 210 } 211 212 STATIC int 213 xfs_defer_barrier_finish_item( 214 struct xfs_trans *tp, 215 struct xfs_log_item *done, 216 struct list_head *item, 217 struct xfs_btree_cur **state) 218 { 219 ASSERT(0); 220 return -EFSCORRUPTED; 221 } 222 223 STATIC void 224 xfs_defer_barrier_cancel_item( 225 struct list_head *item) 226 { 227 ASSERT(0); 228 } 229 230 static const struct xfs_defer_op_type xfs_barrier_defer_type = { 231 .max_items = 1, 232 .create_intent = xfs_defer_barrier_create_intent, 233 .abort_intent = xfs_defer_barrier_abort_intent, 234 .create_done = xfs_defer_barrier_create_done, 235 .finish_item = xfs_defer_barrier_finish_item, 236 .cancel_item = xfs_defer_barrier_cancel_item, 237 }; 238 239 /* Create a log intent done item for a log intent item. */ 240 static inline void 241 xfs_defer_create_done( 242 struct xfs_trans *tp, 243 struct xfs_defer_pending *dfp) 244 { 245 struct xfs_log_item *lip; 246 247 /* If there is no log intent item, there can be no log done item. */ 248 if (!dfp->dfp_intent) 249 return; 250 251 /* 252 * Mark the transaction dirty, even on error. This ensures the 253 * transaction is aborted, which: 254 * 255 * 1.) releases the log intent item and frees the log done item 256 * 2.) shuts down the filesystem 257 */ 258 tp->t_flags |= XFS_TRANS_DIRTY; 259 lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); 260 if (!lip) 261 return; 262 263 tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; 264 xfs_trans_add_item(tp, lip); 265 set_bit(XFS_LI_DIRTY, &lip->li_flags); 266 dfp->dfp_done = lip; 267 } 268 269 /* 270 * Ensure there's a log intent item associated with this deferred work item if 271 * the operation must be restarted on crash. Returns 1 if there's a log item; 272 * 0 if there isn't; or a negative errno. 273 */ 274 static int 275 xfs_defer_create_intent( 276 struct xfs_trans *tp, 277 struct xfs_defer_pending *dfp, 278 bool sort) 279 { 280 struct xfs_log_item *lip; 281 282 if (dfp->dfp_intent) 283 return 1; 284 285 lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, 286 sort); 287 if (!lip) 288 return 0; 289 if (IS_ERR(lip)) 290 return PTR_ERR(lip); 291 292 tp->t_flags |= XFS_TRANS_DIRTY; 293 xfs_trans_add_item(tp, lip); 294 set_bit(XFS_LI_DIRTY, &lip->li_flags); 295 dfp->dfp_intent = lip; 296 return 1; 297 } 298 299 /* 300 * For each pending item in the intake list, log its intent item and the 301 * associated extents, then add the entire intake list to the end of 302 * the pending list. 303 * 304 * Returns 1 if at least one log item was associated with the deferred work; 305 * 0 if there are no log items; or a negative errno. 306 */ 307 static int 308 xfs_defer_create_intents( 309 struct xfs_trans *tp) 310 { 311 struct xfs_defer_pending *dfp; 312 int ret = 0; 313 314 list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { 315 int ret2; 316 317 trace_xfs_defer_create_intent(tp->t_mountp, dfp); 318 ret2 = xfs_defer_create_intent(tp, dfp, true); 319 if (ret2 < 0) 320 return ret2; 321 ret |= ret2; 322 } 323 return ret; 324 } 325 326 static inline void 327 xfs_defer_pending_abort( 328 struct xfs_mount *mp, 329 struct xfs_defer_pending *dfp) 330 { 331 trace_xfs_defer_pending_abort(mp, dfp); 332 333 if (dfp->dfp_intent && !dfp->dfp_done) { 334 dfp->dfp_ops->abort_intent(dfp->dfp_intent); 335 dfp->dfp_intent = NULL; 336 } 337 } 338 339 static inline void 340 xfs_defer_pending_cancel_work( 341 struct xfs_mount *mp, 342 struct xfs_defer_pending *dfp) 343 { 344 struct list_head *pwi; 345 struct list_head *n; 346 347 trace_xfs_defer_cancel_list(mp, dfp); 348 349 list_del(&dfp->dfp_list); 350 list_for_each_safe(pwi, n, &dfp->dfp_work) { 351 list_del(pwi); 352 dfp->dfp_count--; 353 trace_xfs_defer_cancel_item(mp, dfp, pwi); 354 dfp->dfp_ops->cancel_item(pwi); 355 } 356 ASSERT(dfp->dfp_count == 0); 357 kmem_cache_free(xfs_defer_pending_cache, dfp); 358 } 359 360 STATIC void 361 xfs_defer_pending_abort_list( 362 struct xfs_mount *mp, 363 struct list_head *dop_list) 364 { 365 struct xfs_defer_pending *dfp; 366 367 /* Abort intent items that don't have a done item. */ 368 list_for_each_entry(dfp, dop_list, dfp_list) 369 xfs_defer_pending_abort(mp, dfp); 370 } 371 372 /* Abort all the intents that were committed. */ 373 STATIC void 374 xfs_defer_trans_abort( 375 struct xfs_trans *tp, 376 struct list_head *dop_pending) 377 { 378 trace_xfs_defer_trans_abort(tp, _RET_IP_); 379 xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); 380 } 381 382 /* 383 * Capture resources that the caller said not to release ("held") when the 384 * transaction commits. Caller is responsible for zero-initializing @dres. 385 */ 386 static int 387 xfs_defer_save_resources( 388 struct xfs_defer_resources *dres, 389 struct xfs_trans *tp) 390 { 391 struct xfs_buf_log_item *bli; 392 struct xfs_inode_log_item *ili; 393 struct xfs_log_item *lip; 394 395 BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); 396 397 list_for_each_entry(lip, &tp->t_items, li_trans) { 398 switch (lip->li_type) { 399 case XFS_LI_BUF: 400 bli = container_of(lip, struct xfs_buf_log_item, 401 bli_item); 402 if (bli->bli_flags & XFS_BLI_HOLD) { 403 if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { 404 ASSERT(0); 405 return -EFSCORRUPTED; 406 } 407 if (bli->bli_flags & XFS_BLI_ORDERED) 408 dres->dr_ordered |= 409 (1U << dres->dr_bufs); 410 else 411 xfs_trans_dirty_buf(tp, bli->bli_buf); 412 dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; 413 } 414 break; 415 case XFS_LI_INODE: 416 ili = container_of(lip, struct xfs_inode_log_item, 417 ili_item); 418 if (ili->ili_lock_flags == 0) { 419 if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { 420 ASSERT(0); 421 return -EFSCORRUPTED; 422 } 423 xfs_trans_log_inode(tp, ili->ili_inode, 424 XFS_ILOG_CORE); 425 dres->dr_ip[dres->dr_inos++] = ili->ili_inode; 426 } 427 break; 428 default: 429 break; 430 } 431 } 432 433 return 0; 434 } 435 436 /* Attach the held resources to the transaction. */ 437 static void 438 xfs_defer_restore_resources( 439 struct xfs_trans *tp, 440 struct xfs_defer_resources *dres) 441 { 442 unsigned short i; 443 444 /* Rejoin the joined inodes. */ 445 for (i = 0; i < dres->dr_inos; i++) 446 xfs_trans_ijoin(tp, dres->dr_ip[i], 0); 447 448 /* Rejoin the buffers and dirty them so the log moves forward. */ 449 for (i = 0; i < dres->dr_bufs; i++) { 450 xfs_trans_bjoin(tp, dres->dr_bp[i]); 451 if (dres->dr_ordered & (1U << i)) 452 xfs_trans_ordered_buf(tp, dres->dr_bp[i]); 453 xfs_trans_bhold(tp, dres->dr_bp[i]); 454 } 455 } 456 457 /* Roll a transaction so we can do some deferred op processing. */ 458 STATIC int 459 xfs_defer_trans_roll( 460 struct xfs_trans **tpp) 461 { 462 struct xfs_defer_resources dres = { }; 463 int error; 464 465 error = xfs_defer_save_resources(&dres, *tpp); 466 if (error) 467 return error; 468 469 trace_xfs_defer_trans_roll(*tpp, _RET_IP_); 470 471 /* 472 * Roll the transaction. Rolling always given a new transaction (even 473 * if committing the old one fails!) to hand back to the caller, so we 474 * join the held resources to the new transaction so that we always 475 * return with the held resources joined to @tpp, no matter what 476 * happened. 477 */ 478 error = xfs_trans_roll(tpp); 479 480 xfs_defer_restore_resources(*tpp, &dres); 481 482 if (error) 483 trace_xfs_defer_trans_roll_error(*tpp, error); 484 return error; 485 } 486 487 /* 488 * Free up any items left in the list. 489 */ 490 static void 491 xfs_defer_cancel_list( 492 struct xfs_mount *mp, 493 struct list_head *dop_list) 494 { 495 struct xfs_defer_pending *dfp; 496 struct xfs_defer_pending *pli; 497 498 /* 499 * Free the pending items. Caller should already have arranged 500 * for the intent items to be released. 501 */ 502 list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) 503 xfs_defer_pending_cancel_work(mp, dfp); 504 } 505 506 static inline void 507 xfs_defer_relog_intent( 508 struct xfs_trans *tp, 509 struct xfs_defer_pending *dfp) 510 { 511 struct xfs_log_item *lip; 512 513 xfs_defer_create_done(tp, dfp); 514 515 lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); 516 if (lip) { 517 xfs_trans_add_item(tp, lip); 518 set_bit(XFS_LI_DIRTY, &lip->li_flags); 519 } 520 dfp->dfp_done = NULL; 521 dfp->dfp_intent = lip; 522 } 523 524 /* 525 * Prevent a log intent item from pinning the tail of the log by logging a 526 * done item to release the intent item; and then log a new intent item. 527 * The caller should provide a fresh transaction and roll it after we're done. 528 */ 529 static void 530 xfs_defer_relog( 531 struct xfs_trans **tpp, 532 struct list_head *dfops) 533 { 534 struct xlog *log = (*tpp)->t_mountp->m_log; 535 struct xfs_defer_pending *dfp; 536 xfs_lsn_t threshold_lsn = NULLCOMMITLSN; 537 538 539 ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); 540 541 list_for_each_entry(dfp, dfops, dfp_list) { 542 /* 543 * If the log intent item for this deferred op is not a part of 544 * the current log checkpoint, relog the intent item to keep 545 * the log tail moving forward. We're ok with this being racy 546 * because an incorrect decision means we'll be a little slower 547 * at pushing the tail. 548 */ 549 if (dfp->dfp_intent == NULL || 550 xfs_log_item_in_current_chkpt(dfp->dfp_intent)) 551 continue; 552 553 /* 554 * Figure out where we need the tail to be in order to maintain 555 * the minimum required free space in the log. Only sample 556 * the log threshold once per call. 557 */ 558 if (threshold_lsn == NULLCOMMITLSN) { 559 threshold_lsn = xlog_grant_push_threshold(log, 0); 560 if (threshold_lsn == NULLCOMMITLSN) 561 break; 562 } 563 if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) 564 continue; 565 566 trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); 567 XFS_STATS_INC((*tpp)->t_mountp, defer_relog); 568 569 xfs_defer_relog_intent(*tpp, dfp); 570 } 571 } 572 573 /* 574 * Log an intent-done item for the first pending intent, and finish the work 575 * items. 576 */ 577 int 578 xfs_defer_finish_one( 579 struct xfs_trans *tp, 580 struct xfs_defer_pending *dfp) 581 { 582 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 583 struct xfs_btree_cur *state = NULL; 584 struct list_head *li, *n; 585 int error; 586 587 trace_xfs_defer_pending_finish(tp->t_mountp, dfp); 588 589 xfs_defer_create_done(tp, dfp); 590 list_for_each_safe(li, n, &dfp->dfp_work) { 591 list_del(li); 592 dfp->dfp_count--; 593 trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); 594 error = ops->finish_item(tp, dfp->dfp_done, li, &state); 595 if (error == -EAGAIN) { 596 int ret; 597 598 /* 599 * Caller wants a fresh transaction; put the work item 600 * back on the list and log a new log intent item to 601 * replace the old one. See "Requesting a Fresh 602 * Transaction while Finishing Deferred Work" above. 603 */ 604 list_add(li, &dfp->dfp_work); 605 dfp->dfp_count++; 606 dfp->dfp_done = NULL; 607 dfp->dfp_intent = NULL; 608 ret = xfs_defer_create_intent(tp, dfp, false); 609 if (ret < 0) 610 error = ret; 611 } 612 613 if (error) 614 goto out; 615 } 616 617 /* Done with the dfp, free it. */ 618 list_del(&dfp->dfp_list); 619 kmem_cache_free(xfs_defer_pending_cache, dfp); 620 out: 621 if (ops->finish_cleanup) 622 ops->finish_cleanup(tp, state, error); 623 return error; 624 } 625 626 /* Move all paused deferred work from @tp to @paused_list. */ 627 static void 628 xfs_defer_isolate_paused( 629 struct xfs_trans *tp, 630 struct list_head *paused_list) 631 { 632 struct xfs_defer_pending *dfp; 633 struct xfs_defer_pending *pli; 634 635 list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { 636 if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) 637 continue; 638 639 list_move_tail(&dfp->dfp_list, paused_list); 640 trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); 641 } 642 } 643 644 /* 645 * Finish all the pending work. This involves logging intent items for 646 * any work items that wandered in since the last transaction roll (if 647 * one has even happened), rolling the transaction, and finishing the 648 * work items in the first item on the logged-and-pending list. 649 * 650 * If an inode is provided, relog it to the new transaction. 651 */ 652 int 653 xfs_defer_finish_noroll( 654 struct xfs_trans **tp) 655 { 656 struct xfs_defer_pending *dfp = NULL; 657 int error = 0; 658 LIST_HEAD(dop_pending); 659 LIST_HEAD(dop_paused); 660 661 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 662 663 trace_xfs_defer_finish(*tp, _RET_IP_); 664 665 /* Until we run out of pending work to finish... */ 666 while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { 667 /* 668 * Deferred items that are created in the process of finishing 669 * other deferred work items should be queued at the head of 670 * the pending list, which puts them ahead of the deferred work 671 * that was created by the caller. This keeps the number of 672 * pending work items to a minimum, which decreases the amount 673 * of time that any one intent item can stick around in memory, 674 * pinning the log tail. 675 */ 676 int has_intents = xfs_defer_create_intents(*tp); 677 678 xfs_defer_isolate_paused(*tp, &dop_paused); 679 680 list_splice_init(&(*tp)->t_dfops, &dop_pending); 681 682 if (has_intents < 0) { 683 error = has_intents; 684 goto out_shutdown; 685 } 686 if (has_intents || dfp) { 687 error = xfs_defer_trans_roll(tp); 688 if (error) 689 goto out_shutdown; 690 691 /* Relog intent items to keep the log moving. */ 692 xfs_defer_relog(tp, &dop_pending); 693 xfs_defer_relog(tp, &dop_paused); 694 695 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 696 error = xfs_defer_trans_roll(tp); 697 if (error) 698 goto out_shutdown; 699 } 700 } 701 702 dfp = list_first_entry_or_null(&dop_pending, 703 struct xfs_defer_pending, dfp_list); 704 if (!dfp) 705 break; 706 error = xfs_defer_finish_one(*tp, dfp); 707 if (error && error != -EAGAIN) 708 goto out_shutdown; 709 } 710 711 /* Requeue the paused items in the outgoing transaction. */ 712 list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); 713 714 trace_xfs_defer_finish_done(*tp, _RET_IP_); 715 return 0; 716 717 out_shutdown: 718 list_splice_tail_init(&dop_paused, &dop_pending); 719 xfs_defer_trans_abort(*tp, &dop_pending); 720 xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); 721 trace_xfs_defer_finish_error(*tp, error); 722 xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); 723 xfs_defer_cancel(*tp); 724 return error; 725 } 726 727 int 728 xfs_defer_finish( 729 struct xfs_trans **tp) 730 { 731 #ifdef DEBUG 732 struct xfs_defer_pending *dfp; 733 #endif 734 int error; 735 736 /* 737 * Finish and roll the transaction once more to avoid returning to the 738 * caller with a dirty transaction. 739 */ 740 error = xfs_defer_finish_noroll(tp); 741 if (error) 742 return error; 743 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 744 error = xfs_defer_trans_roll(tp); 745 if (error) { 746 xfs_force_shutdown((*tp)->t_mountp, 747 SHUTDOWN_CORRUPT_INCORE); 748 return error; 749 } 750 } 751 752 /* Reset LOWMODE now that we've finished all the dfops. */ 753 #ifdef DEBUG 754 list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) 755 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 756 #endif 757 (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; 758 return 0; 759 } 760 761 void 762 xfs_defer_cancel( 763 struct xfs_trans *tp) 764 { 765 struct xfs_mount *mp = tp->t_mountp; 766 767 trace_xfs_defer_cancel(tp, _RET_IP_); 768 xfs_defer_trans_abort(tp, &tp->t_dfops); 769 xfs_defer_cancel_list(mp, &tp->t_dfops); 770 } 771 772 /* 773 * Return the last pending work item attached to this transaction if it matches 774 * the deferred op type. 775 */ 776 static inline struct xfs_defer_pending * 777 xfs_defer_find_last( 778 struct xfs_trans *tp, 779 const struct xfs_defer_op_type *ops) 780 { 781 struct xfs_defer_pending *dfp = NULL; 782 783 /* No dfops at all? */ 784 if (list_empty(&tp->t_dfops)) 785 return NULL; 786 787 dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, 788 dfp_list); 789 790 /* Wrong type? */ 791 if (dfp->dfp_ops != ops) 792 return NULL; 793 return dfp; 794 } 795 796 /* 797 * Decide if we can add a deferred work item to the last dfops item attached 798 * to the transaction. 799 */ 800 static inline bool 801 xfs_defer_can_append( 802 struct xfs_defer_pending *dfp, 803 const struct xfs_defer_op_type *ops) 804 { 805 /* Already logged? */ 806 if (dfp->dfp_intent) 807 return false; 808 809 /* Paused items cannot absorb more work */ 810 if (dfp->dfp_flags & XFS_DEFER_PAUSED) 811 return NULL; 812 813 /* Already full? */ 814 if (ops->max_items && dfp->dfp_count >= ops->max_items) 815 return false; 816 817 return true; 818 } 819 820 /* Create a new pending item at the end of the transaction list. */ 821 static inline struct xfs_defer_pending * 822 xfs_defer_alloc( 823 struct list_head *dfops, 824 const struct xfs_defer_op_type *ops) 825 { 826 struct xfs_defer_pending *dfp; 827 828 dfp = kmem_cache_zalloc(xfs_defer_pending_cache, 829 GFP_KERNEL | __GFP_NOFAIL); 830 dfp->dfp_ops = ops; 831 INIT_LIST_HEAD(&dfp->dfp_work); 832 list_add_tail(&dfp->dfp_list, dfops); 833 834 return dfp; 835 } 836 837 /* Add an item for later deferred processing. */ 838 struct xfs_defer_pending * 839 xfs_defer_add( 840 struct xfs_trans *tp, 841 struct list_head *li, 842 const struct xfs_defer_op_type *ops) 843 { 844 struct xfs_defer_pending *dfp = NULL; 845 846 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 847 848 dfp = xfs_defer_find_last(tp, ops); 849 if (!dfp || !xfs_defer_can_append(dfp, ops)) 850 dfp = xfs_defer_alloc(&tp->t_dfops, ops); 851 852 xfs_defer_add_item(dfp, li); 853 trace_xfs_defer_add_item(tp->t_mountp, dfp, li); 854 return dfp; 855 } 856 857 /* 858 * Add a defer ops barrier to force two otherwise adjacent deferred work items 859 * to be tracked separately and have separate log items. 860 */ 861 void 862 xfs_defer_add_barrier( 863 struct xfs_trans *tp) 864 { 865 struct xfs_defer_pending *dfp; 866 867 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 868 869 /* If the last defer op added was a barrier, we're done. */ 870 dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); 871 if (dfp) 872 return; 873 874 xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); 875 876 trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); 877 } 878 879 /* 880 * Create a pending deferred work item to replay the recovered intent item 881 * and add it to the list. 882 */ 883 void 884 xfs_defer_start_recovery( 885 struct xfs_log_item *lip, 886 struct list_head *r_dfops, 887 const struct xfs_defer_op_type *ops) 888 { 889 struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); 890 891 dfp->dfp_intent = lip; 892 } 893 894 /* 895 * Cancel a deferred work item created to recover a log intent item. @dfp 896 * will be freed after this function returns. 897 */ 898 void 899 xfs_defer_cancel_recovery( 900 struct xfs_mount *mp, 901 struct xfs_defer_pending *dfp) 902 { 903 xfs_defer_pending_abort(mp, dfp); 904 xfs_defer_pending_cancel_work(mp, dfp); 905 } 906 907 /* Replay the deferred work item created from a recovered log intent item. */ 908 int 909 xfs_defer_finish_recovery( 910 struct xfs_mount *mp, 911 struct xfs_defer_pending *dfp, 912 struct list_head *capture_list) 913 { 914 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 915 int error; 916 917 /* dfp is freed by recover_work and must not be accessed afterwards */ 918 error = ops->recover_work(dfp, capture_list); 919 if (error) 920 trace_xlog_intent_recovery_failed(mp, ops, error); 921 return error; 922 } 923 924 /* 925 * Move deferred ops from one transaction to another and reset the source to 926 * initial state. This is primarily used to carry state forward across 927 * transaction rolls with pending dfops. 928 */ 929 void 930 xfs_defer_move( 931 struct xfs_trans *dtp, 932 struct xfs_trans *stp) 933 { 934 list_splice_init(&stp->t_dfops, &dtp->t_dfops); 935 936 /* 937 * Low free space mode was historically controlled by a dfops field. 938 * This meant that low mode state potentially carried across multiple 939 * transaction rolls. Transfer low mode on a dfops move to preserve 940 * that behavior. 941 */ 942 dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); 943 stp->t_flags &= ~XFS_TRANS_LOWMODE; 944 } 945 946 /* 947 * Prepare a chain of fresh deferred ops work items to be completed later. Log 948 * recovery requires the ability to put off until later the actual finishing 949 * work so that it can process unfinished items recovered from the log in 950 * correct order. 951 * 952 * Create and log intent items for all the work that we're capturing so that we 953 * can be assured that the items will get replayed if the system goes down 954 * before log recovery gets a chance to finish the work it put off. The entire 955 * deferred ops state is transferred to the capture structure and the 956 * transaction is then ready for the caller to commit it. If there are no 957 * intent items to capture, this function returns NULL. 958 * 959 * If capture_ip is not NULL, the capture structure will obtain an extra 960 * reference to the inode. 961 */ 962 static struct xfs_defer_capture * 963 xfs_defer_ops_capture( 964 struct xfs_trans *tp) 965 { 966 struct xfs_defer_capture *dfc; 967 unsigned short i; 968 int error; 969 970 if (list_empty(&tp->t_dfops)) 971 return NULL; 972 973 error = xfs_defer_create_intents(tp); 974 if (error < 0) 975 return ERR_PTR(error); 976 977 /* Create an object to capture the defer ops. */ 978 dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); 979 INIT_LIST_HEAD(&dfc->dfc_list); 980 INIT_LIST_HEAD(&dfc->dfc_dfops); 981 982 /* Move the dfops chain and transaction state to the capture struct. */ 983 list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); 984 dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; 985 tp->t_flags &= ~XFS_TRANS_LOWMODE; 986 987 /* Capture the remaining block reservations along with the dfops. */ 988 dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; 989 dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; 990 991 /* Preserve the log reservation size. */ 992 dfc->dfc_logres = tp->t_log_res; 993 994 error = xfs_defer_save_resources(&dfc->dfc_held, tp); 995 if (error) { 996 /* 997 * Resource capture should never fail, but if it does, we 998 * still have to shut down the log and release things 999 * properly. 1000 */ 1001 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); 1002 } 1003 1004 /* 1005 * Grab extra references to the inodes and buffers because callers are 1006 * expected to release their held references after we commit the 1007 * transaction. 1008 */ 1009 for (i = 0; i < dfc->dfc_held.dr_inos; i++) { 1010 xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); 1011 ihold(VFS_I(dfc->dfc_held.dr_ip[i])); 1012 } 1013 1014 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1015 xfs_buf_hold(dfc->dfc_held.dr_bp[i]); 1016 1017 return dfc; 1018 } 1019 1020 /* Release all resources that we used to capture deferred ops. */ 1021 void 1022 xfs_defer_ops_capture_abort( 1023 struct xfs_mount *mp, 1024 struct xfs_defer_capture *dfc) 1025 { 1026 unsigned short i; 1027 1028 xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); 1029 xfs_defer_cancel_list(mp, &dfc->dfc_dfops); 1030 1031 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1032 xfs_buf_relse(dfc->dfc_held.dr_bp[i]); 1033 1034 for (i = 0; i < dfc->dfc_held.dr_inos; i++) 1035 xfs_irele(dfc->dfc_held.dr_ip[i]); 1036 1037 kfree(dfc); 1038 } 1039 1040 /* 1041 * Capture any deferred ops and commit the transaction. This is the last step 1042 * needed to finish a log intent item that we recovered from the log. If any 1043 * of the deferred ops operate on an inode, the caller must pass in that inode 1044 * so that the reference can be transferred to the capture structure. The 1045 * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling 1046 * xfs_defer_ops_continue. 1047 */ 1048 int 1049 xfs_defer_ops_capture_and_commit( 1050 struct xfs_trans *tp, 1051 struct list_head *capture_list) 1052 { 1053 struct xfs_mount *mp = tp->t_mountp; 1054 struct xfs_defer_capture *dfc; 1055 int error; 1056 1057 /* If we don't capture anything, commit transaction and exit. */ 1058 dfc = xfs_defer_ops_capture(tp); 1059 if (IS_ERR(dfc)) { 1060 xfs_trans_cancel(tp); 1061 return PTR_ERR(dfc); 1062 } 1063 if (!dfc) 1064 return xfs_trans_commit(tp); 1065 1066 /* Commit the transaction and add the capture structure to the list. */ 1067 error = xfs_trans_commit(tp); 1068 if (error) { 1069 xfs_defer_ops_capture_abort(mp, dfc); 1070 return error; 1071 } 1072 1073 list_add_tail(&dfc->dfc_list, capture_list); 1074 return 0; 1075 } 1076 1077 /* 1078 * Attach a chain of captured deferred ops to a new transaction and free the 1079 * capture structure. If an inode was captured, it will be passed back to the 1080 * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. 1081 * The caller now owns the inode reference. 1082 */ 1083 void 1084 xfs_defer_ops_continue( 1085 struct xfs_defer_capture *dfc, 1086 struct xfs_trans *tp, 1087 struct xfs_defer_resources *dres) 1088 { 1089 unsigned int i; 1090 1091 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1092 ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); 1093 1094 /* Lock the captured resources to the new transaction. */ 1095 if (dfc->dfc_held.dr_inos > 2) { 1096 xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos); 1097 xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos, 1098 XFS_ILOCK_EXCL); 1099 } else if (dfc->dfc_held.dr_inos == 2) 1100 xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, 1101 dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); 1102 else if (dfc->dfc_held.dr_inos == 1) 1103 xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); 1104 1105 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1106 xfs_buf_lock(dfc->dfc_held.dr_bp[i]); 1107 1108 /* Join the captured resources to the new transaction. */ 1109 xfs_defer_restore_resources(tp, &dfc->dfc_held); 1110 memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); 1111 dres->dr_bufs = 0; 1112 1113 /* Move captured dfops chain and state to the transaction. */ 1114 list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); 1115 tp->t_flags |= dfc->dfc_tpflags; 1116 1117 kfree(dfc); 1118 } 1119 1120 /* Release the resources captured and continued during recovery. */ 1121 void 1122 xfs_defer_resources_rele( 1123 struct xfs_defer_resources *dres) 1124 { 1125 unsigned short i; 1126 1127 for (i = 0; i < dres->dr_inos; i++) { 1128 xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); 1129 xfs_irele(dres->dr_ip[i]); 1130 dres->dr_ip[i] = NULL; 1131 } 1132 1133 for (i = 0; i < dres->dr_bufs; i++) { 1134 xfs_buf_relse(dres->dr_bp[i]); 1135 dres->dr_bp[i] = NULL; 1136 } 1137 1138 dres->dr_inos = 0; 1139 dres->dr_bufs = 0; 1140 dres->dr_ordered = 0; 1141 } 1142 1143 static inline int __init 1144 xfs_defer_init_cache(void) 1145 { 1146 xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", 1147 sizeof(struct xfs_defer_pending), 1148 0, 0, NULL); 1149 1150 return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; 1151 } 1152 1153 static inline void 1154 xfs_defer_destroy_cache(void) 1155 { 1156 kmem_cache_destroy(xfs_defer_pending_cache); 1157 xfs_defer_pending_cache = NULL; 1158 } 1159 1160 /* Set up caches for deferred work items. */ 1161 int __init 1162 xfs_defer_init_item_caches(void) 1163 { 1164 int error; 1165 1166 error = xfs_defer_init_cache(); 1167 if (error) 1168 return error; 1169 error = xfs_rmap_intent_init_cache(); 1170 if (error) 1171 goto err; 1172 error = xfs_refcount_intent_init_cache(); 1173 if (error) 1174 goto err; 1175 error = xfs_bmap_intent_init_cache(); 1176 if (error) 1177 goto err; 1178 error = xfs_extfree_intent_init_cache(); 1179 if (error) 1180 goto err; 1181 error = xfs_attr_intent_init_cache(); 1182 if (error) 1183 goto err; 1184 error = xfs_exchmaps_intent_init_cache(); 1185 if (error) 1186 goto err; 1187 1188 return 0; 1189 err: 1190 xfs_defer_destroy_item_caches(); 1191 return error; 1192 } 1193 1194 /* Destroy all the deferred work item caches, if they've been allocated. */ 1195 void 1196 xfs_defer_destroy_item_caches(void) 1197 { 1198 xfs_exchmaps_intent_destroy_cache(); 1199 xfs_attr_intent_destroy_cache(); 1200 xfs_extfree_intent_destroy_cache(); 1201 xfs_bmap_intent_destroy_cache(); 1202 xfs_refcount_intent_destroy_cache(); 1203 xfs_rmap_intent_destroy_cache(); 1204 xfs_defer_destroy_cache(); 1205 } 1206 1207 /* 1208 * Mark a deferred work item so that it will be requeued indefinitely without 1209 * being finished. Caller must ensure there are no data dependencies on this 1210 * work item in the meantime. 1211 */ 1212 void 1213 xfs_defer_item_pause( 1214 struct xfs_trans *tp, 1215 struct xfs_defer_pending *dfp) 1216 { 1217 ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); 1218 1219 dfp->dfp_flags |= XFS_DEFER_PAUSED; 1220 1221 trace_xfs_defer_item_pause(tp->t_mountp, dfp); 1222 } 1223 1224 /* 1225 * Release a paused deferred work item so that it will be finished during the 1226 * next transaction roll. 1227 */ 1228 void 1229 xfs_defer_item_unpause( 1230 struct xfs_trans *tp, 1231 struct xfs_defer_pending *dfp) 1232 { 1233 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 1234 1235 dfp->dfp_flags &= ~XFS_DEFER_PAUSED; 1236 1237 trace_xfs_defer_item_unpause(tp->t_mountp, dfp); 1238 } 1239