1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_inode.h" 17 #include "xfs_inode_item.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_log.h" 21 #include "xfs_rmap.h" 22 #include "xfs_refcount.h" 23 #include "xfs_bmap.h" 24 #include "xfs_alloc.h" 25 #include "xfs_buf.h" 26 #include "xfs_da_format.h" 27 #include "xfs_da_btree.h" 28 #include "xfs_attr.h" 29 #include "xfs_trans_priv.h" 30 31 static struct kmem_cache *xfs_defer_pending_cache; 32 33 /* 34 * Deferred Operations in XFS 35 * 36 * Due to the way locking rules work in XFS, certain transactions (block 37 * mapping and unmapping, typically) have permanent reservations so that 38 * we can roll the transaction to adhere to AG locking order rules and 39 * to unlock buffers between metadata updates. Prior to rmap/reflink, 40 * the mapping code had a mechanism to perform these deferrals for 41 * extents that were going to be freed; this code makes that facility 42 * more generic. 43 * 44 * When adding the reverse mapping and reflink features, it became 45 * necessary to perform complex remapping multi-transactions to comply 46 * with AG locking order rules, and to be able to spread a single 47 * refcount update operation (an operation on an n-block extent can 48 * update as many as n records!) among multiple transactions. XFS can 49 * roll a transaction to facilitate this, but using this facility 50 * requires us to log "intent" items in case log recovery needs to 51 * redo the operation, and to log "done" items to indicate that redo 52 * is not necessary. 53 * 54 * Deferred work is tracked in xfs_defer_pending items. Each pending 55 * item tracks one type of deferred work. Incoming work items (which 56 * have not yet had an intent logged) are attached to a pending item 57 * on the dop_intake list, where they wait for the caller to finish 58 * the deferred operations. 59 * 60 * Finishing a set of deferred operations is an involved process. To 61 * start, we define "rolling a deferred-op transaction" as follows: 62 * 63 * > For each xfs_defer_pending item on the dop_intake list, 64 * - Sort the work items in AG order. XFS locking 65 * order rules require us to lock buffers in AG order. 66 * - Create a log intent item for that type. 67 * - Attach it to the pending item. 68 * - Move the pending item from the dop_intake list to the 69 * dop_pending list. 70 * > Roll the transaction. 71 * 72 * NOTE: To avoid exceeding the transaction reservation, we limit the 73 * number of items that we attach to a given xfs_defer_pending. 74 * 75 * The actual finishing process looks like this: 76 * 77 * > For each xfs_defer_pending in the dop_pending list, 78 * - Roll the deferred-op transaction as above. 79 * - Create a log done item for that type, and attach it to the 80 * log intent item. 81 * - For each work item attached to the log intent item, 82 * * Perform the described action. 83 * * Attach the work item to the log done item. 84 * * If the result of doing the work was -EAGAIN, ->finish work 85 * wants a new transaction. See the "Requesting a Fresh 86 * Transaction while Finishing Deferred Work" section below for 87 * details. 88 * 89 * The key here is that we must log an intent item for all pending 90 * work items every time we roll the transaction, and that we must log 91 * a done item as soon as the work is completed. With this mechanism 92 * we can perform complex remapping operations, chaining intent items 93 * as needed. 94 * 95 * Requesting a Fresh Transaction while Finishing Deferred Work 96 * 97 * If ->finish_item decides that it needs a fresh transaction to 98 * finish the work, it must ask its caller (xfs_defer_finish) for a 99 * continuation. The most likely cause of this circumstance are the 100 * refcount adjust functions deciding that they've logged enough items 101 * to be at risk of exceeding the transaction reservation. 102 * 103 * To get a fresh transaction, we want to log the existing log done 104 * item to prevent the log intent item from replaying, immediately log 105 * a new log intent item with the unfinished work items, roll the 106 * transaction, and re-call ->finish_item wherever it left off. The 107 * log done item and the new log intent item must be in the same 108 * transaction or atomicity cannot be guaranteed; defer_finish ensures 109 * that this happens. 110 * 111 * This requires some coordination between ->finish_item and 112 * defer_finish. Upon deciding to request a new transaction, 113 * ->finish_item should update the current work item to reflect the 114 * unfinished work. Next, it should reset the log done item's list 115 * count to the number of items finished, and return -EAGAIN. 116 * defer_finish sees the -EAGAIN, logs the new log intent item 117 * with the remaining work items, and leaves the xfs_defer_pending 118 * item at the head of the dop_work queue. Then it rolls the 119 * transaction and picks up processing where it left off. It is 120 * required that ->finish_item must be careful to leave enough 121 * transaction reservation to fit the new log intent item. 122 * 123 * This is an example of remapping the extent (E, E+B) into file X at 124 * offset A and dealing with the extent (C, C+B) already being mapped 125 * there: 126 * +-------------------------------------------------+ 127 * | Unmap file X startblock C offset A length B | t0 128 * | Intent to reduce refcount for extent (C, B) | 129 * | Intent to remove rmap (X, C, A, B) | 130 * | Intent to free extent (D, 1) (bmbt block) | 131 * | Intent to map (X, A, B) at startblock E | 132 * +-------------------------------------------------+ 133 * | Map file X startblock E offset A length B | t1 134 * | Done mapping (X, E, A, B) | 135 * | Intent to increase refcount for extent (E, B) | 136 * | Intent to add rmap (X, E, A, B) | 137 * +-------------------------------------------------+ 138 * | Reduce refcount for extent (C, B) | t2 139 * | Done reducing refcount for extent (C, 9) | 140 * | Intent to reduce refcount for extent (C+9, B-9) | 141 * | (ran out of space after 9 refcount updates) | 142 * +-------------------------------------------------+ 143 * | Reduce refcount for extent (C+9, B+9) | t3 144 * | Done reducing refcount for extent (C+9, B-9) | 145 * | Increase refcount for extent (E, B) | 146 * | Done increasing refcount for extent (E, B) | 147 * | Intent to free extent (C, B) | 148 * | Intent to free extent (F, 1) (refcountbt block) | 149 * | Intent to remove rmap (F, 1, REFC) | 150 * +-------------------------------------------------+ 151 * | Remove rmap (X, C, A, B) | t4 152 * | Done removing rmap (X, C, A, B) | 153 * | Add rmap (X, E, A, B) | 154 * | Done adding rmap (X, E, A, B) | 155 * | Remove rmap (F, 1, REFC) | 156 * | Done removing rmap (F, 1, REFC) | 157 * +-------------------------------------------------+ 158 * | Free extent (C, B) | t5 159 * | Done freeing extent (C, B) | 160 * | Free extent (D, 1) | 161 * | Done freeing extent (D, 1) | 162 * | Free extent (F, 1) | 163 * | Done freeing extent (F, 1) | 164 * +-------------------------------------------------+ 165 * 166 * If we should crash before t2 commits, log recovery replays 167 * the following intent items: 168 * 169 * - Intent to reduce refcount for extent (C, B) 170 * - Intent to remove rmap (X, C, A, B) 171 * - Intent to free extent (D, 1) (bmbt block) 172 * - Intent to increase refcount for extent (E, B) 173 * - Intent to add rmap (X, E, A, B) 174 * 175 * In the process of recovering, it should also generate and take care 176 * of these intent items: 177 * 178 * - Intent to free extent (C, B) 179 * - Intent to free extent (F, 1) (refcountbt block) 180 * - Intent to remove rmap (F, 1, REFC) 181 * 182 * Note that the continuation requested between t2 and t3 is likely to 183 * reoccur. 184 */ 185 STATIC struct xfs_log_item * 186 xfs_defer_barrier_create_intent( 187 struct xfs_trans *tp, 188 struct list_head *items, 189 unsigned int count, 190 bool sort) 191 { 192 return NULL; 193 } 194 195 STATIC void 196 xfs_defer_barrier_abort_intent( 197 struct xfs_log_item *intent) 198 { 199 /* empty */ 200 } 201 202 STATIC struct xfs_log_item * 203 xfs_defer_barrier_create_done( 204 struct xfs_trans *tp, 205 struct xfs_log_item *intent, 206 unsigned int count) 207 { 208 return NULL; 209 } 210 211 STATIC int 212 xfs_defer_barrier_finish_item( 213 struct xfs_trans *tp, 214 struct xfs_log_item *done, 215 struct list_head *item, 216 struct xfs_btree_cur **state) 217 { 218 ASSERT(0); 219 return -EFSCORRUPTED; 220 } 221 222 STATIC void 223 xfs_defer_barrier_cancel_item( 224 struct list_head *item) 225 { 226 ASSERT(0); 227 } 228 229 static const struct xfs_defer_op_type xfs_barrier_defer_type = { 230 .max_items = 1, 231 .create_intent = xfs_defer_barrier_create_intent, 232 .abort_intent = xfs_defer_barrier_abort_intent, 233 .create_done = xfs_defer_barrier_create_done, 234 .finish_item = xfs_defer_barrier_finish_item, 235 .cancel_item = xfs_defer_barrier_cancel_item, 236 }; 237 238 /* Create a log intent done item for a log intent item. */ 239 static inline void 240 xfs_defer_create_done( 241 struct xfs_trans *tp, 242 struct xfs_defer_pending *dfp) 243 { 244 struct xfs_log_item *lip; 245 246 /* If there is no log intent item, there can be no log done item. */ 247 if (!dfp->dfp_intent) 248 return; 249 250 /* 251 * Mark the transaction dirty, even on error. This ensures the 252 * transaction is aborted, which: 253 * 254 * 1.) releases the log intent item and frees the log done item 255 * 2.) shuts down the filesystem 256 */ 257 tp->t_flags |= XFS_TRANS_DIRTY; 258 lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); 259 if (!lip) 260 return; 261 262 tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; 263 xfs_trans_add_item(tp, lip); 264 set_bit(XFS_LI_DIRTY, &lip->li_flags); 265 dfp->dfp_done = lip; 266 } 267 268 /* 269 * Ensure there's a log intent item associated with this deferred work item if 270 * the operation must be restarted on crash. Returns 1 if there's a log item; 271 * 0 if there isn't; or a negative errno. 272 */ 273 static int 274 xfs_defer_create_intent( 275 struct xfs_trans *tp, 276 struct xfs_defer_pending *dfp, 277 bool sort) 278 { 279 struct xfs_log_item *lip; 280 281 if (dfp->dfp_intent) 282 return 1; 283 284 lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, 285 sort); 286 if (!lip) 287 return 0; 288 if (IS_ERR(lip)) 289 return PTR_ERR(lip); 290 291 tp->t_flags |= XFS_TRANS_DIRTY; 292 xfs_trans_add_item(tp, lip); 293 set_bit(XFS_LI_DIRTY, &lip->li_flags); 294 dfp->dfp_intent = lip; 295 return 1; 296 } 297 298 /* 299 * For each pending item in the intake list, log its intent item and the 300 * associated extents, then add the entire intake list to the end of 301 * the pending list. 302 * 303 * Returns 1 if at least one log item was associated with the deferred work; 304 * 0 if there are no log items; or a negative errno. 305 */ 306 static int 307 xfs_defer_create_intents( 308 struct xfs_trans *tp) 309 { 310 struct xfs_defer_pending *dfp; 311 int ret = 0; 312 313 list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { 314 int ret2; 315 316 trace_xfs_defer_create_intent(tp->t_mountp, dfp); 317 ret2 = xfs_defer_create_intent(tp, dfp, true); 318 if (ret2 < 0) 319 return ret2; 320 ret |= ret2; 321 } 322 return ret; 323 } 324 325 static inline void 326 xfs_defer_pending_abort( 327 struct xfs_mount *mp, 328 struct xfs_defer_pending *dfp) 329 { 330 trace_xfs_defer_pending_abort(mp, dfp); 331 332 if (dfp->dfp_intent && !dfp->dfp_done) { 333 dfp->dfp_ops->abort_intent(dfp->dfp_intent); 334 dfp->dfp_intent = NULL; 335 } 336 } 337 338 static inline void 339 xfs_defer_pending_cancel_work( 340 struct xfs_mount *mp, 341 struct xfs_defer_pending *dfp) 342 { 343 struct list_head *pwi; 344 struct list_head *n; 345 346 trace_xfs_defer_cancel_list(mp, dfp); 347 348 list_del(&dfp->dfp_list); 349 list_for_each_safe(pwi, n, &dfp->dfp_work) { 350 list_del(pwi); 351 dfp->dfp_count--; 352 trace_xfs_defer_cancel_item(mp, dfp, pwi); 353 dfp->dfp_ops->cancel_item(pwi); 354 } 355 ASSERT(dfp->dfp_count == 0); 356 kmem_cache_free(xfs_defer_pending_cache, dfp); 357 } 358 359 STATIC void 360 xfs_defer_pending_abort_list( 361 struct xfs_mount *mp, 362 struct list_head *dop_list) 363 { 364 struct xfs_defer_pending *dfp; 365 366 /* Abort intent items that don't have a done item. */ 367 list_for_each_entry(dfp, dop_list, dfp_list) 368 xfs_defer_pending_abort(mp, dfp); 369 } 370 371 /* Abort all the intents that were committed. */ 372 STATIC void 373 xfs_defer_trans_abort( 374 struct xfs_trans *tp, 375 struct list_head *dop_pending) 376 { 377 trace_xfs_defer_trans_abort(tp, _RET_IP_); 378 xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); 379 } 380 381 /* 382 * Capture resources that the caller said not to release ("held") when the 383 * transaction commits. Caller is responsible for zero-initializing @dres. 384 */ 385 static int 386 xfs_defer_save_resources( 387 struct xfs_defer_resources *dres, 388 struct xfs_trans *tp) 389 { 390 struct xfs_buf_log_item *bli; 391 struct xfs_inode_log_item *ili; 392 struct xfs_log_item *lip; 393 394 BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); 395 396 list_for_each_entry(lip, &tp->t_items, li_trans) { 397 switch (lip->li_type) { 398 case XFS_LI_BUF: 399 bli = container_of(lip, struct xfs_buf_log_item, 400 bli_item); 401 if (bli->bli_flags & XFS_BLI_HOLD) { 402 if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { 403 ASSERT(0); 404 return -EFSCORRUPTED; 405 } 406 if (bli->bli_flags & XFS_BLI_ORDERED) 407 dres->dr_ordered |= 408 (1U << dres->dr_bufs); 409 else 410 xfs_trans_dirty_buf(tp, bli->bli_buf); 411 dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; 412 } 413 break; 414 case XFS_LI_INODE: 415 ili = container_of(lip, struct xfs_inode_log_item, 416 ili_item); 417 if (ili->ili_lock_flags == 0) { 418 if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { 419 ASSERT(0); 420 return -EFSCORRUPTED; 421 } 422 xfs_trans_log_inode(tp, ili->ili_inode, 423 XFS_ILOG_CORE); 424 dres->dr_ip[dres->dr_inos++] = ili->ili_inode; 425 } 426 break; 427 default: 428 break; 429 } 430 } 431 432 return 0; 433 } 434 435 /* Attach the held resources to the transaction. */ 436 static void 437 xfs_defer_restore_resources( 438 struct xfs_trans *tp, 439 struct xfs_defer_resources *dres) 440 { 441 unsigned short i; 442 443 /* Rejoin the joined inodes. */ 444 for (i = 0; i < dres->dr_inos; i++) 445 xfs_trans_ijoin(tp, dres->dr_ip[i], 0); 446 447 /* Rejoin the buffers and dirty them so the log moves forward. */ 448 for (i = 0; i < dres->dr_bufs; i++) { 449 xfs_trans_bjoin(tp, dres->dr_bp[i]); 450 if (dres->dr_ordered & (1U << i)) 451 xfs_trans_ordered_buf(tp, dres->dr_bp[i]); 452 xfs_trans_bhold(tp, dres->dr_bp[i]); 453 } 454 } 455 456 /* Roll a transaction so we can do some deferred op processing. */ 457 STATIC int 458 xfs_defer_trans_roll( 459 struct xfs_trans **tpp) 460 { 461 struct xfs_defer_resources dres = { }; 462 int error; 463 464 error = xfs_defer_save_resources(&dres, *tpp); 465 if (error) 466 return error; 467 468 trace_xfs_defer_trans_roll(*tpp, _RET_IP_); 469 470 /* 471 * Roll the transaction. Rolling always given a new transaction (even 472 * if committing the old one fails!) to hand back to the caller, so we 473 * join the held resources to the new transaction so that we always 474 * return with the held resources joined to @tpp, no matter what 475 * happened. 476 */ 477 error = xfs_trans_roll(tpp); 478 479 xfs_defer_restore_resources(*tpp, &dres); 480 481 if (error) 482 trace_xfs_defer_trans_roll_error(*tpp, error); 483 return error; 484 } 485 486 /* 487 * Free up any items left in the list. 488 */ 489 static void 490 xfs_defer_cancel_list( 491 struct xfs_mount *mp, 492 struct list_head *dop_list) 493 { 494 struct xfs_defer_pending *dfp; 495 struct xfs_defer_pending *pli; 496 497 /* 498 * Free the pending items. Caller should already have arranged 499 * for the intent items to be released. 500 */ 501 list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) 502 xfs_defer_pending_cancel_work(mp, dfp); 503 } 504 505 static inline void 506 xfs_defer_relog_intent( 507 struct xfs_trans *tp, 508 struct xfs_defer_pending *dfp) 509 { 510 struct xfs_log_item *lip; 511 512 xfs_defer_create_done(tp, dfp); 513 514 lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); 515 if (lip) { 516 xfs_trans_add_item(tp, lip); 517 set_bit(XFS_LI_DIRTY, &lip->li_flags); 518 } 519 dfp->dfp_done = NULL; 520 dfp->dfp_intent = lip; 521 } 522 523 /* 524 * Prevent a log intent item from pinning the tail of the log by logging a 525 * done item to release the intent item; and then log a new intent item. 526 * The caller should provide a fresh transaction and roll it after we're done. 527 */ 528 static void 529 xfs_defer_relog( 530 struct xfs_trans **tpp, 531 struct list_head *dfops) 532 { 533 struct xlog *log = (*tpp)->t_mountp->m_log; 534 struct xfs_defer_pending *dfp; 535 xfs_lsn_t threshold_lsn = NULLCOMMITLSN; 536 537 538 ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); 539 540 list_for_each_entry(dfp, dfops, dfp_list) { 541 /* 542 * If the log intent item for this deferred op is not a part of 543 * the current log checkpoint, relog the intent item to keep 544 * the log tail moving forward. We're ok with this being racy 545 * because an incorrect decision means we'll be a little slower 546 * at pushing the tail. 547 */ 548 if (dfp->dfp_intent == NULL || 549 xfs_log_item_in_current_chkpt(dfp->dfp_intent)) 550 continue; 551 552 /* 553 * Figure out where we need the tail to be in order to maintain 554 * the minimum required free space in the log. Only sample 555 * the log threshold once per call. 556 */ 557 if (threshold_lsn == NULLCOMMITLSN) { 558 threshold_lsn = xlog_grant_push_threshold(log, 0); 559 if (threshold_lsn == NULLCOMMITLSN) 560 break; 561 } 562 if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) 563 continue; 564 565 trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); 566 XFS_STATS_INC((*tpp)->t_mountp, defer_relog); 567 568 xfs_defer_relog_intent(*tpp, dfp); 569 } 570 } 571 572 /* 573 * Log an intent-done item for the first pending intent, and finish the work 574 * items. 575 */ 576 int 577 xfs_defer_finish_one( 578 struct xfs_trans *tp, 579 struct xfs_defer_pending *dfp) 580 { 581 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 582 struct xfs_btree_cur *state = NULL; 583 struct list_head *li, *n; 584 int error; 585 586 trace_xfs_defer_pending_finish(tp->t_mountp, dfp); 587 588 xfs_defer_create_done(tp, dfp); 589 list_for_each_safe(li, n, &dfp->dfp_work) { 590 list_del(li); 591 dfp->dfp_count--; 592 trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); 593 error = ops->finish_item(tp, dfp->dfp_done, li, &state); 594 if (error == -EAGAIN) { 595 int ret; 596 597 /* 598 * Caller wants a fresh transaction; put the work item 599 * back on the list and log a new log intent item to 600 * replace the old one. See "Requesting a Fresh 601 * Transaction while Finishing Deferred Work" above. 602 */ 603 list_add(li, &dfp->dfp_work); 604 dfp->dfp_count++; 605 dfp->dfp_done = NULL; 606 dfp->dfp_intent = NULL; 607 ret = xfs_defer_create_intent(tp, dfp, false); 608 if (ret < 0) 609 error = ret; 610 } 611 612 if (error) 613 goto out; 614 } 615 616 /* Done with the dfp, free it. */ 617 list_del(&dfp->dfp_list); 618 kmem_cache_free(xfs_defer_pending_cache, dfp); 619 out: 620 if (ops->finish_cleanup) 621 ops->finish_cleanup(tp, state, error); 622 return error; 623 } 624 625 /* Move all paused deferred work from @tp to @paused_list. */ 626 static void 627 xfs_defer_isolate_paused( 628 struct xfs_trans *tp, 629 struct list_head *paused_list) 630 { 631 struct xfs_defer_pending *dfp; 632 struct xfs_defer_pending *pli; 633 634 list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { 635 if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) 636 continue; 637 638 list_move_tail(&dfp->dfp_list, paused_list); 639 trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); 640 } 641 } 642 643 /* 644 * Finish all the pending work. This involves logging intent items for 645 * any work items that wandered in since the last transaction roll (if 646 * one has even happened), rolling the transaction, and finishing the 647 * work items in the first item on the logged-and-pending list. 648 * 649 * If an inode is provided, relog it to the new transaction. 650 */ 651 int 652 xfs_defer_finish_noroll( 653 struct xfs_trans **tp) 654 { 655 struct xfs_defer_pending *dfp = NULL; 656 int error = 0; 657 LIST_HEAD(dop_pending); 658 LIST_HEAD(dop_paused); 659 660 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 661 662 trace_xfs_defer_finish(*tp, _RET_IP_); 663 664 /* Until we run out of pending work to finish... */ 665 while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { 666 /* 667 * Deferred items that are created in the process of finishing 668 * other deferred work items should be queued at the head of 669 * the pending list, which puts them ahead of the deferred work 670 * that was created by the caller. This keeps the number of 671 * pending work items to a minimum, which decreases the amount 672 * of time that any one intent item can stick around in memory, 673 * pinning the log tail. 674 */ 675 int has_intents = xfs_defer_create_intents(*tp); 676 677 xfs_defer_isolate_paused(*tp, &dop_paused); 678 679 list_splice_init(&(*tp)->t_dfops, &dop_pending); 680 681 if (has_intents < 0) { 682 error = has_intents; 683 goto out_shutdown; 684 } 685 if (has_intents || dfp) { 686 error = xfs_defer_trans_roll(tp); 687 if (error) 688 goto out_shutdown; 689 690 /* Relog intent items to keep the log moving. */ 691 xfs_defer_relog(tp, &dop_pending); 692 xfs_defer_relog(tp, &dop_paused); 693 694 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 695 error = xfs_defer_trans_roll(tp); 696 if (error) 697 goto out_shutdown; 698 } 699 } 700 701 dfp = list_first_entry_or_null(&dop_pending, 702 struct xfs_defer_pending, dfp_list); 703 if (!dfp) 704 break; 705 error = xfs_defer_finish_one(*tp, dfp); 706 if (error && error != -EAGAIN) 707 goto out_shutdown; 708 } 709 710 /* Requeue the paused items in the outgoing transaction. */ 711 list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); 712 713 trace_xfs_defer_finish_done(*tp, _RET_IP_); 714 return 0; 715 716 out_shutdown: 717 list_splice_tail_init(&dop_paused, &dop_pending); 718 xfs_defer_trans_abort(*tp, &dop_pending); 719 xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); 720 trace_xfs_defer_finish_error(*tp, error); 721 xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); 722 xfs_defer_cancel(*tp); 723 return error; 724 } 725 726 int 727 xfs_defer_finish( 728 struct xfs_trans **tp) 729 { 730 #ifdef DEBUG 731 struct xfs_defer_pending *dfp; 732 #endif 733 int error; 734 735 /* 736 * Finish and roll the transaction once more to avoid returning to the 737 * caller with a dirty transaction. 738 */ 739 error = xfs_defer_finish_noroll(tp); 740 if (error) 741 return error; 742 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 743 error = xfs_defer_trans_roll(tp); 744 if (error) { 745 xfs_force_shutdown((*tp)->t_mountp, 746 SHUTDOWN_CORRUPT_INCORE); 747 return error; 748 } 749 } 750 751 /* Reset LOWMODE now that we've finished all the dfops. */ 752 #ifdef DEBUG 753 list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) 754 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 755 #endif 756 (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; 757 return 0; 758 } 759 760 void 761 xfs_defer_cancel( 762 struct xfs_trans *tp) 763 { 764 struct xfs_mount *mp = tp->t_mountp; 765 766 trace_xfs_defer_cancel(tp, _RET_IP_); 767 xfs_defer_trans_abort(tp, &tp->t_dfops); 768 xfs_defer_cancel_list(mp, &tp->t_dfops); 769 } 770 771 /* 772 * Return the last pending work item attached to this transaction if it matches 773 * the deferred op type. 774 */ 775 static inline struct xfs_defer_pending * 776 xfs_defer_find_last( 777 struct xfs_trans *tp, 778 const struct xfs_defer_op_type *ops) 779 { 780 struct xfs_defer_pending *dfp = NULL; 781 782 /* No dfops at all? */ 783 if (list_empty(&tp->t_dfops)) 784 return NULL; 785 786 dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, 787 dfp_list); 788 789 /* Wrong type? */ 790 if (dfp->dfp_ops != ops) 791 return NULL; 792 return dfp; 793 } 794 795 /* 796 * Decide if we can add a deferred work item to the last dfops item attached 797 * to the transaction. 798 */ 799 static inline bool 800 xfs_defer_can_append( 801 struct xfs_defer_pending *dfp, 802 const struct xfs_defer_op_type *ops) 803 { 804 /* Already logged? */ 805 if (dfp->dfp_intent) 806 return false; 807 808 /* Paused items cannot absorb more work */ 809 if (dfp->dfp_flags & XFS_DEFER_PAUSED) 810 return NULL; 811 812 /* Already full? */ 813 if (ops->max_items && dfp->dfp_count >= ops->max_items) 814 return false; 815 816 return true; 817 } 818 819 /* Create a new pending item at the end of the transaction list. */ 820 static inline struct xfs_defer_pending * 821 xfs_defer_alloc( 822 struct list_head *dfops, 823 const struct xfs_defer_op_type *ops) 824 { 825 struct xfs_defer_pending *dfp; 826 827 dfp = kmem_cache_zalloc(xfs_defer_pending_cache, 828 GFP_KERNEL | __GFP_NOFAIL); 829 dfp->dfp_ops = ops; 830 INIT_LIST_HEAD(&dfp->dfp_work); 831 list_add_tail(&dfp->dfp_list, dfops); 832 833 return dfp; 834 } 835 836 /* Add an item for later deferred processing. */ 837 struct xfs_defer_pending * 838 xfs_defer_add( 839 struct xfs_trans *tp, 840 struct list_head *li, 841 const struct xfs_defer_op_type *ops) 842 { 843 struct xfs_defer_pending *dfp = NULL; 844 845 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 846 847 dfp = xfs_defer_find_last(tp, ops); 848 if (!dfp || !xfs_defer_can_append(dfp, ops)) 849 dfp = xfs_defer_alloc(&tp->t_dfops, ops); 850 851 xfs_defer_add_item(dfp, li); 852 trace_xfs_defer_add_item(tp->t_mountp, dfp, li); 853 return dfp; 854 } 855 856 /* 857 * Add a defer ops barrier to force two otherwise adjacent deferred work items 858 * to be tracked separately and have separate log items. 859 */ 860 void 861 xfs_defer_add_barrier( 862 struct xfs_trans *tp) 863 { 864 struct xfs_defer_pending *dfp; 865 866 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 867 868 /* If the last defer op added was a barrier, we're done. */ 869 dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); 870 if (dfp) 871 return; 872 873 xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); 874 875 trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); 876 } 877 878 /* 879 * Create a pending deferred work item to replay the recovered intent item 880 * and add it to the list. 881 */ 882 void 883 xfs_defer_start_recovery( 884 struct xfs_log_item *lip, 885 struct list_head *r_dfops, 886 const struct xfs_defer_op_type *ops) 887 { 888 struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); 889 890 dfp->dfp_intent = lip; 891 } 892 893 /* 894 * Cancel a deferred work item created to recover a log intent item. @dfp 895 * will be freed after this function returns. 896 */ 897 void 898 xfs_defer_cancel_recovery( 899 struct xfs_mount *mp, 900 struct xfs_defer_pending *dfp) 901 { 902 xfs_defer_pending_abort(mp, dfp); 903 xfs_defer_pending_cancel_work(mp, dfp); 904 } 905 906 /* Replay the deferred work item created from a recovered log intent item. */ 907 int 908 xfs_defer_finish_recovery( 909 struct xfs_mount *mp, 910 struct xfs_defer_pending *dfp, 911 struct list_head *capture_list) 912 { 913 const struct xfs_defer_op_type *ops = dfp->dfp_ops; 914 int error; 915 916 /* dfp is freed by recover_work and must not be accessed afterwards */ 917 error = ops->recover_work(dfp, capture_list); 918 if (error) 919 trace_xlog_intent_recovery_failed(mp, ops, error); 920 return error; 921 } 922 923 /* 924 * Move deferred ops from one transaction to another and reset the source to 925 * initial state. This is primarily used to carry state forward across 926 * transaction rolls with pending dfops. 927 */ 928 void 929 xfs_defer_move( 930 struct xfs_trans *dtp, 931 struct xfs_trans *stp) 932 { 933 list_splice_init(&stp->t_dfops, &dtp->t_dfops); 934 935 /* 936 * Low free space mode was historically controlled by a dfops field. 937 * This meant that low mode state potentially carried across multiple 938 * transaction rolls. Transfer low mode on a dfops move to preserve 939 * that behavior. 940 */ 941 dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); 942 stp->t_flags &= ~XFS_TRANS_LOWMODE; 943 } 944 945 /* 946 * Prepare a chain of fresh deferred ops work items to be completed later. Log 947 * recovery requires the ability to put off until later the actual finishing 948 * work so that it can process unfinished items recovered from the log in 949 * correct order. 950 * 951 * Create and log intent items for all the work that we're capturing so that we 952 * can be assured that the items will get replayed if the system goes down 953 * before log recovery gets a chance to finish the work it put off. The entire 954 * deferred ops state is transferred to the capture structure and the 955 * transaction is then ready for the caller to commit it. If there are no 956 * intent items to capture, this function returns NULL. 957 * 958 * If capture_ip is not NULL, the capture structure will obtain an extra 959 * reference to the inode. 960 */ 961 static struct xfs_defer_capture * 962 xfs_defer_ops_capture( 963 struct xfs_trans *tp) 964 { 965 struct xfs_defer_capture *dfc; 966 unsigned short i; 967 int error; 968 969 if (list_empty(&tp->t_dfops)) 970 return NULL; 971 972 error = xfs_defer_create_intents(tp); 973 if (error < 0) 974 return ERR_PTR(error); 975 976 /* Create an object to capture the defer ops. */ 977 dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); 978 INIT_LIST_HEAD(&dfc->dfc_list); 979 INIT_LIST_HEAD(&dfc->dfc_dfops); 980 981 /* Move the dfops chain and transaction state to the capture struct. */ 982 list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); 983 dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; 984 tp->t_flags &= ~XFS_TRANS_LOWMODE; 985 986 /* Capture the remaining block reservations along with the dfops. */ 987 dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; 988 dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; 989 990 /* Preserve the log reservation size. */ 991 dfc->dfc_logres = tp->t_log_res; 992 993 error = xfs_defer_save_resources(&dfc->dfc_held, tp); 994 if (error) { 995 /* 996 * Resource capture should never fail, but if it does, we 997 * still have to shut down the log and release things 998 * properly. 999 */ 1000 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); 1001 } 1002 1003 /* 1004 * Grab extra references to the inodes and buffers because callers are 1005 * expected to release their held references after we commit the 1006 * transaction. 1007 */ 1008 for (i = 0; i < dfc->dfc_held.dr_inos; i++) { 1009 xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); 1010 ihold(VFS_I(dfc->dfc_held.dr_ip[i])); 1011 } 1012 1013 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1014 xfs_buf_hold(dfc->dfc_held.dr_bp[i]); 1015 1016 return dfc; 1017 } 1018 1019 /* Release all resources that we used to capture deferred ops. */ 1020 void 1021 xfs_defer_ops_capture_abort( 1022 struct xfs_mount *mp, 1023 struct xfs_defer_capture *dfc) 1024 { 1025 unsigned short i; 1026 1027 xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); 1028 xfs_defer_cancel_list(mp, &dfc->dfc_dfops); 1029 1030 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1031 xfs_buf_relse(dfc->dfc_held.dr_bp[i]); 1032 1033 for (i = 0; i < dfc->dfc_held.dr_inos; i++) 1034 xfs_irele(dfc->dfc_held.dr_ip[i]); 1035 1036 kfree(dfc); 1037 } 1038 1039 /* 1040 * Capture any deferred ops and commit the transaction. This is the last step 1041 * needed to finish a log intent item that we recovered from the log. If any 1042 * of the deferred ops operate on an inode, the caller must pass in that inode 1043 * so that the reference can be transferred to the capture structure. The 1044 * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling 1045 * xfs_defer_ops_continue. 1046 */ 1047 int 1048 xfs_defer_ops_capture_and_commit( 1049 struct xfs_trans *tp, 1050 struct list_head *capture_list) 1051 { 1052 struct xfs_mount *mp = tp->t_mountp; 1053 struct xfs_defer_capture *dfc; 1054 int error; 1055 1056 /* If we don't capture anything, commit transaction and exit. */ 1057 dfc = xfs_defer_ops_capture(tp); 1058 if (IS_ERR(dfc)) { 1059 xfs_trans_cancel(tp); 1060 return PTR_ERR(dfc); 1061 } 1062 if (!dfc) 1063 return xfs_trans_commit(tp); 1064 1065 /* Commit the transaction and add the capture structure to the list. */ 1066 error = xfs_trans_commit(tp); 1067 if (error) { 1068 xfs_defer_ops_capture_abort(mp, dfc); 1069 return error; 1070 } 1071 1072 list_add_tail(&dfc->dfc_list, capture_list); 1073 return 0; 1074 } 1075 1076 /* 1077 * Attach a chain of captured deferred ops to a new transaction and free the 1078 * capture structure. If an inode was captured, it will be passed back to the 1079 * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. 1080 * The caller now owns the inode reference. 1081 */ 1082 void 1083 xfs_defer_ops_continue( 1084 struct xfs_defer_capture *dfc, 1085 struct xfs_trans *tp, 1086 struct xfs_defer_resources *dres) 1087 { 1088 unsigned int i; 1089 1090 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1091 ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); 1092 1093 /* Lock the captured resources to the new transaction. */ 1094 if (dfc->dfc_held.dr_inos == 2) 1095 xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, 1096 dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); 1097 else if (dfc->dfc_held.dr_inos == 1) 1098 xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); 1099 1100 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1101 xfs_buf_lock(dfc->dfc_held.dr_bp[i]); 1102 1103 /* Join the captured resources to the new transaction. */ 1104 xfs_defer_restore_resources(tp, &dfc->dfc_held); 1105 memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); 1106 dres->dr_bufs = 0; 1107 1108 /* Move captured dfops chain and state to the transaction. */ 1109 list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); 1110 tp->t_flags |= dfc->dfc_tpflags; 1111 1112 kfree(dfc); 1113 } 1114 1115 /* Release the resources captured and continued during recovery. */ 1116 void 1117 xfs_defer_resources_rele( 1118 struct xfs_defer_resources *dres) 1119 { 1120 unsigned short i; 1121 1122 for (i = 0; i < dres->dr_inos; i++) { 1123 xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); 1124 xfs_irele(dres->dr_ip[i]); 1125 dres->dr_ip[i] = NULL; 1126 } 1127 1128 for (i = 0; i < dres->dr_bufs; i++) { 1129 xfs_buf_relse(dres->dr_bp[i]); 1130 dres->dr_bp[i] = NULL; 1131 } 1132 1133 dres->dr_inos = 0; 1134 dres->dr_bufs = 0; 1135 dres->dr_ordered = 0; 1136 } 1137 1138 static inline int __init 1139 xfs_defer_init_cache(void) 1140 { 1141 xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", 1142 sizeof(struct xfs_defer_pending), 1143 0, 0, NULL); 1144 1145 return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; 1146 } 1147 1148 static inline void 1149 xfs_defer_destroy_cache(void) 1150 { 1151 kmem_cache_destroy(xfs_defer_pending_cache); 1152 xfs_defer_pending_cache = NULL; 1153 } 1154 1155 /* Set up caches for deferred work items. */ 1156 int __init 1157 xfs_defer_init_item_caches(void) 1158 { 1159 int error; 1160 1161 error = xfs_defer_init_cache(); 1162 if (error) 1163 return error; 1164 error = xfs_rmap_intent_init_cache(); 1165 if (error) 1166 goto err; 1167 error = xfs_refcount_intent_init_cache(); 1168 if (error) 1169 goto err; 1170 error = xfs_bmap_intent_init_cache(); 1171 if (error) 1172 goto err; 1173 error = xfs_extfree_intent_init_cache(); 1174 if (error) 1175 goto err; 1176 error = xfs_attr_intent_init_cache(); 1177 if (error) 1178 goto err; 1179 return 0; 1180 err: 1181 xfs_defer_destroy_item_caches(); 1182 return error; 1183 } 1184 1185 /* Destroy all the deferred work item caches, if they've been allocated. */ 1186 void 1187 xfs_defer_destroy_item_caches(void) 1188 { 1189 xfs_attr_intent_destroy_cache(); 1190 xfs_extfree_intent_destroy_cache(); 1191 xfs_bmap_intent_destroy_cache(); 1192 xfs_refcount_intent_destroy_cache(); 1193 xfs_rmap_intent_destroy_cache(); 1194 xfs_defer_destroy_cache(); 1195 } 1196 1197 /* 1198 * Mark a deferred work item so that it will be requeued indefinitely without 1199 * being finished. Caller must ensure there are no data dependencies on this 1200 * work item in the meantime. 1201 */ 1202 void 1203 xfs_defer_item_pause( 1204 struct xfs_trans *tp, 1205 struct xfs_defer_pending *dfp) 1206 { 1207 ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); 1208 1209 dfp->dfp_flags |= XFS_DEFER_PAUSED; 1210 1211 trace_xfs_defer_item_pause(tp->t_mountp, dfp); 1212 } 1213 1214 /* 1215 * Release a paused deferred work item so that it will be finished during the 1216 * next transaction roll. 1217 */ 1218 void 1219 xfs_defer_item_unpause( 1220 struct xfs_trans *tp, 1221 struct xfs_defer_pending *dfp) 1222 { 1223 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 1224 1225 dfp->dfp_flags &= ~XFS_DEFER_PAUSED; 1226 1227 trace_xfs_defer_item_unpause(tp->t_mountp, dfp); 1228 } 1229