1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_inode.h" 17 #include "xfs_inode_item.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_log.h" 21 #include "xfs_rmap.h" 22 #include "xfs_refcount.h" 23 #include "xfs_bmap.h" 24 #include "xfs_alloc.h" 25 #include "xfs_buf.h" 26 #include "xfs_da_format.h" 27 #include "xfs_da_btree.h" 28 #include "xfs_attr.h" 29 #include "xfs_trans_priv.h" 30 31 static struct kmem_cache *xfs_defer_pending_cache; 32 33 /* 34 * Deferred Operations in XFS 35 * 36 * Due to the way locking rules work in XFS, certain transactions (block 37 * mapping and unmapping, typically) have permanent reservations so that 38 * we can roll the transaction to adhere to AG locking order rules and 39 * to unlock buffers between metadata updates. Prior to rmap/reflink, 40 * the mapping code had a mechanism to perform these deferrals for 41 * extents that were going to be freed; this code makes that facility 42 * more generic. 43 * 44 * When adding the reverse mapping and reflink features, it became 45 * necessary to perform complex remapping multi-transactions to comply 46 * with AG locking order rules, and to be able to spread a single 47 * refcount update operation (an operation on an n-block extent can 48 * update as many as n records!) among multiple transactions. XFS can 49 * roll a transaction to facilitate this, but using this facility 50 * requires us to log "intent" items in case log recovery needs to 51 * redo the operation, and to log "done" items to indicate that redo 52 * is not necessary. 53 * 54 * Deferred work is tracked in xfs_defer_pending items. Each pending 55 * item tracks one type of deferred work. Incoming work items (which 56 * have not yet had an intent logged) are attached to a pending item 57 * on the dop_intake list, where they wait for the caller to finish 58 * the deferred operations. 59 * 60 * Finishing a set of deferred operations is an involved process. To 61 * start, we define "rolling a deferred-op transaction" as follows: 62 * 63 * > For each xfs_defer_pending item on the dop_intake list, 64 * - Sort the work items in AG order. XFS locking 65 * order rules require us to lock buffers in AG order. 66 * - Create a log intent item for that type. 67 * - Attach it to the pending item. 68 * - Move the pending item from the dop_intake list to the 69 * dop_pending list. 70 * > Roll the transaction. 71 * 72 * NOTE: To avoid exceeding the transaction reservation, we limit the 73 * number of items that we attach to a given xfs_defer_pending. 74 * 75 * The actual finishing process looks like this: 76 * 77 * > For each xfs_defer_pending in the dop_pending list, 78 * - Roll the deferred-op transaction as above. 79 * - Create a log done item for that type, and attach it to the 80 * log intent item. 81 * - For each work item attached to the log intent item, 82 * * Perform the described action. 83 * * Attach the work item to the log done item. 84 * * If the result of doing the work was -EAGAIN, ->finish work 85 * wants a new transaction. See the "Requesting a Fresh 86 * Transaction while Finishing Deferred Work" section below for 87 * details. 88 * 89 * The key here is that we must log an intent item for all pending 90 * work items every time we roll the transaction, and that we must log 91 * a done item as soon as the work is completed. With this mechanism 92 * we can perform complex remapping operations, chaining intent items 93 * as needed. 94 * 95 * Requesting a Fresh Transaction while Finishing Deferred Work 96 * 97 * If ->finish_item decides that it needs a fresh transaction to 98 * finish the work, it must ask its caller (xfs_defer_finish) for a 99 * continuation. The most likely cause of this circumstance are the 100 * refcount adjust functions deciding that they've logged enough items 101 * to be at risk of exceeding the transaction reservation. 102 * 103 * To get a fresh transaction, we want to log the existing log done 104 * item to prevent the log intent item from replaying, immediately log 105 * a new log intent item with the unfinished work items, roll the 106 * transaction, and re-call ->finish_item wherever it left off. The 107 * log done item and the new log intent item must be in the same 108 * transaction or atomicity cannot be guaranteed; defer_finish ensures 109 * that this happens. 110 * 111 * This requires some coordination between ->finish_item and 112 * defer_finish. Upon deciding to request a new transaction, 113 * ->finish_item should update the current work item to reflect the 114 * unfinished work. Next, it should reset the log done item's list 115 * count to the number of items finished, and return -EAGAIN. 116 * defer_finish sees the -EAGAIN, logs the new log intent item 117 * with the remaining work items, and leaves the xfs_defer_pending 118 * item at the head of the dop_work queue. Then it rolls the 119 * transaction and picks up processing where it left off. It is 120 * required that ->finish_item must be careful to leave enough 121 * transaction reservation to fit the new log intent item. 122 * 123 * This is an example of remapping the extent (E, E+B) into file X at 124 * offset A and dealing with the extent (C, C+B) already being mapped 125 * there: 126 * +-------------------------------------------------+ 127 * | Unmap file X startblock C offset A length B | t0 128 * | Intent to reduce refcount for extent (C, B) | 129 * | Intent to remove rmap (X, C, A, B) | 130 * | Intent to free extent (D, 1) (bmbt block) | 131 * | Intent to map (X, A, B) at startblock E | 132 * +-------------------------------------------------+ 133 * | Map file X startblock E offset A length B | t1 134 * | Done mapping (X, E, A, B) | 135 * | Intent to increase refcount for extent (E, B) | 136 * | Intent to add rmap (X, E, A, B) | 137 * +-------------------------------------------------+ 138 * | Reduce refcount for extent (C, B) | t2 139 * | Done reducing refcount for extent (C, 9) | 140 * | Intent to reduce refcount for extent (C+9, B-9) | 141 * | (ran out of space after 9 refcount updates) | 142 * +-------------------------------------------------+ 143 * | Reduce refcount for extent (C+9, B+9) | t3 144 * | Done reducing refcount for extent (C+9, B-9) | 145 * | Increase refcount for extent (E, B) | 146 * | Done increasing refcount for extent (E, B) | 147 * | Intent to free extent (C, B) | 148 * | Intent to free extent (F, 1) (refcountbt block) | 149 * | Intent to remove rmap (F, 1, REFC) | 150 * +-------------------------------------------------+ 151 * | Remove rmap (X, C, A, B) | t4 152 * | Done removing rmap (X, C, A, B) | 153 * | Add rmap (X, E, A, B) | 154 * | Done adding rmap (X, E, A, B) | 155 * | Remove rmap (F, 1, REFC) | 156 * | Done removing rmap (F, 1, REFC) | 157 * +-------------------------------------------------+ 158 * | Free extent (C, B) | t5 159 * | Done freeing extent (C, B) | 160 * | Free extent (D, 1) | 161 * | Done freeing extent (D, 1) | 162 * | Free extent (F, 1) | 163 * | Done freeing extent (F, 1) | 164 * +-------------------------------------------------+ 165 * 166 * If we should crash before t2 commits, log recovery replays 167 * the following intent items: 168 * 169 * - Intent to reduce refcount for extent (C, B) 170 * - Intent to remove rmap (X, C, A, B) 171 * - Intent to free extent (D, 1) (bmbt block) 172 * - Intent to increase refcount for extent (E, B) 173 * - Intent to add rmap (X, E, A, B) 174 * 175 * In the process of recovering, it should also generate and take care 176 * of these intent items: 177 * 178 * - Intent to free extent (C, B) 179 * - Intent to free extent (F, 1) (refcountbt block) 180 * - Intent to remove rmap (F, 1, REFC) 181 * 182 * Note that the continuation requested between t2 and t3 is likely to 183 * reoccur. 184 */ 185 STATIC struct xfs_log_item * 186 xfs_defer_barrier_create_intent( 187 struct xfs_trans *tp, 188 struct list_head *items, 189 unsigned int count, 190 bool sort) 191 { 192 return NULL; 193 } 194 195 STATIC void 196 xfs_defer_barrier_abort_intent( 197 struct xfs_log_item *intent) 198 { 199 /* empty */ 200 } 201 202 STATIC struct xfs_log_item * 203 xfs_defer_barrier_create_done( 204 struct xfs_trans *tp, 205 struct xfs_log_item *intent, 206 unsigned int count) 207 { 208 return NULL; 209 } 210 211 STATIC int 212 xfs_defer_barrier_finish_item( 213 struct xfs_trans *tp, 214 struct xfs_log_item *done, 215 struct list_head *item, 216 struct xfs_btree_cur **state) 217 { 218 ASSERT(0); 219 return -EFSCORRUPTED; 220 } 221 222 STATIC void 223 xfs_defer_barrier_cancel_item( 224 struct list_head *item) 225 { 226 ASSERT(0); 227 } 228 229 static const struct xfs_defer_op_type xfs_barrier_defer_type = { 230 .max_items = 1, 231 .create_intent = xfs_defer_barrier_create_intent, 232 .abort_intent = xfs_defer_barrier_abort_intent, 233 .create_done = xfs_defer_barrier_create_done, 234 .finish_item = xfs_defer_barrier_finish_item, 235 .cancel_item = xfs_defer_barrier_cancel_item, 236 }; 237 238 static const struct xfs_defer_op_type *defer_op_types[] = { 239 [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, 240 [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, 241 [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, 242 [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, 243 [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, 244 [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, 245 [XFS_DEFER_OPS_TYPE_BARRIER] = &xfs_barrier_defer_type, 246 }; 247 248 /* Create a log intent done item for a log intent item. */ 249 static inline void 250 xfs_defer_create_done( 251 struct xfs_trans *tp, 252 struct xfs_defer_pending *dfp) 253 { 254 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 255 struct xfs_log_item *lip; 256 257 /* If there is no log intent item, there can be no log done item. */ 258 if (!dfp->dfp_intent) 259 return; 260 261 /* 262 * Mark the transaction dirty, even on error. This ensures the 263 * transaction is aborted, which: 264 * 265 * 1.) releases the log intent item and frees the log done item 266 * 2.) shuts down the filesystem 267 */ 268 tp->t_flags |= XFS_TRANS_DIRTY; 269 lip = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); 270 if (!lip) 271 return; 272 273 tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; 274 xfs_trans_add_item(tp, lip); 275 set_bit(XFS_LI_DIRTY, &lip->li_flags); 276 dfp->dfp_done = lip; 277 } 278 279 /* 280 * Ensure there's a log intent item associated with this deferred work item if 281 * the operation must be restarted on crash. Returns 1 if there's a log item; 282 * 0 if there isn't; or a negative errno. 283 */ 284 static int 285 xfs_defer_create_intent( 286 struct xfs_trans *tp, 287 struct xfs_defer_pending *dfp, 288 bool sort) 289 { 290 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 291 struct xfs_log_item *lip; 292 293 if (dfp->dfp_intent) 294 return 1; 295 296 lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); 297 if (!lip) 298 return 0; 299 if (IS_ERR(lip)) 300 return PTR_ERR(lip); 301 302 tp->t_flags |= XFS_TRANS_DIRTY; 303 xfs_trans_add_item(tp, lip); 304 set_bit(XFS_LI_DIRTY, &lip->li_flags); 305 dfp->dfp_intent = lip; 306 return 1; 307 } 308 309 /* 310 * For each pending item in the intake list, log its intent item and the 311 * associated extents, then add the entire intake list to the end of 312 * the pending list. 313 * 314 * Returns 1 if at least one log item was associated with the deferred work; 315 * 0 if there are no log items; or a negative errno. 316 */ 317 static int 318 xfs_defer_create_intents( 319 struct xfs_trans *tp) 320 { 321 struct xfs_defer_pending *dfp; 322 int ret = 0; 323 324 list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { 325 int ret2; 326 327 trace_xfs_defer_create_intent(tp->t_mountp, dfp); 328 ret2 = xfs_defer_create_intent(tp, dfp, true); 329 if (ret2 < 0) 330 return ret2; 331 ret |= ret2; 332 } 333 return ret; 334 } 335 336 static inline void 337 xfs_defer_pending_abort( 338 struct xfs_mount *mp, 339 struct xfs_defer_pending *dfp) 340 { 341 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 342 343 trace_xfs_defer_pending_abort(mp, dfp); 344 345 if (dfp->dfp_intent && !dfp->dfp_done) { 346 ops->abort_intent(dfp->dfp_intent); 347 dfp->dfp_intent = NULL; 348 } 349 } 350 351 static inline void 352 xfs_defer_pending_cancel_work( 353 struct xfs_mount *mp, 354 struct xfs_defer_pending *dfp) 355 { 356 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 357 struct list_head *pwi; 358 struct list_head *n; 359 360 trace_xfs_defer_cancel_list(mp, dfp); 361 362 list_del(&dfp->dfp_list); 363 list_for_each_safe(pwi, n, &dfp->dfp_work) { 364 list_del(pwi); 365 dfp->dfp_count--; 366 trace_xfs_defer_cancel_item(mp, dfp, pwi); 367 ops->cancel_item(pwi); 368 } 369 ASSERT(dfp->dfp_count == 0); 370 kmem_cache_free(xfs_defer_pending_cache, dfp); 371 } 372 373 STATIC void 374 xfs_defer_pending_abort_list( 375 struct xfs_mount *mp, 376 struct list_head *dop_list) 377 { 378 struct xfs_defer_pending *dfp; 379 380 /* Abort intent items that don't have a done item. */ 381 list_for_each_entry(dfp, dop_list, dfp_list) 382 xfs_defer_pending_abort(mp, dfp); 383 } 384 385 /* Abort all the intents that were committed. */ 386 STATIC void 387 xfs_defer_trans_abort( 388 struct xfs_trans *tp, 389 struct list_head *dop_pending) 390 { 391 trace_xfs_defer_trans_abort(tp, _RET_IP_); 392 xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); 393 } 394 395 /* 396 * Capture resources that the caller said not to release ("held") when the 397 * transaction commits. Caller is responsible for zero-initializing @dres. 398 */ 399 static int 400 xfs_defer_save_resources( 401 struct xfs_defer_resources *dres, 402 struct xfs_trans *tp) 403 { 404 struct xfs_buf_log_item *bli; 405 struct xfs_inode_log_item *ili; 406 struct xfs_log_item *lip; 407 408 BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); 409 410 list_for_each_entry(lip, &tp->t_items, li_trans) { 411 switch (lip->li_type) { 412 case XFS_LI_BUF: 413 bli = container_of(lip, struct xfs_buf_log_item, 414 bli_item); 415 if (bli->bli_flags & XFS_BLI_HOLD) { 416 if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { 417 ASSERT(0); 418 return -EFSCORRUPTED; 419 } 420 if (bli->bli_flags & XFS_BLI_ORDERED) 421 dres->dr_ordered |= 422 (1U << dres->dr_bufs); 423 else 424 xfs_trans_dirty_buf(tp, bli->bli_buf); 425 dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; 426 } 427 break; 428 case XFS_LI_INODE: 429 ili = container_of(lip, struct xfs_inode_log_item, 430 ili_item); 431 if (ili->ili_lock_flags == 0) { 432 if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { 433 ASSERT(0); 434 return -EFSCORRUPTED; 435 } 436 xfs_trans_log_inode(tp, ili->ili_inode, 437 XFS_ILOG_CORE); 438 dres->dr_ip[dres->dr_inos++] = ili->ili_inode; 439 } 440 break; 441 default: 442 break; 443 } 444 } 445 446 return 0; 447 } 448 449 /* Attach the held resources to the transaction. */ 450 static void 451 xfs_defer_restore_resources( 452 struct xfs_trans *tp, 453 struct xfs_defer_resources *dres) 454 { 455 unsigned short i; 456 457 /* Rejoin the joined inodes. */ 458 for (i = 0; i < dres->dr_inos; i++) 459 xfs_trans_ijoin(tp, dres->dr_ip[i], 0); 460 461 /* Rejoin the buffers and dirty them so the log moves forward. */ 462 for (i = 0; i < dres->dr_bufs; i++) { 463 xfs_trans_bjoin(tp, dres->dr_bp[i]); 464 if (dres->dr_ordered & (1U << i)) 465 xfs_trans_ordered_buf(tp, dres->dr_bp[i]); 466 xfs_trans_bhold(tp, dres->dr_bp[i]); 467 } 468 } 469 470 /* Roll a transaction so we can do some deferred op processing. */ 471 STATIC int 472 xfs_defer_trans_roll( 473 struct xfs_trans **tpp) 474 { 475 struct xfs_defer_resources dres = { }; 476 int error; 477 478 error = xfs_defer_save_resources(&dres, *tpp); 479 if (error) 480 return error; 481 482 trace_xfs_defer_trans_roll(*tpp, _RET_IP_); 483 484 /* 485 * Roll the transaction. Rolling always given a new transaction (even 486 * if committing the old one fails!) to hand back to the caller, so we 487 * join the held resources to the new transaction so that we always 488 * return with the held resources joined to @tpp, no matter what 489 * happened. 490 */ 491 error = xfs_trans_roll(tpp); 492 493 xfs_defer_restore_resources(*tpp, &dres); 494 495 if (error) 496 trace_xfs_defer_trans_roll_error(*tpp, error); 497 return error; 498 } 499 500 /* 501 * Free up any items left in the list. 502 */ 503 static void 504 xfs_defer_cancel_list( 505 struct xfs_mount *mp, 506 struct list_head *dop_list) 507 { 508 struct xfs_defer_pending *dfp; 509 struct xfs_defer_pending *pli; 510 511 /* 512 * Free the pending items. Caller should already have arranged 513 * for the intent items to be released. 514 */ 515 list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) 516 xfs_defer_pending_cancel_work(mp, dfp); 517 } 518 519 static inline void 520 xfs_defer_relog_intent( 521 struct xfs_trans *tp, 522 struct xfs_defer_pending *dfp) 523 { 524 struct xfs_log_item *lip; 525 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 526 527 xfs_defer_create_done(tp, dfp); 528 529 lip = ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); 530 if (lip) { 531 xfs_trans_add_item(tp, lip); 532 set_bit(XFS_LI_DIRTY, &lip->li_flags); 533 } 534 dfp->dfp_done = NULL; 535 dfp->dfp_intent = lip; 536 } 537 538 /* 539 * Prevent a log intent item from pinning the tail of the log by logging a 540 * done item to release the intent item; and then log a new intent item. 541 * The caller should provide a fresh transaction and roll it after we're done. 542 */ 543 static void 544 xfs_defer_relog( 545 struct xfs_trans **tpp, 546 struct list_head *dfops) 547 { 548 struct xlog *log = (*tpp)->t_mountp->m_log; 549 struct xfs_defer_pending *dfp; 550 xfs_lsn_t threshold_lsn = NULLCOMMITLSN; 551 552 553 ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); 554 555 list_for_each_entry(dfp, dfops, dfp_list) { 556 /* 557 * If the log intent item for this deferred op is not a part of 558 * the current log checkpoint, relog the intent item to keep 559 * the log tail moving forward. We're ok with this being racy 560 * because an incorrect decision means we'll be a little slower 561 * at pushing the tail. 562 */ 563 if (dfp->dfp_intent == NULL || 564 xfs_log_item_in_current_chkpt(dfp->dfp_intent)) 565 continue; 566 567 /* 568 * Figure out where we need the tail to be in order to maintain 569 * the minimum required free space in the log. Only sample 570 * the log threshold once per call. 571 */ 572 if (threshold_lsn == NULLCOMMITLSN) { 573 threshold_lsn = xlog_grant_push_threshold(log, 0); 574 if (threshold_lsn == NULLCOMMITLSN) 575 break; 576 } 577 if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) 578 continue; 579 580 trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); 581 XFS_STATS_INC((*tpp)->t_mountp, defer_relog); 582 583 xfs_defer_relog_intent(*tpp, dfp); 584 } 585 } 586 587 /* 588 * Log an intent-done item for the first pending intent, and finish the work 589 * items. 590 */ 591 int 592 xfs_defer_finish_one( 593 struct xfs_trans *tp, 594 struct xfs_defer_pending *dfp) 595 { 596 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 597 struct xfs_btree_cur *state = NULL; 598 struct list_head *li, *n; 599 int error; 600 601 trace_xfs_defer_pending_finish(tp->t_mountp, dfp); 602 603 xfs_defer_create_done(tp, dfp); 604 list_for_each_safe(li, n, &dfp->dfp_work) { 605 list_del(li); 606 dfp->dfp_count--; 607 trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); 608 error = ops->finish_item(tp, dfp->dfp_done, li, &state); 609 if (error == -EAGAIN) { 610 int ret; 611 612 /* 613 * Caller wants a fresh transaction; put the work item 614 * back on the list and log a new log intent item to 615 * replace the old one. See "Requesting a Fresh 616 * Transaction while Finishing Deferred Work" above. 617 */ 618 list_add(li, &dfp->dfp_work); 619 dfp->dfp_count++; 620 dfp->dfp_done = NULL; 621 dfp->dfp_intent = NULL; 622 ret = xfs_defer_create_intent(tp, dfp, false); 623 if (ret < 0) 624 error = ret; 625 } 626 627 if (error) 628 goto out; 629 } 630 631 /* Done with the dfp, free it. */ 632 list_del(&dfp->dfp_list); 633 kmem_cache_free(xfs_defer_pending_cache, dfp); 634 out: 635 if (ops->finish_cleanup) 636 ops->finish_cleanup(tp, state, error); 637 return error; 638 } 639 640 /* Move all paused deferred work from @tp to @paused_list. */ 641 static void 642 xfs_defer_isolate_paused( 643 struct xfs_trans *tp, 644 struct list_head *paused_list) 645 { 646 struct xfs_defer_pending *dfp; 647 struct xfs_defer_pending *pli; 648 649 list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { 650 if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) 651 continue; 652 653 list_move_tail(&dfp->dfp_list, paused_list); 654 trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); 655 } 656 } 657 658 /* 659 * Finish all the pending work. This involves logging intent items for 660 * any work items that wandered in since the last transaction roll (if 661 * one has even happened), rolling the transaction, and finishing the 662 * work items in the first item on the logged-and-pending list. 663 * 664 * If an inode is provided, relog it to the new transaction. 665 */ 666 int 667 xfs_defer_finish_noroll( 668 struct xfs_trans **tp) 669 { 670 struct xfs_defer_pending *dfp = NULL; 671 int error = 0; 672 LIST_HEAD(dop_pending); 673 LIST_HEAD(dop_paused); 674 675 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 676 677 trace_xfs_defer_finish(*tp, _RET_IP_); 678 679 /* Until we run out of pending work to finish... */ 680 while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { 681 /* 682 * Deferred items that are created in the process of finishing 683 * other deferred work items should be queued at the head of 684 * the pending list, which puts them ahead of the deferred work 685 * that was created by the caller. This keeps the number of 686 * pending work items to a minimum, which decreases the amount 687 * of time that any one intent item can stick around in memory, 688 * pinning the log tail. 689 */ 690 int has_intents = xfs_defer_create_intents(*tp); 691 692 xfs_defer_isolate_paused(*tp, &dop_paused); 693 694 list_splice_init(&(*tp)->t_dfops, &dop_pending); 695 696 if (has_intents < 0) { 697 error = has_intents; 698 goto out_shutdown; 699 } 700 if (has_intents || dfp) { 701 error = xfs_defer_trans_roll(tp); 702 if (error) 703 goto out_shutdown; 704 705 /* Relog intent items to keep the log moving. */ 706 xfs_defer_relog(tp, &dop_pending); 707 xfs_defer_relog(tp, &dop_paused); 708 709 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 710 error = xfs_defer_trans_roll(tp); 711 if (error) 712 goto out_shutdown; 713 } 714 } 715 716 dfp = list_first_entry_or_null(&dop_pending, 717 struct xfs_defer_pending, dfp_list); 718 if (!dfp) 719 break; 720 error = xfs_defer_finish_one(*tp, dfp); 721 if (error && error != -EAGAIN) 722 goto out_shutdown; 723 } 724 725 /* Requeue the paused items in the outgoing transaction. */ 726 list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); 727 728 trace_xfs_defer_finish_done(*tp, _RET_IP_); 729 return 0; 730 731 out_shutdown: 732 list_splice_tail_init(&dop_paused, &dop_pending); 733 xfs_defer_trans_abort(*tp, &dop_pending); 734 xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); 735 trace_xfs_defer_finish_error(*tp, error); 736 xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); 737 xfs_defer_cancel(*tp); 738 return error; 739 } 740 741 int 742 xfs_defer_finish( 743 struct xfs_trans **tp) 744 { 745 #ifdef DEBUG 746 struct xfs_defer_pending *dfp; 747 #endif 748 int error; 749 750 /* 751 * Finish and roll the transaction once more to avoid returning to the 752 * caller with a dirty transaction. 753 */ 754 error = xfs_defer_finish_noroll(tp); 755 if (error) 756 return error; 757 if ((*tp)->t_flags & XFS_TRANS_DIRTY) { 758 error = xfs_defer_trans_roll(tp); 759 if (error) { 760 xfs_force_shutdown((*tp)->t_mountp, 761 SHUTDOWN_CORRUPT_INCORE); 762 return error; 763 } 764 } 765 766 /* Reset LOWMODE now that we've finished all the dfops. */ 767 #ifdef DEBUG 768 list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) 769 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 770 #endif 771 (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; 772 return 0; 773 } 774 775 void 776 xfs_defer_cancel( 777 struct xfs_trans *tp) 778 { 779 struct xfs_mount *mp = tp->t_mountp; 780 781 trace_xfs_defer_cancel(tp, _RET_IP_); 782 xfs_defer_trans_abort(tp, &tp->t_dfops); 783 xfs_defer_cancel_list(mp, &tp->t_dfops); 784 } 785 786 /* 787 * Return the last pending work item attached to this transaction if it matches 788 * the deferred op type. 789 */ 790 static inline struct xfs_defer_pending * 791 xfs_defer_find_last( 792 struct xfs_trans *tp, 793 enum xfs_defer_ops_type type, 794 const struct xfs_defer_op_type *ops) 795 { 796 struct xfs_defer_pending *dfp = NULL; 797 798 /* No dfops at all? */ 799 if (list_empty(&tp->t_dfops)) 800 return NULL; 801 802 dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, 803 dfp_list); 804 805 /* Wrong type? */ 806 if (dfp->dfp_type != type) 807 return NULL; 808 return dfp; 809 } 810 811 /* 812 * Decide if we can add a deferred work item to the last dfops item attached 813 * to the transaction. 814 */ 815 static inline bool 816 xfs_defer_can_append( 817 struct xfs_defer_pending *dfp, 818 const struct xfs_defer_op_type *ops) 819 { 820 /* Already logged? */ 821 if (dfp->dfp_intent) 822 return false; 823 824 /* Paused items cannot absorb more work */ 825 if (dfp->dfp_flags & XFS_DEFER_PAUSED) 826 return NULL; 827 828 /* Already full? */ 829 if (ops->max_items && dfp->dfp_count >= ops->max_items) 830 return false; 831 832 return true; 833 } 834 835 /* Create a new pending item at the end of the transaction list. */ 836 static inline struct xfs_defer_pending * 837 xfs_defer_alloc( 838 struct xfs_trans *tp, 839 enum xfs_defer_ops_type type) 840 { 841 struct xfs_defer_pending *dfp; 842 843 dfp = kmem_cache_zalloc(xfs_defer_pending_cache, 844 GFP_NOFS | __GFP_NOFAIL); 845 dfp->dfp_type = type; 846 INIT_LIST_HEAD(&dfp->dfp_work); 847 list_add_tail(&dfp->dfp_list, &tp->t_dfops); 848 849 return dfp; 850 } 851 852 /* Add an item for later deferred processing. */ 853 struct xfs_defer_pending * 854 xfs_defer_add( 855 struct xfs_trans *tp, 856 enum xfs_defer_ops_type type, 857 struct list_head *li) 858 { 859 struct xfs_defer_pending *dfp = NULL; 860 const struct xfs_defer_op_type *ops = defer_op_types[type]; 861 862 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 863 BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); 864 865 dfp = xfs_defer_find_last(tp, type, ops); 866 if (!dfp || !xfs_defer_can_append(dfp, ops)) 867 dfp = xfs_defer_alloc(tp, type); 868 869 xfs_defer_add_item(dfp, li); 870 trace_xfs_defer_add_item(tp->t_mountp, dfp, li); 871 return dfp; 872 } 873 874 /* 875 * Add a defer ops barrier to force two otherwise adjacent deferred work items 876 * to be tracked separately and have separate log items. 877 */ 878 void 879 xfs_defer_add_barrier( 880 struct xfs_trans *tp) 881 { 882 struct xfs_defer_pending *dfp; 883 const enum xfs_defer_ops_type type = XFS_DEFER_OPS_TYPE_BARRIER; 884 const struct xfs_defer_op_type *ops = defer_op_types[type]; 885 886 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 887 888 /* If the last defer op added was a barrier, we're done. */ 889 dfp = xfs_defer_find_last(tp, type, ops); 890 if (dfp) 891 return; 892 893 xfs_defer_alloc(tp, type); 894 895 trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); 896 } 897 898 /* 899 * Create a pending deferred work item to replay the recovered intent item 900 * and add it to the list. 901 */ 902 void 903 xfs_defer_start_recovery( 904 struct xfs_log_item *lip, 905 enum xfs_defer_ops_type dfp_type, 906 struct list_head *r_dfops) 907 { 908 struct xfs_defer_pending *dfp; 909 910 dfp = kmem_cache_zalloc(xfs_defer_pending_cache, 911 GFP_NOFS | __GFP_NOFAIL); 912 dfp->dfp_type = dfp_type; 913 dfp->dfp_intent = lip; 914 INIT_LIST_HEAD(&dfp->dfp_work); 915 list_add_tail(&dfp->dfp_list, r_dfops); 916 } 917 918 /* 919 * Cancel a deferred work item created to recover a log intent item. @dfp 920 * will be freed after this function returns. 921 */ 922 void 923 xfs_defer_cancel_recovery( 924 struct xfs_mount *mp, 925 struct xfs_defer_pending *dfp) 926 { 927 xfs_defer_pending_abort(mp, dfp); 928 xfs_defer_pending_cancel_work(mp, dfp); 929 } 930 931 /* Replay the deferred work item created from a recovered log intent item. */ 932 int 933 xfs_defer_finish_recovery( 934 struct xfs_mount *mp, 935 struct xfs_defer_pending *dfp, 936 struct list_head *capture_list) 937 { 938 const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; 939 int error; 940 941 error = ops->recover_work(dfp, capture_list); 942 if (error) 943 trace_xlog_intent_recovery_failed(mp, error, 944 ops->recover_work); 945 return error; 946 } 947 948 /* 949 * Move deferred ops from one transaction to another and reset the source to 950 * initial state. This is primarily used to carry state forward across 951 * transaction rolls with pending dfops. 952 */ 953 void 954 xfs_defer_move( 955 struct xfs_trans *dtp, 956 struct xfs_trans *stp) 957 { 958 list_splice_init(&stp->t_dfops, &dtp->t_dfops); 959 960 /* 961 * Low free space mode was historically controlled by a dfops field. 962 * This meant that low mode state potentially carried across multiple 963 * transaction rolls. Transfer low mode on a dfops move to preserve 964 * that behavior. 965 */ 966 dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); 967 stp->t_flags &= ~XFS_TRANS_LOWMODE; 968 } 969 970 /* 971 * Prepare a chain of fresh deferred ops work items to be completed later. Log 972 * recovery requires the ability to put off until later the actual finishing 973 * work so that it can process unfinished items recovered from the log in 974 * correct order. 975 * 976 * Create and log intent items for all the work that we're capturing so that we 977 * can be assured that the items will get replayed if the system goes down 978 * before log recovery gets a chance to finish the work it put off. The entire 979 * deferred ops state is transferred to the capture structure and the 980 * transaction is then ready for the caller to commit it. If there are no 981 * intent items to capture, this function returns NULL. 982 * 983 * If capture_ip is not NULL, the capture structure will obtain an extra 984 * reference to the inode. 985 */ 986 static struct xfs_defer_capture * 987 xfs_defer_ops_capture( 988 struct xfs_trans *tp) 989 { 990 struct xfs_defer_capture *dfc; 991 unsigned short i; 992 int error; 993 994 if (list_empty(&tp->t_dfops)) 995 return NULL; 996 997 error = xfs_defer_create_intents(tp); 998 if (error < 0) 999 return ERR_PTR(error); 1000 1001 /* Create an object to capture the defer ops. */ 1002 dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); 1003 INIT_LIST_HEAD(&dfc->dfc_list); 1004 INIT_LIST_HEAD(&dfc->dfc_dfops); 1005 1006 /* Move the dfops chain and transaction state to the capture struct. */ 1007 list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); 1008 dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; 1009 tp->t_flags &= ~XFS_TRANS_LOWMODE; 1010 1011 /* Capture the remaining block reservations along with the dfops. */ 1012 dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; 1013 dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; 1014 1015 /* Preserve the log reservation size. */ 1016 dfc->dfc_logres = tp->t_log_res; 1017 1018 error = xfs_defer_save_resources(&dfc->dfc_held, tp); 1019 if (error) { 1020 /* 1021 * Resource capture should never fail, but if it does, we 1022 * still have to shut down the log and release things 1023 * properly. 1024 */ 1025 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); 1026 } 1027 1028 /* 1029 * Grab extra references to the inodes and buffers because callers are 1030 * expected to release their held references after we commit the 1031 * transaction. 1032 */ 1033 for (i = 0; i < dfc->dfc_held.dr_inos; i++) { 1034 ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL)); 1035 ihold(VFS_I(dfc->dfc_held.dr_ip[i])); 1036 } 1037 1038 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1039 xfs_buf_hold(dfc->dfc_held.dr_bp[i]); 1040 1041 return dfc; 1042 } 1043 1044 /* Release all resources that we used to capture deferred ops. */ 1045 void 1046 xfs_defer_ops_capture_abort( 1047 struct xfs_mount *mp, 1048 struct xfs_defer_capture *dfc) 1049 { 1050 unsigned short i; 1051 1052 xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); 1053 xfs_defer_cancel_list(mp, &dfc->dfc_dfops); 1054 1055 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1056 xfs_buf_relse(dfc->dfc_held.dr_bp[i]); 1057 1058 for (i = 0; i < dfc->dfc_held.dr_inos; i++) 1059 xfs_irele(dfc->dfc_held.dr_ip[i]); 1060 1061 kmem_free(dfc); 1062 } 1063 1064 /* 1065 * Capture any deferred ops and commit the transaction. This is the last step 1066 * needed to finish a log intent item that we recovered from the log. If any 1067 * of the deferred ops operate on an inode, the caller must pass in that inode 1068 * so that the reference can be transferred to the capture structure. The 1069 * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling 1070 * xfs_defer_ops_continue. 1071 */ 1072 int 1073 xfs_defer_ops_capture_and_commit( 1074 struct xfs_trans *tp, 1075 struct list_head *capture_list) 1076 { 1077 struct xfs_mount *mp = tp->t_mountp; 1078 struct xfs_defer_capture *dfc; 1079 int error; 1080 1081 /* If we don't capture anything, commit transaction and exit. */ 1082 dfc = xfs_defer_ops_capture(tp); 1083 if (IS_ERR(dfc)) { 1084 xfs_trans_cancel(tp); 1085 return PTR_ERR(dfc); 1086 } 1087 if (!dfc) 1088 return xfs_trans_commit(tp); 1089 1090 /* Commit the transaction and add the capture structure to the list. */ 1091 error = xfs_trans_commit(tp); 1092 if (error) { 1093 xfs_defer_ops_capture_abort(mp, dfc); 1094 return error; 1095 } 1096 1097 list_add_tail(&dfc->dfc_list, capture_list); 1098 return 0; 1099 } 1100 1101 /* 1102 * Attach a chain of captured deferred ops to a new transaction and free the 1103 * capture structure. If an inode was captured, it will be passed back to the 1104 * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. 1105 * The caller now owns the inode reference. 1106 */ 1107 void 1108 xfs_defer_ops_continue( 1109 struct xfs_defer_capture *dfc, 1110 struct xfs_trans *tp, 1111 struct xfs_defer_resources *dres) 1112 { 1113 unsigned int i; 1114 1115 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1116 ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); 1117 1118 /* Lock the captured resources to the new transaction. */ 1119 if (dfc->dfc_held.dr_inos == 2) 1120 xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, 1121 dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); 1122 else if (dfc->dfc_held.dr_inos == 1) 1123 xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); 1124 1125 for (i = 0; i < dfc->dfc_held.dr_bufs; i++) 1126 xfs_buf_lock(dfc->dfc_held.dr_bp[i]); 1127 1128 /* Join the captured resources to the new transaction. */ 1129 xfs_defer_restore_resources(tp, &dfc->dfc_held); 1130 memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); 1131 dres->dr_bufs = 0; 1132 1133 /* Move captured dfops chain and state to the transaction. */ 1134 list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); 1135 tp->t_flags |= dfc->dfc_tpflags; 1136 1137 kmem_free(dfc); 1138 } 1139 1140 /* Release the resources captured and continued during recovery. */ 1141 void 1142 xfs_defer_resources_rele( 1143 struct xfs_defer_resources *dres) 1144 { 1145 unsigned short i; 1146 1147 for (i = 0; i < dres->dr_inos; i++) { 1148 xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); 1149 xfs_irele(dres->dr_ip[i]); 1150 dres->dr_ip[i] = NULL; 1151 } 1152 1153 for (i = 0; i < dres->dr_bufs; i++) { 1154 xfs_buf_relse(dres->dr_bp[i]); 1155 dres->dr_bp[i] = NULL; 1156 } 1157 1158 dres->dr_inos = 0; 1159 dres->dr_bufs = 0; 1160 dres->dr_ordered = 0; 1161 } 1162 1163 static inline int __init 1164 xfs_defer_init_cache(void) 1165 { 1166 xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", 1167 sizeof(struct xfs_defer_pending), 1168 0, 0, NULL); 1169 1170 return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; 1171 } 1172 1173 static inline void 1174 xfs_defer_destroy_cache(void) 1175 { 1176 kmem_cache_destroy(xfs_defer_pending_cache); 1177 xfs_defer_pending_cache = NULL; 1178 } 1179 1180 /* Set up caches for deferred work items. */ 1181 int __init 1182 xfs_defer_init_item_caches(void) 1183 { 1184 int error; 1185 1186 error = xfs_defer_init_cache(); 1187 if (error) 1188 return error; 1189 error = xfs_rmap_intent_init_cache(); 1190 if (error) 1191 goto err; 1192 error = xfs_refcount_intent_init_cache(); 1193 if (error) 1194 goto err; 1195 error = xfs_bmap_intent_init_cache(); 1196 if (error) 1197 goto err; 1198 error = xfs_extfree_intent_init_cache(); 1199 if (error) 1200 goto err; 1201 error = xfs_attr_intent_init_cache(); 1202 if (error) 1203 goto err; 1204 return 0; 1205 err: 1206 xfs_defer_destroy_item_caches(); 1207 return error; 1208 } 1209 1210 /* Destroy all the deferred work item caches, if they've been allocated. */ 1211 void 1212 xfs_defer_destroy_item_caches(void) 1213 { 1214 xfs_attr_intent_destroy_cache(); 1215 xfs_extfree_intent_destroy_cache(); 1216 xfs_bmap_intent_destroy_cache(); 1217 xfs_refcount_intent_destroy_cache(); 1218 xfs_rmap_intent_destroy_cache(); 1219 xfs_defer_destroy_cache(); 1220 } 1221 1222 /* 1223 * Mark a deferred work item so that it will be requeued indefinitely without 1224 * being finished. Caller must ensure there are no data dependencies on this 1225 * work item in the meantime. 1226 */ 1227 void 1228 xfs_defer_item_pause( 1229 struct xfs_trans *tp, 1230 struct xfs_defer_pending *dfp) 1231 { 1232 ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); 1233 1234 dfp->dfp_flags |= XFS_DEFER_PAUSED; 1235 1236 trace_xfs_defer_item_pause(tp->t_mountp, dfp); 1237 } 1238 1239 /* 1240 * Release a paused deferred work item so that it will be finished during the 1241 * next transaction roll. 1242 */ 1243 void 1244 xfs_defer_item_unpause( 1245 struct xfs_trans *tp, 1246 struct xfs_defer_pending *dfp) 1247 { 1248 ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); 1249 1250 dfp->dfp_flags &= ~XFS_DEFER_PAUSED; 1251 1252 trace_xfs_defer_item_unpause(tp->t_mountp, dfp); 1253 } 1254