1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_quota.h" 20 #include "xfs_dquot_item.h" 21 #include "xfs_dquot.h" 22 #include "xfs_trace.h" 23 #include "xfs_log.h" 24 #include "xfs_log_priv.h" 25 #include "xfs_error.h" 26 27 28 struct kmem_cache *xfs_buf_item_cache; 29 30 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 31 { 32 return container_of(lip, struct xfs_buf_log_item, bli_item); 33 } 34 35 static void 36 xfs_buf_item_get_format( 37 struct xfs_buf_log_item *bip, 38 int count) 39 { 40 ASSERT(bip->bli_formats == NULL); 41 bip->bli_format_count = count; 42 43 if (count == 1) { 44 bip->bli_formats = &bip->__bli_format; 45 return; 46 } 47 48 bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), 49 GFP_KERNEL | __GFP_NOFAIL); 50 } 51 52 static void 53 xfs_buf_item_free_format( 54 struct xfs_buf_log_item *bip) 55 { 56 if (bip->bli_formats != &bip->__bli_format) { 57 kfree(bip->bli_formats); 58 bip->bli_formats = NULL; 59 } 60 } 61 62 static void 63 xfs_buf_item_free( 64 struct xfs_buf_log_item *bip) 65 { 66 xfs_buf_item_free_format(bip); 67 kvfree(bip->bli_item.li_lv_shadow); 68 kmem_cache_free(xfs_buf_item_cache, bip); 69 } 70 71 /* 72 * xfs_buf_item_relse() is called when the buf log item is no longer needed. 73 */ 74 static void 75 xfs_buf_item_relse( 76 struct xfs_buf_log_item *bip) 77 { 78 struct xfs_buf *bp = bip->bli_buf; 79 80 trace_xfs_buf_item_relse(bp, _RET_IP_); 81 82 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 83 ASSERT(atomic_read(&bip->bli_refcount) == 0); 84 85 bp->b_log_item = NULL; 86 xfs_buf_rele(bp); 87 xfs_buf_item_free(bip); 88 } 89 90 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 91 bool 92 xfs_buf_log_check_iovec( 93 struct kvec *iovec) 94 { 95 struct xfs_buf_log_format *blfp = iovec->iov_base; 96 char *bmp_end; 97 char *item_end; 98 99 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len) 100 return false; 101 102 item_end = (char *)iovec->iov_base + iovec->iov_len; 103 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 104 return bmp_end <= item_end; 105 } 106 107 static inline int 108 xfs_buf_log_format_size( 109 struct xfs_buf_log_format *blfp) 110 { 111 return offsetof(struct xfs_buf_log_format, blf_data_map) + 112 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 113 } 114 115 /* 116 * Return the number of log iovecs and space needed to log the given buf log 117 * item segment. 118 * 119 * It calculates this as 1 iovec for the buf log format structure and 1 for each 120 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 121 * in a single iovec. 122 */ 123 STATIC void 124 xfs_buf_item_size_segment( 125 struct xfs_buf_log_item *bip, 126 struct xfs_buf_log_format *blfp, 127 uint offset, 128 int *nvecs, 129 int *nbytes) 130 { 131 int first_bit; 132 int nbits; 133 134 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 135 if (first_bit == -1) 136 return; 137 138 (*nvecs)++; 139 *nbytes += xfs_buf_log_format_size(blfp); 140 141 do { 142 nbits = xfs_contig_bits(blfp->blf_data_map, 143 blfp->blf_map_size, first_bit); 144 ASSERT(nbits > 0); 145 (*nvecs)++; 146 *nbytes += nbits * XFS_BLF_CHUNK; 147 148 /* 149 * This takes the bit number to start looking from and 150 * returns the next set bit from there. It returns -1 151 * if there are no more bits set or the start bit is 152 * beyond the end of the bitmap. 153 */ 154 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 155 (uint)first_bit + nbits + 1); 156 } while (first_bit != -1); 157 158 return; 159 } 160 161 /* 162 * Compute the worst case log item overhead for an invalidated buffer with the 163 * given map count and block size. 164 */ 165 unsigned int 166 xfs_buf_inval_log_space( 167 unsigned int map_count, 168 unsigned int blocksize) 169 { 170 unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); 171 unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); 172 unsigned int ret = 173 offsetof(struct xfs_buf_log_format, blf_data_map) + 174 (bitmap_size * sizeof_field(struct xfs_buf_log_format, 175 blf_data_map[0])); 176 177 return ret * map_count; 178 } 179 180 /* 181 * Return the number of log iovecs and space needed to log the given buf log 182 * item. 183 * 184 * Discontiguous buffers need a format structure per region that is being 185 * logged. This makes the changes in the buffer appear to log recovery as though 186 * they came from separate buffers, just like would occur if multiple buffers 187 * were used instead of a single discontiguous buffer. This enables 188 * discontiguous buffers to be in-memory constructs, completely transparent to 189 * what ends up on disk. 190 * 191 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 192 * format structures. If the item has previously been logged and has dirty 193 * regions, we do not relog them in stale buffers. This has the effect of 194 * reducing the size of the relogged item by the amount of dirty data tracked 195 * by the log item. This can result in the committing transaction reducing the 196 * amount of space being consumed by the CIL. 197 */ 198 STATIC void 199 xfs_buf_item_size( 200 struct xfs_log_item *lip, 201 int *nvecs, 202 int *nbytes) 203 { 204 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 205 struct xfs_buf *bp = bip->bli_buf; 206 int i; 207 int bytes; 208 uint offset = 0; 209 210 ASSERT(atomic_read(&bip->bli_refcount) > 0); 211 if (bip->bli_flags & XFS_BLI_STALE) { 212 /* 213 * The buffer is stale, so all we need to log is the buf log 214 * format structure with the cancel flag in it as we are never 215 * going to replay the changes tracked in the log item. 216 */ 217 trace_xfs_buf_item_size_stale(bip); 218 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 219 *nvecs += bip->bli_format_count; 220 for (i = 0; i < bip->bli_format_count; i++) { 221 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 222 } 223 return; 224 } 225 226 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 227 228 if (bip->bli_flags & XFS_BLI_ORDERED) { 229 /* 230 * The buffer has been logged just to order it. It is not being 231 * included in the transaction commit, so no vectors are used at 232 * all. 233 */ 234 trace_xfs_buf_item_size_ordered(bip); 235 *nvecs = XFS_LOG_VEC_ORDERED; 236 return; 237 } 238 239 /* 240 * The vector count is based on the number of buffer vectors we have 241 * dirty bits in. This will only be greater than one when we have a 242 * compound buffer with more than one segment dirty. Hence for compound 243 * buffers we need to track which segment the dirty bits correspond to, 244 * and when we move from one segment to the next increment the vector 245 * count for the extra buf log format structure that will need to be 246 * written. 247 */ 248 bytes = 0; 249 for (i = 0; i < bip->bli_format_count; i++) { 250 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, 251 nvecs, &bytes); 252 offset += BBTOB(bp->b_maps[i].bm_len); 253 } 254 255 /* 256 * Round up the buffer size required to minimise the number of memory 257 * allocations that need to be done as this item grows when relogged by 258 * repeated modifications. 259 */ 260 *nbytes = round_up(bytes, 512); 261 trace_xfs_buf_item_size(bip); 262 } 263 264 static inline void 265 xfs_buf_item_copy_iovec( 266 struct xfs_log_vec *lv, 267 struct xfs_log_iovec **vecp, 268 struct xfs_buf *bp, 269 uint offset, 270 int first_bit, 271 uint nbits) 272 { 273 offset += first_bit * XFS_BLF_CHUNK; 274 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, 275 xfs_buf_offset(bp, offset), 276 nbits * XFS_BLF_CHUNK); 277 } 278 279 static void 280 xfs_buf_item_format_segment( 281 struct xfs_buf_log_item *bip, 282 struct xfs_log_vec *lv, 283 struct xfs_log_iovec **vecp, 284 uint offset, 285 struct xfs_buf_log_format *blfp) 286 { 287 struct xfs_buf *bp = bip->bli_buf; 288 uint base_size; 289 int first_bit; 290 uint nbits; 291 292 /* copy the flags across from the base format item */ 293 blfp->blf_flags = bip->__bli_format.blf_flags; 294 295 /* 296 * Base size is the actual size of the ondisk structure - it reflects 297 * the actual size of the dirty bitmap rather than the size of the in 298 * memory structure. 299 */ 300 base_size = xfs_buf_log_format_size(blfp); 301 302 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 303 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 304 /* 305 * If the map is not be dirty in the transaction, mark 306 * the size as zero and do not advance the vector pointer. 307 */ 308 return; 309 } 310 311 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 312 blfp->blf_size = 1; 313 314 if (bip->bli_flags & XFS_BLI_STALE) { 315 /* 316 * The buffer is stale, so all we need to log 317 * is the buf log format structure with the 318 * cancel flag in it. 319 */ 320 trace_xfs_buf_item_format_stale(bip); 321 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 322 return; 323 } 324 325 326 /* 327 * Fill in an iovec for each set of contiguous chunks. 328 */ 329 do { 330 ASSERT(first_bit >= 0); 331 nbits = xfs_contig_bits(blfp->blf_data_map, 332 blfp->blf_map_size, first_bit); 333 ASSERT(nbits > 0); 334 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 335 first_bit, nbits); 336 blfp->blf_size++; 337 338 /* 339 * This takes the bit number to start looking from and 340 * returns the next set bit from there. It returns -1 341 * if there are no more bits set or the start bit is 342 * beyond the end of the bitmap. 343 */ 344 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 345 (uint)first_bit + nbits + 1); 346 } while (first_bit != -1); 347 348 return; 349 } 350 351 /* 352 * This is called to fill in the vector of log iovecs for the 353 * given log buf item. It fills the first entry with a buf log 354 * format structure, and the rest point to contiguous chunks 355 * within the buffer. 356 */ 357 STATIC void 358 xfs_buf_item_format( 359 struct xfs_log_item *lip, 360 struct xfs_log_vec *lv) 361 { 362 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 363 struct xfs_buf *bp = bip->bli_buf; 364 struct xfs_log_iovec *vecp = NULL; 365 uint offset = 0; 366 int i; 367 368 ASSERT(atomic_read(&bip->bli_refcount) > 0); 369 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 370 (bip->bli_flags & XFS_BLI_STALE)); 371 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 372 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 373 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 374 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 375 (bip->bli_flags & XFS_BLI_STALE)); 376 377 378 /* 379 * If it is an inode buffer, transfer the in-memory state to the 380 * format flags and clear the in-memory state. 381 * 382 * For buffer based inode allocation, we do not transfer 383 * this state if the inode buffer allocation has not yet been committed 384 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 385 * correct replay of the inode allocation. 386 * 387 * For icreate item based inode allocation, the buffers aren't written 388 * to the journal during allocation, and hence we should always tag the 389 * buffer as an inode buffer so that the correct unlinked list replay 390 * occurs during recovery. 391 */ 392 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 393 if (xfs_has_v3inodes(lip->li_log->l_mp) || 394 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 395 xfs_log_item_in_current_chkpt(lip))) 396 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 397 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 398 } 399 400 for (i = 0; i < bip->bli_format_count; i++) { 401 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 402 &bip->bli_formats[i]); 403 offset += BBTOB(bp->b_maps[i].bm_len); 404 } 405 406 /* 407 * Check to make sure everything is consistent. 408 */ 409 trace_xfs_buf_item_format(bip); 410 } 411 412 /* 413 * This is called to pin the buffer associated with the buf log item in memory 414 * so it cannot be written out. 415 * 416 * We take a reference to the buffer log item here so that the BLI life cycle 417 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and 418 * inserted into the AIL. 419 * 420 * We also need to take a reference to the buffer itself as the BLI unpin 421 * processing requires accessing the buffer after the BLI has dropped the final 422 * BLI reference. See xfs_buf_item_unpin() for an explanation. 423 * If unpins race to drop the final BLI reference and only the 424 * BLI owns a reference to the buffer, then the loser of the race can have the 425 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per 426 * pin count ensures the life cycle of the buffer extends for as 427 * long as we hold the buffer pin reference in xfs_buf_item_unpin(). 428 */ 429 STATIC void 430 xfs_buf_item_pin( 431 struct xfs_log_item *lip) 432 { 433 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 434 435 ASSERT(atomic_read(&bip->bli_refcount) > 0); 436 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 437 (bip->bli_flags & XFS_BLI_ORDERED) || 438 (bip->bli_flags & XFS_BLI_STALE)); 439 440 trace_xfs_buf_item_pin(bip); 441 442 xfs_buf_hold(bip->bli_buf); 443 atomic_inc(&bip->bli_refcount); 444 atomic_inc(&bip->bli_buf->b_pin_count); 445 } 446 447 /* 448 * For a stale BLI, process all the necessary completions that must be 449 * performed when the final BLI reference goes away. The buffer will be 450 * referenced and locked here - we return to the caller with the buffer still 451 * referenced and locked for them to finalise processing of the buffer. 452 */ 453 static void 454 xfs_buf_item_finish_stale( 455 struct xfs_buf_log_item *bip) 456 { 457 struct xfs_buf *bp = bip->bli_buf; 458 struct xfs_log_item *lip = &bip->bli_item; 459 460 ASSERT(bip->bli_flags & XFS_BLI_STALE); 461 ASSERT(xfs_buf_islocked(bp)); 462 ASSERT(bp->b_flags & XBF_STALE); 463 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 464 ASSERT(list_empty(&lip->li_trans)); 465 ASSERT(!bp->b_transp); 466 467 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 468 xfs_buf_item_done(bp); 469 xfs_buf_inode_iodone(bp); 470 ASSERT(list_empty(&bp->b_li_list)); 471 return; 472 } 473 474 /* 475 * We may or may not be on the AIL here, xfs_trans_ail_delete() will do 476 * the right thing regardless of the situation in which we are called. 477 */ 478 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 479 xfs_buf_item_relse(bip); 480 ASSERT(bp->b_log_item == NULL); 481 } 482 483 /* 484 * This is called to unpin the buffer associated with the buf log item which was 485 * previously pinned with a call to xfs_buf_item_pin(). We enter this function 486 * with a buffer pin count, a buffer reference and a BLI reference. 487 * 488 * We must drop the BLI reference before we unpin the buffer because the AIL 489 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the 490 * refcount drops to zero, the bli could still be AIL resident and the buffer 491 * submitted for I/O at any point before we return. This can result in IO 492 * completion freeing the buffer while we are still trying to access it here. 493 * This race condition can also occur in shutdown situations where we abort and 494 * unpin buffers from contexts other that journal IO completion. 495 * 496 * Hence we have to hold a buffer reference per pin count to ensure that the 497 * buffer cannot be freed until we have finished processing the unpin operation. 498 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we 499 * are done processing the buffer state. In the case of an abort (remove = 500 * true) then we re-use the current pin reference as the IO reference we hand 501 * off to IO failure handling. 502 */ 503 STATIC void 504 xfs_buf_item_unpin( 505 struct xfs_log_item *lip, 506 int remove) 507 { 508 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 509 struct xfs_buf *bp = bip->bli_buf; 510 int stale = bip->bli_flags & XFS_BLI_STALE; 511 int freed; 512 513 ASSERT(bp->b_log_item == bip); 514 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 516 trace_xfs_buf_item_unpin(bip); 517 518 freed = atomic_dec_and_test(&bip->bli_refcount); 519 if (atomic_dec_and_test(&bp->b_pin_count)) 520 wake_up_all(&bp->b_waiters); 521 522 /* 523 * Nothing to do but drop the buffer pin reference if the BLI is 524 * still active. 525 */ 526 if (!freed) { 527 xfs_buf_rele(bp); 528 return; 529 } 530 531 if (stale) { 532 trace_xfs_buf_item_unpin_stale(bip); 533 534 /* 535 * The buffer has been locked and referenced since it was marked 536 * stale so we own both lock and reference exclusively here. We 537 * do not need the pin reference any more, so drop it now so 538 * that we only have one reference to drop once item completion 539 * processing is complete. 540 */ 541 xfs_buf_rele(bp); 542 xfs_buf_item_finish_stale(bip); 543 xfs_buf_relse(bp); 544 return; 545 } 546 547 if (remove) { 548 /* 549 * We need to simulate an async IO failures here to ensure that 550 * the correct error completion is run on this buffer. This 551 * requires a reference to the buffer and for the buffer to be 552 * locked. We can safely pass ownership of the pin reference to 553 * the IO to ensure that nothing can free the buffer while we 554 * wait for the lock and then run the IO failure completion. 555 */ 556 xfs_buf_lock(bp); 557 bp->b_flags |= XBF_ASYNC; 558 xfs_buf_ioend_fail(bp); 559 return; 560 } 561 562 /* 563 * BLI has no more active references - it will be moved to the AIL to 564 * manage the remaining BLI/buffer life cycle. There is nothing left for 565 * us to do here so drop the pin reference to the buffer. 566 */ 567 xfs_buf_rele(bp); 568 } 569 570 STATIC uint 571 xfs_buf_item_push( 572 struct xfs_log_item *lip, 573 struct list_head *buffer_list) 574 { 575 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 576 struct xfs_buf *bp = bip->bli_buf; 577 uint rval = XFS_ITEM_SUCCESS; 578 579 if (xfs_buf_ispinned(bp)) 580 return XFS_ITEM_PINNED; 581 if (!xfs_buf_trylock(bp)) { 582 /* 583 * If we have just raced with a buffer being pinned and it has 584 * been marked stale, we could end up stalling until someone else 585 * issues a log force to unpin the stale buffer. Check for the 586 * race condition here so xfsaild recognizes the buffer is pinned 587 * and queues a log force to move it along. 588 */ 589 if (xfs_buf_ispinned(bp)) 590 return XFS_ITEM_PINNED; 591 return XFS_ITEM_LOCKED; 592 } 593 594 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 595 596 trace_xfs_buf_item_push(bip); 597 598 /* has a previous flush failed due to IO errors? */ 599 if (bp->b_flags & XBF_WRITE_FAIL) { 600 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 601 "Failing async write on buffer block 0x%llx. Retrying async write.", 602 (long long)xfs_buf_daddr(bp)); 603 } 604 605 if (!xfs_buf_delwri_queue(bp, buffer_list)) 606 rval = XFS_ITEM_FLUSHING; 607 xfs_buf_unlock(bp); 608 return rval; 609 } 610 611 /* 612 * Drop the buffer log item refcount and take appropriate action. This helper 613 * determines whether the bli must be freed or not, since a decrement to zero 614 * does not necessarily mean the bli is unused. 615 */ 616 void 617 xfs_buf_item_put( 618 struct xfs_buf_log_item *bip) 619 { 620 621 ASSERT(xfs_buf_islocked(bip->bli_buf)); 622 623 /* drop the bli ref and return if it wasn't the last one */ 624 if (!atomic_dec_and_test(&bip->bli_refcount)) 625 return; 626 627 /* If the BLI is in the AIL, then it is still dirty and in use */ 628 if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) { 629 ASSERT(bip->bli_flags & XFS_BLI_DIRTY); 630 return; 631 } 632 633 /* 634 * In shutdown conditions, we can be asked to free a dirty BLI that 635 * isn't in the AIL. This can occur due to a checkpoint aborting a BLI 636 * instead of inserting it into the AIL at checkpoint IO completion. If 637 * there's another bli reference (e.g. a btree cursor holds a clean 638 * reference) and it is released via xfs_trans_brelse(), we can get here 639 * with that aborted, dirty BLI. In this case, it is safe to free the 640 * dirty BLI immediately, as it is not in the AIL and there are no 641 * other references to it. 642 * 643 * We should never get here with a stale BLI via that path as 644 * xfs_trans_brelse() specifically holds onto stale buffers rather than 645 * releasing them. 646 */ 647 ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) || 648 test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags)); 649 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 650 xfs_buf_item_relse(bip); 651 } 652 653 /* 654 * Release the buffer associated with the buf log item. If there is no dirty 655 * logged data associated with the buffer recorded in the buf log item, then 656 * free the buf log item and remove the reference to it in the buffer. 657 * 658 * This call ignores the recursion count. It is only called when the buffer 659 * should REALLY be unlocked, regardless of the recursion count. 660 * 661 * We unconditionally drop the transaction's reference to the log item. If the 662 * item was logged, then another reference was taken when it was pinned, so we 663 * can safely drop the transaction reference now. This also allows us to avoid 664 * potential races with the unpin code freeing the bli by not referencing the 665 * bli after we've dropped the reference count. 666 * 667 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 668 * if necessary but do not unlock the buffer. This is for support of 669 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 670 * free the item. 671 * 672 * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must* 673 * perform a completion abort of any objects attached to the buffer for IO 674 * tracking purposes. This generally only happens in shutdown situations, 675 * normally xfs_buf_item_unpin() will drop the last BLI reference and perform 676 * completion processing. However, because transaction completion can race with 677 * checkpoint completion during a shutdown, this release context may end up 678 * being the last active reference to the BLI and so needs to perform this 679 * cleanup. 680 */ 681 STATIC void 682 xfs_buf_item_release( 683 struct xfs_log_item *lip) 684 { 685 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 686 struct xfs_buf *bp = bip->bli_buf; 687 bool hold = bip->bli_flags & XFS_BLI_HOLD; 688 bool stale = bip->bli_flags & XFS_BLI_STALE; 689 bool aborted = test_bit(XFS_LI_ABORTED, 690 &lip->li_flags); 691 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 692 #if defined(DEBUG) || defined(XFS_WARN) 693 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 694 #endif 695 696 trace_xfs_buf_item_release(bip); 697 698 ASSERT(xfs_buf_islocked(bp)); 699 700 /* 701 * The bli dirty state should match whether the blf has logged segments 702 * except for ordered buffers, where only the bli should be dirty. 703 */ 704 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 705 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 706 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 707 708 /* 709 * Clear the buffer's association with this transaction and 710 * per-transaction state from the bli, which has been copied above. 711 */ 712 bp->b_transp = NULL; 713 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 714 715 /* If there are other references, then we have nothing to do. */ 716 if (!atomic_dec_and_test(&bip->bli_refcount)) 717 goto out_release; 718 719 /* 720 * Stale buffer completion frees the BLI, unlocks and releases the 721 * buffer. Neither the BLI or buffer are safe to reference after this 722 * call, so there's nothing more we need to do here. 723 * 724 * If we get here with a stale buffer and references to the BLI remain, 725 * we must not unlock the buffer as the last BLI reference owns lock 726 * context, not us. 727 */ 728 if (stale) { 729 xfs_buf_item_finish_stale(bip); 730 xfs_buf_relse(bp); 731 ASSERT(!hold); 732 return; 733 } 734 735 /* 736 * Dirty or clean, aborted items are done and need to be removed from 737 * the AIL and released. This frees the BLI, but leaves the buffer 738 * locked and referenced. 739 */ 740 if (aborted || xlog_is_shutdown(lip->li_log)) { 741 ASSERT(list_empty(&bip->bli_buf->b_li_list)); 742 xfs_buf_item_done(bp); 743 goto out_release; 744 } 745 746 /* 747 * Clean, unreferenced BLIs can be immediately freed, leaving the buffer 748 * locked and referenced. 749 * 750 * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback. 751 */ 752 if (!dirty) 753 xfs_buf_item_relse(bip); 754 else 755 ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); 756 757 /* Not safe to reference the BLI from here */ 758 out_release: 759 /* 760 * If we get here with a stale buffer, we must not unlock the 761 * buffer as the last BLI reference owns lock context, not us. 762 */ 763 if (stale || hold) 764 return; 765 xfs_buf_relse(bp); 766 } 767 768 STATIC void 769 xfs_buf_item_committing( 770 struct xfs_log_item *lip, 771 xfs_csn_t seq) 772 { 773 return xfs_buf_item_release(lip); 774 } 775 776 /* 777 * This is called to find out where the oldest active copy of the 778 * buf log item in the on disk log resides now that the last log 779 * write of it completed at the given lsn. 780 * We always re-log all the dirty data in a buffer, so usually the 781 * latest copy in the on disk log is the only one that matters. For 782 * those cases we simply return the given lsn. 783 * 784 * The one exception to this is for buffers full of newly allocated 785 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 786 * flag set, indicating that only the di_next_unlinked fields from the 787 * inodes in the buffers will be replayed during recovery. If the 788 * original newly allocated inode images have not yet been flushed 789 * when the buffer is so relogged, then we need to make sure that we 790 * keep the old images in the 'active' portion of the log. We do this 791 * by returning the original lsn of that transaction here rather than 792 * the current one. 793 */ 794 STATIC xfs_lsn_t 795 xfs_buf_item_committed( 796 struct xfs_log_item *lip, 797 xfs_lsn_t lsn) 798 { 799 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 800 801 trace_xfs_buf_item_committed(bip); 802 803 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 804 return lip->li_lsn; 805 return lsn; 806 } 807 808 #ifdef DEBUG_EXPENSIVE 809 static int 810 xfs_buf_item_precommit( 811 struct xfs_trans *tp, 812 struct xfs_log_item *lip) 813 { 814 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 815 struct xfs_buf *bp = bip->bli_buf; 816 struct xfs_mount *mp = bp->b_mount; 817 xfs_failaddr_t fa; 818 819 if (!bp->b_ops || !bp->b_ops->verify_struct) 820 return 0; 821 if (bip->bli_flags & XFS_BLI_STALE) 822 return 0; 823 824 fa = bp->b_ops->verify_struct(bp); 825 if (fa) { 826 xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, 827 bp->b_addr, BBTOB(bp->b_length), fa); 828 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 829 ASSERT(fa == NULL); 830 } 831 832 return 0; 833 } 834 #else 835 # define xfs_buf_item_precommit NULL 836 #endif 837 838 static const struct xfs_item_ops xfs_buf_item_ops = { 839 .iop_size = xfs_buf_item_size, 840 .iop_precommit = xfs_buf_item_precommit, 841 .iop_format = xfs_buf_item_format, 842 .iop_pin = xfs_buf_item_pin, 843 .iop_unpin = xfs_buf_item_unpin, 844 .iop_release = xfs_buf_item_release, 845 .iop_committing = xfs_buf_item_committing, 846 .iop_committed = xfs_buf_item_committed, 847 .iop_push = xfs_buf_item_push, 848 }; 849 850 /* 851 * Allocate a new buf log item to go with the given buffer. 852 * Set the buffer's b_log_item field to point to the new 853 * buf log item. 854 */ 855 int 856 xfs_buf_item_init( 857 struct xfs_buf *bp, 858 struct xfs_mount *mp) 859 { 860 struct xfs_buf_log_item *bip = bp->b_log_item; 861 int chunks; 862 int map_size; 863 int i; 864 865 /* 866 * Check to see if there is already a buf log item for 867 * this buffer. If we do already have one, there is 868 * nothing to do here so return. 869 */ 870 ASSERT(bp->b_mount == mp); 871 if (bip) { 872 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 873 ASSERT(!bp->b_transp); 874 ASSERT(bip->bli_buf == bp); 875 return 0; 876 } 877 878 bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); 879 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 880 bip->bli_buf = bp; 881 882 /* 883 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 884 * can be divided into. Make sure not to truncate any pieces. 885 * map_size is the size of the bitmap needed to describe the 886 * chunks of the buffer. 887 * 888 * Discontiguous buffer support follows the layout of the underlying 889 * buffer. This makes the implementation as simple as possible. 890 */ 891 xfs_buf_item_get_format(bip, bp->b_map_count); 892 893 for (i = 0; i < bip->bli_format_count; i++) { 894 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 895 XFS_BLF_CHUNK); 896 map_size = DIV_ROUND_UP(chunks, NBWORD); 897 898 if (map_size > XFS_BLF_DATAMAP_SIZE) { 899 kmem_cache_free(xfs_buf_item_cache, bip); 900 xfs_err(mp, 901 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 902 map_size, 903 BBTOB(bp->b_maps[i].bm_len)); 904 return -EFSCORRUPTED; 905 } 906 907 bip->bli_formats[i].blf_type = XFS_LI_BUF; 908 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 909 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 910 bip->bli_formats[i].blf_map_size = map_size; 911 } 912 913 bp->b_log_item = bip; 914 xfs_buf_hold(bp); 915 return 0; 916 } 917 918 919 /* 920 * Mark bytes first through last inclusive as dirty in the buf 921 * item's bitmap. 922 */ 923 static void 924 xfs_buf_item_log_segment( 925 uint first, 926 uint last, 927 uint *map) 928 { 929 uint first_bit; 930 uint last_bit; 931 uint bits_to_set; 932 uint bits_set; 933 uint word_num; 934 uint *wordp; 935 uint bit; 936 uint end_bit; 937 uint mask; 938 939 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 940 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 941 942 /* 943 * Convert byte offsets to bit numbers. 944 */ 945 first_bit = first >> XFS_BLF_SHIFT; 946 last_bit = last >> XFS_BLF_SHIFT; 947 948 /* 949 * Calculate the total number of bits to be set. 950 */ 951 bits_to_set = last_bit - first_bit + 1; 952 953 /* 954 * Get a pointer to the first word in the bitmap 955 * to set a bit in. 956 */ 957 word_num = first_bit >> BIT_TO_WORD_SHIFT; 958 wordp = &map[word_num]; 959 960 /* 961 * Calculate the starting bit in the first word. 962 */ 963 bit = first_bit & (uint)(NBWORD - 1); 964 965 /* 966 * First set any bits in the first word of our range. 967 * If it starts at bit 0 of the word, it will be 968 * set below rather than here. That is what the variable 969 * bit tells us. The variable bits_set tracks the number 970 * of bits that have been set so far. End_bit is the number 971 * of the last bit to be set in this word plus one. 972 */ 973 if (bit) { 974 end_bit = min(bit + bits_to_set, (uint)NBWORD); 975 mask = ((1U << (end_bit - bit)) - 1) << bit; 976 *wordp |= mask; 977 wordp++; 978 bits_set = end_bit - bit; 979 } else { 980 bits_set = 0; 981 } 982 983 /* 984 * Now set bits a whole word at a time that are between 985 * first_bit and last_bit. 986 */ 987 while ((bits_to_set - bits_set) >= NBWORD) { 988 *wordp = 0xffffffff; 989 bits_set += NBWORD; 990 wordp++; 991 } 992 993 /* 994 * Finally, set any bits left to be set in one last partial word. 995 */ 996 end_bit = bits_to_set - bits_set; 997 if (end_bit) { 998 mask = (1U << end_bit) - 1; 999 *wordp |= mask; 1000 } 1001 } 1002 1003 /* 1004 * Mark bytes first through last inclusive as dirty in the buf 1005 * item's bitmap. 1006 */ 1007 void 1008 xfs_buf_item_log( 1009 struct xfs_buf_log_item *bip, 1010 uint first, 1011 uint last) 1012 { 1013 int i; 1014 uint start; 1015 uint end; 1016 struct xfs_buf *bp = bip->bli_buf; 1017 1018 /* 1019 * walk each buffer segment and mark them dirty appropriately. 1020 */ 1021 start = 0; 1022 for (i = 0; i < bip->bli_format_count; i++) { 1023 if (start > last) 1024 break; 1025 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 1026 1027 /* skip to the map that includes the first byte to log */ 1028 if (first > end) { 1029 start += BBTOB(bp->b_maps[i].bm_len); 1030 continue; 1031 } 1032 1033 /* 1034 * Trim the range to this segment and mark it in the bitmap. 1035 * Note that we must convert buffer offsets to segment relative 1036 * offsets (e.g., the first byte of each segment is byte 0 of 1037 * that segment). 1038 */ 1039 if (first < start) 1040 first = start; 1041 if (end > last) 1042 end = last; 1043 xfs_buf_item_log_segment(first - start, end - start, 1044 &bip->bli_formats[i].blf_data_map[0]); 1045 1046 start += BBTOB(bp->b_maps[i].bm_len); 1047 } 1048 } 1049 1050 1051 /* 1052 * Return true if the buffer has any ranges logged/dirtied by a transaction, 1053 * false otherwise. 1054 */ 1055 bool 1056 xfs_buf_item_dirty_format( 1057 struct xfs_buf_log_item *bip) 1058 { 1059 int i; 1060 1061 for (i = 0; i < bip->bli_format_count; i++) { 1062 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 1063 bip->bli_formats[i].blf_map_size)) 1064 return true; 1065 } 1066 1067 return false; 1068 } 1069 1070 void 1071 xfs_buf_item_done( 1072 struct xfs_buf *bp) 1073 { 1074 /* 1075 * If we are forcibly shutting down, this may well be off the AIL 1076 * already. That's because we simulate the log-committed callbacks to 1077 * unpin these buffers. Or we may never have put this item on AIL 1078 * because of the transaction was aborted forcibly. 1079 * xfs_trans_ail_delete() takes care of these. 1080 * 1081 * Either way, AIL is useless if we're forcing a shutdown. 1082 * 1083 * Note that log recovery writes might have buffer items that are not on 1084 * the AIL even when the file system is not shut down. 1085 */ 1086 xfs_trans_ail_delete(&bp->b_log_item->bli_item, 1087 (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : 1088 SHUTDOWN_CORRUPT_INCORE); 1089 xfs_buf_item_relse(bp->b_log_item); 1090 } 1091