1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_quota.h" 20 #include "xfs_dquot_item.h" 21 #include "xfs_dquot.h" 22 #include "xfs_trace.h" 23 #include "xfs_log.h" 24 #include "xfs_log_priv.h" 25 #include "xfs_error.h" 26 27 28 struct kmem_cache *xfs_buf_item_cache; 29 30 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 31 { 32 return container_of(lip, struct xfs_buf_log_item, bli_item); 33 } 34 35 static void 36 xfs_buf_item_get_format( 37 struct xfs_buf_log_item *bip, 38 int count) 39 { 40 ASSERT(bip->bli_formats == NULL); 41 bip->bli_format_count = count; 42 43 if (count == 1) { 44 bip->bli_formats = &bip->__bli_format; 45 return; 46 } 47 48 bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), 49 GFP_KERNEL | __GFP_NOFAIL); 50 } 51 52 static void 53 xfs_buf_item_free_format( 54 struct xfs_buf_log_item *bip) 55 { 56 if (bip->bli_formats != &bip->__bli_format) { 57 kfree(bip->bli_formats); 58 bip->bli_formats = NULL; 59 } 60 } 61 62 static void 63 xfs_buf_item_free( 64 struct xfs_buf_log_item *bip) 65 { 66 xfs_buf_item_free_format(bip); 67 kvfree(bip->bli_item.li_lv_shadow); 68 kmem_cache_free(xfs_buf_item_cache, bip); 69 } 70 71 /* 72 * xfs_buf_item_relse() is called when the buf log item is no longer needed. 73 */ 74 static void 75 xfs_buf_item_relse( 76 struct xfs_buf_log_item *bip) 77 { 78 struct xfs_buf *bp = bip->bli_buf; 79 80 trace_xfs_buf_item_relse(bp, _RET_IP_); 81 82 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 83 ASSERT(atomic_read(&bip->bli_refcount) == 0); 84 85 bp->b_log_item = NULL; 86 xfs_buf_rele(bp); 87 xfs_buf_item_free(bip); 88 } 89 90 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 91 bool 92 xfs_buf_log_check_iovec( 93 struct kvec *iovec) 94 { 95 struct xfs_buf_log_format *blfp = iovec->iov_base; 96 char *bmp_end; 97 char *item_end; 98 99 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len) 100 return false; 101 102 item_end = (char *)iovec->iov_base + iovec->iov_len; 103 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 104 return bmp_end <= item_end; 105 } 106 107 static inline int 108 xfs_buf_log_format_size( 109 struct xfs_buf_log_format *blfp) 110 { 111 return offsetof(struct xfs_buf_log_format, blf_data_map) + 112 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 113 } 114 115 /* 116 * Return the number of log iovecs and space needed to log the given buf log 117 * item segment. 118 * 119 * It calculates this as 1 iovec for the buf log format structure and 1 for each 120 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 121 * in a single iovec. 122 */ 123 STATIC void 124 xfs_buf_item_size_segment( 125 struct xfs_buf_log_item *bip, 126 struct xfs_buf_log_format *blfp, 127 uint offset, 128 int *nvecs, 129 int *nbytes) 130 { 131 int first_bit; 132 int nbits; 133 134 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 135 if (first_bit == -1) 136 return; 137 138 (*nvecs)++; 139 *nbytes += xfs_buf_log_format_size(blfp); 140 141 do { 142 nbits = xfs_contig_bits(blfp->blf_data_map, 143 blfp->blf_map_size, first_bit); 144 ASSERT(nbits > 0); 145 (*nvecs)++; 146 *nbytes += nbits * XFS_BLF_CHUNK; 147 148 /* 149 * This takes the bit number to start looking from and 150 * returns the next set bit from there. It returns -1 151 * if there are no more bits set or the start bit is 152 * beyond the end of the bitmap. 153 */ 154 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 155 (uint)first_bit + nbits + 1); 156 } while (first_bit != -1); 157 158 return; 159 } 160 161 /* 162 * Compute the worst case log item overhead for an invalidated buffer with the 163 * given map count and block size. 164 */ 165 unsigned int 166 xfs_buf_inval_log_space( 167 unsigned int map_count, 168 unsigned int blocksize) 169 { 170 unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); 171 unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); 172 unsigned int ret = 173 offsetof(struct xfs_buf_log_format, blf_data_map) + 174 (bitmap_size * sizeof_field(struct xfs_buf_log_format, 175 blf_data_map[0])); 176 177 return ret * map_count; 178 } 179 180 /* 181 * Return the number of log iovecs and space needed to log the given buf log 182 * item. 183 * 184 * Discontiguous buffers need a format structure per region that is being 185 * logged. This makes the changes in the buffer appear to log recovery as though 186 * they came from separate buffers, just like would occur if multiple buffers 187 * were used instead of a single discontiguous buffer. This enables 188 * discontiguous buffers to be in-memory constructs, completely transparent to 189 * what ends up on disk. 190 * 191 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 192 * format structures. If the item has previously been logged and has dirty 193 * regions, we do not relog them in stale buffers. This has the effect of 194 * reducing the size of the relogged item by the amount of dirty data tracked 195 * by the log item. This can result in the committing transaction reducing the 196 * amount of space being consumed by the CIL. 197 */ 198 STATIC void 199 xfs_buf_item_size( 200 struct xfs_log_item *lip, 201 int *nvecs, 202 int *nbytes) 203 { 204 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 205 struct xfs_buf *bp = bip->bli_buf; 206 int i; 207 int bytes; 208 uint offset = 0; 209 210 ASSERT(atomic_read(&bip->bli_refcount) > 0); 211 if (bip->bli_flags & XFS_BLI_STALE) { 212 /* 213 * The buffer is stale, so all we need to log is the buf log 214 * format structure with the cancel flag in it as we are never 215 * going to replay the changes tracked in the log item. 216 */ 217 trace_xfs_buf_item_size_stale(bip); 218 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 219 *nvecs += bip->bli_format_count; 220 for (i = 0; i < bip->bli_format_count; i++) { 221 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 222 } 223 return; 224 } 225 226 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 227 228 if (bip->bli_flags & XFS_BLI_ORDERED) { 229 /* 230 * The buffer has been logged just to order it. It is not being 231 * included in the transaction commit, so no vectors are used at 232 * all. 233 */ 234 trace_xfs_buf_item_size_ordered(bip); 235 *nvecs = XFS_LOG_VEC_ORDERED; 236 return; 237 } 238 239 /* 240 * The vector count is based on the number of buffer vectors we have 241 * dirty bits in. This will only be greater than one when we have a 242 * compound buffer with more than one segment dirty. Hence for compound 243 * buffers we need to track which segment the dirty bits correspond to, 244 * and when we move from one segment to the next increment the vector 245 * count for the extra buf log format structure that will need to be 246 * written. 247 */ 248 bytes = 0; 249 for (i = 0; i < bip->bli_format_count; i++) { 250 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, 251 nvecs, &bytes); 252 offset += BBTOB(bp->b_maps[i].bm_len); 253 } 254 255 /* 256 * Round up the buffer size required to minimise the number of memory 257 * allocations that need to be done as this item grows when relogged by 258 * repeated modifications. 259 */ 260 *nbytes = round_up(bytes, 512); 261 trace_xfs_buf_item_size(bip); 262 } 263 264 static inline void 265 xfs_buf_item_copy_iovec( 266 struct xlog_format_buf *lfb, 267 struct xfs_buf *bp, 268 uint offset, 269 int first_bit, 270 uint nbits) 271 { 272 offset += first_bit * XFS_BLF_CHUNK; 273 xlog_format_copy(lfb, XLOG_REG_TYPE_BCHUNK, xfs_buf_offset(bp, offset), 274 nbits * XFS_BLF_CHUNK); 275 } 276 277 static void 278 xfs_buf_item_format_segment( 279 struct xfs_buf_log_item *bip, 280 struct xlog_format_buf *lfb, 281 uint offset, 282 struct xfs_buf_log_format *blfp) 283 { 284 struct xfs_buf *bp = bip->bli_buf; 285 uint base_size; 286 int first_bit; 287 uint nbits; 288 289 /* copy the flags across from the base format item */ 290 blfp->blf_flags = bip->__bli_format.blf_flags; 291 292 /* 293 * Base size is the actual size of the ondisk structure - it reflects 294 * the actual size of the dirty bitmap rather than the size of the in 295 * memory structure. 296 */ 297 base_size = xfs_buf_log_format_size(blfp); 298 299 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 300 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 301 /* 302 * If the map is not be dirty in the transaction, mark 303 * the size as zero and do not advance the vector pointer. 304 */ 305 return; 306 } 307 308 blfp = xlog_format_copy(lfb, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 309 blfp->blf_size = 1; 310 311 if (bip->bli_flags & XFS_BLI_STALE) { 312 /* 313 * The buffer is stale, so all we need to log 314 * is the buf log format structure with the 315 * cancel flag in it. 316 */ 317 trace_xfs_buf_item_format_stale(bip); 318 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 319 return; 320 } 321 322 323 /* 324 * Fill in an iovec for each set of contiguous chunks. 325 */ 326 do { 327 ASSERT(first_bit >= 0); 328 nbits = xfs_contig_bits(blfp->blf_data_map, 329 blfp->blf_map_size, first_bit); 330 ASSERT(nbits > 0); 331 xfs_buf_item_copy_iovec(lfb, bp, offset, first_bit, nbits); 332 blfp->blf_size++; 333 334 /* 335 * This takes the bit number to start looking from and 336 * returns the next set bit from there. It returns -1 337 * if there are no more bits set or the start bit is 338 * beyond the end of the bitmap. 339 */ 340 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 341 (uint)first_bit + nbits + 1); 342 } while (first_bit != -1); 343 344 return; 345 } 346 347 /* 348 * This is called to fill in the vector of log iovecs for the 349 * given log buf item. It fills the first entry with a buf log 350 * format structure, and the rest point to contiguous chunks 351 * within the buffer. 352 */ 353 STATIC void 354 xfs_buf_item_format( 355 struct xfs_log_item *lip, 356 struct xlog_format_buf *lfb) 357 { 358 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 359 struct xfs_buf *bp = bip->bli_buf; 360 uint offset = 0; 361 int i; 362 363 ASSERT(atomic_read(&bip->bli_refcount) > 0); 364 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 365 (bip->bli_flags & XFS_BLI_STALE)); 366 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 367 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 368 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 369 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 370 (bip->bli_flags & XFS_BLI_STALE)); 371 372 373 /* 374 * If it is an inode buffer, transfer the in-memory state to the 375 * format flags and clear the in-memory state. 376 * 377 * For buffer based inode allocation, we do not transfer 378 * this state if the inode buffer allocation has not yet been committed 379 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 380 * correct replay of the inode allocation. 381 * 382 * For icreate item based inode allocation, the buffers aren't written 383 * to the journal during allocation, and hence we should always tag the 384 * buffer as an inode buffer so that the correct unlinked list replay 385 * occurs during recovery. 386 */ 387 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 388 if (xfs_has_v3inodes(lip->li_log->l_mp) || 389 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 390 xfs_log_item_in_current_chkpt(lip))) 391 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 392 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 393 } 394 395 for (i = 0; i < bip->bli_format_count; i++) { 396 xfs_buf_item_format_segment(bip, lfb, offset, 397 &bip->bli_formats[i]); 398 offset += BBTOB(bp->b_maps[i].bm_len); 399 } 400 401 /* 402 * Check to make sure everything is consistent. 403 */ 404 trace_xfs_buf_item_format(bip); 405 } 406 407 /* 408 * This is called to pin the buffer associated with the buf log item in memory 409 * so it cannot be written out. 410 * 411 * We take a reference to the buffer log item here so that the BLI life cycle 412 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and 413 * inserted into the AIL. 414 * 415 * We also need to take a reference to the buffer itself as the BLI unpin 416 * processing requires accessing the buffer after the BLI has dropped the final 417 * BLI reference. See xfs_buf_item_unpin() for an explanation. 418 * If unpins race to drop the final BLI reference and only the 419 * BLI owns a reference to the buffer, then the loser of the race can have the 420 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per 421 * pin count ensures the life cycle of the buffer extends for as 422 * long as we hold the buffer pin reference in xfs_buf_item_unpin(). 423 */ 424 STATIC void 425 xfs_buf_item_pin( 426 struct xfs_log_item *lip) 427 { 428 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 429 430 ASSERT(atomic_read(&bip->bli_refcount) > 0); 431 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 432 (bip->bli_flags & XFS_BLI_ORDERED) || 433 (bip->bli_flags & XFS_BLI_STALE)); 434 435 trace_xfs_buf_item_pin(bip); 436 437 xfs_buf_hold(bip->bli_buf); 438 atomic_inc(&bip->bli_refcount); 439 atomic_inc(&bip->bli_buf->b_pin_count); 440 } 441 442 /* 443 * For a stale BLI, process all the necessary completions that must be 444 * performed when the final BLI reference goes away. The buffer will be 445 * referenced and locked here - we return to the caller with the buffer still 446 * referenced and locked for them to finalise processing of the buffer. 447 */ 448 static void 449 xfs_buf_item_finish_stale( 450 struct xfs_buf_log_item *bip) 451 { 452 struct xfs_buf *bp = bip->bli_buf; 453 struct xfs_log_item *lip = &bip->bli_item; 454 455 ASSERT(bip->bli_flags & XFS_BLI_STALE); 456 ASSERT(xfs_buf_islocked(bp)); 457 ASSERT(bp->b_flags & XBF_STALE); 458 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 459 ASSERT(list_empty(&lip->li_trans)); 460 ASSERT(!bp->b_transp); 461 462 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 463 xfs_buf_item_done(bp); 464 xfs_buf_inode_iodone(bp); 465 ASSERT(list_empty(&bp->b_li_list)); 466 return; 467 } 468 469 /* 470 * We may or may not be on the AIL here, xfs_trans_ail_delete() will do 471 * the right thing regardless of the situation in which we are called. 472 */ 473 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 474 xfs_buf_item_relse(bip); 475 ASSERT(bp->b_log_item == NULL); 476 } 477 478 /* 479 * This is called to unpin the buffer associated with the buf log item which was 480 * previously pinned with a call to xfs_buf_item_pin(). We enter this function 481 * with a buffer pin count, a buffer reference and a BLI reference. 482 * 483 * We must drop the BLI reference before we unpin the buffer because the AIL 484 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the 485 * refcount drops to zero, the bli could still be AIL resident and the buffer 486 * submitted for I/O at any point before we return. This can result in IO 487 * completion freeing the buffer while we are still trying to access it here. 488 * This race condition can also occur in shutdown situations where we abort and 489 * unpin buffers from contexts other that journal IO completion. 490 * 491 * Hence we have to hold a buffer reference per pin count to ensure that the 492 * buffer cannot be freed until we have finished processing the unpin operation. 493 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we 494 * are done processing the buffer state. In the case of an abort (remove = 495 * true) then we re-use the current pin reference as the IO reference we hand 496 * off to IO failure handling. 497 */ 498 STATIC void 499 xfs_buf_item_unpin( 500 struct xfs_log_item *lip, 501 int remove) 502 { 503 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 504 struct xfs_buf *bp = bip->bli_buf; 505 int stale = bip->bli_flags & XFS_BLI_STALE; 506 int freed; 507 508 ASSERT(bp->b_log_item == bip); 509 ASSERT(atomic_read(&bip->bli_refcount) > 0); 510 511 trace_xfs_buf_item_unpin(bip); 512 513 freed = atomic_dec_and_test(&bip->bli_refcount); 514 if (atomic_dec_and_test(&bp->b_pin_count)) 515 wake_up_all(&bp->b_waiters); 516 517 /* 518 * Nothing to do but drop the buffer pin reference if the BLI is 519 * still active. 520 */ 521 if (!freed) { 522 xfs_buf_rele(bp); 523 return; 524 } 525 526 if (stale) { 527 trace_xfs_buf_item_unpin_stale(bip); 528 529 /* 530 * The buffer has been locked and referenced since it was marked 531 * stale so we own both lock and reference exclusively here. We 532 * do not need the pin reference any more, so drop it now so 533 * that we only have one reference to drop once item completion 534 * processing is complete. 535 */ 536 xfs_buf_rele(bp); 537 xfs_buf_item_finish_stale(bip); 538 xfs_buf_relse(bp); 539 return; 540 } 541 542 if (remove) { 543 /* 544 * We need to simulate an async IO failures here to ensure that 545 * the correct error completion is run on this buffer. This 546 * requires a reference to the buffer and for the buffer to be 547 * locked. We can safely pass ownership of the pin reference to 548 * the IO to ensure that nothing can free the buffer while we 549 * wait for the lock and then run the IO failure completion. 550 */ 551 xfs_buf_lock(bp); 552 bp->b_flags |= XBF_ASYNC; 553 xfs_buf_ioend_fail(bp); 554 return; 555 } 556 557 /* 558 * BLI has no more active references - it will be moved to the AIL to 559 * manage the remaining BLI/buffer life cycle. There is nothing left for 560 * us to do here so drop the pin reference to the buffer. 561 */ 562 xfs_buf_rele(bp); 563 } 564 565 STATIC uint 566 xfs_buf_item_push( 567 struct xfs_log_item *lip, 568 struct list_head *buffer_list) 569 { 570 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 571 struct xfs_buf *bp = bip->bli_buf; 572 uint rval = XFS_ITEM_SUCCESS; 573 574 if (xfs_buf_ispinned(bp)) 575 return XFS_ITEM_PINNED; 576 if (!xfs_buf_trylock(bp)) { 577 /* 578 * If we have just raced with a buffer being pinned and it has 579 * been marked stale, we could end up stalling until someone else 580 * issues a log force to unpin the stale buffer. Check for the 581 * race condition here so xfsaild recognizes the buffer is pinned 582 * and queues a log force to move it along. 583 */ 584 if (xfs_buf_ispinned(bp)) 585 return XFS_ITEM_PINNED; 586 return XFS_ITEM_LOCKED; 587 } 588 589 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 590 591 trace_xfs_buf_item_push(bip); 592 593 /* has a previous flush failed due to IO errors? */ 594 if (bp->b_flags & XBF_WRITE_FAIL) { 595 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 596 "Failing async write on buffer block 0x%llx. Retrying async write.", 597 (long long)xfs_buf_daddr(bp)); 598 } 599 600 if (!xfs_buf_delwri_queue(bp, buffer_list)) 601 rval = XFS_ITEM_FLUSHING; 602 xfs_buf_unlock(bp); 603 return rval; 604 } 605 606 /* 607 * Drop the buffer log item refcount and take appropriate action. This helper 608 * determines whether the bli must be freed or not, since a decrement to zero 609 * does not necessarily mean the bli is unused. 610 */ 611 void 612 xfs_buf_item_put( 613 struct xfs_buf_log_item *bip) 614 { 615 616 ASSERT(xfs_buf_islocked(bip->bli_buf)); 617 618 /* drop the bli ref and return if it wasn't the last one */ 619 if (!atomic_dec_and_test(&bip->bli_refcount)) 620 return; 621 622 /* If the BLI is in the AIL, then it is still dirty and in use */ 623 if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) { 624 ASSERT(bip->bli_flags & XFS_BLI_DIRTY); 625 return; 626 } 627 628 /* 629 * In shutdown conditions, we can be asked to free a dirty BLI that 630 * isn't in the AIL. This can occur due to a checkpoint aborting a BLI 631 * instead of inserting it into the AIL at checkpoint IO completion. If 632 * there's another bli reference (e.g. a btree cursor holds a clean 633 * reference) and it is released via xfs_trans_brelse(), we can get here 634 * with that aborted, dirty BLI. In this case, it is safe to free the 635 * dirty BLI immediately, as it is not in the AIL and there are no 636 * other references to it. 637 * 638 * We should never get here with a stale BLI via that path as 639 * xfs_trans_brelse() specifically holds onto stale buffers rather than 640 * releasing them. 641 */ 642 ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) || 643 test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags)); 644 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 645 xfs_buf_item_relse(bip); 646 } 647 648 /* 649 * Release the buffer associated with the buf log item. If there is no dirty 650 * logged data associated with the buffer recorded in the buf log item, then 651 * free the buf log item and remove the reference to it in the buffer. 652 * 653 * This call ignores the recursion count. It is only called when the buffer 654 * should REALLY be unlocked, regardless of the recursion count. 655 * 656 * We unconditionally drop the transaction's reference to the log item. If the 657 * item was logged, then another reference was taken when it was pinned, so we 658 * can safely drop the transaction reference now. This also allows us to avoid 659 * potential races with the unpin code freeing the bli by not referencing the 660 * bli after we've dropped the reference count. 661 * 662 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 663 * if necessary but do not unlock the buffer. This is for support of 664 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 665 * free the item. 666 * 667 * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must* 668 * perform a completion abort of any objects attached to the buffer for IO 669 * tracking purposes. This generally only happens in shutdown situations, 670 * normally xfs_buf_item_unpin() will drop the last BLI reference and perform 671 * completion processing. However, because transaction completion can race with 672 * checkpoint completion during a shutdown, this release context may end up 673 * being the last active reference to the BLI and so needs to perform this 674 * cleanup. 675 */ 676 STATIC void 677 xfs_buf_item_release( 678 struct xfs_log_item *lip) 679 { 680 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 681 struct xfs_buf *bp = bip->bli_buf; 682 bool hold = bip->bli_flags & XFS_BLI_HOLD; 683 bool stale = bip->bli_flags & XFS_BLI_STALE; 684 bool aborted = test_bit(XFS_LI_ABORTED, 685 &lip->li_flags); 686 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 687 #if defined(DEBUG) || defined(XFS_WARN) 688 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 689 #endif 690 691 trace_xfs_buf_item_release(bip); 692 693 ASSERT(xfs_buf_islocked(bp)); 694 695 /* 696 * The bli dirty state should match whether the blf has logged segments 697 * except for ordered buffers, where only the bli should be dirty. 698 */ 699 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 700 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 701 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 702 703 /* 704 * Clear the buffer's association with this transaction and 705 * per-transaction state from the bli, which has been copied above. 706 */ 707 bp->b_transp = NULL; 708 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 709 710 /* If there are other references, then we have nothing to do. */ 711 if (!atomic_dec_and_test(&bip->bli_refcount)) 712 goto out_release; 713 714 /* 715 * Stale buffer completion frees the BLI, unlocks and releases the 716 * buffer. Neither the BLI or buffer are safe to reference after this 717 * call, so there's nothing more we need to do here. 718 * 719 * If we get here with a stale buffer and references to the BLI remain, 720 * we must not unlock the buffer as the last BLI reference owns lock 721 * context, not us. 722 */ 723 if (stale) { 724 xfs_buf_item_finish_stale(bip); 725 xfs_buf_relse(bp); 726 ASSERT(!hold); 727 return; 728 } 729 730 /* 731 * Dirty or clean, aborted items are done and need to be removed from 732 * the AIL and released. This frees the BLI, but leaves the buffer 733 * locked and referenced. 734 */ 735 if (aborted || xlog_is_shutdown(lip->li_log)) { 736 ASSERT(list_empty(&bip->bli_buf->b_li_list)); 737 xfs_buf_item_done(bp); 738 goto out_release; 739 } 740 741 /* 742 * Clean, unreferenced BLIs can be immediately freed, leaving the buffer 743 * locked and referenced. 744 * 745 * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback. 746 */ 747 if (!dirty) 748 xfs_buf_item_relse(bip); 749 else 750 ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); 751 752 /* Not safe to reference the BLI from here */ 753 out_release: 754 /* 755 * If we get here with a stale buffer, we must not unlock the 756 * buffer as the last BLI reference owns lock context, not us. 757 */ 758 if (stale || hold) 759 return; 760 xfs_buf_relse(bp); 761 } 762 763 STATIC void 764 xfs_buf_item_committing( 765 struct xfs_log_item *lip, 766 xfs_csn_t seq) 767 { 768 return xfs_buf_item_release(lip); 769 } 770 771 /* 772 * This is called to find out where the oldest active copy of the 773 * buf log item in the on disk log resides now that the last log 774 * write of it completed at the given lsn. 775 * We always re-log all the dirty data in a buffer, so usually the 776 * latest copy in the on disk log is the only one that matters. For 777 * those cases we simply return the given lsn. 778 * 779 * The one exception to this is for buffers full of newly allocated 780 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 781 * flag set, indicating that only the di_next_unlinked fields from the 782 * inodes in the buffers will be replayed during recovery. If the 783 * original newly allocated inode images have not yet been flushed 784 * when the buffer is so relogged, then we need to make sure that we 785 * keep the old images in the 'active' portion of the log. We do this 786 * by returning the original lsn of that transaction here rather than 787 * the current one. 788 */ 789 STATIC xfs_lsn_t 790 xfs_buf_item_committed( 791 struct xfs_log_item *lip, 792 xfs_lsn_t lsn) 793 { 794 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 795 796 trace_xfs_buf_item_committed(bip); 797 798 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 799 return lip->li_lsn; 800 return lsn; 801 } 802 803 #ifdef DEBUG_EXPENSIVE 804 static int 805 xfs_buf_item_precommit( 806 struct xfs_trans *tp, 807 struct xfs_log_item *lip) 808 { 809 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 810 struct xfs_buf *bp = bip->bli_buf; 811 struct xfs_mount *mp = bp->b_mount; 812 xfs_failaddr_t fa; 813 814 if (!bp->b_ops || !bp->b_ops->verify_struct) 815 return 0; 816 if (bip->bli_flags & XFS_BLI_STALE) 817 return 0; 818 819 fa = bp->b_ops->verify_struct(bp); 820 if (fa) { 821 xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, 822 bp->b_addr, BBTOB(bp->b_length), fa); 823 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 824 ASSERT(fa == NULL); 825 } 826 827 return 0; 828 } 829 #else 830 # define xfs_buf_item_precommit NULL 831 #endif 832 833 static const struct xfs_item_ops xfs_buf_item_ops = { 834 .iop_size = xfs_buf_item_size, 835 .iop_precommit = xfs_buf_item_precommit, 836 .iop_format = xfs_buf_item_format, 837 .iop_pin = xfs_buf_item_pin, 838 .iop_unpin = xfs_buf_item_unpin, 839 .iop_release = xfs_buf_item_release, 840 .iop_committing = xfs_buf_item_committing, 841 .iop_committed = xfs_buf_item_committed, 842 .iop_push = xfs_buf_item_push, 843 }; 844 845 /* 846 * Allocate a new buf log item to go with the given buffer. 847 * Set the buffer's b_log_item field to point to the new 848 * buf log item. 849 */ 850 int 851 xfs_buf_item_init( 852 struct xfs_buf *bp, 853 struct xfs_mount *mp) 854 { 855 struct xfs_buf_log_item *bip = bp->b_log_item; 856 int chunks; 857 int map_size; 858 int i; 859 860 /* 861 * Check to see if there is already a buf log item for 862 * this buffer. If we do already have one, there is 863 * nothing to do here so return. 864 */ 865 ASSERT(bp->b_mount == mp); 866 if (bip) { 867 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 868 ASSERT(!bp->b_transp); 869 ASSERT(bip->bli_buf == bp); 870 return 0; 871 } 872 873 bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); 874 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 875 bip->bli_buf = bp; 876 877 /* 878 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 879 * can be divided into. Make sure not to truncate any pieces. 880 * map_size is the size of the bitmap needed to describe the 881 * chunks of the buffer. 882 * 883 * Discontiguous buffer support follows the layout of the underlying 884 * buffer. This makes the implementation as simple as possible. 885 */ 886 xfs_buf_item_get_format(bip, bp->b_map_count); 887 888 for (i = 0; i < bip->bli_format_count; i++) { 889 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 890 XFS_BLF_CHUNK); 891 map_size = DIV_ROUND_UP(chunks, NBWORD); 892 893 if (map_size > XFS_BLF_DATAMAP_SIZE) { 894 xfs_buf_item_free_format(bip); 895 kmem_cache_free(xfs_buf_item_cache, bip); 896 xfs_err(mp, 897 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 898 map_size, 899 BBTOB(bp->b_maps[i].bm_len)); 900 return -EFSCORRUPTED; 901 } 902 903 bip->bli_formats[i].blf_type = XFS_LI_BUF; 904 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 905 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 906 bip->bli_formats[i].blf_map_size = map_size; 907 } 908 909 bp->b_log_item = bip; 910 xfs_buf_hold(bp); 911 return 0; 912 } 913 914 915 /* 916 * Mark bytes first through last inclusive as dirty in the buf 917 * item's bitmap. 918 */ 919 static void 920 xfs_buf_item_log_segment( 921 uint first, 922 uint last, 923 uint *map) 924 { 925 uint first_bit; 926 uint last_bit; 927 uint bits_to_set; 928 uint bits_set; 929 uint word_num; 930 uint *wordp; 931 uint bit; 932 uint end_bit; 933 uint mask; 934 935 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 936 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 937 938 /* 939 * Convert byte offsets to bit numbers. 940 */ 941 first_bit = first >> XFS_BLF_SHIFT; 942 last_bit = last >> XFS_BLF_SHIFT; 943 944 /* 945 * Calculate the total number of bits to be set. 946 */ 947 bits_to_set = last_bit - first_bit + 1; 948 949 /* 950 * Get a pointer to the first word in the bitmap 951 * to set a bit in. 952 */ 953 word_num = first_bit >> BIT_TO_WORD_SHIFT; 954 wordp = &map[word_num]; 955 956 /* 957 * Calculate the starting bit in the first word. 958 */ 959 bit = first_bit & (uint)(NBWORD - 1); 960 961 /* 962 * First set any bits in the first word of our range. 963 * If it starts at bit 0 of the word, it will be 964 * set below rather than here. That is what the variable 965 * bit tells us. The variable bits_set tracks the number 966 * of bits that have been set so far. End_bit is the number 967 * of the last bit to be set in this word plus one. 968 */ 969 if (bit) { 970 end_bit = min(bit + bits_to_set, (uint)NBWORD); 971 mask = ((1U << (end_bit - bit)) - 1) << bit; 972 *wordp |= mask; 973 wordp++; 974 bits_set = end_bit - bit; 975 } else { 976 bits_set = 0; 977 } 978 979 /* 980 * Now set bits a whole word at a time that are between 981 * first_bit and last_bit. 982 */ 983 while ((bits_to_set - bits_set) >= NBWORD) { 984 *wordp = 0xffffffff; 985 bits_set += NBWORD; 986 wordp++; 987 } 988 989 /* 990 * Finally, set any bits left to be set in one last partial word. 991 */ 992 end_bit = bits_to_set - bits_set; 993 if (end_bit) { 994 mask = (1U << end_bit) - 1; 995 *wordp |= mask; 996 } 997 } 998 999 /* 1000 * Mark bytes first through last inclusive as dirty in the buf 1001 * item's bitmap. 1002 */ 1003 void 1004 xfs_buf_item_log( 1005 struct xfs_buf_log_item *bip, 1006 uint first, 1007 uint last) 1008 { 1009 int i; 1010 uint start; 1011 uint end; 1012 struct xfs_buf *bp = bip->bli_buf; 1013 1014 /* 1015 * walk each buffer segment and mark them dirty appropriately. 1016 */ 1017 start = 0; 1018 for (i = 0; i < bip->bli_format_count; i++) { 1019 if (start > last) 1020 break; 1021 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 1022 1023 /* skip to the map that includes the first byte to log */ 1024 if (first > end) { 1025 start += BBTOB(bp->b_maps[i].bm_len); 1026 continue; 1027 } 1028 1029 /* 1030 * Trim the range to this segment and mark it in the bitmap. 1031 * Note that we must convert buffer offsets to segment relative 1032 * offsets (e.g., the first byte of each segment is byte 0 of 1033 * that segment). 1034 */ 1035 if (first < start) 1036 first = start; 1037 if (end > last) 1038 end = last; 1039 xfs_buf_item_log_segment(first - start, end - start, 1040 &bip->bli_formats[i].blf_data_map[0]); 1041 1042 start += BBTOB(bp->b_maps[i].bm_len); 1043 } 1044 } 1045 1046 1047 /* 1048 * Return true if the buffer has any ranges logged/dirtied by a transaction, 1049 * false otherwise. 1050 */ 1051 bool 1052 xfs_buf_item_dirty_format( 1053 struct xfs_buf_log_item *bip) 1054 { 1055 int i; 1056 1057 for (i = 0; i < bip->bli_format_count; i++) { 1058 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 1059 bip->bli_formats[i].blf_map_size)) 1060 return true; 1061 } 1062 1063 return false; 1064 } 1065 1066 void 1067 xfs_buf_item_done( 1068 struct xfs_buf *bp) 1069 { 1070 /* 1071 * If we are forcibly shutting down, this may well be off the AIL 1072 * already. That's because we simulate the log-committed callbacks to 1073 * unpin these buffers. Or we may never have put this item on AIL 1074 * because of the transaction was aborted forcibly. 1075 * xfs_trans_ail_delete() takes care of these. 1076 * 1077 * Either way, AIL is useless if we're forcing a shutdown. 1078 * 1079 * Note that log recovery writes might have buffer items that are not on 1080 * the AIL even when the file system is not shut down. 1081 */ 1082 xfs_trans_ail_delete(&bp->b_log_item->bli_item, 1083 (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : 1084 SHUTDOWN_CORRUPT_INCORE); 1085 xfs_buf_item_relse(bp->b_log_item); 1086 } 1087