1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_quota.h" 20 #include "xfs_dquot_item.h" 21 #include "xfs_dquot.h" 22 #include "xfs_trace.h" 23 #include "xfs_log.h" 24 #include "xfs_log_priv.h" 25 #include "xfs_error.h" 26 27 28 struct kmem_cache *xfs_buf_item_cache; 29 30 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 31 { 32 return container_of(lip, struct xfs_buf_log_item, bli_item); 33 } 34 35 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 36 bool 37 xfs_buf_log_check_iovec( 38 struct xfs_log_iovec *iovec) 39 { 40 struct xfs_buf_log_format *blfp = iovec->i_addr; 41 char *bmp_end; 42 char *item_end; 43 44 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) 45 return false; 46 47 item_end = (char *)iovec->i_addr + iovec->i_len; 48 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 49 return bmp_end <= item_end; 50 } 51 52 static inline int 53 xfs_buf_log_format_size( 54 struct xfs_buf_log_format *blfp) 55 { 56 return offsetof(struct xfs_buf_log_format, blf_data_map) + 57 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 58 } 59 60 /* 61 * Return the number of log iovecs and space needed to log the given buf log 62 * item segment. 63 * 64 * It calculates this as 1 iovec for the buf log format structure and 1 for each 65 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 66 * in a single iovec. 67 */ 68 STATIC void 69 xfs_buf_item_size_segment( 70 struct xfs_buf_log_item *bip, 71 struct xfs_buf_log_format *blfp, 72 uint offset, 73 int *nvecs, 74 int *nbytes) 75 { 76 int first_bit; 77 int nbits; 78 79 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 80 if (first_bit == -1) 81 return; 82 83 (*nvecs)++; 84 *nbytes += xfs_buf_log_format_size(blfp); 85 86 do { 87 nbits = xfs_contig_bits(blfp->blf_data_map, 88 blfp->blf_map_size, first_bit); 89 ASSERT(nbits > 0); 90 (*nvecs)++; 91 *nbytes += nbits * XFS_BLF_CHUNK; 92 93 /* 94 * This takes the bit number to start looking from and 95 * returns the next set bit from there. It returns -1 96 * if there are no more bits set or the start bit is 97 * beyond the end of the bitmap. 98 */ 99 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 100 (uint)first_bit + nbits + 1); 101 } while (first_bit != -1); 102 103 return; 104 } 105 106 /* 107 * Compute the worst case log item overhead for an invalidated buffer with the 108 * given map count and block size. 109 */ 110 unsigned int 111 xfs_buf_inval_log_space( 112 unsigned int map_count, 113 unsigned int blocksize) 114 { 115 unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); 116 unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); 117 unsigned int ret = 118 offsetof(struct xfs_buf_log_format, blf_data_map) + 119 (bitmap_size * sizeof_field(struct xfs_buf_log_format, 120 blf_data_map[0])); 121 122 return ret * map_count; 123 } 124 125 /* 126 * Return the number of log iovecs and space needed to log the given buf log 127 * item. 128 * 129 * Discontiguous buffers need a format structure per region that is being 130 * logged. This makes the changes in the buffer appear to log recovery as though 131 * they came from separate buffers, just like would occur if multiple buffers 132 * were used instead of a single discontiguous buffer. This enables 133 * discontiguous buffers to be in-memory constructs, completely transparent to 134 * what ends up on disk. 135 * 136 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 137 * format structures. If the item has previously been logged and has dirty 138 * regions, we do not relog them in stale buffers. This has the effect of 139 * reducing the size of the relogged item by the amount of dirty data tracked 140 * by the log item. This can result in the committing transaction reducing the 141 * amount of space being consumed by the CIL. 142 */ 143 STATIC void 144 xfs_buf_item_size( 145 struct xfs_log_item *lip, 146 int *nvecs, 147 int *nbytes) 148 { 149 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 150 struct xfs_buf *bp = bip->bli_buf; 151 int i; 152 int bytes; 153 uint offset = 0; 154 155 ASSERT(atomic_read(&bip->bli_refcount) > 0); 156 if (bip->bli_flags & XFS_BLI_STALE) { 157 /* 158 * The buffer is stale, so all we need to log is the buf log 159 * format structure with the cancel flag in it as we are never 160 * going to replay the changes tracked in the log item. 161 */ 162 trace_xfs_buf_item_size_stale(bip); 163 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 164 *nvecs += bip->bli_format_count; 165 for (i = 0; i < bip->bli_format_count; i++) { 166 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 167 } 168 return; 169 } 170 171 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 172 173 if (bip->bli_flags & XFS_BLI_ORDERED) { 174 /* 175 * The buffer has been logged just to order it. It is not being 176 * included in the transaction commit, so no vectors are used at 177 * all. 178 */ 179 trace_xfs_buf_item_size_ordered(bip); 180 *nvecs = XFS_LOG_VEC_ORDERED; 181 return; 182 } 183 184 /* 185 * The vector count is based on the number of buffer vectors we have 186 * dirty bits in. This will only be greater than one when we have a 187 * compound buffer with more than one segment dirty. Hence for compound 188 * buffers we need to track which segment the dirty bits correspond to, 189 * and when we move from one segment to the next increment the vector 190 * count for the extra buf log format structure that will need to be 191 * written. 192 */ 193 bytes = 0; 194 for (i = 0; i < bip->bli_format_count; i++) { 195 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, 196 nvecs, &bytes); 197 offset += BBTOB(bp->b_maps[i].bm_len); 198 } 199 200 /* 201 * Round up the buffer size required to minimise the number of memory 202 * allocations that need to be done as this item grows when relogged by 203 * repeated modifications. 204 */ 205 *nbytes = round_up(bytes, 512); 206 trace_xfs_buf_item_size(bip); 207 } 208 209 static inline void 210 xfs_buf_item_copy_iovec( 211 struct xfs_log_vec *lv, 212 struct xfs_log_iovec **vecp, 213 struct xfs_buf *bp, 214 uint offset, 215 int first_bit, 216 uint nbits) 217 { 218 offset += first_bit * XFS_BLF_CHUNK; 219 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, 220 xfs_buf_offset(bp, offset), 221 nbits * XFS_BLF_CHUNK); 222 } 223 224 static void 225 xfs_buf_item_format_segment( 226 struct xfs_buf_log_item *bip, 227 struct xfs_log_vec *lv, 228 struct xfs_log_iovec **vecp, 229 uint offset, 230 struct xfs_buf_log_format *blfp) 231 { 232 struct xfs_buf *bp = bip->bli_buf; 233 uint base_size; 234 int first_bit; 235 uint nbits; 236 237 /* copy the flags across from the base format item */ 238 blfp->blf_flags = bip->__bli_format.blf_flags; 239 240 /* 241 * Base size is the actual size of the ondisk structure - it reflects 242 * the actual size of the dirty bitmap rather than the size of the in 243 * memory structure. 244 */ 245 base_size = xfs_buf_log_format_size(blfp); 246 247 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 248 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 249 /* 250 * If the map is not be dirty in the transaction, mark 251 * the size as zero and do not advance the vector pointer. 252 */ 253 return; 254 } 255 256 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 257 blfp->blf_size = 1; 258 259 if (bip->bli_flags & XFS_BLI_STALE) { 260 /* 261 * The buffer is stale, so all we need to log 262 * is the buf log format structure with the 263 * cancel flag in it. 264 */ 265 trace_xfs_buf_item_format_stale(bip); 266 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 267 return; 268 } 269 270 271 /* 272 * Fill in an iovec for each set of contiguous chunks. 273 */ 274 do { 275 ASSERT(first_bit >= 0); 276 nbits = xfs_contig_bits(blfp->blf_data_map, 277 blfp->blf_map_size, first_bit); 278 ASSERT(nbits > 0); 279 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 280 first_bit, nbits); 281 blfp->blf_size++; 282 283 /* 284 * This takes the bit number to start looking from and 285 * returns the next set bit from there. It returns -1 286 * if there are no more bits set or the start bit is 287 * beyond the end of the bitmap. 288 */ 289 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 290 (uint)first_bit + nbits + 1); 291 } while (first_bit != -1); 292 293 return; 294 } 295 296 /* 297 * This is called to fill in the vector of log iovecs for the 298 * given log buf item. It fills the first entry with a buf log 299 * format structure, and the rest point to contiguous chunks 300 * within the buffer. 301 */ 302 STATIC void 303 xfs_buf_item_format( 304 struct xfs_log_item *lip, 305 struct xfs_log_vec *lv) 306 { 307 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 308 struct xfs_buf *bp = bip->bli_buf; 309 struct xfs_log_iovec *vecp = NULL; 310 uint offset = 0; 311 int i; 312 313 ASSERT(atomic_read(&bip->bli_refcount) > 0); 314 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 315 (bip->bli_flags & XFS_BLI_STALE)); 316 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 317 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 318 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 319 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 320 (bip->bli_flags & XFS_BLI_STALE)); 321 322 323 /* 324 * If it is an inode buffer, transfer the in-memory state to the 325 * format flags and clear the in-memory state. 326 * 327 * For buffer based inode allocation, we do not transfer 328 * this state if the inode buffer allocation has not yet been committed 329 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 330 * correct replay of the inode allocation. 331 * 332 * For icreate item based inode allocation, the buffers aren't written 333 * to the journal during allocation, and hence we should always tag the 334 * buffer as an inode buffer so that the correct unlinked list replay 335 * occurs during recovery. 336 */ 337 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 338 if (xfs_has_v3inodes(lip->li_log->l_mp) || 339 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 340 xfs_log_item_in_current_chkpt(lip))) 341 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 342 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 343 } 344 345 for (i = 0; i < bip->bli_format_count; i++) { 346 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 347 &bip->bli_formats[i]); 348 offset += BBTOB(bp->b_maps[i].bm_len); 349 } 350 351 /* 352 * Check to make sure everything is consistent. 353 */ 354 trace_xfs_buf_item_format(bip); 355 } 356 357 /* 358 * This is called to pin the buffer associated with the buf log item in memory 359 * so it cannot be written out. 360 * 361 * We take a reference to the buffer log item here so that the BLI life cycle 362 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and 363 * inserted into the AIL. 364 * 365 * We also need to take a reference to the buffer itself as the BLI unpin 366 * processing requires accessing the buffer after the BLI has dropped the final 367 * BLI reference. See xfs_buf_item_unpin() for an explanation. 368 * If unpins race to drop the final BLI reference and only the 369 * BLI owns a reference to the buffer, then the loser of the race can have the 370 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per 371 * pin count ensures the life cycle of the buffer extends for as 372 * long as we hold the buffer pin reference in xfs_buf_item_unpin(). 373 */ 374 STATIC void 375 xfs_buf_item_pin( 376 struct xfs_log_item *lip) 377 { 378 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 379 380 ASSERT(atomic_read(&bip->bli_refcount) > 0); 381 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 382 (bip->bli_flags & XFS_BLI_ORDERED) || 383 (bip->bli_flags & XFS_BLI_STALE)); 384 385 trace_xfs_buf_item_pin(bip); 386 387 xfs_buf_hold(bip->bli_buf); 388 atomic_inc(&bip->bli_refcount); 389 atomic_inc(&bip->bli_buf->b_pin_count); 390 } 391 392 /* 393 * This is called to unpin the buffer associated with the buf log item which was 394 * previously pinned with a call to xfs_buf_item_pin(). We enter this function 395 * with a buffer pin count, a buffer reference and a BLI reference. 396 * 397 * We must drop the BLI reference before we unpin the buffer because the AIL 398 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the 399 * refcount drops to zero, the bli could still be AIL resident and the buffer 400 * submitted for I/O at any point before we return. This can result in IO 401 * completion freeing the buffer while we are still trying to access it here. 402 * This race condition can also occur in shutdown situations where we abort and 403 * unpin buffers from contexts other that journal IO completion. 404 * 405 * Hence we have to hold a buffer reference per pin count to ensure that the 406 * buffer cannot be freed until we have finished processing the unpin operation. 407 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we 408 * are done processing the buffer state. In the case of an abort (remove = 409 * true) then we re-use the current pin reference as the IO reference we hand 410 * off to IO failure handling. 411 */ 412 STATIC void 413 xfs_buf_item_unpin( 414 struct xfs_log_item *lip, 415 int remove) 416 { 417 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 418 struct xfs_buf *bp = bip->bli_buf; 419 int stale = bip->bli_flags & XFS_BLI_STALE; 420 int freed; 421 422 ASSERT(bp->b_log_item == bip); 423 ASSERT(atomic_read(&bip->bli_refcount) > 0); 424 425 trace_xfs_buf_item_unpin(bip); 426 427 freed = atomic_dec_and_test(&bip->bli_refcount); 428 if (atomic_dec_and_test(&bp->b_pin_count)) 429 wake_up_all(&bp->b_waiters); 430 431 /* 432 * Nothing to do but drop the buffer pin reference if the BLI is 433 * still active. 434 */ 435 if (!freed) { 436 xfs_buf_rele(bp); 437 return; 438 } 439 440 if (stale) { 441 ASSERT(bip->bli_flags & XFS_BLI_STALE); 442 ASSERT(xfs_buf_islocked(bp)); 443 ASSERT(bp->b_flags & XBF_STALE); 444 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 445 ASSERT(list_empty(&lip->li_trans)); 446 ASSERT(!bp->b_transp); 447 448 trace_xfs_buf_item_unpin_stale(bip); 449 450 /* 451 * The buffer has been locked and referenced since it was marked 452 * stale so we own both lock and reference exclusively here. We 453 * do not need the pin reference any more, so drop it now so 454 * that we only have one reference to drop once item completion 455 * processing is complete. 456 */ 457 xfs_buf_rele(bp); 458 459 /* 460 * If we get called here because of an IO error, we may or may 461 * not have the item on the AIL. xfs_trans_ail_delete() will 462 * take care of that situation. xfs_trans_ail_delete() drops 463 * the AIL lock. 464 */ 465 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 466 xfs_buf_item_done(bp); 467 xfs_buf_inode_iodone(bp); 468 ASSERT(list_empty(&bp->b_li_list)); 469 } else { 470 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 471 xfs_buf_item_relse(bp); 472 ASSERT(bp->b_log_item == NULL); 473 } 474 xfs_buf_relse(bp); 475 return; 476 } 477 478 if (remove) { 479 /* 480 * We need to simulate an async IO failures here to ensure that 481 * the correct error completion is run on this buffer. This 482 * requires a reference to the buffer and for the buffer to be 483 * locked. We can safely pass ownership of the pin reference to 484 * the IO to ensure that nothing can free the buffer while we 485 * wait for the lock and then run the IO failure completion. 486 */ 487 xfs_buf_lock(bp); 488 bp->b_flags |= XBF_ASYNC; 489 xfs_buf_ioend_fail(bp); 490 return; 491 } 492 493 /* 494 * BLI has no more active references - it will be moved to the AIL to 495 * manage the remaining BLI/buffer life cycle. There is nothing left for 496 * us to do here so drop the pin reference to the buffer. 497 */ 498 xfs_buf_rele(bp); 499 } 500 501 STATIC uint 502 xfs_buf_item_push( 503 struct xfs_log_item *lip, 504 struct list_head *buffer_list) 505 { 506 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 507 struct xfs_buf *bp = bip->bli_buf; 508 uint rval = XFS_ITEM_SUCCESS; 509 510 if (xfs_buf_ispinned(bp)) 511 return XFS_ITEM_PINNED; 512 if (!xfs_buf_trylock(bp)) { 513 /* 514 * If we have just raced with a buffer being pinned and it has 515 * been marked stale, we could end up stalling until someone else 516 * issues a log force to unpin the stale buffer. Check for the 517 * race condition here so xfsaild recognizes the buffer is pinned 518 * and queues a log force to move it along. 519 */ 520 if (xfs_buf_ispinned(bp)) 521 return XFS_ITEM_PINNED; 522 return XFS_ITEM_LOCKED; 523 } 524 525 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 526 527 trace_xfs_buf_item_push(bip); 528 529 /* has a previous flush failed due to IO errors? */ 530 if (bp->b_flags & XBF_WRITE_FAIL) { 531 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 532 "Failing async write on buffer block 0x%llx. Retrying async write.", 533 (long long)xfs_buf_daddr(bp)); 534 } 535 536 if (!xfs_buf_delwri_queue(bp, buffer_list)) 537 rval = XFS_ITEM_FLUSHING; 538 xfs_buf_unlock(bp); 539 return rval; 540 } 541 542 /* 543 * Drop the buffer log item refcount and take appropriate action. This helper 544 * determines whether the bli must be freed or not, since a decrement to zero 545 * does not necessarily mean the bli is unused. 546 * 547 * Return true if the bli is freed, false otherwise. 548 */ 549 bool 550 xfs_buf_item_put( 551 struct xfs_buf_log_item *bip) 552 { 553 struct xfs_log_item *lip = &bip->bli_item; 554 bool aborted; 555 bool dirty; 556 557 /* drop the bli ref and return if it wasn't the last one */ 558 if (!atomic_dec_and_test(&bip->bli_refcount)) 559 return false; 560 561 /* 562 * We dropped the last ref and must free the item if clean or aborted. 563 * If the bli is dirty and non-aborted, the buffer was clean in the 564 * transaction but still awaiting writeback from previous changes. In 565 * that case, the bli is freed on buffer writeback completion. 566 */ 567 aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || 568 xlog_is_shutdown(lip->li_log); 569 dirty = bip->bli_flags & XFS_BLI_DIRTY; 570 if (dirty && !aborted) 571 return false; 572 573 /* 574 * The bli is aborted or clean. An aborted item may be in the AIL 575 * regardless of dirty state. For example, consider an aborted 576 * transaction that invalidated a dirty bli and cleared the dirty 577 * state. 578 */ 579 if (aborted) 580 xfs_trans_ail_delete(lip, 0); 581 xfs_buf_item_relse(bip->bli_buf); 582 return true; 583 } 584 585 /* 586 * Release the buffer associated with the buf log item. If there is no dirty 587 * logged data associated with the buffer recorded in the buf log item, then 588 * free the buf log item and remove the reference to it in the buffer. 589 * 590 * This call ignores the recursion count. It is only called when the buffer 591 * should REALLY be unlocked, regardless of the recursion count. 592 * 593 * We unconditionally drop the transaction's reference to the log item. If the 594 * item was logged, then another reference was taken when it was pinned, so we 595 * can safely drop the transaction reference now. This also allows us to avoid 596 * potential races with the unpin code freeing the bli by not referencing the 597 * bli after we've dropped the reference count. 598 * 599 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 600 * if necessary but do not unlock the buffer. This is for support of 601 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 602 * free the item. 603 */ 604 STATIC void 605 xfs_buf_item_release( 606 struct xfs_log_item *lip) 607 { 608 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 609 struct xfs_buf *bp = bip->bli_buf; 610 bool released; 611 bool hold = bip->bli_flags & XFS_BLI_HOLD; 612 bool stale = bip->bli_flags & XFS_BLI_STALE; 613 #if defined(DEBUG) || defined(XFS_WARN) 614 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 615 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 616 bool aborted = test_bit(XFS_LI_ABORTED, 617 &lip->li_flags); 618 #endif 619 620 trace_xfs_buf_item_release(bip); 621 622 /* 623 * The bli dirty state should match whether the blf has logged segments 624 * except for ordered buffers, where only the bli should be dirty. 625 */ 626 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 627 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 628 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 629 630 /* 631 * Clear the buffer's association with this transaction and 632 * per-transaction state from the bli, which has been copied above. 633 */ 634 bp->b_transp = NULL; 635 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 636 637 /* 638 * Unref the item and unlock the buffer unless held or stale. Stale 639 * buffers remain locked until final unpin unless the bli is freed by 640 * the unref call. The latter implies shutdown because buffer 641 * invalidation dirties the bli and transaction. 642 */ 643 released = xfs_buf_item_put(bip); 644 if (hold || (stale && !released)) 645 return; 646 ASSERT(!stale || aborted); 647 xfs_buf_relse(bp); 648 } 649 650 STATIC void 651 xfs_buf_item_committing( 652 struct xfs_log_item *lip, 653 xfs_csn_t seq) 654 { 655 return xfs_buf_item_release(lip); 656 } 657 658 /* 659 * This is called to find out where the oldest active copy of the 660 * buf log item in the on disk log resides now that the last log 661 * write of it completed at the given lsn. 662 * We always re-log all the dirty data in a buffer, so usually the 663 * latest copy in the on disk log is the only one that matters. For 664 * those cases we simply return the given lsn. 665 * 666 * The one exception to this is for buffers full of newly allocated 667 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 668 * flag set, indicating that only the di_next_unlinked fields from the 669 * inodes in the buffers will be replayed during recovery. If the 670 * original newly allocated inode images have not yet been flushed 671 * when the buffer is so relogged, then we need to make sure that we 672 * keep the old images in the 'active' portion of the log. We do this 673 * by returning the original lsn of that transaction here rather than 674 * the current one. 675 */ 676 STATIC xfs_lsn_t 677 xfs_buf_item_committed( 678 struct xfs_log_item *lip, 679 xfs_lsn_t lsn) 680 { 681 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 682 683 trace_xfs_buf_item_committed(bip); 684 685 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 686 return lip->li_lsn; 687 return lsn; 688 } 689 690 #ifdef DEBUG_EXPENSIVE 691 static int 692 xfs_buf_item_precommit( 693 struct xfs_trans *tp, 694 struct xfs_log_item *lip) 695 { 696 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 697 struct xfs_buf *bp = bip->bli_buf; 698 struct xfs_mount *mp = bp->b_mount; 699 xfs_failaddr_t fa; 700 701 if (!bp->b_ops || !bp->b_ops->verify_struct) 702 return 0; 703 if (bip->bli_flags & XFS_BLI_STALE) 704 return 0; 705 706 fa = bp->b_ops->verify_struct(bp); 707 if (fa) { 708 xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, 709 bp->b_addr, BBTOB(bp->b_length), fa); 710 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 711 ASSERT(fa == NULL); 712 } 713 714 return 0; 715 } 716 #else 717 # define xfs_buf_item_precommit NULL 718 #endif 719 720 static const struct xfs_item_ops xfs_buf_item_ops = { 721 .iop_size = xfs_buf_item_size, 722 .iop_precommit = xfs_buf_item_precommit, 723 .iop_format = xfs_buf_item_format, 724 .iop_pin = xfs_buf_item_pin, 725 .iop_unpin = xfs_buf_item_unpin, 726 .iop_release = xfs_buf_item_release, 727 .iop_committing = xfs_buf_item_committing, 728 .iop_committed = xfs_buf_item_committed, 729 .iop_push = xfs_buf_item_push, 730 }; 731 732 STATIC void 733 xfs_buf_item_get_format( 734 struct xfs_buf_log_item *bip, 735 int count) 736 { 737 ASSERT(bip->bli_formats == NULL); 738 bip->bli_format_count = count; 739 740 if (count == 1) { 741 bip->bli_formats = &bip->__bli_format; 742 return; 743 } 744 745 bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), 746 GFP_KERNEL | __GFP_NOFAIL); 747 } 748 749 STATIC void 750 xfs_buf_item_free_format( 751 struct xfs_buf_log_item *bip) 752 { 753 if (bip->bli_formats != &bip->__bli_format) { 754 kfree(bip->bli_formats); 755 bip->bli_formats = NULL; 756 } 757 } 758 759 /* 760 * Allocate a new buf log item to go with the given buffer. 761 * Set the buffer's b_log_item field to point to the new 762 * buf log item. 763 */ 764 int 765 xfs_buf_item_init( 766 struct xfs_buf *bp, 767 struct xfs_mount *mp) 768 { 769 struct xfs_buf_log_item *bip = bp->b_log_item; 770 int chunks; 771 int map_size; 772 int i; 773 774 /* 775 * Check to see if there is already a buf log item for 776 * this buffer. If we do already have one, there is 777 * nothing to do here so return. 778 */ 779 ASSERT(bp->b_mount == mp); 780 if (bip) { 781 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 782 ASSERT(!bp->b_transp); 783 ASSERT(bip->bli_buf == bp); 784 return 0; 785 } 786 787 bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); 788 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 789 bip->bli_buf = bp; 790 791 /* 792 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 793 * can be divided into. Make sure not to truncate any pieces. 794 * map_size is the size of the bitmap needed to describe the 795 * chunks of the buffer. 796 * 797 * Discontiguous buffer support follows the layout of the underlying 798 * buffer. This makes the implementation as simple as possible. 799 */ 800 xfs_buf_item_get_format(bip, bp->b_map_count); 801 802 for (i = 0; i < bip->bli_format_count; i++) { 803 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 804 XFS_BLF_CHUNK); 805 map_size = DIV_ROUND_UP(chunks, NBWORD); 806 807 if (map_size > XFS_BLF_DATAMAP_SIZE) { 808 kmem_cache_free(xfs_buf_item_cache, bip); 809 xfs_err(mp, 810 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 811 map_size, 812 BBTOB(bp->b_maps[i].bm_len)); 813 return -EFSCORRUPTED; 814 } 815 816 bip->bli_formats[i].blf_type = XFS_LI_BUF; 817 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 818 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 819 bip->bli_formats[i].blf_map_size = map_size; 820 } 821 822 bp->b_log_item = bip; 823 xfs_buf_hold(bp); 824 return 0; 825 } 826 827 828 /* 829 * Mark bytes first through last inclusive as dirty in the buf 830 * item's bitmap. 831 */ 832 static void 833 xfs_buf_item_log_segment( 834 uint first, 835 uint last, 836 uint *map) 837 { 838 uint first_bit; 839 uint last_bit; 840 uint bits_to_set; 841 uint bits_set; 842 uint word_num; 843 uint *wordp; 844 uint bit; 845 uint end_bit; 846 uint mask; 847 848 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 849 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 850 851 /* 852 * Convert byte offsets to bit numbers. 853 */ 854 first_bit = first >> XFS_BLF_SHIFT; 855 last_bit = last >> XFS_BLF_SHIFT; 856 857 /* 858 * Calculate the total number of bits to be set. 859 */ 860 bits_to_set = last_bit - first_bit + 1; 861 862 /* 863 * Get a pointer to the first word in the bitmap 864 * to set a bit in. 865 */ 866 word_num = first_bit >> BIT_TO_WORD_SHIFT; 867 wordp = &map[word_num]; 868 869 /* 870 * Calculate the starting bit in the first word. 871 */ 872 bit = first_bit & (uint)(NBWORD - 1); 873 874 /* 875 * First set any bits in the first word of our range. 876 * If it starts at bit 0 of the word, it will be 877 * set below rather than here. That is what the variable 878 * bit tells us. The variable bits_set tracks the number 879 * of bits that have been set so far. End_bit is the number 880 * of the last bit to be set in this word plus one. 881 */ 882 if (bit) { 883 end_bit = min(bit + bits_to_set, (uint)NBWORD); 884 mask = ((1U << (end_bit - bit)) - 1) << bit; 885 *wordp |= mask; 886 wordp++; 887 bits_set = end_bit - bit; 888 } else { 889 bits_set = 0; 890 } 891 892 /* 893 * Now set bits a whole word at a time that are between 894 * first_bit and last_bit. 895 */ 896 while ((bits_to_set - bits_set) >= NBWORD) { 897 *wordp = 0xffffffff; 898 bits_set += NBWORD; 899 wordp++; 900 } 901 902 /* 903 * Finally, set any bits left to be set in one last partial word. 904 */ 905 end_bit = bits_to_set - bits_set; 906 if (end_bit) { 907 mask = (1U << end_bit) - 1; 908 *wordp |= mask; 909 } 910 } 911 912 /* 913 * Mark bytes first through last inclusive as dirty in the buf 914 * item's bitmap. 915 */ 916 void 917 xfs_buf_item_log( 918 struct xfs_buf_log_item *bip, 919 uint first, 920 uint last) 921 { 922 int i; 923 uint start; 924 uint end; 925 struct xfs_buf *bp = bip->bli_buf; 926 927 /* 928 * walk each buffer segment and mark them dirty appropriately. 929 */ 930 start = 0; 931 for (i = 0; i < bip->bli_format_count; i++) { 932 if (start > last) 933 break; 934 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 935 936 /* skip to the map that includes the first byte to log */ 937 if (first > end) { 938 start += BBTOB(bp->b_maps[i].bm_len); 939 continue; 940 } 941 942 /* 943 * Trim the range to this segment and mark it in the bitmap. 944 * Note that we must convert buffer offsets to segment relative 945 * offsets (e.g., the first byte of each segment is byte 0 of 946 * that segment). 947 */ 948 if (first < start) 949 first = start; 950 if (end > last) 951 end = last; 952 xfs_buf_item_log_segment(first - start, end - start, 953 &bip->bli_formats[i].blf_data_map[0]); 954 955 start += BBTOB(bp->b_maps[i].bm_len); 956 } 957 } 958 959 960 /* 961 * Return true if the buffer has any ranges logged/dirtied by a transaction, 962 * false otherwise. 963 */ 964 bool 965 xfs_buf_item_dirty_format( 966 struct xfs_buf_log_item *bip) 967 { 968 int i; 969 970 for (i = 0; i < bip->bli_format_count; i++) { 971 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 972 bip->bli_formats[i].blf_map_size)) 973 return true; 974 } 975 976 return false; 977 } 978 979 STATIC void 980 xfs_buf_item_free( 981 struct xfs_buf_log_item *bip) 982 { 983 xfs_buf_item_free_format(bip); 984 kvfree(bip->bli_item.li_lv_shadow); 985 kmem_cache_free(xfs_buf_item_cache, bip); 986 } 987 988 /* 989 * xfs_buf_item_relse() is called when the buf log item is no longer needed. 990 */ 991 void 992 xfs_buf_item_relse( 993 struct xfs_buf *bp) 994 { 995 struct xfs_buf_log_item *bip = bp->b_log_item; 996 997 trace_xfs_buf_item_relse(bp, _RET_IP_); 998 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 999 1000 if (atomic_read(&bip->bli_refcount)) 1001 return; 1002 bp->b_log_item = NULL; 1003 xfs_buf_rele(bp); 1004 xfs_buf_item_free(bip); 1005 } 1006 1007 void 1008 xfs_buf_item_done( 1009 struct xfs_buf *bp) 1010 { 1011 /* 1012 * If we are forcibly shutting down, this may well be off the AIL 1013 * already. That's because we simulate the log-committed callbacks to 1014 * unpin these buffers. Or we may never have put this item on AIL 1015 * because of the transaction was aborted forcibly. 1016 * xfs_trans_ail_delete() takes care of these. 1017 * 1018 * Either way, AIL is useless if we're forcing a shutdown. 1019 * 1020 * Note that log recovery writes might have buffer items that are not on 1021 * the AIL even when the file system is not shut down. 1022 */ 1023 xfs_trans_ail_delete(&bp->b_log_item->bli_item, 1024 (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : 1025 SHUTDOWN_CORRUPT_INCORE); 1026 xfs_buf_item_relse(bp); 1027 } 1028