1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_quota.h" 20 #include "xfs_dquot_item.h" 21 #include "xfs_dquot.h" 22 #include "xfs_trace.h" 23 #include "xfs_log.h" 24 #include "xfs_log_priv.h" 25 #include "xfs_error.h" 26 27 28 struct kmem_cache *xfs_buf_item_cache; 29 30 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 31 { 32 return container_of(lip, struct xfs_buf_log_item, bli_item); 33 } 34 35 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 36 bool 37 xfs_buf_log_check_iovec( 38 struct xfs_log_iovec *iovec) 39 { 40 struct xfs_buf_log_format *blfp = iovec->i_addr; 41 char *bmp_end; 42 char *item_end; 43 44 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) 45 return false; 46 47 item_end = (char *)iovec->i_addr + iovec->i_len; 48 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 49 return bmp_end <= item_end; 50 } 51 52 static inline int 53 xfs_buf_log_format_size( 54 struct xfs_buf_log_format *blfp) 55 { 56 return offsetof(struct xfs_buf_log_format, blf_data_map) + 57 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 58 } 59 60 /* 61 * Return the number of log iovecs and space needed to log the given buf log 62 * item segment. 63 * 64 * It calculates this as 1 iovec for the buf log format structure and 1 for each 65 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 66 * in a single iovec. 67 */ 68 STATIC void 69 xfs_buf_item_size_segment( 70 struct xfs_buf_log_item *bip, 71 struct xfs_buf_log_format *blfp, 72 uint offset, 73 int *nvecs, 74 int *nbytes) 75 { 76 int first_bit; 77 int nbits; 78 79 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 80 if (first_bit == -1) 81 return; 82 83 (*nvecs)++; 84 *nbytes += xfs_buf_log_format_size(blfp); 85 86 do { 87 nbits = xfs_contig_bits(blfp->blf_data_map, 88 blfp->blf_map_size, first_bit); 89 ASSERT(nbits > 0); 90 (*nvecs)++; 91 *nbytes += nbits * XFS_BLF_CHUNK; 92 93 /* 94 * This takes the bit number to start looking from and 95 * returns the next set bit from there. It returns -1 96 * if there are no more bits set or the start bit is 97 * beyond the end of the bitmap. 98 */ 99 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 100 (uint)first_bit + nbits + 1); 101 } while (first_bit != -1); 102 103 return; 104 } 105 106 /* 107 * Return the number of log iovecs and space needed to log the given buf log 108 * item. 109 * 110 * Discontiguous buffers need a format structure per region that is being 111 * logged. This makes the changes in the buffer appear to log recovery as though 112 * they came from separate buffers, just like would occur if multiple buffers 113 * were used instead of a single discontiguous buffer. This enables 114 * discontiguous buffers to be in-memory constructs, completely transparent to 115 * what ends up on disk. 116 * 117 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 118 * format structures. If the item has previously been logged and has dirty 119 * regions, we do not relog them in stale buffers. This has the effect of 120 * reducing the size of the relogged item by the amount of dirty data tracked 121 * by the log item. This can result in the committing transaction reducing the 122 * amount of space being consumed by the CIL. 123 */ 124 STATIC void 125 xfs_buf_item_size( 126 struct xfs_log_item *lip, 127 int *nvecs, 128 int *nbytes) 129 { 130 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 131 struct xfs_buf *bp = bip->bli_buf; 132 int i; 133 int bytes; 134 uint offset = 0; 135 136 ASSERT(atomic_read(&bip->bli_refcount) > 0); 137 if (bip->bli_flags & XFS_BLI_STALE) { 138 /* 139 * The buffer is stale, so all we need to log is the buf log 140 * format structure with the cancel flag in it as we are never 141 * going to replay the changes tracked in the log item. 142 */ 143 trace_xfs_buf_item_size_stale(bip); 144 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 145 *nvecs += bip->bli_format_count; 146 for (i = 0; i < bip->bli_format_count; i++) { 147 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 148 } 149 return; 150 } 151 152 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 153 154 if (bip->bli_flags & XFS_BLI_ORDERED) { 155 /* 156 * The buffer has been logged just to order it. It is not being 157 * included in the transaction commit, so no vectors are used at 158 * all. 159 */ 160 trace_xfs_buf_item_size_ordered(bip); 161 *nvecs = XFS_LOG_VEC_ORDERED; 162 return; 163 } 164 165 /* 166 * The vector count is based on the number of buffer vectors we have 167 * dirty bits in. This will only be greater than one when we have a 168 * compound buffer with more than one segment dirty. Hence for compound 169 * buffers we need to track which segment the dirty bits correspond to, 170 * and when we move from one segment to the next increment the vector 171 * count for the extra buf log format structure that will need to be 172 * written. 173 */ 174 bytes = 0; 175 for (i = 0; i < bip->bli_format_count; i++) { 176 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, 177 nvecs, &bytes); 178 offset += BBTOB(bp->b_maps[i].bm_len); 179 } 180 181 /* 182 * Round up the buffer size required to minimise the number of memory 183 * allocations that need to be done as this item grows when relogged by 184 * repeated modifications. 185 */ 186 *nbytes = round_up(bytes, 512); 187 trace_xfs_buf_item_size(bip); 188 } 189 190 static inline void 191 xfs_buf_item_copy_iovec( 192 struct xfs_log_vec *lv, 193 struct xfs_log_iovec **vecp, 194 struct xfs_buf *bp, 195 uint offset, 196 int first_bit, 197 uint nbits) 198 { 199 offset += first_bit * XFS_BLF_CHUNK; 200 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, 201 xfs_buf_offset(bp, offset), 202 nbits * XFS_BLF_CHUNK); 203 } 204 205 static void 206 xfs_buf_item_format_segment( 207 struct xfs_buf_log_item *bip, 208 struct xfs_log_vec *lv, 209 struct xfs_log_iovec **vecp, 210 uint offset, 211 struct xfs_buf_log_format *blfp) 212 { 213 struct xfs_buf *bp = bip->bli_buf; 214 uint base_size; 215 int first_bit; 216 uint nbits; 217 218 /* copy the flags across from the base format item */ 219 blfp->blf_flags = bip->__bli_format.blf_flags; 220 221 /* 222 * Base size is the actual size of the ondisk structure - it reflects 223 * the actual size of the dirty bitmap rather than the size of the in 224 * memory structure. 225 */ 226 base_size = xfs_buf_log_format_size(blfp); 227 228 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 229 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 230 /* 231 * If the map is not be dirty in the transaction, mark 232 * the size as zero and do not advance the vector pointer. 233 */ 234 return; 235 } 236 237 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 238 blfp->blf_size = 1; 239 240 if (bip->bli_flags & XFS_BLI_STALE) { 241 /* 242 * The buffer is stale, so all we need to log 243 * is the buf log format structure with the 244 * cancel flag in it. 245 */ 246 trace_xfs_buf_item_format_stale(bip); 247 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 248 return; 249 } 250 251 252 /* 253 * Fill in an iovec for each set of contiguous chunks. 254 */ 255 do { 256 ASSERT(first_bit >= 0); 257 nbits = xfs_contig_bits(blfp->blf_data_map, 258 blfp->blf_map_size, first_bit); 259 ASSERT(nbits > 0); 260 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 261 first_bit, nbits); 262 blfp->blf_size++; 263 264 /* 265 * This takes the bit number to start looking from and 266 * returns the next set bit from there. It returns -1 267 * if there are no more bits set or the start bit is 268 * beyond the end of the bitmap. 269 */ 270 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 271 (uint)first_bit + nbits + 1); 272 } while (first_bit != -1); 273 274 return; 275 } 276 277 /* 278 * This is called to fill in the vector of log iovecs for the 279 * given log buf item. It fills the first entry with a buf log 280 * format structure, and the rest point to contiguous chunks 281 * within the buffer. 282 */ 283 STATIC void 284 xfs_buf_item_format( 285 struct xfs_log_item *lip, 286 struct xfs_log_vec *lv) 287 { 288 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 289 struct xfs_buf *bp = bip->bli_buf; 290 struct xfs_log_iovec *vecp = NULL; 291 uint offset = 0; 292 int i; 293 294 ASSERT(atomic_read(&bip->bli_refcount) > 0); 295 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 296 (bip->bli_flags & XFS_BLI_STALE)); 297 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 298 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 299 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 300 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 301 (bip->bli_flags & XFS_BLI_STALE)); 302 303 304 /* 305 * If it is an inode buffer, transfer the in-memory state to the 306 * format flags and clear the in-memory state. 307 * 308 * For buffer based inode allocation, we do not transfer 309 * this state if the inode buffer allocation has not yet been committed 310 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 311 * correct replay of the inode allocation. 312 * 313 * For icreate item based inode allocation, the buffers aren't written 314 * to the journal during allocation, and hence we should always tag the 315 * buffer as an inode buffer so that the correct unlinked list replay 316 * occurs during recovery. 317 */ 318 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 319 if (xfs_has_v3inodes(lip->li_log->l_mp) || 320 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 321 xfs_log_item_in_current_chkpt(lip))) 322 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 323 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 324 } 325 326 for (i = 0; i < bip->bli_format_count; i++) { 327 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 328 &bip->bli_formats[i]); 329 offset += BBTOB(bp->b_maps[i].bm_len); 330 } 331 332 /* 333 * Check to make sure everything is consistent. 334 */ 335 trace_xfs_buf_item_format(bip); 336 } 337 338 /* 339 * This is called to pin the buffer associated with the buf log item in memory 340 * so it cannot be written out. 341 * 342 * We take a reference to the buffer log item here so that the BLI life cycle 343 * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and 344 * inserted into the AIL. 345 * 346 * We also need to take a reference to the buffer itself as the BLI unpin 347 * processing requires accessing the buffer after the BLI has dropped the final 348 * BLI reference. See xfs_buf_item_unpin() for an explanation. 349 * If unpins race to drop the final BLI reference and only the 350 * BLI owns a reference to the buffer, then the loser of the race can have the 351 * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per 352 * pin count ensures the life cycle of the buffer extends for as 353 * long as we hold the buffer pin reference in xfs_buf_item_unpin(). 354 */ 355 STATIC void 356 xfs_buf_item_pin( 357 struct xfs_log_item *lip) 358 { 359 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 360 361 ASSERT(atomic_read(&bip->bli_refcount) > 0); 362 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 363 (bip->bli_flags & XFS_BLI_ORDERED) || 364 (bip->bli_flags & XFS_BLI_STALE)); 365 366 trace_xfs_buf_item_pin(bip); 367 368 xfs_buf_hold(bip->bli_buf); 369 atomic_inc(&bip->bli_refcount); 370 atomic_inc(&bip->bli_buf->b_pin_count); 371 } 372 373 /* 374 * This is called to unpin the buffer associated with the buf log item which was 375 * previously pinned with a call to xfs_buf_item_pin(). We enter this function 376 * with a buffer pin count, a buffer reference and a BLI reference. 377 * 378 * We must drop the BLI reference before we unpin the buffer because the AIL 379 * doesn't acquire a BLI reference whenever it accesses it. Therefore if the 380 * refcount drops to zero, the bli could still be AIL resident and the buffer 381 * submitted for I/O at any point before we return. This can result in IO 382 * completion freeing the buffer while we are still trying to access it here. 383 * This race condition can also occur in shutdown situations where we abort and 384 * unpin buffers from contexts other that journal IO completion. 385 * 386 * Hence we have to hold a buffer reference per pin count to ensure that the 387 * buffer cannot be freed until we have finished processing the unpin operation. 388 * The reference is taken in xfs_buf_item_pin(), and we must hold it until we 389 * are done processing the buffer state. In the case of an abort (remove = 390 * true) then we re-use the current pin reference as the IO reference we hand 391 * off to IO failure handling. 392 */ 393 STATIC void 394 xfs_buf_item_unpin( 395 struct xfs_log_item *lip, 396 int remove) 397 { 398 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 399 struct xfs_buf *bp = bip->bli_buf; 400 int stale = bip->bli_flags & XFS_BLI_STALE; 401 int freed; 402 403 ASSERT(bp->b_log_item == bip); 404 ASSERT(atomic_read(&bip->bli_refcount) > 0); 405 406 trace_xfs_buf_item_unpin(bip); 407 408 freed = atomic_dec_and_test(&bip->bli_refcount); 409 if (atomic_dec_and_test(&bp->b_pin_count)) 410 wake_up_all(&bp->b_waiters); 411 412 /* 413 * Nothing to do but drop the buffer pin reference if the BLI is 414 * still active. 415 */ 416 if (!freed) { 417 xfs_buf_rele(bp); 418 return; 419 } 420 421 if (stale) { 422 ASSERT(bip->bli_flags & XFS_BLI_STALE); 423 ASSERT(xfs_buf_islocked(bp)); 424 ASSERT(bp->b_flags & XBF_STALE); 425 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 426 ASSERT(list_empty(&lip->li_trans)); 427 ASSERT(!bp->b_transp); 428 429 trace_xfs_buf_item_unpin_stale(bip); 430 431 /* 432 * The buffer has been locked and referenced since it was marked 433 * stale so we own both lock and reference exclusively here. We 434 * do not need the pin reference any more, so drop it now so 435 * that we only have one reference to drop once item completion 436 * processing is complete. 437 */ 438 xfs_buf_rele(bp); 439 440 /* 441 * If we get called here because of an IO error, we may or may 442 * not have the item on the AIL. xfs_trans_ail_delete() will 443 * take care of that situation. xfs_trans_ail_delete() drops 444 * the AIL lock. 445 */ 446 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 447 xfs_buf_item_done(bp); 448 xfs_buf_inode_iodone(bp); 449 ASSERT(list_empty(&bp->b_li_list)); 450 } else { 451 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 452 xfs_buf_item_relse(bp); 453 ASSERT(bp->b_log_item == NULL); 454 } 455 xfs_buf_relse(bp); 456 return; 457 } 458 459 if (remove) { 460 /* 461 * We need to simulate an async IO failures here to ensure that 462 * the correct error completion is run on this buffer. This 463 * requires a reference to the buffer and for the buffer to be 464 * locked. We can safely pass ownership of the pin reference to 465 * the IO to ensure that nothing can free the buffer while we 466 * wait for the lock and then run the IO failure completion. 467 */ 468 xfs_buf_lock(bp); 469 bp->b_flags |= XBF_ASYNC; 470 xfs_buf_ioend_fail(bp); 471 return; 472 } 473 474 /* 475 * BLI has no more active references - it will be moved to the AIL to 476 * manage the remaining BLI/buffer life cycle. There is nothing left for 477 * us to do here so drop the pin reference to the buffer. 478 */ 479 xfs_buf_rele(bp); 480 } 481 482 STATIC uint 483 xfs_buf_item_push( 484 struct xfs_log_item *lip, 485 struct list_head *buffer_list) 486 { 487 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 488 struct xfs_buf *bp = bip->bli_buf; 489 uint rval = XFS_ITEM_SUCCESS; 490 491 if (xfs_buf_ispinned(bp)) 492 return XFS_ITEM_PINNED; 493 if (!xfs_buf_trylock(bp)) { 494 /* 495 * If we have just raced with a buffer being pinned and it has 496 * been marked stale, we could end up stalling until someone else 497 * issues a log force to unpin the stale buffer. Check for the 498 * race condition here so xfsaild recognizes the buffer is pinned 499 * and queues a log force to move it along. 500 */ 501 if (xfs_buf_ispinned(bp)) 502 return XFS_ITEM_PINNED; 503 return XFS_ITEM_LOCKED; 504 } 505 506 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 507 508 trace_xfs_buf_item_push(bip); 509 510 /* has a previous flush failed due to IO errors? */ 511 if (bp->b_flags & XBF_WRITE_FAIL) { 512 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 513 "Failing async write on buffer block 0x%llx. Retrying async write.", 514 (long long)xfs_buf_daddr(bp)); 515 } 516 517 if (!xfs_buf_delwri_queue(bp, buffer_list)) 518 rval = XFS_ITEM_FLUSHING; 519 xfs_buf_unlock(bp); 520 return rval; 521 } 522 523 /* 524 * Drop the buffer log item refcount and take appropriate action. This helper 525 * determines whether the bli must be freed or not, since a decrement to zero 526 * does not necessarily mean the bli is unused. 527 * 528 * Return true if the bli is freed, false otherwise. 529 */ 530 bool 531 xfs_buf_item_put( 532 struct xfs_buf_log_item *bip) 533 { 534 struct xfs_log_item *lip = &bip->bli_item; 535 bool aborted; 536 bool dirty; 537 538 /* drop the bli ref and return if it wasn't the last one */ 539 if (!atomic_dec_and_test(&bip->bli_refcount)) 540 return false; 541 542 /* 543 * We dropped the last ref and must free the item if clean or aborted. 544 * If the bli is dirty and non-aborted, the buffer was clean in the 545 * transaction but still awaiting writeback from previous changes. In 546 * that case, the bli is freed on buffer writeback completion. 547 */ 548 aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || 549 xlog_is_shutdown(lip->li_log); 550 dirty = bip->bli_flags & XFS_BLI_DIRTY; 551 if (dirty && !aborted) 552 return false; 553 554 /* 555 * The bli is aborted or clean. An aborted item may be in the AIL 556 * regardless of dirty state. For example, consider an aborted 557 * transaction that invalidated a dirty bli and cleared the dirty 558 * state. 559 */ 560 if (aborted) 561 xfs_trans_ail_delete(lip, 0); 562 xfs_buf_item_relse(bip->bli_buf); 563 return true; 564 } 565 566 /* 567 * Release the buffer associated with the buf log item. If there is no dirty 568 * logged data associated with the buffer recorded in the buf log item, then 569 * free the buf log item and remove the reference to it in the buffer. 570 * 571 * This call ignores the recursion count. It is only called when the buffer 572 * should REALLY be unlocked, regardless of the recursion count. 573 * 574 * We unconditionally drop the transaction's reference to the log item. If the 575 * item was logged, then another reference was taken when it was pinned, so we 576 * can safely drop the transaction reference now. This also allows us to avoid 577 * potential races with the unpin code freeing the bli by not referencing the 578 * bli after we've dropped the reference count. 579 * 580 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 581 * if necessary but do not unlock the buffer. This is for support of 582 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 583 * free the item. 584 */ 585 STATIC void 586 xfs_buf_item_release( 587 struct xfs_log_item *lip) 588 { 589 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 590 struct xfs_buf *bp = bip->bli_buf; 591 bool released; 592 bool hold = bip->bli_flags & XFS_BLI_HOLD; 593 bool stale = bip->bli_flags & XFS_BLI_STALE; 594 #if defined(DEBUG) || defined(XFS_WARN) 595 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 596 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 597 bool aborted = test_bit(XFS_LI_ABORTED, 598 &lip->li_flags); 599 #endif 600 601 trace_xfs_buf_item_release(bip); 602 603 /* 604 * The bli dirty state should match whether the blf has logged segments 605 * except for ordered buffers, where only the bli should be dirty. 606 */ 607 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 608 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 609 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 610 611 /* 612 * Clear the buffer's association with this transaction and 613 * per-transaction state from the bli, which has been copied above. 614 */ 615 bp->b_transp = NULL; 616 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 617 618 /* 619 * Unref the item and unlock the buffer unless held or stale. Stale 620 * buffers remain locked until final unpin unless the bli is freed by 621 * the unref call. The latter implies shutdown because buffer 622 * invalidation dirties the bli and transaction. 623 */ 624 released = xfs_buf_item_put(bip); 625 if (hold || (stale && !released)) 626 return; 627 ASSERT(!stale || aborted); 628 xfs_buf_relse(bp); 629 } 630 631 STATIC void 632 xfs_buf_item_committing( 633 struct xfs_log_item *lip, 634 xfs_csn_t seq) 635 { 636 return xfs_buf_item_release(lip); 637 } 638 639 /* 640 * This is called to find out where the oldest active copy of the 641 * buf log item in the on disk log resides now that the last log 642 * write of it completed at the given lsn. 643 * We always re-log all the dirty data in a buffer, so usually the 644 * latest copy in the on disk log is the only one that matters. For 645 * those cases we simply return the given lsn. 646 * 647 * The one exception to this is for buffers full of newly allocated 648 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 649 * flag set, indicating that only the di_next_unlinked fields from the 650 * inodes in the buffers will be replayed during recovery. If the 651 * original newly allocated inode images have not yet been flushed 652 * when the buffer is so relogged, then we need to make sure that we 653 * keep the old images in the 'active' portion of the log. We do this 654 * by returning the original lsn of that transaction here rather than 655 * the current one. 656 */ 657 STATIC xfs_lsn_t 658 xfs_buf_item_committed( 659 struct xfs_log_item *lip, 660 xfs_lsn_t lsn) 661 { 662 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 663 664 trace_xfs_buf_item_committed(bip); 665 666 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 667 return lip->li_lsn; 668 return lsn; 669 } 670 671 #ifdef DEBUG_EXPENSIVE 672 static int 673 xfs_buf_item_precommit( 674 struct xfs_trans *tp, 675 struct xfs_log_item *lip) 676 { 677 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 678 struct xfs_buf *bp = bip->bli_buf; 679 struct xfs_mount *mp = bp->b_mount; 680 xfs_failaddr_t fa; 681 682 if (!bp->b_ops || !bp->b_ops->verify_struct) 683 return 0; 684 if (bip->bli_flags & XFS_BLI_STALE) 685 return 0; 686 687 fa = bp->b_ops->verify_struct(bp); 688 if (fa) { 689 xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name, 690 bp->b_addr, BBTOB(bp->b_length), fa); 691 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 692 ASSERT(fa == NULL); 693 } 694 695 return 0; 696 } 697 #else 698 # define xfs_buf_item_precommit NULL 699 #endif 700 701 static const struct xfs_item_ops xfs_buf_item_ops = { 702 .iop_size = xfs_buf_item_size, 703 .iop_precommit = xfs_buf_item_precommit, 704 .iop_format = xfs_buf_item_format, 705 .iop_pin = xfs_buf_item_pin, 706 .iop_unpin = xfs_buf_item_unpin, 707 .iop_release = xfs_buf_item_release, 708 .iop_committing = xfs_buf_item_committing, 709 .iop_committed = xfs_buf_item_committed, 710 .iop_push = xfs_buf_item_push, 711 }; 712 713 STATIC void 714 xfs_buf_item_get_format( 715 struct xfs_buf_log_item *bip, 716 int count) 717 { 718 ASSERT(bip->bli_formats == NULL); 719 bip->bli_format_count = count; 720 721 if (count == 1) { 722 bip->bli_formats = &bip->__bli_format; 723 return; 724 } 725 726 bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), 727 GFP_KERNEL | __GFP_NOFAIL); 728 } 729 730 STATIC void 731 xfs_buf_item_free_format( 732 struct xfs_buf_log_item *bip) 733 { 734 if (bip->bli_formats != &bip->__bli_format) { 735 kfree(bip->bli_formats); 736 bip->bli_formats = NULL; 737 } 738 } 739 740 /* 741 * Allocate a new buf log item to go with the given buffer. 742 * Set the buffer's b_log_item field to point to the new 743 * buf log item. 744 */ 745 int 746 xfs_buf_item_init( 747 struct xfs_buf *bp, 748 struct xfs_mount *mp) 749 { 750 struct xfs_buf_log_item *bip = bp->b_log_item; 751 int chunks; 752 int map_size; 753 int i; 754 755 /* 756 * Check to see if there is already a buf log item for 757 * this buffer. If we do already have one, there is 758 * nothing to do here so return. 759 */ 760 ASSERT(bp->b_mount == mp); 761 if (bip) { 762 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 763 ASSERT(!bp->b_transp); 764 ASSERT(bip->bli_buf == bp); 765 return 0; 766 } 767 768 bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); 769 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 770 bip->bli_buf = bp; 771 772 /* 773 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 774 * can be divided into. Make sure not to truncate any pieces. 775 * map_size is the size of the bitmap needed to describe the 776 * chunks of the buffer. 777 * 778 * Discontiguous buffer support follows the layout of the underlying 779 * buffer. This makes the implementation as simple as possible. 780 */ 781 xfs_buf_item_get_format(bip, bp->b_map_count); 782 783 for (i = 0; i < bip->bli_format_count; i++) { 784 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 785 XFS_BLF_CHUNK); 786 map_size = DIV_ROUND_UP(chunks, NBWORD); 787 788 if (map_size > XFS_BLF_DATAMAP_SIZE) { 789 kmem_cache_free(xfs_buf_item_cache, bip); 790 xfs_err(mp, 791 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 792 map_size, 793 BBTOB(bp->b_maps[i].bm_len)); 794 return -EFSCORRUPTED; 795 } 796 797 bip->bli_formats[i].blf_type = XFS_LI_BUF; 798 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 799 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 800 bip->bli_formats[i].blf_map_size = map_size; 801 } 802 803 bp->b_log_item = bip; 804 xfs_buf_hold(bp); 805 return 0; 806 } 807 808 809 /* 810 * Mark bytes first through last inclusive as dirty in the buf 811 * item's bitmap. 812 */ 813 static void 814 xfs_buf_item_log_segment( 815 uint first, 816 uint last, 817 uint *map) 818 { 819 uint first_bit; 820 uint last_bit; 821 uint bits_to_set; 822 uint bits_set; 823 uint word_num; 824 uint *wordp; 825 uint bit; 826 uint end_bit; 827 uint mask; 828 829 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 830 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 831 832 /* 833 * Convert byte offsets to bit numbers. 834 */ 835 first_bit = first >> XFS_BLF_SHIFT; 836 last_bit = last >> XFS_BLF_SHIFT; 837 838 /* 839 * Calculate the total number of bits to be set. 840 */ 841 bits_to_set = last_bit - first_bit + 1; 842 843 /* 844 * Get a pointer to the first word in the bitmap 845 * to set a bit in. 846 */ 847 word_num = first_bit >> BIT_TO_WORD_SHIFT; 848 wordp = &map[word_num]; 849 850 /* 851 * Calculate the starting bit in the first word. 852 */ 853 bit = first_bit & (uint)(NBWORD - 1); 854 855 /* 856 * First set any bits in the first word of our range. 857 * If it starts at bit 0 of the word, it will be 858 * set below rather than here. That is what the variable 859 * bit tells us. The variable bits_set tracks the number 860 * of bits that have been set so far. End_bit is the number 861 * of the last bit to be set in this word plus one. 862 */ 863 if (bit) { 864 end_bit = min(bit + bits_to_set, (uint)NBWORD); 865 mask = ((1U << (end_bit - bit)) - 1) << bit; 866 *wordp |= mask; 867 wordp++; 868 bits_set = end_bit - bit; 869 } else { 870 bits_set = 0; 871 } 872 873 /* 874 * Now set bits a whole word at a time that are between 875 * first_bit and last_bit. 876 */ 877 while ((bits_to_set - bits_set) >= NBWORD) { 878 *wordp = 0xffffffff; 879 bits_set += NBWORD; 880 wordp++; 881 } 882 883 /* 884 * Finally, set any bits left to be set in one last partial word. 885 */ 886 end_bit = bits_to_set - bits_set; 887 if (end_bit) { 888 mask = (1U << end_bit) - 1; 889 *wordp |= mask; 890 } 891 } 892 893 /* 894 * Mark bytes first through last inclusive as dirty in the buf 895 * item's bitmap. 896 */ 897 void 898 xfs_buf_item_log( 899 struct xfs_buf_log_item *bip, 900 uint first, 901 uint last) 902 { 903 int i; 904 uint start; 905 uint end; 906 struct xfs_buf *bp = bip->bli_buf; 907 908 /* 909 * walk each buffer segment and mark them dirty appropriately. 910 */ 911 start = 0; 912 for (i = 0; i < bip->bli_format_count; i++) { 913 if (start > last) 914 break; 915 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 916 917 /* skip to the map that includes the first byte to log */ 918 if (first > end) { 919 start += BBTOB(bp->b_maps[i].bm_len); 920 continue; 921 } 922 923 /* 924 * Trim the range to this segment and mark it in the bitmap. 925 * Note that we must convert buffer offsets to segment relative 926 * offsets (e.g., the first byte of each segment is byte 0 of 927 * that segment). 928 */ 929 if (first < start) 930 first = start; 931 if (end > last) 932 end = last; 933 xfs_buf_item_log_segment(first - start, end - start, 934 &bip->bli_formats[i].blf_data_map[0]); 935 936 start += BBTOB(bp->b_maps[i].bm_len); 937 } 938 } 939 940 941 /* 942 * Return true if the buffer has any ranges logged/dirtied by a transaction, 943 * false otherwise. 944 */ 945 bool 946 xfs_buf_item_dirty_format( 947 struct xfs_buf_log_item *bip) 948 { 949 int i; 950 951 for (i = 0; i < bip->bli_format_count; i++) { 952 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 953 bip->bli_formats[i].blf_map_size)) 954 return true; 955 } 956 957 return false; 958 } 959 960 STATIC void 961 xfs_buf_item_free( 962 struct xfs_buf_log_item *bip) 963 { 964 xfs_buf_item_free_format(bip); 965 kvfree(bip->bli_item.li_lv_shadow); 966 kmem_cache_free(xfs_buf_item_cache, bip); 967 } 968 969 /* 970 * xfs_buf_item_relse() is called when the buf log item is no longer needed. 971 */ 972 void 973 xfs_buf_item_relse( 974 struct xfs_buf *bp) 975 { 976 struct xfs_buf_log_item *bip = bp->b_log_item; 977 978 trace_xfs_buf_item_relse(bp, _RET_IP_); 979 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 980 981 if (atomic_read(&bip->bli_refcount)) 982 return; 983 bp->b_log_item = NULL; 984 xfs_buf_rele(bp); 985 xfs_buf_item_free(bip); 986 } 987 988 void 989 xfs_buf_item_done( 990 struct xfs_buf *bp) 991 { 992 /* 993 * If we are forcibly shutting down, this may well be off the AIL 994 * already. That's because we simulate the log-committed callbacks to 995 * unpin these buffers. Or we may never have put this item on AIL 996 * because of the transaction was aborted forcibly. 997 * xfs_trans_ail_delete() takes care of these. 998 * 999 * Either way, AIL is useless if we're forcing a shutdown. 1000 * 1001 * Note that log recovery writes might have buffer items that are not on 1002 * the AIL even when the file system is not shut down. 1003 */ 1004 xfs_trans_ail_delete(&bp->b_log_item->bli_item, 1005 (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : 1006 SHUTDOWN_CORRUPT_INCORE); 1007 xfs_buf_item_relse(bp); 1008 } 1009