1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_types.h" 21 #include "xfs_bit.h" 22 #include "xfs_log.h" 23 #include "xfs_inum.h" 24 #include "xfs_trans.h" 25 #include "xfs_sb.h" 26 #include "xfs_ag.h" 27 #include "xfs_dir.h" 28 #include "xfs_dir2.h" 29 #include "xfs_dmapi.h" 30 #include "xfs_mount.h" 31 #include "xfs_error.h" 32 #include "xfs_log_priv.h" 33 #include "xfs_buf_item.h" 34 #include "xfs_bmap_btree.h" 35 #include "xfs_alloc_btree.h" 36 #include "xfs_ialloc_btree.h" 37 #include "xfs_log_recover.h" 38 #include "xfs_trans_priv.h" 39 #include "xfs_dir_sf.h" 40 #include "xfs_dir2_sf.h" 41 #include "xfs_attr_sf.h" 42 #include "xfs_dinode.h" 43 #include "xfs_inode.h" 44 #include "xfs_rw.h" 45 46 47 #define xlog_write_adv_cnt(ptr, len, off, bytes) \ 48 { (ptr) += (bytes); \ 49 (len) -= (bytes); \ 50 (off) += (bytes);} 51 52 /* Local miscellaneous function prototypes */ 53 STATIC int xlog_bdstrat_cb(struct xfs_buf *); 54 STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 55 xlog_in_core_t **, xfs_lsn_t *); 56 STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 57 xfs_buftarg_t *log_target, 58 xfs_daddr_t blk_offset, 59 int num_bblks); 60 STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 61 STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 62 STATIC void xlog_unalloc_log(xlog_t *log); 63 STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], 64 int nentries, xfs_log_ticket_t tic, 65 xfs_lsn_t *start_lsn, 66 xlog_in_core_t **commit_iclog, 67 uint flags); 68 69 /* local state machine functions */ 70 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 71 STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); 72 STATIC int xlog_state_get_iclog_space(xlog_t *log, 73 int len, 74 xlog_in_core_t **iclog, 75 xlog_ticket_t *ticket, 76 int *continued_write, 77 int *logoffsetp); 78 STATIC void xlog_state_put_ticket(xlog_t *log, 79 xlog_ticket_t *tic); 80 STATIC int xlog_state_release_iclog(xlog_t *log, 81 xlog_in_core_t *iclog); 82 STATIC void xlog_state_switch_iclogs(xlog_t *log, 83 xlog_in_core_t *iclog, 84 int eventual_size); 85 STATIC int xlog_state_sync(xlog_t *log, 86 xfs_lsn_t lsn, 87 uint flags, 88 int *log_flushed); 89 STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed); 90 STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); 91 92 /* local functions to manipulate grant head */ 93 STATIC int xlog_grant_log_space(xlog_t *log, 94 xlog_ticket_t *xtic); 95 STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 96 int need_bytes); 97 STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 98 xlog_ticket_t *ticket); 99 STATIC int xlog_regrant_write_log_space(xlog_t *log, 100 xlog_ticket_t *ticket); 101 STATIC void xlog_ungrant_log_space(xlog_t *log, 102 xlog_ticket_t *ticket); 103 104 105 /* local ticket functions */ 106 STATIC void xlog_state_ticket_alloc(xlog_t *log); 107 STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, 108 int unit_bytes, 109 int count, 110 char clientid, 111 uint flags); 112 STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket); 113 114 #if defined(DEBUG) 115 STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); 116 STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 117 STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 118 int count, boolean_t syncing); 119 STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 120 xfs_lsn_t tail_lsn); 121 #else 122 #define xlog_verify_dest_ptr(a,b) 123 #define xlog_verify_grant_head(a,b) 124 #define xlog_verify_iclog(a,b,c,d) 125 #define xlog_verify_tail_lsn(a,b,c) 126 #endif 127 128 STATIC int xlog_iclogs_empty(xlog_t *log); 129 130 #if defined(XFS_LOG_TRACE) 131 void 132 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) 133 { 134 unsigned long cnts; 135 136 if (!log->l_grant_trace) { 137 log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); 138 if (!log->l_grant_trace) 139 return; 140 } 141 /* ticket counts are 1 byte each */ 142 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; 143 144 ktrace_enter(log->l_grant_trace, 145 (void *)tic, 146 (void *)log->l_reserve_headq, 147 (void *)log->l_write_headq, 148 (void *)((unsigned long)log->l_grant_reserve_cycle), 149 (void *)((unsigned long)log->l_grant_reserve_bytes), 150 (void *)((unsigned long)log->l_grant_write_cycle), 151 (void *)((unsigned long)log->l_grant_write_bytes), 152 (void *)((unsigned long)log->l_curr_cycle), 153 (void *)((unsigned long)log->l_curr_block), 154 (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)), 155 (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)), 156 (void *)string, 157 (void *)((unsigned long)tic->t_trans_type), 158 (void *)cnts, 159 (void *)((unsigned long)tic->t_curr_res), 160 (void *)((unsigned long)tic->t_unit_res)); 161 } 162 163 void 164 xlog_trace_iclog(xlog_in_core_t *iclog, uint state) 165 { 166 if (!iclog->ic_trace) 167 iclog->ic_trace = ktrace_alloc(256, KM_SLEEP); 168 ktrace_enter(iclog->ic_trace, 169 (void *)((unsigned long)state), 170 (void *)((unsigned long)current_pid()), 171 (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, 172 (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, 173 (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, 174 (void *)NULL, (void *)NULL); 175 } 176 #else 177 #define xlog_trace_loggrant(log,tic,string) 178 #define xlog_trace_iclog(iclog,state) 179 #endif /* XFS_LOG_TRACE */ 180 181 182 static void 183 xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 184 { 185 if (*qp) { 186 tic->t_next = (*qp); 187 tic->t_prev = (*qp)->t_prev; 188 (*qp)->t_prev->t_next = tic; 189 (*qp)->t_prev = tic; 190 } else { 191 tic->t_prev = tic->t_next = tic; 192 *qp = tic; 193 } 194 195 tic->t_flags |= XLOG_TIC_IN_Q; 196 } 197 198 static void 199 xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 200 { 201 if (tic == tic->t_next) { 202 *qp = NULL; 203 } else { 204 *qp = tic->t_next; 205 tic->t_next->t_prev = tic->t_prev; 206 tic->t_prev->t_next = tic->t_next; 207 } 208 209 tic->t_next = tic->t_prev = NULL; 210 tic->t_flags &= ~XLOG_TIC_IN_Q; 211 } 212 213 static void 214 xlog_grant_sub_space(struct log *log, int bytes) 215 { 216 log->l_grant_write_bytes -= bytes; 217 if (log->l_grant_write_bytes < 0) { 218 log->l_grant_write_bytes += log->l_logsize; 219 log->l_grant_write_cycle--; 220 } 221 222 log->l_grant_reserve_bytes -= bytes; 223 if ((log)->l_grant_reserve_bytes < 0) { 224 log->l_grant_reserve_bytes += log->l_logsize; 225 log->l_grant_reserve_cycle--; 226 } 227 228 } 229 230 static void 231 xlog_grant_add_space_write(struct log *log, int bytes) 232 { 233 log->l_grant_write_bytes += bytes; 234 if (log->l_grant_write_bytes > log->l_logsize) { 235 log->l_grant_write_bytes -= log->l_logsize; 236 log->l_grant_write_cycle++; 237 } 238 } 239 240 static void 241 xlog_grant_add_space_reserve(struct log *log, int bytes) 242 { 243 log->l_grant_reserve_bytes += bytes; 244 if (log->l_grant_reserve_bytes > log->l_logsize) { 245 log->l_grant_reserve_bytes -= log->l_logsize; 246 log->l_grant_reserve_cycle++; 247 } 248 } 249 250 static inline void 251 xlog_grant_add_space(struct log *log, int bytes) 252 { 253 xlog_grant_add_space_write(log, bytes); 254 xlog_grant_add_space_reserve(log, bytes); 255 } 256 257 258 /* 259 * NOTES: 260 * 261 * 1. currblock field gets updated at startup and after in-core logs 262 * marked as with WANT_SYNC. 263 */ 264 265 /* 266 * This routine is called when a user of a log manager ticket is done with 267 * the reservation. If the ticket was ever used, then a commit record for 268 * the associated transaction is written out as a log operation header with 269 * no data. The flag XLOG_TIC_INITED is set when the first write occurs with 270 * a given ticket. If the ticket was one with a permanent reservation, then 271 * a few operations are done differently. Permanent reservation tickets by 272 * default don't release the reservation. They just commit the current 273 * transaction with the belief that the reservation is still needed. A flag 274 * must be passed in before permanent reservations are actually released. 275 * When these type of tickets are not released, they need to be set into 276 * the inited state again. By doing this, a start record will be written 277 * out when the next write occurs. 278 */ 279 xfs_lsn_t 280 xfs_log_done(xfs_mount_t *mp, 281 xfs_log_ticket_t xtic, 282 void **iclog, 283 uint flags) 284 { 285 xlog_t *log = mp->m_log; 286 xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; 287 xfs_lsn_t lsn = 0; 288 289 if (XLOG_FORCED_SHUTDOWN(log) || 290 /* 291 * If nothing was ever written, don't write out commit record. 292 * If we get an error, just continue and give back the log ticket. 293 */ 294 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 295 (xlog_commit_record(mp, ticket, 296 (xlog_in_core_t **)iclog, &lsn)))) { 297 lsn = (xfs_lsn_t) -1; 298 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 299 flags |= XFS_LOG_REL_PERM_RESERV; 300 } 301 } 302 303 304 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || 305 (flags & XFS_LOG_REL_PERM_RESERV)) { 306 /* 307 * Release ticket if not permanent reservation or a specifc 308 * request has been made to release a permanent reservation. 309 */ 310 xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); 311 xlog_ungrant_log_space(log, ticket); 312 xlog_state_put_ticket(log, ticket); 313 } else { 314 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); 315 xlog_regrant_reserve_log_space(log, ticket); 316 } 317 318 /* If this ticket was a permanent reservation and we aren't 319 * trying to release it, reset the inited flags; so next time 320 * we write, a start record will be written out. 321 */ 322 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) && 323 (flags & XFS_LOG_REL_PERM_RESERV) == 0) 324 ticket->t_flags |= XLOG_TIC_INITED; 325 326 return lsn; 327 } /* xfs_log_done */ 328 329 330 /* 331 * Force the in-core log to disk. If flags == XFS_LOG_SYNC, 332 * the force is done synchronously. 333 * 334 * Asynchronous forces are implemented by setting the WANT_SYNC 335 * bit in the appropriate in-core log and then returning. 336 * 337 * Synchronous forces are implemented with a semaphore. All callers 338 * to force a given lsn to disk will wait on a semaphore attached to the 339 * specific in-core log. When given in-core log finally completes its 340 * write to disk, that thread will wake up all threads waiting on the 341 * semaphore. 342 */ 343 int 344 _xfs_log_force( 345 xfs_mount_t *mp, 346 xfs_lsn_t lsn, 347 uint flags, 348 int *log_flushed) 349 { 350 xlog_t *log = mp->m_log; 351 int dummy; 352 353 if (!log_flushed) 354 log_flushed = &dummy; 355 356 ASSERT(flags & XFS_LOG_FORCE); 357 358 XFS_STATS_INC(xs_log_force); 359 360 if (log->l_flags & XLOG_IO_ERROR) 361 return XFS_ERROR(EIO); 362 if (lsn == 0) 363 return xlog_state_sync_all(log, flags, log_flushed); 364 else 365 return xlog_state_sync(log, lsn, flags, log_flushed); 366 } /* xfs_log_force */ 367 368 /* 369 * Attaches a new iclog I/O completion callback routine during 370 * transaction commit. If the log is in error state, a non-zero 371 * return code is handed back and the caller is responsible for 372 * executing the callback at an appropriate time. 373 */ 374 int 375 xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ 376 void *iclog_hndl, /* iclog to hang callback off */ 377 xfs_log_callback_t *cb) 378 { 379 xlog_t *log = mp->m_log; 380 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; 381 int abortflg, spl; 382 383 cb->cb_next = NULL; 384 spl = LOG_LOCK(log); 385 abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); 386 if (!abortflg) { 387 ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || 388 (iclog->ic_state == XLOG_STATE_WANT_SYNC)); 389 cb->cb_next = NULL; 390 *(iclog->ic_callback_tail) = cb; 391 iclog->ic_callback_tail = &(cb->cb_next); 392 } 393 LOG_UNLOCK(log, spl); 394 return abortflg; 395 } /* xfs_log_notify */ 396 397 int 398 xfs_log_release_iclog(xfs_mount_t *mp, 399 void *iclog_hndl) 400 { 401 xlog_t *log = mp->m_log; 402 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; 403 404 if (xlog_state_release_iclog(log, iclog)) { 405 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); 406 return EIO; 407 } 408 409 return 0; 410 } 411 412 /* 413 * 1. Reserve an amount of on-disk log space and return a ticket corresponding 414 * to the reservation. 415 * 2. Potentially, push buffers at tail of log to disk. 416 * 417 * Each reservation is going to reserve extra space for a log record header. 418 * When writes happen to the on-disk log, we don't subtract the length of the 419 * log record header from any reservation. By wasting space in each 420 * reservation, we prevent over allocation problems. 421 */ 422 int 423 xfs_log_reserve(xfs_mount_t *mp, 424 int unit_bytes, 425 int cnt, 426 xfs_log_ticket_t *ticket, 427 __uint8_t client, 428 uint flags, 429 uint t_type) 430 { 431 xlog_t *log = mp->m_log; 432 xlog_ticket_t *internal_ticket; 433 int retval = 0; 434 435 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 436 ASSERT((flags & XFS_LOG_NOSLEEP) == 0); 437 438 if (XLOG_FORCED_SHUTDOWN(log)) 439 return XFS_ERROR(EIO); 440 441 XFS_STATS_INC(xs_try_logspace); 442 443 if (*ticket != NULL) { 444 ASSERT(flags & XFS_LOG_PERM_RESERV); 445 internal_ticket = (xlog_ticket_t *)*ticket; 446 xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)"); 447 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 448 retval = xlog_regrant_write_log_space(log, internal_ticket); 449 } else { 450 /* may sleep if need to allocate more tickets */ 451 internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, 452 client, flags); 453 internal_ticket->t_trans_type = t_type; 454 *ticket = internal_ticket; 455 xlog_trace_loggrant(log, internal_ticket, 456 (internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ? 457 "xfs_log_reserve: create new ticket (permanent trans)" : 458 "xfs_log_reserve: create new ticket"); 459 xlog_grant_push_ail(mp, 460 (internal_ticket->t_unit_res * 461 internal_ticket->t_cnt)); 462 retval = xlog_grant_log_space(log, internal_ticket); 463 } 464 465 return retval; 466 } /* xfs_log_reserve */ 467 468 469 /* 470 * Mount a log filesystem 471 * 472 * mp - ubiquitous xfs mount point structure 473 * log_target - buftarg of on-disk log device 474 * blk_offset - Start block # where block size is 512 bytes (BBSIZE) 475 * num_bblocks - Number of BBSIZE blocks in on-disk log 476 * 477 * Return error or zero. 478 */ 479 int 480 xfs_log_mount(xfs_mount_t *mp, 481 xfs_buftarg_t *log_target, 482 xfs_daddr_t blk_offset, 483 int num_bblks) 484 { 485 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 486 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 487 else { 488 cmn_err(CE_NOTE, 489 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 490 mp->m_fsname); 491 ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); 492 } 493 494 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); 495 496 /* 497 * skip log recovery on a norecovery mount. pretend it all 498 * just worked. 499 */ 500 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 501 int error; 502 vfs_t *vfsp = XFS_MTOVFS(mp); 503 int readonly = (vfsp->vfs_flag & VFS_RDONLY); 504 505 if (readonly) 506 vfsp->vfs_flag &= ~VFS_RDONLY; 507 508 error = xlog_recover(mp->m_log); 509 510 if (readonly) 511 vfsp->vfs_flag |= VFS_RDONLY; 512 if (error) { 513 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 514 xlog_unalloc_log(mp->m_log); 515 return error; 516 } 517 } 518 519 /* Normal transactions can now occur */ 520 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 521 522 /* End mounting message in xfs_log_mount_finish */ 523 return 0; 524 } /* xfs_log_mount */ 525 526 /* 527 * Finish the recovery of the file system. This is separate from 528 * the xfs_log_mount() call, because it depends on the code in 529 * xfs_mountfs() to read in the root and real-time bitmap inodes 530 * between calling xfs_log_mount() and here. 531 * 532 * mp - ubiquitous xfs mount point structure 533 */ 534 int 535 xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) 536 { 537 int error; 538 539 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 540 error = xlog_recover_finish(mp->m_log, mfsi_flags); 541 else { 542 error = 0; 543 ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); 544 } 545 546 return error; 547 } 548 549 /* 550 * Unmount processing for the log. 551 */ 552 int 553 xfs_log_unmount(xfs_mount_t *mp) 554 { 555 int error; 556 557 error = xfs_log_unmount_write(mp); 558 xfs_log_unmount_dealloc(mp); 559 return error; 560 } 561 562 /* 563 * Final log writes as part of unmount. 564 * 565 * Mark the filesystem clean as unmount happens. Note that during relocation 566 * this routine needs to be executed as part of source-bag while the 567 * deallocation must not be done until source-end. 568 */ 569 570 /* 571 * Unmount record used to have a string "Unmount filesystem--" in the 572 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). 573 * We just write the magic number now since that particular field isn't 574 * currently architecture converted and "nUmount" is a bit foo. 575 * As far as I know, there weren't any dependencies on the old behaviour. 576 */ 577 578 int 579 xfs_log_unmount_write(xfs_mount_t *mp) 580 { 581 xlog_t *log = mp->m_log; 582 xlog_in_core_t *iclog; 583 #ifdef DEBUG 584 xlog_in_core_t *first_iclog; 585 #endif 586 xfs_log_iovec_t reg[1]; 587 xfs_log_ticket_t tic = NULL; 588 xfs_lsn_t lsn; 589 int error; 590 SPLDECL(s); 591 592 /* the data section must be 32 bit size aligned */ 593 struct { 594 __uint16_t magic; 595 __uint16_t pad1; 596 __uint32_t pad2; /* may as well make it 64 bits */ 597 } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; 598 599 /* 600 * Don't write out unmount record on read-only mounts. 601 * Or, if we are doing a forced umount (typically because of IO errors). 602 */ 603 if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) 604 return 0; 605 606 xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); 607 608 #ifdef DEBUG 609 first_iclog = iclog = log->l_iclog; 610 do { 611 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 612 ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); 613 ASSERT(iclog->ic_offset == 0); 614 } 615 iclog = iclog->ic_next; 616 } while (iclog != first_iclog); 617 #endif 618 if (! (XLOG_FORCED_SHUTDOWN(log))) { 619 reg[0].i_addr = (void*)&magic; 620 reg[0].i_len = sizeof(magic); 621 XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_UNMOUNT); 622 623 error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0); 624 if (!error) { 625 /* remove inited flag */ 626 ((xlog_ticket_t *)tic)->t_flags = 0; 627 error = xlog_write(mp, reg, 1, tic, &lsn, 628 NULL, XLOG_UNMOUNT_TRANS); 629 /* 630 * At this point, we're umounting anyway, 631 * so there's no point in transitioning log state 632 * to IOERROR. Just continue... 633 */ 634 } 635 636 if (error) { 637 xfs_fs_cmn_err(CE_ALERT, mp, 638 "xfs_log_unmount: unmount record failed"); 639 } 640 641 642 s = LOG_LOCK(log); 643 iclog = log->l_iclog; 644 iclog->ic_refcnt++; 645 LOG_UNLOCK(log, s); 646 xlog_state_want_sync(log, iclog); 647 (void) xlog_state_release_iclog(log, iclog); 648 649 s = LOG_LOCK(log); 650 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 651 iclog->ic_state == XLOG_STATE_DIRTY)) { 652 if (!XLOG_FORCED_SHUTDOWN(log)) { 653 sv_wait(&iclog->ic_forcesema, PMEM, 654 &log->l_icloglock, s); 655 } else { 656 LOG_UNLOCK(log, s); 657 } 658 } else { 659 LOG_UNLOCK(log, s); 660 } 661 if (tic) 662 xlog_state_put_ticket(log, tic); 663 } else { 664 /* 665 * We're already in forced_shutdown mode, couldn't 666 * even attempt to write out the unmount transaction. 667 * 668 * Go through the motions of sync'ing and releasing 669 * the iclog, even though no I/O will actually happen, 670 * we need to wait for other log I/O's that may already 671 * be in progress. Do this as a separate section of 672 * code so we'll know if we ever get stuck here that 673 * we're in this odd situation of trying to unmount 674 * a file system that went into forced_shutdown as 675 * the result of an unmount.. 676 */ 677 s = LOG_LOCK(log); 678 iclog = log->l_iclog; 679 iclog->ic_refcnt++; 680 LOG_UNLOCK(log, s); 681 682 xlog_state_want_sync(log, iclog); 683 (void) xlog_state_release_iclog(log, iclog); 684 685 s = LOG_LOCK(log); 686 687 if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE 688 || iclog->ic_state == XLOG_STATE_DIRTY 689 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 690 691 sv_wait(&iclog->ic_forcesema, PMEM, 692 &log->l_icloglock, s); 693 } else { 694 LOG_UNLOCK(log, s); 695 } 696 } 697 698 return 0; 699 } /* xfs_log_unmount_write */ 700 701 /* 702 * Deallocate log structures for unmount/relocation. 703 */ 704 void 705 xfs_log_unmount_dealloc(xfs_mount_t *mp) 706 { 707 xlog_unalloc_log(mp->m_log); 708 } 709 710 /* 711 * Write region vectors to log. The write happens using the space reservation 712 * of the ticket (tic). It is not a requirement that all writes for a given 713 * transaction occur with one call to xfs_log_write(). 714 */ 715 int 716 xfs_log_write(xfs_mount_t * mp, 717 xfs_log_iovec_t reg[], 718 int nentries, 719 xfs_log_ticket_t tic, 720 xfs_lsn_t *start_lsn) 721 { 722 int error; 723 xlog_t *log = mp->m_log; 724 725 if (XLOG_FORCED_SHUTDOWN(log)) 726 return XFS_ERROR(EIO); 727 728 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { 729 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); 730 } 731 return error; 732 } /* xfs_log_write */ 733 734 735 void 736 xfs_log_move_tail(xfs_mount_t *mp, 737 xfs_lsn_t tail_lsn) 738 { 739 xlog_ticket_t *tic; 740 xlog_t *log = mp->m_log; 741 int need_bytes, free_bytes, cycle, bytes; 742 SPLDECL(s); 743 744 if (XLOG_FORCED_SHUTDOWN(log)) 745 return; 746 ASSERT(!XFS_FORCED_SHUTDOWN(mp)); 747 748 if (tail_lsn == 0) { 749 /* needed since sync_lsn is 64 bits */ 750 s = LOG_LOCK(log); 751 tail_lsn = log->l_last_sync_lsn; 752 LOG_UNLOCK(log, s); 753 } 754 755 s = GRANT_LOCK(log); 756 757 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 758 * tail_lsn. 759 */ 760 if (tail_lsn != 1) { 761 log->l_tail_lsn = tail_lsn; 762 } 763 764 if ((tic = log->l_write_headq)) { 765 #ifdef DEBUG 766 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 767 panic("Recovery problem"); 768 #endif 769 cycle = log->l_grant_write_cycle; 770 bytes = log->l_grant_write_bytes; 771 free_bytes = xlog_space_left(log, cycle, bytes); 772 do { 773 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 774 775 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 776 break; 777 tail_lsn = 0; 778 free_bytes -= tic->t_unit_res; 779 sv_signal(&tic->t_sema); 780 tic = tic->t_next; 781 } while (tic != log->l_write_headq); 782 } 783 if ((tic = log->l_reserve_headq)) { 784 #ifdef DEBUG 785 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 786 panic("Recovery problem"); 787 #endif 788 cycle = log->l_grant_reserve_cycle; 789 bytes = log->l_grant_reserve_bytes; 790 free_bytes = xlog_space_left(log, cycle, bytes); 791 do { 792 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 793 need_bytes = tic->t_unit_res*tic->t_cnt; 794 else 795 need_bytes = tic->t_unit_res; 796 if (free_bytes < need_bytes && tail_lsn != 1) 797 break; 798 tail_lsn = 0; 799 free_bytes -= need_bytes; 800 sv_signal(&tic->t_sema); 801 tic = tic->t_next; 802 } while (tic != log->l_reserve_headq); 803 } 804 GRANT_UNLOCK(log, s); 805 } /* xfs_log_move_tail */ 806 807 /* 808 * Determine if we have a transaction that has gone to disk 809 * that needs to be covered. Log activity needs to be idle (no AIL and 810 * nothing in the iclogs). And, we need to be in the right state indicating 811 * something has gone out. 812 */ 813 int 814 xfs_log_need_covered(xfs_mount_t *mp) 815 { 816 SPLDECL(s); 817 int needed = 0, gen; 818 xlog_t *log = mp->m_log; 819 vfs_t *vfsp = XFS_MTOVFS(mp); 820 821 if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) || 822 (vfsp->vfs_flag & VFS_RDONLY)) 823 return 0; 824 825 s = LOG_LOCK(log); 826 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 827 (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 828 && !xfs_trans_first_ail(mp, &gen) 829 && xlog_iclogs_empty(log)) { 830 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 831 log->l_covered_state = XLOG_STATE_COVER_DONE; 832 else { 833 ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); 834 log->l_covered_state = XLOG_STATE_COVER_DONE2; 835 } 836 needed = 1; 837 } 838 LOG_UNLOCK(log, s); 839 return needed; 840 } 841 842 /****************************************************************************** 843 * 844 * local routines 845 * 846 ****************************************************************************** 847 */ 848 849 /* xfs_trans_tail_ail returns 0 when there is nothing in the list. 850 * The log manager must keep track of the last LR which was committed 851 * to disk. The lsn of this LR will become the new tail_lsn whenever 852 * xfs_trans_tail_ail returns 0. If we don't do this, we run into 853 * the situation where stuff could be written into the log but nothing 854 * was ever in the AIL when asked. Eventually, we panic since the 855 * tail hits the head. 856 * 857 * We may be holding the log iclog lock upon entering this routine. 858 */ 859 xfs_lsn_t 860 xlog_assign_tail_lsn(xfs_mount_t *mp) 861 { 862 xfs_lsn_t tail_lsn; 863 SPLDECL(s); 864 xlog_t *log = mp->m_log; 865 866 tail_lsn = xfs_trans_tail_ail(mp); 867 s = GRANT_LOCK(log); 868 if (tail_lsn != 0) { 869 log->l_tail_lsn = tail_lsn; 870 } else { 871 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn; 872 } 873 GRANT_UNLOCK(log, s); 874 875 return tail_lsn; 876 } /* xlog_assign_tail_lsn */ 877 878 879 /* 880 * Return the space in the log between the tail and the head. The head 881 * is passed in the cycle/bytes formal parms. In the special case where 882 * the reserve head has wrapped passed the tail, this calculation is no 883 * longer valid. In this case, just return 0 which means there is no space 884 * in the log. This works for all places where this function is called 885 * with the reserve head. Of course, if the write head were to ever 886 * wrap the tail, we should blow up. Rather than catch this case here, 887 * we depend on other ASSERTions in other parts of the code. XXXmiken 888 * 889 * This code also handles the case where the reservation head is behind 890 * the tail. The details of this case are described below, but the end 891 * result is that we return the size of the log as the amount of space left. 892 */ 893 int 894 xlog_space_left(xlog_t *log, int cycle, int bytes) 895 { 896 int free_bytes; 897 int tail_bytes; 898 int tail_cycle; 899 900 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 901 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 902 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 903 free_bytes = log->l_logsize - (bytes - tail_bytes); 904 } else if ((tail_cycle + 1) < cycle) { 905 return 0; 906 } else if (tail_cycle < cycle) { 907 ASSERT(tail_cycle == (cycle - 1)); 908 free_bytes = tail_bytes - bytes; 909 } else { 910 /* 911 * The reservation head is behind the tail. 912 * In this case we just want to return the size of the 913 * log as the amount of space left. 914 */ 915 xfs_fs_cmn_err(CE_ALERT, log->l_mp, 916 "xlog_space_left: head behind tail\n" 917 " tail_cycle = %d, tail_bytes = %d\n" 918 " GH cycle = %d, GH bytes = %d", 919 tail_cycle, tail_bytes, cycle, bytes); 920 ASSERT(0); 921 free_bytes = log->l_logsize; 922 } 923 return free_bytes; 924 } /* xlog_space_left */ 925 926 927 /* 928 * Log function which is called when an io completes. 929 * 930 * The log manager needs its own routine, in order to control what 931 * happens with the buffer after the write completes. 932 */ 933 void 934 xlog_iodone(xfs_buf_t *bp) 935 { 936 xlog_in_core_t *iclog; 937 xlog_t *l; 938 int aborted; 939 940 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); 941 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); 942 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 943 aborted = 0; 944 945 /* 946 * Some versions of cpp barf on the recursive definition of 947 * ic_log -> hic_fields.ic_log and expand ic_log twice when 948 * it is passed through two macros. Workaround broken cpp. 949 */ 950 l = iclog->ic_log; 951 952 /* 953 * Race to shutdown the filesystem if we see an error. 954 */ 955 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 956 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { 957 xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); 958 XFS_BUF_STALE(bp); 959 xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR); 960 /* 961 * This flag will be propagated to the trans-committed 962 * callback routines to let them know that the log-commit 963 * didn't succeed. 964 */ 965 aborted = XFS_LI_ABORTED; 966 } else if (iclog->ic_state & XLOG_STATE_IOERROR) { 967 aborted = XFS_LI_ABORTED; 968 } 969 xlog_state_done_syncing(iclog, aborted); 970 if (!(XFS_BUF_ISASYNC(bp))) { 971 /* 972 * Corresponding psema() will be done in bwrite(). If we don't 973 * vsema() here, panic. 974 */ 975 XFS_BUF_V_IODONESEMA(bp); 976 } 977 } /* xlog_iodone */ 978 979 /* 980 * The bdstrat callback function for log bufs. This gives us a central 981 * place to trap bufs in case we get hit by a log I/O error and need to 982 * shutdown. Actually, in practice, even when we didn't get a log error, 983 * we transition the iclogs to IOERROR state *after* flushing all existing 984 * iclogs to disk. This is because we don't want anymore new transactions to be 985 * started or completed afterwards. 986 */ 987 STATIC int 988 xlog_bdstrat_cb(struct xfs_buf *bp) 989 { 990 xlog_in_core_t *iclog; 991 992 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); 993 994 if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) { 995 /* note for irix bstrat will need struct bdevsw passed 996 * Fix the following macro if the code ever is merged 997 */ 998 XFS_bdstrat(bp); 999 return 0; 1000 } 1001 1002 xfs_buftrace("XLOG__BDSTRAT IOERROR", bp); 1003 XFS_BUF_ERROR(bp, EIO); 1004 XFS_BUF_STALE(bp); 1005 xfs_biodone(bp); 1006 return XFS_ERROR(EIO); 1007 1008 1009 } 1010 1011 /* 1012 * Return size of each in-core log record buffer. 1013 * 1014 * Low memory machines only get 2 16KB buffers. We don't want to waste 1015 * memory here. However, all other machines get at least 2 32KB buffers. 1016 * The number is hard coded because we don't care about the minimum 1017 * memory size, just 32MB systems. 1018 * 1019 * If the filesystem blocksize is too large, we may need to choose a 1020 * larger size since the directory code currently logs entire blocks. 1021 */ 1022 1023 STATIC void 1024 xlog_get_iclog_buffer_size(xfs_mount_t *mp, 1025 xlog_t *log) 1026 { 1027 int size; 1028 int xhdrs; 1029 1030 if (mp->m_logbufs <= 0) { 1031 if (xfs_physmem <= btoc(128*1024*1024)) { 1032 log->l_iclog_bufs = XLOG_MIN_ICLOGS; 1033 } else if (xfs_physmem <= btoc(400*1024*1024)) { 1034 log->l_iclog_bufs = XLOG_MED_ICLOGS; 1035 } else { /* 256K with 32K bufs */ 1036 log->l_iclog_bufs = XLOG_MAX_ICLOGS; 1037 } 1038 } else { 1039 log->l_iclog_bufs = mp->m_logbufs; 1040 } 1041 1042 /* 1043 * Buffer size passed in from mount system call. 1044 */ 1045 if (mp->m_logbsize > 0) { 1046 size = log->l_iclog_size = mp->m_logbsize; 1047 log->l_iclog_size_log = 0; 1048 while (size != 1) { 1049 log->l_iclog_size_log++; 1050 size >>= 1; 1051 } 1052 1053 if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { 1054 /* # headers = size / 32K 1055 * one header holds cycles from 32K of data 1056 */ 1057 1058 xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; 1059 if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) 1060 xhdrs++; 1061 log->l_iclog_hsize = xhdrs << BBSHIFT; 1062 log->l_iclog_heads = xhdrs; 1063 } else { 1064 ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); 1065 log->l_iclog_hsize = BBSIZE; 1066 log->l_iclog_heads = 1; 1067 } 1068 goto done; 1069 } 1070 1071 /* 1072 * Special case machines that have less than 32MB of memory. 1073 * All machines with more memory use 32KB buffers. 1074 */ 1075 if (xfs_physmem <= btoc(32*1024*1024)) { 1076 /* Don't change; min configuration */ 1077 log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */ 1078 log->l_iclog_size_log = XLOG_RECORD_BSHIFT; 1079 } else { 1080 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */ 1081 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; 1082 } 1083 1084 /* the default log size is 16k or 32k which is one header sector */ 1085 log->l_iclog_hsize = BBSIZE; 1086 log->l_iclog_heads = 1; 1087 1088 /* 1089 * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use 1090 * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers. 1091 */ 1092 if (mp->m_sb.sb_blocksize >= 16*1024) { 1093 log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; 1094 log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; 1095 if (mp->m_logbufs <= 0) { 1096 switch (mp->m_sb.sb_blocksize) { 1097 case 16*1024: /* 16 KB */ 1098 log->l_iclog_bufs = 3; 1099 break; 1100 case 32*1024: /* 32 KB */ 1101 log->l_iclog_bufs = 4; 1102 break; 1103 case 64*1024: /* 64 KB */ 1104 log->l_iclog_bufs = 8; 1105 break; 1106 default: 1107 xlog_panic("XFS: Invalid blocksize"); 1108 break; 1109 } 1110 } 1111 } 1112 1113 done: /* are we being asked to make the sizes selected above visible? */ 1114 if (mp->m_logbufs == 0) 1115 mp->m_logbufs = log->l_iclog_bufs; 1116 if (mp->m_logbsize == 0) 1117 mp->m_logbsize = log->l_iclog_size; 1118 } /* xlog_get_iclog_buffer_size */ 1119 1120 1121 /* 1122 * This routine initializes some of the log structure for a given mount point. 1123 * Its primary purpose is to fill in enough, so recovery can occur. However, 1124 * some other stuff may be filled in too. 1125 */ 1126 STATIC xlog_t * 1127 xlog_alloc_log(xfs_mount_t *mp, 1128 xfs_buftarg_t *log_target, 1129 xfs_daddr_t blk_offset, 1130 int num_bblks) 1131 { 1132 xlog_t *log; 1133 xlog_rec_header_t *head; 1134 xlog_in_core_t **iclogp; 1135 xlog_in_core_t *iclog, *prev_iclog=NULL; 1136 xfs_buf_t *bp; 1137 int i; 1138 int iclogsize; 1139 1140 log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP); 1141 1142 log->l_mp = mp; 1143 log->l_targ = log_target; 1144 log->l_logsize = BBTOB(num_bblks); 1145 log->l_logBBstart = blk_offset; 1146 log->l_logBBsize = num_bblks; 1147 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1148 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1149 1150 log->l_prev_block = -1; 1151 ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0); 1152 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1153 log->l_last_sync_lsn = log->l_tail_lsn; 1154 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1155 log->l_grant_reserve_cycle = 1; 1156 log->l_grant_write_cycle = 1; 1157 1158 if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) { 1159 log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; 1160 ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); 1161 /* for larger sector sizes, must have v2 or external log */ 1162 ASSERT(log->l_sectbb_log == 0 || 1163 log->l_logBBstart == 0 || 1164 XFS_SB_VERSION_HASLOGV2(&mp->m_sb)); 1165 ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); 1166 } 1167 log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; 1168 1169 xlog_get_iclog_buffer_size(mp, log); 1170 1171 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); 1172 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1173 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); 1174 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1175 ASSERT(XFS_BUF_ISBUSY(bp)); 1176 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 1177 log->l_xbuf = bp; 1178 1179 spinlock_init(&log->l_icloglock, "iclog"); 1180 spinlock_init(&log->l_grant_lock, "grhead_iclog"); 1181 initnsema(&log->l_flushsema, 0, "ic-flush"); 1182 xlog_state_ticket_alloc(log); /* wait until after icloglock inited */ 1183 1184 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1185 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1186 1187 iclogp = &log->l_iclog; 1188 /* 1189 * The amount of memory to allocate for the iclog structure is 1190 * rather funky due to the way the structure is defined. It is 1191 * done this way so that we can use different sizes for machines 1192 * with different amounts of memory. See the definition of 1193 * xlog_in_core_t in xfs_log_priv.h for details. 1194 */ 1195 iclogsize = log->l_iclog_size; 1196 ASSERT(log->l_iclog_size >= 4096); 1197 for (i=0; i < log->l_iclog_bufs; i++) { 1198 *iclogp = (xlog_in_core_t *) 1199 kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); 1200 iclog = *iclogp; 1201 iclog->hic_data = (xlog_in_core_2_t *) 1202 kmem_zalloc(iclogsize, KM_SLEEP); 1203 1204 iclog->ic_prev = prev_iclog; 1205 prev_iclog = iclog; 1206 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); 1207 1208 head = &iclog->ic_header; 1209 memset(head, 0, sizeof(xlog_rec_header_t)); 1210 INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); 1211 INT_SET(head->h_version, ARCH_CONVERT, 1212 XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); 1213 INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size); 1214 /* new fields */ 1215 INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); 1216 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1217 1218 bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); 1219 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1220 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); 1221 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1222 iclog->ic_bp = bp; 1223 1224 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; 1225 iclog->ic_state = XLOG_STATE_ACTIVE; 1226 iclog->ic_log = log; 1227 iclog->ic_callback_tail = &(iclog->ic_callback); 1228 iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; 1229 1230 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1231 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1232 sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); 1233 sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); 1234 1235 iclogp = &iclog->ic_next; 1236 } 1237 *iclogp = log->l_iclog; /* complete ring */ 1238 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1239 1240 return log; 1241 } /* xlog_alloc_log */ 1242 1243 1244 /* 1245 * Write out the commit record of a transaction associated with the given 1246 * ticket. Return the lsn of the commit record. 1247 */ 1248 STATIC int 1249 xlog_commit_record(xfs_mount_t *mp, 1250 xlog_ticket_t *ticket, 1251 xlog_in_core_t **iclog, 1252 xfs_lsn_t *commitlsnp) 1253 { 1254 int error; 1255 xfs_log_iovec_t reg[1]; 1256 1257 reg[0].i_addr = NULL; 1258 reg[0].i_len = 0; 1259 XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_COMMIT); 1260 1261 ASSERT_ALWAYS(iclog); 1262 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1263 iclog, XLOG_COMMIT_TRANS))) { 1264 xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); 1265 } 1266 return error; 1267 } /* xlog_commit_record */ 1268 1269 1270 /* 1271 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1272 * log space. This code pushes on the lsn which would supposedly free up 1273 * the 25% which we want to leave free. We may need to adopt a policy which 1274 * pushes on an lsn which is further along in the log once we reach the high 1275 * water mark. In this manner, we would be creating a low water mark. 1276 */ 1277 void 1278 xlog_grant_push_ail(xfs_mount_t *mp, 1279 int need_bytes) 1280 { 1281 xlog_t *log = mp->m_log; /* pointer to the log */ 1282 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1283 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1284 int free_blocks; /* free blocks left to write to */ 1285 int free_bytes; /* free bytes left to write to */ 1286 int threshold_block; /* block in lsn we'd like to be at */ 1287 int threshold_cycle; /* lsn cycle we'd like to be at */ 1288 int free_threshold; 1289 SPLDECL(s); 1290 1291 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1292 1293 s = GRANT_LOCK(log); 1294 free_bytes = xlog_space_left(log, 1295 log->l_grant_reserve_cycle, 1296 log->l_grant_reserve_bytes); 1297 tail_lsn = log->l_tail_lsn; 1298 free_blocks = BTOBBT(free_bytes); 1299 1300 /* 1301 * Set the threshold for the minimum number of free blocks in the 1302 * log to the maximum of what the caller needs, one quarter of the 1303 * log, and 256 blocks. 1304 */ 1305 free_threshold = BTOBB(need_bytes); 1306 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1307 free_threshold = MAX(free_threshold, 256); 1308 if (free_blocks < free_threshold) { 1309 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold; 1310 threshold_cycle = CYCLE_LSN(tail_lsn); 1311 if (threshold_block >= log->l_logBBsize) { 1312 threshold_block -= log->l_logBBsize; 1313 threshold_cycle += 1; 1314 } 1315 ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle, 1316 threshold_block); 1317 1318 /* Don't pass in an lsn greater than the lsn of the last 1319 * log record known to be on disk. 1320 */ 1321 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1322 threshold_lsn = log->l_last_sync_lsn; 1323 } 1324 GRANT_UNLOCK(log, s); 1325 1326 /* 1327 * Get the transaction layer to kick the dirty buffers out to 1328 * disk asynchronously. No point in trying to do this if 1329 * the filesystem is shutting down. 1330 */ 1331 if (threshold_lsn && 1332 !XLOG_FORCED_SHUTDOWN(log)) 1333 xfs_trans_push_ail(mp, threshold_lsn); 1334 } /* xlog_grant_push_ail */ 1335 1336 1337 /* 1338 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1339 * fashion. Previously, we should have moved the current iclog 1340 * ptr in the log to point to the next available iclog. This allows further 1341 * write to continue while this code syncs out an iclog ready to go. 1342 * Before an in-core log can be written out, the data section must be scanned 1343 * to save away the 1st word of each BBSIZE block into the header. We replace 1344 * it with the current cycle count. Each BBSIZE block is tagged with the 1345 * cycle count because there in an implicit assumption that drives will 1346 * guarantee that entire 512 byte blocks get written at once. In other words, 1347 * we can't have part of a 512 byte block written and part not written. By 1348 * tagging each block, we will know which blocks are valid when recovering 1349 * after an unclean shutdown. 1350 * 1351 * This routine is single threaded on the iclog. No other thread can be in 1352 * this routine with the same iclog. Changing contents of iclog can there- 1353 * fore be done without grabbing the state machine lock. Updating the global 1354 * log will require grabbing the lock though. 1355 * 1356 * The entire log manager uses a logical block numbering scheme. Only 1357 * log_sync (and then only bwrite()) know about the fact that the log may 1358 * not start with block zero on a given device. The log block start offset 1359 * is added immediately before calling bwrite(). 1360 */ 1361 1362 int 1363 xlog_sync(xlog_t *log, 1364 xlog_in_core_t *iclog) 1365 { 1366 xfs_caddr_t dptr; /* pointer to byte sized element */ 1367 xfs_buf_t *bp; 1368 int i, ops; 1369 uint count; /* byte count of bwrite */ 1370 uint count_init; /* initial count before roundup */ 1371 int roundoff; /* roundoff to BB or stripe */ 1372 int split = 0; /* split write into two regions */ 1373 int error; 1374 SPLDECL(s); 1375 int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb); 1376 1377 XFS_STATS_INC(xs_log_writes); 1378 ASSERT(iclog->ic_refcnt == 0); 1379 1380 /* Add for LR header */ 1381 count_init = log->l_iclog_hsize + iclog->ic_offset; 1382 1383 /* Round out the log write size */ 1384 if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { 1385 /* we have a v2 stripe unit to use */ 1386 count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); 1387 } else { 1388 count = BBTOB(BTOBB(count_init)); 1389 } 1390 roundoff = count - count_init; 1391 ASSERT(roundoff >= 0); 1392 ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && 1393 roundoff < log->l_mp->m_sb.sb_logsunit) 1394 || 1395 (log->l_mp->m_sb.sb_logsunit <= 1 && 1396 roundoff < BBTOB(1))); 1397 1398 /* move grant heads by roundoff in sync */ 1399 s = GRANT_LOCK(log); 1400 xlog_grant_add_space(log, roundoff); 1401 GRANT_UNLOCK(log, s); 1402 1403 /* put cycle number in every block */ 1404 xlog_pack_data(log, iclog, roundoff); 1405 1406 /* real byte length */ 1407 if (v2) { 1408 INT_SET(iclog->ic_header.h_len, 1409 ARCH_CONVERT, 1410 iclog->ic_offset + roundoff); 1411 } else { 1412 INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset); 1413 } 1414 1415 /* put ops count in correct order */ 1416 ops = iclog->ic_header.h_num_logops; 1417 INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); 1418 1419 bp = iclog->ic_bp; 1420 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); 1421 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); 1422 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); 1423 1424 XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); 1425 1426 /* Do we need to split this write into 2 parts? */ 1427 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { 1428 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); 1429 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); 1430 iclog->ic_bwritecnt = 2; /* split into 2 writes */ 1431 } else { 1432 iclog->ic_bwritecnt = 1; 1433 } 1434 XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); 1435 XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ 1436 XFS_BUF_BUSY(bp); 1437 XFS_BUF_ASYNC(bp); 1438 /* 1439 * Do an ordered write for the log block. 1440 * 1441 * It may not be needed to flush the first split block in the log wrap 1442 * case, but do it anyways to be safe -AK 1443 */ 1444 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1445 XFS_BUF_ORDERED(bp); 1446 1447 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1448 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1449 1450 xlog_verify_iclog(log, iclog, count, B_TRUE); 1451 1452 /* account for log which doesn't start at block #0 */ 1453 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1454 /* 1455 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem 1456 * is shutting down. 1457 */ 1458 XFS_BUF_WRITE(bp); 1459 1460 if ((error = XFS_bwrite(bp))) { 1461 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1462 XFS_BUF_ADDR(bp)); 1463 return error; 1464 } 1465 if (split) { 1466 bp = iclog->ic_log->l_xbuf; 1467 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == 1468 (unsigned long)1); 1469 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); 1470 XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ 1471 XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ 1472 (__psint_t)count), split); 1473 XFS_BUF_SET_FSPRIVATE(bp, iclog); 1474 XFS_BUF_BUSY(bp); 1475 XFS_BUF_ASYNC(bp); 1476 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1477 XFS_BUF_ORDERED(bp); 1478 dptr = XFS_BUF_PTR(bp); 1479 /* 1480 * Bump the cycle numbers at the start of each block 1481 * since this part of the buffer is at the start of 1482 * a new cycle. Watch out for the header magic number 1483 * case, though. 1484 */ 1485 for (i=0; i<split; i += BBSIZE) { 1486 INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1); 1487 if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) 1488 INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1); 1489 dptr += BBSIZE; 1490 } 1491 1492 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1493 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1494 1495 /* account for internal log which does't start at block #0 */ 1496 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1497 XFS_BUF_WRITE(bp); 1498 if ((error = XFS_bwrite(bp))) { 1499 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1500 bp, XFS_BUF_ADDR(bp)); 1501 return error; 1502 } 1503 } 1504 return 0; 1505 } /* xlog_sync */ 1506 1507 1508 /* 1509 * Unallocate a log structure 1510 */ 1511 void 1512 xlog_unalloc_log(xlog_t *log) 1513 { 1514 xlog_in_core_t *iclog, *next_iclog; 1515 xlog_ticket_t *tic, *next_tic; 1516 int i; 1517 1518 1519 iclog = log->l_iclog; 1520 for (i=0; i<log->l_iclog_bufs; i++) { 1521 sv_destroy(&iclog->ic_forcesema); 1522 sv_destroy(&iclog->ic_writesema); 1523 xfs_buf_free(iclog->ic_bp); 1524 #ifdef XFS_LOG_TRACE 1525 if (iclog->ic_trace != NULL) { 1526 ktrace_free(iclog->ic_trace); 1527 } 1528 #endif 1529 next_iclog = iclog->ic_next; 1530 kmem_free(iclog->hic_data, log->l_iclog_size); 1531 kmem_free(iclog, sizeof(xlog_in_core_t)); 1532 iclog = next_iclog; 1533 } 1534 freesema(&log->l_flushsema); 1535 spinlock_destroy(&log->l_icloglock); 1536 spinlock_destroy(&log->l_grant_lock); 1537 1538 /* XXXsup take a look at this again. */ 1539 if ((log->l_ticket_cnt != log->l_ticket_tcnt) && 1540 !XLOG_FORCED_SHUTDOWN(log)) { 1541 xfs_fs_cmn_err(CE_WARN, log->l_mp, 1542 "xlog_unalloc_log: (cnt: %d, total: %d)", 1543 log->l_ticket_cnt, log->l_ticket_tcnt); 1544 /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */ 1545 1546 } else { 1547 tic = log->l_unmount_free; 1548 while (tic) { 1549 next_tic = tic->t_next; 1550 kmem_free(tic, NBPP); 1551 tic = next_tic; 1552 } 1553 } 1554 xfs_buf_free(log->l_xbuf); 1555 #ifdef XFS_LOG_TRACE 1556 if (log->l_trace != NULL) { 1557 ktrace_free(log->l_trace); 1558 } 1559 if (log->l_grant_trace != NULL) { 1560 ktrace_free(log->l_grant_trace); 1561 } 1562 #endif 1563 log->l_mp->m_log = NULL; 1564 kmem_free(log, sizeof(xlog_t)); 1565 } /* xlog_unalloc_log */ 1566 1567 /* 1568 * Update counters atomically now that memcpy is done. 1569 */ 1570 /* ARGSUSED */ 1571 static inline void 1572 xlog_state_finish_copy(xlog_t *log, 1573 xlog_in_core_t *iclog, 1574 int record_cnt, 1575 int copy_bytes) 1576 { 1577 SPLDECL(s); 1578 1579 s = LOG_LOCK(log); 1580 1581 iclog->ic_header.h_num_logops += record_cnt; 1582 iclog->ic_offset += copy_bytes; 1583 1584 LOG_UNLOCK(log, s); 1585 } /* xlog_state_finish_copy */ 1586 1587 1588 1589 1590 /* 1591 * print out info relating to regions written which consume 1592 * the reservation 1593 */ 1594 STATIC void 1595 xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1596 { 1597 uint i; 1598 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1599 1600 /* match with XLOG_REG_TYPE_* in xfs_log.h */ 1601 static char *res_type_str[XLOG_REG_TYPE_MAX] = { 1602 "bformat", 1603 "bchunk", 1604 "efi_format", 1605 "efd_format", 1606 "iformat", 1607 "icore", 1608 "iext", 1609 "ibroot", 1610 "ilocal", 1611 "iattr_ext", 1612 "iattr_broot", 1613 "iattr_local", 1614 "qformat", 1615 "dquot", 1616 "quotaoff", 1617 "LR header", 1618 "unmount", 1619 "commit", 1620 "trans header" 1621 }; 1622 static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { 1623 "SETATTR_NOT_SIZE", 1624 "SETATTR_SIZE", 1625 "INACTIVE", 1626 "CREATE", 1627 "CREATE_TRUNC", 1628 "TRUNCATE_FILE", 1629 "REMOVE", 1630 "LINK", 1631 "RENAME", 1632 "MKDIR", 1633 "RMDIR", 1634 "SYMLINK", 1635 "SET_DMATTRS", 1636 "GROWFS", 1637 "STRAT_WRITE", 1638 "DIOSTRAT", 1639 "WRITE_SYNC", 1640 "WRITEID", 1641 "ADDAFORK", 1642 "ATTRINVAL", 1643 "ATRUNCATE", 1644 "ATTR_SET", 1645 "ATTR_RM", 1646 "ATTR_FLAG", 1647 "CLEAR_AGI_BUCKET", 1648 "QM_SBCHANGE", 1649 "DUMMY1", 1650 "DUMMY2", 1651 "QM_QUOTAOFF", 1652 "QM_DQALLOC", 1653 "QM_SETQLIM", 1654 "QM_DQCLUSTER", 1655 "QM_QINOCREATE", 1656 "QM_QUOTAOFF_END", 1657 "SB_UNIT", 1658 "FSYNC_TS", 1659 "GROWFSRT_ALLOC", 1660 "GROWFSRT_ZERO", 1661 "GROWFSRT_FREE", 1662 "SWAPEXT" 1663 }; 1664 1665 xfs_fs_cmn_err(CE_WARN, mp, 1666 "xfs_log_write: reservation summary:\n" 1667 " trans type = %s (%u)\n" 1668 " unit res = %d bytes\n" 1669 " current res = %d bytes\n" 1670 " total reg = %u bytes (o/flow = %u bytes)\n" 1671 " ophdrs = %u (ophdr space = %u bytes)\n" 1672 " ophdr + reg = %u bytes\n" 1673 " num regions = %u\n", 1674 ((ticket->t_trans_type <= 0 || 1675 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 1676 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 1677 ticket->t_trans_type, 1678 ticket->t_unit_res, 1679 ticket->t_curr_res, 1680 ticket->t_res_arr_sum, ticket->t_res_o_flow, 1681 ticket->t_res_num_ophdrs, ophdr_spc, 1682 ticket->t_res_arr_sum + 1683 ticket->t_res_o_flow + ophdr_spc, 1684 ticket->t_res_num); 1685 1686 for (i = 0; i < ticket->t_res_num; i++) { 1687 uint r_type = ticket->t_res_arr[i].r_type; 1688 cmn_err(CE_WARN, 1689 "region[%u]: %s - %u bytes\n", 1690 i, 1691 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 1692 "bad-rtype" : res_type_str[r_type-1]), 1693 ticket->t_res_arr[i].r_len); 1694 } 1695 } 1696 1697 /* 1698 * Write some region out to in-core log 1699 * 1700 * This will be called when writing externally provided regions or when 1701 * writing out a commit record for a given transaction. 1702 * 1703 * General algorithm: 1704 * 1. Find total length of this write. This may include adding to the 1705 * lengths passed in. 1706 * 2. Check whether we violate the tickets reservation. 1707 * 3. While writing to this iclog 1708 * A. Reserve as much space in this iclog as can get 1709 * B. If this is first write, save away start lsn 1710 * C. While writing this region: 1711 * 1. If first write of transaction, write start record 1712 * 2. Write log operation header (header per region) 1713 * 3. Find out if we can fit entire region into this iclog 1714 * 4. Potentially, verify destination memcpy ptr 1715 * 5. Memcpy (partial) region 1716 * 6. If partial copy, release iclog; otherwise, continue 1717 * copying more regions into current iclog 1718 * 4. Mark want sync bit (in simulation mode) 1719 * 5. Release iclog for potential flush to on-disk log. 1720 * 1721 * ERRORS: 1722 * 1. Panic if reservation is overrun. This should never happen since 1723 * reservation amounts are generated internal to the filesystem. 1724 * NOTES: 1725 * 1. Tickets are single threaded data structures. 1726 * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the 1727 * syncing routine. When a single log_write region needs to span 1728 * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set 1729 * on all log operation writes which don't contain the end of the 1730 * region. The XLOG_END_TRANS bit is used for the in-core log 1731 * operation which contains the end of the continued log_write region. 1732 * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, 1733 * we don't really know exactly how much space will be used. As a result, 1734 * we don't update ic_offset until the end when we know exactly how many 1735 * bytes have been written out. 1736 */ 1737 int 1738 xlog_write(xfs_mount_t * mp, 1739 xfs_log_iovec_t reg[], 1740 int nentries, 1741 xfs_log_ticket_t tic, 1742 xfs_lsn_t *start_lsn, 1743 xlog_in_core_t **commit_iclog, 1744 uint flags) 1745 { 1746 xlog_t *log = mp->m_log; 1747 xlog_ticket_t *ticket = (xlog_ticket_t *)tic; 1748 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1749 xlog_in_core_t *iclog; /* ptr to current in-core log */ 1750 __psint_t ptr; /* copy address into data region */ 1751 int len; /* # xlog_write() bytes 2 still copy */ 1752 int index; /* region index currently copying */ 1753 int log_offset; /* offset (from 0) into data region */ 1754 int start_rec_copy; /* # bytes to copy for start record */ 1755 int partial_copy; /* did we split a region? */ 1756 int partial_copy_len;/* # bytes copied if split region */ 1757 int need_copy; /* # bytes need to memcpy this region */ 1758 int copy_len; /* # bytes actually memcpy'ing */ 1759 int copy_off; /* # bytes from entry start */ 1760 int contwr; /* continued write of in-core log? */ 1761 int error; 1762 int record_cnt = 0, data_cnt = 0; 1763 1764 partial_copy_len = partial_copy = 0; 1765 1766 /* Calculate potential maximum space. Each region gets its own 1767 * xlog_op_header_t and may need to be double word aligned. 1768 */ 1769 len = 0; 1770 if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ 1771 len += sizeof(xlog_op_header_t); 1772 XLOG_TIC_ADD_OPHDR(ticket); 1773 } 1774 1775 for (index = 0; index < nentries; index++) { 1776 len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ 1777 XLOG_TIC_ADD_OPHDR(ticket); 1778 len += reg[index].i_len; 1779 XLOG_TIC_ADD_REGION(ticket, reg[index].i_len, reg[index].i_type); 1780 } 1781 contwr = *start_lsn = 0; 1782 1783 if (ticket->t_curr_res < len) { 1784 xlog_print_tic_res(mp, ticket); 1785 #ifdef DEBUG 1786 xlog_panic( 1787 "xfs_log_write: reservation ran out. Need to up reservation"); 1788 #else 1789 /* Customer configurable panic */ 1790 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, 1791 "xfs_log_write: reservation ran out. Need to up reservation"); 1792 /* If we did not panic, shutdown the filesystem */ 1793 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 1794 #endif 1795 } else 1796 ticket->t_curr_res -= len; 1797 1798 for (index = 0; index < nentries; ) { 1799 if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 1800 &contwr, &log_offset))) 1801 return error; 1802 1803 ASSERT(log_offset <= iclog->ic_size - 1); 1804 ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); 1805 1806 /* start_lsn is the first lsn written to. That's all we need. */ 1807 if (! *start_lsn) 1808 *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); 1809 1810 /* This loop writes out as many regions as can fit in the amount 1811 * of space which was allocated by xlog_state_get_iclog_space(). 1812 */ 1813 while (index < nentries) { 1814 ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); 1815 ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); 1816 start_rec_copy = 0; 1817 1818 /* If first write for transaction, insert start record. 1819 * We can't be trying to commit if we are inited. We can't 1820 * have any "partial_copy" if we are inited. 1821 */ 1822 if (ticket->t_flags & XLOG_TIC_INITED) { 1823 logop_head = (xlog_op_header_t *)ptr; 1824 INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); 1825 logop_head->oh_clientid = ticket->t_clientid; 1826 logop_head->oh_len = 0; 1827 logop_head->oh_flags = XLOG_START_TRANS; 1828 logop_head->oh_res2 = 0; 1829 ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ 1830 record_cnt++; 1831 1832 start_rec_copy = sizeof(xlog_op_header_t); 1833 xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); 1834 } 1835 1836 /* Copy log operation header directly into data section */ 1837 logop_head = (xlog_op_header_t *)ptr; 1838 INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); 1839 logop_head->oh_clientid = ticket->t_clientid; 1840 logop_head->oh_res2 = 0; 1841 1842 /* header copied directly */ 1843 xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); 1844 1845 /* are we copying a commit or unmount record? */ 1846 logop_head->oh_flags = flags; 1847 1848 /* 1849 * We've seen logs corrupted with bad transaction client 1850 * ids. This makes sure that XFS doesn't generate them on. 1851 * Turn this into an EIO and shut down the filesystem. 1852 */ 1853 switch (logop_head->oh_clientid) { 1854 case XFS_TRANSACTION: 1855 case XFS_VOLUME: 1856 case XFS_LOG: 1857 break; 1858 default: 1859 xfs_fs_cmn_err(CE_WARN, mp, 1860 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1861 logop_head->oh_clientid, tic); 1862 return XFS_ERROR(EIO); 1863 } 1864 1865 /* Partial write last time? => (partial_copy != 0) 1866 * need_copy is the amount we'd like to copy if everything could 1867 * fit in the current memcpy. 1868 */ 1869 need_copy = reg[index].i_len - partial_copy_len; 1870 1871 copy_off = partial_copy_len; 1872 if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ 1873 INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy); 1874 if (partial_copy) 1875 logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); 1876 partial_copy_len = partial_copy = 0; 1877 } else { /* partial write */ 1878 copy_len = iclog->ic_size - log_offset; 1879 INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len); 1880 logop_head->oh_flags |= XLOG_CONTINUE_TRANS; 1881 if (partial_copy) 1882 logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; 1883 partial_copy_len += copy_len; 1884 partial_copy++; 1885 len += sizeof(xlog_op_header_t); /* from splitting of region */ 1886 /* account for new log op header */ 1887 ticket->t_curr_res -= sizeof(xlog_op_header_t); 1888 XLOG_TIC_ADD_OPHDR(ticket); 1889 } 1890 xlog_verify_dest_ptr(log, ptr); 1891 1892 /* copy region */ 1893 ASSERT(copy_len >= 0); 1894 memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); 1895 xlog_write_adv_cnt(ptr, len, log_offset, copy_len); 1896 1897 /* make copy_len total bytes copied, including headers */ 1898 copy_len += start_rec_copy + sizeof(xlog_op_header_t); 1899 record_cnt++; 1900 data_cnt += contwr ? copy_len : 0; 1901 if (partial_copy) { /* copied partial region */ 1902 /* already marked WANT_SYNC by xlog_state_get_iclog_space */ 1903 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 1904 record_cnt = data_cnt = 0; 1905 if ((error = xlog_state_release_iclog(log, iclog))) 1906 return error; 1907 break; /* don't increment index */ 1908 } else { /* copied entire region */ 1909 index++; 1910 partial_copy_len = partial_copy = 0; 1911 1912 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { 1913 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 1914 record_cnt = data_cnt = 0; 1915 xlog_state_want_sync(log, iclog); 1916 if (commit_iclog) { 1917 ASSERT(flags & XLOG_COMMIT_TRANS); 1918 *commit_iclog = iclog; 1919 } else if ((error = xlog_state_release_iclog(log, iclog))) 1920 return error; 1921 if (index == nentries) 1922 return 0; /* we are done */ 1923 else 1924 break; 1925 } 1926 } /* if (partial_copy) */ 1927 } /* while (index < nentries) */ 1928 } /* for (index = 0; index < nentries; ) */ 1929 ASSERT(len == 0); 1930 1931 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 1932 if (commit_iclog) { 1933 ASSERT(flags & XLOG_COMMIT_TRANS); 1934 *commit_iclog = iclog; 1935 return 0; 1936 } 1937 return xlog_state_release_iclog(log, iclog); 1938 } /* xlog_write */ 1939 1940 1941 /***************************************************************************** 1942 * 1943 * State Machine functions 1944 * 1945 ***************************************************************************** 1946 */ 1947 1948 /* Clean iclogs starting from the head. This ordering must be 1949 * maintained, so an iclog doesn't become ACTIVE beyond one that 1950 * is SYNCING. This is also required to maintain the notion that we use 1951 * a counting semaphore to hold off would be writers to the log when every 1952 * iclog is trying to sync to disk. 1953 * 1954 * State Change: DIRTY -> ACTIVE 1955 */ 1956 STATIC void 1957 xlog_state_clean_log(xlog_t *log) 1958 { 1959 xlog_in_core_t *iclog; 1960 int changed = 0; 1961 1962 iclog = log->l_iclog; 1963 do { 1964 if (iclog->ic_state == XLOG_STATE_DIRTY) { 1965 iclog->ic_state = XLOG_STATE_ACTIVE; 1966 iclog->ic_offset = 0; 1967 iclog->ic_callback = NULL; /* don't need to free */ 1968 /* 1969 * If the number of ops in this iclog indicate it just 1970 * contains the dummy transaction, we can 1971 * change state into IDLE (the second time around). 1972 * Otherwise we should change the state into 1973 * NEED a dummy. 1974 * We don't need to cover the dummy. 1975 */ 1976 if (!changed && 1977 (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) { 1978 changed = 1; 1979 } else { 1980 /* 1981 * We have two dirty iclogs so start over 1982 * This could also be num of ops indicates 1983 * this is not the dummy going out. 1984 */ 1985 changed = 2; 1986 } 1987 iclog->ic_header.h_num_logops = 0; 1988 memset(iclog->ic_header.h_cycle_data, 0, 1989 sizeof(iclog->ic_header.h_cycle_data)); 1990 iclog->ic_header.h_lsn = 0; 1991 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) 1992 /* do nothing */; 1993 else 1994 break; /* stop cleaning */ 1995 iclog = iclog->ic_next; 1996 } while (iclog != log->l_iclog); 1997 1998 /* log is locked when we are called */ 1999 /* 2000 * Change state for the dummy log recording. 2001 * We usually go to NEED. But we go to NEED2 if the changed indicates 2002 * we are done writing the dummy record. 2003 * If we are done with the second dummy recored (DONE2), then 2004 * we go to IDLE. 2005 */ 2006 if (changed) { 2007 switch (log->l_covered_state) { 2008 case XLOG_STATE_COVER_IDLE: 2009 case XLOG_STATE_COVER_NEED: 2010 case XLOG_STATE_COVER_NEED2: 2011 log->l_covered_state = XLOG_STATE_COVER_NEED; 2012 break; 2013 2014 case XLOG_STATE_COVER_DONE: 2015 if (changed == 1) 2016 log->l_covered_state = XLOG_STATE_COVER_NEED2; 2017 else 2018 log->l_covered_state = XLOG_STATE_COVER_NEED; 2019 break; 2020 2021 case XLOG_STATE_COVER_DONE2: 2022 if (changed == 1) 2023 log->l_covered_state = XLOG_STATE_COVER_IDLE; 2024 else 2025 log->l_covered_state = XLOG_STATE_COVER_NEED; 2026 break; 2027 2028 default: 2029 ASSERT(0); 2030 } 2031 } 2032 } /* xlog_state_clean_log */ 2033 2034 STATIC xfs_lsn_t 2035 xlog_get_lowest_lsn( 2036 xlog_t *log) 2037 { 2038 xlog_in_core_t *lsn_log; 2039 xfs_lsn_t lowest_lsn, lsn; 2040 2041 lsn_log = log->l_iclog; 2042 lowest_lsn = 0; 2043 do { 2044 if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { 2045 lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT); 2046 if ((lsn && !lowest_lsn) || 2047 (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { 2048 lowest_lsn = lsn; 2049 } 2050 } 2051 lsn_log = lsn_log->ic_next; 2052 } while (lsn_log != log->l_iclog); 2053 return lowest_lsn; 2054 } 2055 2056 2057 STATIC void 2058 xlog_state_do_callback( 2059 xlog_t *log, 2060 int aborted, 2061 xlog_in_core_t *ciclog) 2062 { 2063 xlog_in_core_t *iclog; 2064 xlog_in_core_t *first_iclog; /* used to know when we've 2065 * processed all iclogs once */ 2066 xfs_log_callback_t *cb, *cb_next; 2067 int flushcnt = 0; 2068 xfs_lsn_t lowest_lsn; 2069 int ioerrors; /* counter: iclogs with errors */ 2070 int loopdidcallbacks; /* flag: inner loop did callbacks*/ 2071 int funcdidcallbacks; /* flag: function did callbacks */ 2072 int repeats; /* for issuing console warnings if 2073 * looping too many times */ 2074 SPLDECL(s); 2075 2076 s = LOG_LOCK(log); 2077 first_iclog = iclog = log->l_iclog; 2078 ioerrors = 0; 2079 funcdidcallbacks = 0; 2080 repeats = 0; 2081 2082 do { 2083 /* 2084 * Scan all iclogs starting with the one pointed to by the 2085 * log. Reset this starting point each time the log is 2086 * unlocked (during callbacks). 2087 * 2088 * Keep looping through iclogs until one full pass is made 2089 * without running any callbacks. 2090 */ 2091 first_iclog = log->l_iclog; 2092 iclog = log->l_iclog; 2093 loopdidcallbacks = 0; 2094 repeats++; 2095 2096 do { 2097 2098 /* skip all iclogs in the ACTIVE & DIRTY states */ 2099 if (iclog->ic_state & 2100 (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { 2101 iclog = iclog->ic_next; 2102 continue; 2103 } 2104 2105 /* 2106 * Between marking a filesystem SHUTDOWN and stopping 2107 * the log, we do flush all iclogs to disk (if there 2108 * wasn't a log I/O error). So, we do want things to 2109 * go smoothly in case of just a SHUTDOWN w/o a 2110 * LOG_IO_ERROR. 2111 */ 2112 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 2113 /* 2114 * Can only perform callbacks in order. Since 2115 * this iclog is not in the DONE_SYNC/ 2116 * DO_CALLBACK state, we skip the rest and 2117 * just try to clean up. If we set our iclog 2118 * to DO_CALLBACK, we will not process it when 2119 * we retry since a previous iclog is in the 2120 * CALLBACK and the state cannot change since 2121 * we are holding the LOG_LOCK. 2122 */ 2123 if (!(iclog->ic_state & 2124 (XLOG_STATE_DONE_SYNC | 2125 XLOG_STATE_DO_CALLBACK))) { 2126 if (ciclog && (ciclog->ic_state == 2127 XLOG_STATE_DONE_SYNC)) { 2128 ciclog->ic_state = XLOG_STATE_DO_CALLBACK; 2129 } 2130 break; 2131 } 2132 /* 2133 * We now have an iclog that is in either the 2134 * DO_CALLBACK or DONE_SYNC states. The other 2135 * states (WANT_SYNC, SYNCING, or CALLBACK were 2136 * caught by the above if and are going to 2137 * clean (i.e. we aren't doing their callbacks) 2138 * see the above if. 2139 */ 2140 2141 /* 2142 * We will do one more check here to see if we 2143 * have chased our tail around. 2144 */ 2145 2146 lowest_lsn = xlog_get_lowest_lsn(log); 2147 if (lowest_lsn && ( 2148 XFS_LSN_CMP( 2149 lowest_lsn, 2150 INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) 2151 )<0)) { 2152 iclog = iclog->ic_next; 2153 continue; /* Leave this iclog for 2154 * another thread */ 2155 } 2156 2157 iclog->ic_state = XLOG_STATE_CALLBACK; 2158 2159 LOG_UNLOCK(log, s); 2160 2161 /* l_last_sync_lsn field protected by 2162 * GRANT_LOCK. Don't worry about iclog's lsn. 2163 * No one else can be here except us. 2164 */ 2165 s = GRANT_LOCK(log); 2166 ASSERT(XFS_LSN_CMP( 2167 log->l_last_sync_lsn, 2168 INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) 2169 )<=0); 2170 log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); 2171 GRANT_UNLOCK(log, s); 2172 2173 /* 2174 * Keep processing entries in the callback list 2175 * until we come around and it is empty. We 2176 * need to atomically see that the list is 2177 * empty and change the state to DIRTY so that 2178 * we don't miss any more callbacks being added. 2179 */ 2180 s = LOG_LOCK(log); 2181 } else { 2182 ioerrors++; 2183 } 2184 cb = iclog->ic_callback; 2185 2186 while (cb != 0) { 2187 iclog->ic_callback_tail = &(iclog->ic_callback); 2188 iclog->ic_callback = NULL; 2189 LOG_UNLOCK(log, s); 2190 2191 /* perform callbacks in the order given */ 2192 for (; cb != 0; cb = cb_next) { 2193 cb_next = cb->cb_next; 2194 cb->cb_func(cb->cb_arg, aborted); 2195 } 2196 s = LOG_LOCK(log); 2197 cb = iclog->ic_callback; 2198 } 2199 2200 loopdidcallbacks++; 2201 funcdidcallbacks++; 2202 2203 ASSERT(iclog->ic_callback == 0); 2204 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) 2205 iclog->ic_state = XLOG_STATE_DIRTY; 2206 2207 /* 2208 * Transition from DIRTY to ACTIVE if applicable. 2209 * NOP if STATE_IOERROR. 2210 */ 2211 xlog_state_clean_log(log); 2212 2213 /* wake up threads waiting in xfs_log_force() */ 2214 sv_broadcast(&iclog->ic_forcesema); 2215 2216 iclog = iclog->ic_next; 2217 } while (first_iclog != iclog); 2218 if (repeats && (repeats % 10) == 0) { 2219 xfs_fs_cmn_err(CE_WARN, log->l_mp, 2220 "xlog_state_do_callback: looping %d", repeats); 2221 } 2222 } while (!ioerrors && loopdidcallbacks); 2223 2224 /* 2225 * make one last gasp attempt to see if iclogs are being left in 2226 * limbo.. 2227 */ 2228 #ifdef DEBUG 2229 if (funcdidcallbacks) { 2230 first_iclog = iclog = log->l_iclog; 2231 do { 2232 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2233 /* 2234 * Terminate the loop if iclogs are found in states 2235 * which will cause other threads to clean up iclogs. 2236 * 2237 * SYNCING - i/o completion will go through logs 2238 * DONE_SYNC - interrupt thread should be waiting for 2239 * LOG_LOCK 2240 * IOERROR - give up hope all ye who enter here 2241 */ 2242 if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2243 iclog->ic_state == XLOG_STATE_SYNCING || 2244 iclog->ic_state == XLOG_STATE_DONE_SYNC || 2245 iclog->ic_state == XLOG_STATE_IOERROR ) 2246 break; 2247 iclog = iclog->ic_next; 2248 } while (first_iclog != iclog); 2249 } 2250 #endif 2251 2252 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { 2253 flushcnt = log->l_flushcnt; 2254 log->l_flushcnt = 0; 2255 } 2256 LOG_UNLOCK(log, s); 2257 while (flushcnt--) 2258 vsema(&log->l_flushsema); 2259 } /* xlog_state_do_callback */ 2260 2261 2262 /* 2263 * Finish transitioning this iclog to the dirty state. 2264 * 2265 * Make sure that we completely execute this routine only when this is 2266 * the last call to the iclog. There is a good chance that iclog flushes, 2267 * when we reach the end of the physical log, get turned into 2 separate 2268 * calls to bwrite. Hence, one iclog flush could generate two calls to this 2269 * routine. By using the reference count bwritecnt, we guarantee that only 2270 * the second completion goes through. 2271 * 2272 * Callbacks could take time, so they are done outside the scope of the 2273 * global state machine log lock. Assume that the calls to cvsema won't 2274 * take a long time. At least we know it won't sleep. 2275 */ 2276 void 2277 xlog_state_done_syncing( 2278 xlog_in_core_t *iclog, 2279 int aborted) 2280 { 2281 xlog_t *log = iclog->ic_log; 2282 SPLDECL(s); 2283 2284 s = LOG_LOCK(log); 2285 2286 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || 2287 iclog->ic_state == XLOG_STATE_IOERROR); 2288 ASSERT(iclog->ic_refcnt == 0); 2289 ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); 2290 2291 2292 /* 2293 * If we got an error, either on the first buffer, or in the case of 2294 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, 2295 * and none should ever be attempted to be written to disk 2296 * again. 2297 */ 2298 if (iclog->ic_state != XLOG_STATE_IOERROR) { 2299 if (--iclog->ic_bwritecnt == 1) { 2300 LOG_UNLOCK(log, s); 2301 return; 2302 } 2303 iclog->ic_state = XLOG_STATE_DONE_SYNC; 2304 } 2305 2306 /* 2307 * Someone could be sleeping prior to writing out the next 2308 * iclog buffer, we wake them all, one will get to do the 2309 * I/O, the others get to wait for the result. 2310 */ 2311 sv_broadcast(&iclog->ic_writesema); 2312 LOG_UNLOCK(log, s); 2313 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2314 } /* xlog_state_done_syncing */ 2315 2316 2317 /* 2318 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must 2319 * sleep. The flush semaphore is set to the number of in-core buffers and 2320 * decremented around disk syncing. Therefore, if all buffers are syncing, 2321 * this semaphore will cause new writes to sleep until a sync completes. 2322 * Otherwise, this code just does p() followed by v(). This approximates 2323 * a sleep/wakeup except we can't race. 2324 * 2325 * The in-core logs are used in a circular fashion. They are not used 2326 * out-of-order even when an iclog past the head is free. 2327 * 2328 * return: 2329 * * log_offset where xlog_write() can start writing into the in-core 2330 * log's data space. 2331 * * in-core log pointer to which xlog_write() should write. 2332 * * boolean indicating this is a continued write to an in-core log. 2333 * If this is the last write, then the in-core log's offset field 2334 * needs to be incremented, depending on the amount of data which 2335 * is copied. 2336 */ 2337 int 2338 xlog_state_get_iclog_space(xlog_t *log, 2339 int len, 2340 xlog_in_core_t **iclogp, 2341 xlog_ticket_t *ticket, 2342 int *continued_write, 2343 int *logoffsetp) 2344 { 2345 SPLDECL(s); 2346 int log_offset; 2347 xlog_rec_header_t *head; 2348 xlog_in_core_t *iclog; 2349 int error; 2350 2351 restart: 2352 s = LOG_LOCK(log); 2353 if (XLOG_FORCED_SHUTDOWN(log)) { 2354 LOG_UNLOCK(log, s); 2355 return XFS_ERROR(EIO); 2356 } 2357 2358 iclog = log->l_iclog; 2359 if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) { 2360 log->l_flushcnt++; 2361 LOG_UNLOCK(log, s); 2362 xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); 2363 XFS_STATS_INC(xs_log_noiclogs); 2364 /* Ensure that log writes happen */ 2365 psema(&log->l_flushsema, PINOD); 2366 goto restart; 2367 } 2368 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 2369 head = &iclog->ic_header; 2370 2371 iclog->ic_refcnt++; /* prevents sync */ 2372 log_offset = iclog->ic_offset; 2373 2374 /* On the 1st write to an iclog, figure out lsn. This works 2375 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are 2376 * committing to. If the offset is set, that's how many blocks 2377 * must be written. 2378 */ 2379 if (log_offset == 0) { 2380 ticket->t_curr_res -= log->l_iclog_hsize; 2381 XLOG_TIC_ADD_REGION(ticket, 2382 log->l_iclog_hsize, 2383 XLOG_REG_TYPE_LRHEADER); 2384 INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle); 2385 ASSIGN_LSN(head->h_lsn, log); 2386 ASSERT(log->l_curr_block >= 0); 2387 } 2388 2389 /* If there is enough room to write everything, then do it. Otherwise, 2390 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC 2391 * bit is on, so this will get flushed out. Don't update ic_offset 2392 * until you know exactly how many bytes get copied. Therefore, wait 2393 * until later to update ic_offset. 2394 * 2395 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's 2396 * can fit into remaining data section. 2397 */ 2398 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { 2399 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2400 2401 /* If I'm the only one writing to this iclog, sync it to disk */ 2402 if (iclog->ic_refcnt == 1) { 2403 LOG_UNLOCK(log, s); 2404 if ((error = xlog_state_release_iclog(log, iclog))) 2405 return error; 2406 } else { 2407 iclog->ic_refcnt--; 2408 LOG_UNLOCK(log, s); 2409 } 2410 goto restart; 2411 } 2412 2413 /* Do we have enough room to write the full amount in the remainder 2414 * of this iclog? Or must we continue a write on the next iclog and 2415 * mark this iclog as completely taken? In the case where we switch 2416 * iclogs (to mark it taken), this particular iclog will release/sync 2417 * to disk in xlog_write(). 2418 */ 2419 if (len <= iclog->ic_size - iclog->ic_offset) { 2420 *continued_write = 0; 2421 iclog->ic_offset += len; 2422 } else { 2423 *continued_write = 1; 2424 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2425 } 2426 *iclogp = iclog; 2427 2428 ASSERT(iclog->ic_offset <= iclog->ic_size); 2429 LOG_UNLOCK(log, s); 2430 2431 *logoffsetp = log_offset; 2432 return 0; 2433 } /* xlog_state_get_iclog_space */ 2434 2435 /* 2436 * Atomically get the log space required for a log ticket. 2437 * 2438 * Once a ticket gets put onto the reserveq, it will only return after 2439 * the needed reservation is satisfied. 2440 */ 2441 STATIC int 2442 xlog_grant_log_space(xlog_t *log, 2443 xlog_ticket_t *tic) 2444 { 2445 int free_bytes; 2446 int need_bytes; 2447 SPLDECL(s); 2448 #ifdef DEBUG 2449 xfs_lsn_t tail_lsn; 2450 #endif 2451 2452 2453 #ifdef DEBUG 2454 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2455 panic("grant Recovery problem"); 2456 #endif 2457 2458 /* Is there space or do we need to sleep? */ 2459 s = GRANT_LOCK(log); 2460 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); 2461 2462 /* something is already sleeping; insert new transaction at end */ 2463 if (log->l_reserve_headq) { 2464 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2465 xlog_trace_loggrant(log, tic, 2466 "xlog_grant_log_space: sleep 1"); 2467 /* 2468 * Gotta check this before going to sleep, while we're 2469 * holding the grant lock. 2470 */ 2471 if (XLOG_FORCED_SHUTDOWN(log)) 2472 goto error_return; 2473 2474 XFS_STATS_INC(xs_sleep_logspace); 2475 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2476 /* 2477 * If we got an error, and the filesystem is shutting down, 2478 * we'll catch it down below. So just continue... 2479 */ 2480 xlog_trace_loggrant(log, tic, 2481 "xlog_grant_log_space: wake 1"); 2482 s = GRANT_LOCK(log); 2483 } 2484 if (tic->t_flags & XFS_LOG_PERM_RESERV) 2485 need_bytes = tic->t_unit_res*tic->t_ocnt; 2486 else 2487 need_bytes = tic->t_unit_res; 2488 2489 redo: 2490 if (XLOG_FORCED_SHUTDOWN(log)) 2491 goto error_return; 2492 2493 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2494 log->l_grant_reserve_bytes); 2495 if (free_bytes < need_bytes) { 2496 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2497 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2498 xlog_trace_loggrant(log, tic, 2499 "xlog_grant_log_space: sleep 2"); 2500 XFS_STATS_INC(xs_sleep_logspace); 2501 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2502 2503 if (XLOG_FORCED_SHUTDOWN(log)) { 2504 s = GRANT_LOCK(log); 2505 goto error_return; 2506 } 2507 2508 xlog_trace_loggrant(log, tic, 2509 "xlog_grant_log_space: wake 2"); 2510 xlog_grant_push_ail(log->l_mp, need_bytes); 2511 s = GRANT_LOCK(log); 2512 goto redo; 2513 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2514 xlog_del_ticketq(&log->l_reserve_headq, tic); 2515 2516 /* we've got enough space */ 2517 xlog_grant_add_space(log, need_bytes); 2518 #ifdef DEBUG 2519 tail_lsn = log->l_tail_lsn; 2520 /* 2521 * Check to make sure the grant write head didn't just over lap the 2522 * tail. If the cycles are the same, we can't be overlapping. 2523 * Otherwise, make sure that the cycles differ by exactly one and 2524 * check the byte count. 2525 */ 2526 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { 2527 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); 2528 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); 2529 } 2530 #endif 2531 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); 2532 xlog_verify_grant_head(log, 1); 2533 GRANT_UNLOCK(log, s); 2534 return 0; 2535 2536 error_return: 2537 if (tic->t_flags & XLOG_TIC_IN_Q) 2538 xlog_del_ticketq(&log->l_reserve_headq, tic); 2539 xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); 2540 /* 2541 * If we are failing, make sure the ticket doesn't have any 2542 * current reservations. We don't want to add this back when 2543 * the ticket/transaction gets cancelled. 2544 */ 2545 tic->t_curr_res = 0; 2546 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2547 GRANT_UNLOCK(log, s); 2548 return XFS_ERROR(EIO); 2549 } /* xlog_grant_log_space */ 2550 2551 2552 /* 2553 * Replenish the byte reservation required by moving the grant write head. 2554 * 2555 * 2556 */ 2557 STATIC int 2558 xlog_regrant_write_log_space(xlog_t *log, 2559 xlog_ticket_t *tic) 2560 { 2561 SPLDECL(s); 2562 int free_bytes, need_bytes; 2563 xlog_ticket_t *ntic; 2564 #ifdef DEBUG 2565 xfs_lsn_t tail_lsn; 2566 #endif 2567 2568 tic->t_curr_res = tic->t_unit_res; 2569 XLOG_TIC_RESET_RES(tic); 2570 2571 if (tic->t_cnt > 0) 2572 return 0; 2573 2574 #ifdef DEBUG 2575 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2576 panic("regrant Recovery problem"); 2577 #endif 2578 2579 s = GRANT_LOCK(log); 2580 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); 2581 2582 if (XLOG_FORCED_SHUTDOWN(log)) 2583 goto error_return; 2584 2585 /* If there are other waiters on the queue then give them a 2586 * chance at logspace before us. Wake up the first waiters, 2587 * if we do not wake up all the waiters then go to sleep waiting 2588 * for more free space, otherwise try to get some space for 2589 * this transaction. 2590 */ 2591 2592 if ((ntic = log->l_write_headq)) { 2593 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2594 log->l_grant_write_bytes); 2595 do { 2596 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2597 2598 if (free_bytes < ntic->t_unit_res) 2599 break; 2600 free_bytes -= ntic->t_unit_res; 2601 sv_signal(&ntic->t_sema); 2602 ntic = ntic->t_next; 2603 } while (ntic != log->l_write_headq); 2604 2605 if (ntic != log->l_write_headq) { 2606 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2607 xlog_ins_ticketq(&log->l_write_headq, tic); 2608 2609 xlog_trace_loggrant(log, tic, 2610 "xlog_regrant_write_log_space: sleep 1"); 2611 XFS_STATS_INC(xs_sleep_logspace); 2612 sv_wait(&tic->t_sema, PINOD|PLTWAIT, 2613 &log->l_grant_lock, s); 2614 2615 /* If we're shutting down, this tic is already 2616 * off the queue */ 2617 if (XLOG_FORCED_SHUTDOWN(log)) { 2618 s = GRANT_LOCK(log); 2619 goto error_return; 2620 } 2621 2622 xlog_trace_loggrant(log, tic, 2623 "xlog_regrant_write_log_space: wake 1"); 2624 xlog_grant_push_ail(log->l_mp, tic->t_unit_res); 2625 s = GRANT_LOCK(log); 2626 } 2627 } 2628 2629 need_bytes = tic->t_unit_res; 2630 2631 redo: 2632 if (XLOG_FORCED_SHUTDOWN(log)) 2633 goto error_return; 2634 2635 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2636 log->l_grant_write_bytes); 2637 if (free_bytes < need_bytes) { 2638 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2639 xlog_ins_ticketq(&log->l_write_headq, tic); 2640 XFS_STATS_INC(xs_sleep_logspace); 2641 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2642 2643 /* If we're shutting down, this tic is already off the queue */ 2644 if (XLOG_FORCED_SHUTDOWN(log)) { 2645 s = GRANT_LOCK(log); 2646 goto error_return; 2647 } 2648 2649 xlog_trace_loggrant(log, tic, 2650 "xlog_regrant_write_log_space: wake 2"); 2651 xlog_grant_push_ail(log->l_mp, need_bytes); 2652 s = GRANT_LOCK(log); 2653 goto redo; 2654 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2655 xlog_del_ticketq(&log->l_write_headq, tic); 2656 2657 /* we've got enough space */ 2658 xlog_grant_add_space_write(log, need_bytes); 2659 #ifdef DEBUG 2660 tail_lsn = log->l_tail_lsn; 2661 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { 2662 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); 2663 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); 2664 } 2665 #endif 2666 2667 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); 2668 xlog_verify_grant_head(log, 1); 2669 GRANT_UNLOCK(log, s); 2670 return 0; 2671 2672 2673 error_return: 2674 if (tic->t_flags & XLOG_TIC_IN_Q) 2675 xlog_del_ticketq(&log->l_reserve_headq, tic); 2676 xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); 2677 /* 2678 * If we are failing, make sure the ticket doesn't have any 2679 * current reservations. We don't want to add this back when 2680 * the ticket/transaction gets cancelled. 2681 */ 2682 tic->t_curr_res = 0; 2683 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2684 GRANT_UNLOCK(log, s); 2685 return XFS_ERROR(EIO); 2686 } /* xlog_regrant_write_log_space */ 2687 2688 2689 /* The first cnt-1 times through here we don't need to 2690 * move the grant write head because the permanent 2691 * reservation has reserved cnt times the unit amount. 2692 * Release part of current permanent unit reservation and 2693 * reset current reservation to be one units worth. Also 2694 * move grant reservation head forward. 2695 */ 2696 STATIC void 2697 xlog_regrant_reserve_log_space(xlog_t *log, 2698 xlog_ticket_t *ticket) 2699 { 2700 SPLDECL(s); 2701 2702 xlog_trace_loggrant(log, ticket, 2703 "xlog_regrant_reserve_log_space: enter"); 2704 if (ticket->t_cnt > 0) 2705 ticket->t_cnt--; 2706 2707 s = GRANT_LOCK(log); 2708 xlog_grant_sub_space(log, ticket->t_curr_res); 2709 ticket->t_curr_res = ticket->t_unit_res; 2710 XLOG_TIC_RESET_RES(ticket); 2711 xlog_trace_loggrant(log, ticket, 2712 "xlog_regrant_reserve_log_space: sub current res"); 2713 xlog_verify_grant_head(log, 1); 2714 2715 /* just return if we still have some of the pre-reserved space */ 2716 if (ticket->t_cnt > 0) { 2717 GRANT_UNLOCK(log, s); 2718 return; 2719 } 2720 2721 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2722 xlog_trace_loggrant(log, ticket, 2723 "xlog_regrant_reserve_log_space: exit"); 2724 xlog_verify_grant_head(log, 0); 2725 GRANT_UNLOCK(log, s); 2726 ticket->t_curr_res = ticket->t_unit_res; 2727 XLOG_TIC_RESET_RES(ticket); 2728 } /* xlog_regrant_reserve_log_space */ 2729 2730 2731 /* 2732 * Give back the space left from a reservation. 2733 * 2734 * All the information we need to make a correct determination of space left 2735 * is present. For non-permanent reservations, things are quite easy. The 2736 * count should have been decremented to zero. We only need to deal with the 2737 * space remaining in the current reservation part of the ticket. If the 2738 * ticket contains a permanent reservation, there may be left over space which 2739 * needs to be released. A count of N means that N-1 refills of the current 2740 * reservation can be done before we need to ask for more space. The first 2741 * one goes to fill up the first current reservation. Once we run out of 2742 * space, the count will stay at zero and the only space remaining will be 2743 * in the current reservation field. 2744 */ 2745 STATIC void 2746 xlog_ungrant_log_space(xlog_t *log, 2747 xlog_ticket_t *ticket) 2748 { 2749 SPLDECL(s); 2750 2751 if (ticket->t_cnt > 0) 2752 ticket->t_cnt--; 2753 2754 s = GRANT_LOCK(log); 2755 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); 2756 2757 xlog_grant_sub_space(log, ticket->t_curr_res); 2758 2759 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); 2760 2761 /* If this is a permanent reservation ticket, we may be able to free 2762 * up more space based on the remaining count. 2763 */ 2764 if (ticket->t_cnt > 0) { 2765 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2766 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2767 } 2768 2769 xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); 2770 xlog_verify_grant_head(log, 1); 2771 GRANT_UNLOCK(log, s); 2772 xfs_log_move_tail(log->l_mp, 1); 2773 } /* xlog_ungrant_log_space */ 2774 2775 2776 /* 2777 * Atomically put back used ticket. 2778 */ 2779 void 2780 xlog_state_put_ticket(xlog_t *log, 2781 xlog_ticket_t *tic) 2782 { 2783 unsigned long s; 2784 2785 s = LOG_LOCK(log); 2786 xlog_ticket_put(log, tic); 2787 LOG_UNLOCK(log, s); 2788 } /* xlog_state_put_ticket */ 2789 2790 /* 2791 * Flush iclog to disk if this is the last reference to the given iclog and 2792 * the WANT_SYNC bit is set. 2793 * 2794 * When this function is entered, the iclog is not necessarily in the 2795 * WANT_SYNC state. It may be sitting around waiting to get filled. 2796 * 2797 * 2798 */ 2799 int 2800 xlog_state_release_iclog(xlog_t *log, 2801 xlog_in_core_t *iclog) 2802 { 2803 SPLDECL(s); 2804 int sync = 0; /* do we sync? */ 2805 2806 xlog_assign_tail_lsn(log->l_mp); 2807 2808 s = LOG_LOCK(log); 2809 2810 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2811 LOG_UNLOCK(log, s); 2812 return XFS_ERROR(EIO); 2813 } 2814 2815 ASSERT(iclog->ic_refcnt > 0); 2816 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || 2817 iclog->ic_state == XLOG_STATE_WANT_SYNC); 2818 2819 if (--iclog->ic_refcnt == 0 && 2820 iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2821 sync++; 2822 iclog->ic_state = XLOG_STATE_SYNCING; 2823 INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn); 2824 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2825 /* cycle incremented when incrementing curr_block */ 2826 } 2827 2828 LOG_UNLOCK(log, s); 2829 2830 /* 2831 * We let the log lock go, so it's possible that we hit a log I/O 2832 * error or someother SHUTDOWN condition that marks the iclog 2833 * as XLOG_STATE_IOERROR before the bwrite. However, we know that 2834 * this iclog has consistent data, so we ignore IOERROR 2835 * flags after this point. 2836 */ 2837 if (sync) { 2838 return xlog_sync(log, iclog); 2839 } 2840 return 0; 2841 2842 } /* xlog_state_release_iclog */ 2843 2844 2845 /* 2846 * This routine will mark the current iclog in the ring as WANT_SYNC 2847 * and move the current iclog pointer to the next iclog in the ring. 2848 * When this routine is called from xlog_state_get_iclog_space(), the 2849 * exact size of the iclog has not yet been determined. All we know is 2850 * that every data block. We have run out of space in this log record. 2851 */ 2852 STATIC void 2853 xlog_state_switch_iclogs(xlog_t *log, 2854 xlog_in_core_t *iclog, 2855 int eventual_size) 2856 { 2857 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 2858 if (!eventual_size) 2859 eventual_size = iclog->ic_offset; 2860 iclog->ic_state = XLOG_STATE_WANT_SYNC; 2861 INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block); 2862 log->l_prev_block = log->l_curr_block; 2863 log->l_prev_cycle = log->l_curr_cycle; 2864 2865 /* roll log?: ic_offset changed later */ 2866 log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); 2867 2868 /* Round up to next log-sunit */ 2869 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && 2870 log->l_mp->m_sb.sb_logsunit > 1) { 2871 __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); 2872 log->l_curr_block = roundup(log->l_curr_block, sunit_bb); 2873 } 2874 2875 if (log->l_curr_block >= log->l_logBBsize) { 2876 log->l_curr_cycle++; 2877 if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) 2878 log->l_curr_cycle++; 2879 log->l_curr_block -= log->l_logBBsize; 2880 ASSERT(log->l_curr_block >= 0); 2881 } 2882 ASSERT(iclog == log->l_iclog); 2883 log->l_iclog = iclog->ic_next; 2884 } /* xlog_state_switch_iclogs */ 2885 2886 2887 /* 2888 * Write out all data in the in-core log as of this exact moment in time. 2889 * 2890 * Data may be written to the in-core log during this call. However, 2891 * we don't guarantee this data will be written out. A change from past 2892 * implementation means this routine will *not* write out zero length LRs. 2893 * 2894 * Basically, we try and perform an intelligent scan of the in-core logs. 2895 * If we determine there is no flushable data, we just return. There is no 2896 * flushable data if: 2897 * 2898 * 1. the current iclog is active and has no data; the previous iclog 2899 * is in the active or dirty state. 2900 * 2. the current iclog is drity, and the previous iclog is in the 2901 * active or dirty state. 2902 * 2903 * We may sleep (call psema) if: 2904 * 2905 * 1. the current iclog is not in the active nor dirty state. 2906 * 2. the current iclog dirty, and the previous iclog is not in the 2907 * active nor dirty state. 2908 * 3. the current iclog is active, and there is another thread writing 2909 * to this particular iclog. 2910 * 4. a) the current iclog is active and has no other writers 2911 * b) when we return from flushing out this iclog, it is still 2912 * not in the active nor dirty state. 2913 */ 2914 STATIC int 2915 xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) 2916 { 2917 xlog_in_core_t *iclog; 2918 xfs_lsn_t lsn; 2919 SPLDECL(s); 2920 2921 s = LOG_LOCK(log); 2922 2923 iclog = log->l_iclog; 2924 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2925 LOG_UNLOCK(log, s); 2926 return XFS_ERROR(EIO); 2927 } 2928 2929 /* If the head iclog is not active nor dirty, we just attach 2930 * ourselves to the head and go to sleep. 2931 */ 2932 if (iclog->ic_state == XLOG_STATE_ACTIVE || 2933 iclog->ic_state == XLOG_STATE_DIRTY) { 2934 /* 2935 * If the head is dirty or (active and empty), then 2936 * we need to look at the previous iclog. If the previous 2937 * iclog is active or dirty we are done. There is nothing 2938 * to sync out. Otherwise, we attach ourselves to the 2939 * previous iclog and go to sleep. 2940 */ 2941 if (iclog->ic_state == XLOG_STATE_DIRTY || 2942 (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { 2943 iclog = iclog->ic_prev; 2944 if (iclog->ic_state == XLOG_STATE_ACTIVE || 2945 iclog->ic_state == XLOG_STATE_DIRTY) 2946 goto no_sleep; 2947 else 2948 goto maybe_sleep; 2949 } else { 2950 if (iclog->ic_refcnt == 0) { 2951 /* We are the only one with access to this 2952 * iclog. Flush it out now. There should 2953 * be a roundoff of zero to show that someone 2954 * has already taken care of the roundoff from 2955 * the previous sync. 2956 */ 2957 iclog->ic_refcnt++; 2958 lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); 2959 xlog_state_switch_iclogs(log, iclog, 0); 2960 LOG_UNLOCK(log, s); 2961 2962 if (xlog_state_release_iclog(log, iclog)) 2963 return XFS_ERROR(EIO); 2964 *log_flushed = 1; 2965 s = LOG_LOCK(log); 2966 if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn && 2967 iclog->ic_state != XLOG_STATE_DIRTY) 2968 goto maybe_sleep; 2969 else 2970 goto no_sleep; 2971 } else { 2972 /* Someone else is writing to this iclog. 2973 * Use its call to flush out the data. However, 2974 * the other thread may not force out this LR, 2975 * so we mark it WANT_SYNC. 2976 */ 2977 xlog_state_switch_iclogs(log, iclog, 0); 2978 goto maybe_sleep; 2979 } 2980 } 2981 } 2982 2983 /* By the time we come around again, the iclog could've been filled 2984 * which would give it another lsn. If we have a new lsn, just 2985 * return because the relevant data has been flushed. 2986 */ 2987 maybe_sleep: 2988 if (flags & XFS_LOG_SYNC) { 2989 /* 2990 * We must check if we're shutting down here, before 2991 * we wait, while we're holding the LOG_LOCK. 2992 * Then we check again after waking up, in case our 2993 * sleep was disturbed by a bad news. 2994 */ 2995 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2996 LOG_UNLOCK(log, s); 2997 return XFS_ERROR(EIO); 2998 } 2999 XFS_STATS_INC(xs_log_force_sleep); 3000 sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); 3001 /* 3002 * No need to grab the log lock here since we're 3003 * only deciding whether or not to return EIO 3004 * and the memory read should be atomic. 3005 */ 3006 if (iclog->ic_state & XLOG_STATE_IOERROR) 3007 return XFS_ERROR(EIO); 3008 *log_flushed = 1; 3009 3010 } else { 3011 3012 no_sleep: 3013 LOG_UNLOCK(log, s); 3014 } 3015 return 0; 3016 } /* xlog_state_sync_all */ 3017 3018 3019 /* 3020 * Used by code which implements synchronous log forces. 3021 * 3022 * Find in-core log with lsn. 3023 * If it is in the DIRTY state, just return. 3024 * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC 3025 * state and go to sleep or return. 3026 * If it is in any other state, go to sleep or return. 3027 * 3028 * If filesystem activity goes to zero, the iclog will get flushed only by 3029 * bdflush(). 3030 */ 3031 int 3032 xlog_state_sync(xlog_t *log, 3033 xfs_lsn_t lsn, 3034 uint flags, 3035 int *log_flushed) 3036 { 3037 xlog_in_core_t *iclog; 3038 int already_slept = 0; 3039 SPLDECL(s); 3040 3041 3042 try_again: 3043 s = LOG_LOCK(log); 3044 iclog = log->l_iclog; 3045 3046 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3047 LOG_UNLOCK(log, s); 3048 return XFS_ERROR(EIO); 3049 } 3050 3051 do { 3052 if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) { 3053 iclog = iclog->ic_next; 3054 continue; 3055 } 3056 3057 if (iclog->ic_state == XLOG_STATE_DIRTY) { 3058 LOG_UNLOCK(log, s); 3059 return 0; 3060 } 3061 3062 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3063 /* 3064 * We sleep here if we haven't already slept (e.g. 3065 * this is the first time we've looked at the correct 3066 * iclog buf) and the buffer before us is going to 3067 * be sync'ed. The reason for this is that if we 3068 * are doing sync transactions here, by waiting for 3069 * the previous I/O to complete, we can allow a few 3070 * more transactions into this iclog before we close 3071 * it down. 3072 * 3073 * Otherwise, we mark the buffer WANT_SYNC, and bump 3074 * up the refcnt so we can release the log (which drops 3075 * the ref count). The state switch keeps new transaction 3076 * commits from using this buffer. When the current commits 3077 * finish writing into the buffer, the refcount will drop to 3078 * zero and the buffer will go out then. 3079 */ 3080 if (!already_slept && 3081 (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | 3082 XLOG_STATE_SYNCING))) { 3083 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3084 XFS_STATS_INC(xs_log_force_sleep); 3085 sv_wait(&iclog->ic_prev->ic_writesema, PSWP, 3086 &log->l_icloglock, s); 3087 *log_flushed = 1; 3088 already_slept = 1; 3089 goto try_again; 3090 } else { 3091 iclog->ic_refcnt++; 3092 xlog_state_switch_iclogs(log, iclog, 0); 3093 LOG_UNLOCK(log, s); 3094 if (xlog_state_release_iclog(log, iclog)) 3095 return XFS_ERROR(EIO); 3096 *log_flushed = 1; 3097 s = LOG_LOCK(log); 3098 } 3099 } 3100 3101 if ((flags & XFS_LOG_SYNC) && /* sleep */ 3102 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { 3103 3104 /* 3105 * Don't wait on the forcesema if we know that we've 3106 * gotten a log write error. 3107 */ 3108 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3109 LOG_UNLOCK(log, s); 3110 return XFS_ERROR(EIO); 3111 } 3112 XFS_STATS_INC(xs_log_force_sleep); 3113 sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); 3114 /* 3115 * No need to grab the log lock here since we're 3116 * only deciding whether or not to return EIO 3117 * and the memory read should be atomic. 3118 */ 3119 if (iclog->ic_state & XLOG_STATE_IOERROR) 3120 return XFS_ERROR(EIO); 3121 *log_flushed = 1; 3122 } else { /* just return */ 3123 LOG_UNLOCK(log, s); 3124 } 3125 return 0; 3126 3127 } while (iclog != log->l_iclog); 3128 3129 LOG_UNLOCK(log, s); 3130 return 0; 3131 } /* xlog_state_sync */ 3132 3133 3134 /* 3135 * Called when we want to mark the current iclog as being ready to sync to 3136 * disk. 3137 */ 3138 void 3139 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) 3140 { 3141 SPLDECL(s); 3142 3143 s = LOG_LOCK(log); 3144 3145 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3146 xlog_state_switch_iclogs(log, iclog, 0); 3147 } else { 3148 ASSERT(iclog->ic_state & 3149 (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); 3150 } 3151 3152 LOG_UNLOCK(log, s); 3153 } /* xlog_state_want_sync */ 3154 3155 3156 3157 /***************************************************************************** 3158 * 3159 * TICKET functions 3160 * 3161 ***************************************************************************** 3162 */ 3163 3164 /* 3165 * Algorithm doesn't take into account page size. ;-( 3166 */ 3167 STATIC void 3168 xlog_state_ticket_alloc(xlog_t *log) 3169 { 3170 xlog_ticket_t *t_list; 3171 xlog_ticket_t *next; 3172 xfs_caddr_t buf; 3173 uint i = (NBPP / sizeof(xlog_ticket_t)) - 2; 3174 SPLDECL(s); 3175 3176 /* 3177 * The kmem_zalloc may sleep, so we shouldn't be holding the 3178 * global lock. XXXmiken: may want to use zone allocator. 3179 */ 3180 buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP); 3181 3182 s = LOG_LOCK(log); 3183 3184 /* Attach 1st ticket to Q, so we can keep track of allocated memory */ 3185 t_list = (xlog_ticket_t *)buf; 3186 t_list->t_next = log->l_unmount_free; 3187 log->l_unmount_free = t_list++; 3188 log->l_ticket_cnt++; 3189 log->l_ticket_tcnt++; 3190 3191 /* Next ticket becomes first ticket attached to ticket free list */ 3192 if (log->l_freelist != NULL) { 3193 ASSERT(log->l_tail != NULL); 3194 log->l_tail->t_next = t_list; 3195 } else { 3196 log->l_freelist = t_list; 3197 } 3198 log->l_ticket_cnt++; 3199 log->l_ticket_tcnt++; 3200 3201 /* Cycle through rest of alloc'ed memory, building up free Q */ 3202 for ( ; i > 0; i--) { 3203 next = t_list + 1; 3204 t_list->t_next = next; 3205 t_list = next; 3206 log->l_ticket_cnt++; 3207 log->l_ticket_tcnt++; 3208 } 3209 t_list->t_next = NULL; 3210 log->l_tail = t_list; 3211 LOG_UNLOCK(log, s); 3212 } /* xlog_state_ticket_alloc */ 3213 3214 3215 /* 3216 * Put ticket into free list 3217 * 3218 * Assumption: log lock is held around this call. 3219 */ 3220 STATIC void 3221 xlog_ticket_put(xlog_t *log, 3222 xlog_ticket_t *ticket) 3223 { 3224 sv_destroy(&ticket->t_sema); 3225 3226 /* 3227 * Don't think caching will make that much difference. It's 3228 * more important to make debug easier. 3229 */ 3230 #if 0 3231 /* real code will want to use LIFO for caching */ 3232 ticket->t_next = log->l_freelist; 3233 log->l_freelist = ticket; 3234 /* no need to clear fields */ 3235 #else 3236 /* When we debug, it is easier if tickets are cycled */ 3237 ticket->t_next = NULL; 3238 if (log->l_tail != 0) { 3239 log->l_tail->t_next = ticket; 3240 } else { 3241 ASSERT(log->l_freelist == 0); 3242 log->l_freelist = ticket; 3243 } 3244 log->l_tail = ticket; 3245 #endif /* DEBUG */ 3246 log->l_ticket_cnt++; 3247 } /* xlog_ticket_put */ 3248 3249 3250 /* 3251 * Grab ticket off freelist or allocation some more 3252 */ 3253 xlog_ticket_t * 3254 xlog_ticket_get(xlog_t *log, 3255 int unit_bytes, 3256 int cnt, 3257 char client, 3258 uint xflags) 3259 { 3260 xlog_ticket_t *tic; 3261 uint num_headers; 3262 SPLDECL(s); 3263 3264 alloc: 3265 if (log->l_freelist == NULL) 3266 xlog_state_ticket_alloc(log); /* potentially sleep */ 3267 3268 s = LOG_LOCK(log); 3269 if (log->l_freelist == NULL) { 3270 LOG_UNLOCK(log, s); 3271 goto alloc; 3272 } 3273 tic = log->l_freelist; 3274 log->l_freelist = tic->t_next; 3275 if (log->l_freelist == NULL) 3276 log->l_tail = NULL; 3277 log->l_ticket_cnt--; 3278 LOG_UNLOCK(log, s); 3279 3280 /* 3281 * Permanent reservations have up to 'cnt'-1 active log operations 3282 * in the log. A unit in this case is the amount of space for one 3283 * of these log operations. Normal reservations have a cnt of 1 3284 * and their unit amount is the total amount of space required. 3285 * 3286 * The following lines of code account for non-transaction data 3287 * which occupy space in the on-disk log. 3288 * 3289 * Normal form of a transaction is: 3290 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph> 3291 * and then there are LR hdrs, split-recs and roundoff at end of syncs. 3292 * 3293 * We need to account for all the leadup data and trailer data 3294 * around the transaction data. 3295 * And then we need to account for the worst case in terms of using 3296 * more space. 3297 * The worst case will happen if: 3298 * - the placement of the transaction happens to be such that the 3299 * roundoff is at its maximum 3300 * - the transaction data is synced before the commit record is synced 3301 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff> 3302 * Therefore the commit record is in its own Log Record. 3303 * This can happen as the commit record is called with its 3304 * own region to xlog_write(). 3305 * This then means that in the worst case, roundoff can happen for 3306 * the commit-rec as well. 3307 * The commit-rec is smaller than padding in this scenario and so it is 3308 * not added separately. 3309 */ 3310 3311 /* for trans header */ 3312 unit_bytes += sizeof(xlog_op_header_t); 3313 unit_bytes += sizeof(xfs_trans_header_t); 3314 3315 /* for start-rec */ 3316 unit_bytes += sizeof(xlog_op_header_t); 3317 3318 /* for LR headers */ 3319 num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); 3320 unit_bytes += log->l_iclog_hsize * num_headers; 3321 3322 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3323 unit_bytes += log->l_iclog_hsize; 3324 3325 /* for split-recs - ophdrs added when data split over LRs */ 3326 unit_bytes += sizeof(xlog_op_header_t) * num_headers; 3327 3328 /* for roundoff padding for transaction data and one for commit record */ 3329 if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && 3330 log->l_mp->m_sb.sb_logsunit > 1) { 3331 /* log su roundoff */ 3332 unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; 3333 } else { 3334 /* BB roundoff */ 3335 unit_bytes += 2*BBSIZE; 3336 } 3337 3338 tic->t_unit_res = unit_bytes; 3339 tic->t_curr_res = unit_bytes; 3340 tic->t_cnt = cnt; 3341 tic->t_ocnt = cnt; 3342 tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); 3343 tic->t_clientid = client; 3344 tic->t_flags = XLOG_TIC_INITED; 3345 tic->t_trans_type = 0; 3346 if (xflags & XFS_LOG_PERM_RESERV) 3347 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3348 sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); 3349 3350 XLOG_TIC_RESET_RES(tic); 3351 3352 return tic; 3353 } /* xlog_ticket_get */ 3354 3355 3356 /****************************************************************************** 3357 * 3358 * Log debug routines 3359 * 3360 ****************************************************************************** 3361 */ 3362 #if defined(DEBUG) 3363 /* 3364 * Make sure that the destination ptr is within the valid data region of 3365 * one of the iclogs. This uses backup pointers stored in a different 3366 * part of the log in case we trash the log structure. 3367 */ 3368 void 3369 xlog_verify_dest_ptr(xlog_t *log, 3370 __psint_t ptr) 3371 { 3372 int i; 3373 int good_ptr = 0; 3374 3375 for (i=0; i < log->l_iclog_bufs; i++) { 3376 if (ptr >= (__psint_t)log->l_iclog_bak[i] && 3377 ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) 3378 good_ptr++; 3379 } 3380 if (! good_ptr) 3381 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3382 } /* xlog_verify_dest_ptr */ 3383 3384 STATIC void 3385 xlog_verify_grant_head(xlog_t *log, int equals) 3386 { 3387 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3388 if (equals) 3389 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3390 else 3391 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3392 } else { 3393 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3394 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3395 } 3396 } /* xlog_verify_grant_head */ 3397 3398 /* check if it will fit */ 3399 STATIC void 3400 xlog_verify_tail_lsn(xlog_t *log, 3401 xlog_in_core_t *iclog, 3402 xfs_lsn_t tail_lsn) 3403 { 3404 int blocks; 3405 3406 if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { 3407 blocks = 3408 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3409 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3410 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3411 } else { 3412 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3413 3414 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3415 xlog_panic("xlog_verify_tail_lsn: tail wrapped"); 3416 3417 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3418 if (blocks < BTOBB(iclog->ic_offset) + 1) 3419 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3420 } 3421 } /* xlog_verify_tail_lsn */ 3422 3423 /* 3424 * Perform a number of checks on the iclog before writing to disk. 3425 * 3426 * 1. Make sure the iclogs are still circular 3427 * 2. Make sure we have a good magic number 3428 * 3. Make sure we don't have magic numbers in the data 3429 * 4. Check fields of each log operation header for: 3430 * A. Valid client identifier 3431 * B. tid ptr value falls in valid ptr space (user space code) 3432 * C. Length in log record header is correct according to the 3433 * individual operation headers within record. 3434 * 5. When a bwrite will occur within 5 blocks of the front of the physical 3435 * log, check the preceding blocks of the physical log to make sure all 3436 * the cycle numbers agree with the current cycle number. 3437 */ 3438 STATIC void 3439 xlog_verify_iclog(xlog_t *log, 3440 xlog_in_core_t *iclog, 3441 int count, 3442 boolean_t syncing) 3443 { 3444 xlog_op_header_t *ophead; 3445 xlog_in_core_t *icptr; 3446 xlog_in_core_2_t *xhdr; 3447 xfs_caddr_t ptr; 3448 xfs_caddr_t base_ptr; 3449 __psint_t field_offset; 3450 __uint8_t clientid; 3451 int len, i, j, k, op_len; 3452 int idx; 3453 SPLDECL(s); 3454 3455 /* check validity of iclog pointers */ 3456 s = LOG_LOCK(log); 3457 icptr = log->l_iclog; 3458 for (i=0; i < log->l_iclog_bufs; i++) { 3459 if (icptr == 0) 3460 xlog_panic("xlog_verify_iclog: invalid ptr"); 3461 icptr = icptr->ic_next; 3462 } 3463 if (icptr != log->l_iclog) 3464 xlog_panic("xlog_verify_iclog: corrupt iclog ring"); 3465 LOG_UNLOCK(log, s); 3466 3467 /* check log magic numbers */ 3468 ptr = (xfs_caddr_t) &(iclog->ic_header); 3469 if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) 3470 xlog_panic("xlog_verify_iclog: invalid magic num"); 3471 3472 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count; 3473 ptr += BBSIZE) { 3474 if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) 3475 xlog_panic("xlog_verify_iclog: unexpected magic num"); 3476 } 3477 3478 /* check fields */ 3479 len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT); 3480 ptr = iclog->ic_datap; 3481 base_ptr = ptr; 3482 ophead = (xlog_op_header_t *)ptr; 3483 xhdr = (xlog_in_core_2_t *)&iclog->ic_header; 3484 for (i = 0; i < len; i++) { 3485 ophead = (xlog_op_header_t *)ptr; 3486 3487 /* clientid is only 1 byte */ 3488 field_offset = (__psint_t) 3489 ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); 3490 if (syncing == B_FALSE || (field_offset & 0x1ff)) { 3491 clientid = ophead->oh_clientid; 3492 } else { 3493 idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); 3494 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3495 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3496 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3497 clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); 3498 } else { 3499 clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); 3500 } 3501 } 3502 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3503 cmn_err(CE_WARN, "xlog_verify_iclog: " 3504 "invalid clientid %d op 0x%p offset 0x%lx", 3505 clientid, ophead, (unsigned long)field_offset); 3506 3507 /* check length */ 3508 field_offset = (__psint_t) 3509 ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); 3510 if (syncing == B_FALSE || (field_offset & 0x1ff)) { 3511 op_len = INT_GET(ophead->oh_len, ARCH_CONVERT); 3512 } else { 3513 idx = BTOBBT((__psint_t)&ophead->oh_len - 3514 (__psint_t)iclog->ic_datap); 3515 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3516 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3517 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3518 op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); 3519 } else { 3520 op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); 3521 } 3522 } 3523 ptr += sizeof(xlog_op_header_t) + op_len; 3524 } 3525 } /* xlog_verify_iclog */ 3526 #endif 3527 3528 /* 3529 * Mark all iclogs IOERROR. LOG_LOCK is held by the caller. 3530 */ 3531 STATIC int 3532 xlog_state_ioerror( 3533 xlog_t *log) 3534 { 3535 xlog_in_core_t *iclog, *ic; 3536 3537 iclog = log->l_iclog; 3538 if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { 3539 /* 3540 * Mark all the incore logs IOERROR. 3541 * From now on, no log flushes will result. 3542 */ 3543 ic = iclog; 3544 do { 3545 ic->ic_state = XLOG_STATE_IOERROR; 3546 ic = ic->ic_next; 3547 } while (ic != iclog); 3548 return 0; 3549 } 3550 /* 3551 * Return non-zero, if state transition has already happened. 3552 */ 3553 return 1; 3554 } 3555 3556 /* 3557 * This is called from xfs_force_shutdown, when we're forcibly 3558 * shutting down the filesystem, typically because of an IO error. 3559 * Our main objectives here are to make sure that: 3560 * a. the filesystem gets marked 'SHUTDOWN' for all interested 3561 * parties to find out, 'atomically'. 3562 * b. those who're sleeping on log reservations, pinned objects and 3563 * other resources get woken up, and be told the bad news. 3564 * c. nothing new gets queued up after (a) and (b) are done. 3565 * d. if !logerror, flush the iclogs to disk, then seal them off 3566 * for business. 3567 */ 3568 int 3569 xfs_log_force_umount( 3570 struct xfs_mount *mp, 3571 int logerror) 3572 { 3573 xlog_ticket_t *tic; 3574 xlog_t *log; 3575 int retval; 3576 int dummy; 3577 SPLDECL(s); 3578 SPLDECL(s2); 3579 3580 log = mp->m_log; 3581 3582 /* 3583 * If this happens during log recovery, don't worry about 3584 * locking; the log isn't open for business yet. 3585 */ 3586 if (!log || 3587 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3588 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3589 XFS_BUF_DONE(mp->m_sb_bp); 3590 return 0; 3591 } 3592 3593 /* 3594 * Somebody could've already done the hard work for us. 3595 * No need to get locks for this. 3596 */ 3597 if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { 3598 ASSERT(XLOG_FORCED_SHUTDOWN(log)); 3599 return 1; 3600 } 3601 retval = 0; 3602 /* 3603 * We must hold both the GRANT lock and the LOG lock, 3604 * before we mark the filesystem SHUTDOWN and wake 3605 * everybody up to tell the bad news. 3606 */ 3607 s = GRANT_LOCK(log); 3608 s2 = LOG_LOCK(log); 3609 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3610 XFS_BUF_DONE(mp->m_sb_bp); 3611 /* 3612 * This flag is sort of redundant because of the mount flag, but 3613 * it's good to maintain the separation between the log and the rest 3614 * of XFS. 3615 */ 3616 log->l_flags |= XLOG_IO_ERROR; 3617 3618 /* 3619 * If we hit a log error, we want to mark all the iclogs IOERROR 3620 * while we're still holding the loglock. 3621 */ 3622 if (logerror) 3623 retval = xlog_state_ioerror(log); 3624 LOG_UNLOCK(log, s2); 3625 3626 /* 3627 * We don't want anybody waiting for log reservations 3628 * after this. That means we have to wake up everybody 3629 * queued up on reserve_headq as well as write_headq. 3630 * In addition, we make sure in xlog_{re}grant_log_space 3631 * that we don't enqueue anything once the SHUTDOWN flag 3632 * is set, and this action is protected by the GRANTLOCK. 3633 */ 3634 if ((tic = log->l_reserve_headq)) { 3635 do { 3636 sv_signal(&tic->t_sema); 3637 tic = tic->t_next; 3638 } while (tic != log->l_reserve_headq); 3639 } 3640 3641 if ((tic = log->l_write_headq)) { 3642 do { 3643 sv_signal(&tic->t_sema); 3644 tic = tic->t_next; 3645 } while (tic != log->l_write_headq); 3646 } 3647 GRANT_UNLOCK(log, s); 3648 3649 if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3650 ASSERT(!logerror); 3651 /* 3652 * Force the incore logs to disk before shutting the 3653 * log down completely. 3654 */ 3655 xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); 3656 s2 = LOG_LOCK(log); 3657 retval = xlog_state_ioerror(log); 3658 LOG_UNLOCK(log, s2); 3659 } 3660 /* 3661 * Wake up everybody waiting on xfs_log_force. 3662 * Callback all log item committed functions as if the 3663 * log writes were completed. 3664 */ 3665 xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); 3666 3667 #ifdef XFSERRORDEBUG 3668 { 3669 xlog_in_core_t *iclog; 3670 3671 s = LOG_LOCK(log); 3672 iclog = log->l_iclog; 3673 do { 3674 ASSERT(iclog->ic_callback == 0); 3675 iclog = iclog->ic_next; 3676 } while (iclog != log->l_iclog); 3677 LOG_UNLOCK(log, s); 3678 } 3679 #endif 3680 /* return non-zero if log IOERROR transition had already happened */ 3681 return retval; 3682 } 3683 3684 STATIC int 3685 xlog_iclogs_empty(xlog_t *log) 3686 { 3687 xlog_in_core_t *iclog; 3688 3689 iclog = log->l_iclog; 3690 do { 3691 /* endianness does not matter here, zero is zero in 3692 * any language. 3693 */ 3694 if (iclog->ic_header.h_num_logops) 3695 return 0; 3696 iclog = iclog->ic_next; 3697 } while (iclog != log->l_iclog); 3698 return 1; 3699 } 3700