1 /* 2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it would be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program; if not, write the Free Software Foundation, 15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 */ 17 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_types.h" 21 #include "xfs_log.h" 22 #include "xfs_trans.h" 23 #include "xfs_trans_priv.h" 24 #include "xfs_log_priv.h" 25 #include "xfs_sb.h" 26 #include "xfs_ag.h" 27 #include "xfs_mount.h" 28 #include "xfs_error.h" 29 #include "xfs_alloc.h" 30 #include "xfs_extent_busy.h" 31 #include "xfs_discard.h" 32 33 /* 34 * Allocate a new ticket. Failing to get a new ticket makes it really hard to 35 * recover, so we don't allow failure here. Also, we allocate in a context that 36 * we don't want to be issuing transactions from, so we need to tell the 37 * allocation code this as well. 38 * 39 * We don't reserve any space for the ticket - we are going to steal whatever 40 * space we require from transactions as they commit. To ensure we reserve all 41 * the space required, we need to set the current reservation of the ticket to 42 * zero so that we know to steal the initial transaction overhead from the 43 * first transaction commit. 44 */ 45 static struct xlog_ticket * 46 xlog_cil_ticket_alloc( 47 struct xlog *log) 48 { 49 struct xlog_ticket *tic; 50 51 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, 52 KM_SLEEP|KM_NOFS); 53 tic->t_trans_type = XFS_TRANS_CHECKPOINT; 54 55 /* 56 * set the current reservation to zero so we know to steal the basic 57 * transaction overhead reservation from the first transaction commit. 58 */ 59 tic->t_curr_res = 0; 60 return tic; 61 } 62 63 /* 64 * After the first stage of log recovery is done, we know where the head and 65 * tail of the log are. We need this log initialisation done before we can 66 * initialise the first CIL checkpoint context. 67 * 68 * Here we allocate a log ticket to track space usage during a CIL push. This 69 * ticket is passed to xlog_write() directly so that we don't slowly leak log 70 * space by failing to account for space used by log headers and additional 71 * region headers for split regions. 72 */ 73 void 74 xlog_cil_init_post_recovery( 75 struct xlog *log) 76 { 77 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); 78 log->l_cilp->xc_ctx->sequence = 1; 79 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, 80 log->l_curr_block); 81 } 82 83 /* 84 * Format log item into a flat buffers 85 * 86 * For delayed logging, we need to hold a formatted buffer containing all the 87 * changes on the log item. This enables us to relog the item in memory and 88 * write it out asynchronously without needing to relock the object that was 89 * modified at the time it gets written into the iclog. 90 * 91 * This function builds a vector for the changes in each log item in the 92 * transaction. It then works out the length of the buffer needed for each log 93 * item, allocates them and formats the vector for the item into the buffer. 94 * The buffer is then attached to the log item are then inserted into the 95 * Committed Item List for tracking until the next checkpoint is written out. 96 * 97 * We don't set up region headers during this process; we simply copy the 98 * regions into the flat buffer. We can do this because we still have to do a 99 * formatting step to write the regions into the iclog buffer. Writing the 100 * ophdrs during the iclog write means that we can support splitting large 101 * regions across iclog boundares without needing a change in the format of the 102 * item/region encapsulation. 103 * 104 * Hence what we need to do now is change the rewrite the vector array to point 105 * to the copied region inside the buffer we just allocated. This allows us to 106 * format the regions into the iclog as though they are being formatted 107 * directly out of the objects themselves. 108 */ 109 static struct xfs_log_vec * 110 xlog_cil_prepare_log_vecs( 111 struct xfs_trans *tp) 112 { 113 struct xfs_log_item_desc *lidp; 114 struct xfs_log_vec *lv = NULL; 115 struct xfs_log_vec *ret_lv = NULL; 116 117 118 /* Bail out if we didn't find a log item. */ 119 if (list_empty(&tp->t_items)) { 120 ASSERT(0); 121 return NULL; 122 } 123 124 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 125 struct xfs_log_vec *new_lv; 126 void *ptr; 127 int index; 128 int len = 0; 129 uint niovecs; 130 bool ordered = false; 131 132 /* Skip items which aren't dirty in this transaction. */ 133 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 134 continue; 135 136 /* Skip items that do not have any vectors for writing */ 137 niovecs = IOP_SIZE(lidp->lid_item); 138 if (!niovecs) 139 continue; 140 141 /* 142 * Ordered items need to be tracked but we do not wish to write 143 * them. We need a logvec to track the object, but we do not 144 * need an iovec or buffer to be allocated for copying data. 145 */ 146 if (niovecs == XFS_LOG_VEC_ORDERED) { 147 ordered = true; 148 niovecs = 0; 149 } 150 151 new_lv = kmem_zalloc(sizeof(*new_lv) + 152 niovecs * sizeof(struct xfs_log_iovec), 153 KM_SLEEP|KM_NOFS); 154 155 new_lv->lv_item = lidp->lid_item; 156 new_lv->lv_niovecs = niovecs; 157 if (ordered) { 158 /* track as an ordered logvec */ 159 new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED; 160 goto next; 161 } 162 163 /* The allocated iovec region lies beyond the log vector. */ 164 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; 165 166 /* build the vector array and calculate it's length */ 167 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); 168 for (index = 0; index < new_lv->lv_niovecs; index++) 169 len += new_lv->lv_iovecp[index].i_len; 170 171 new_lv->lv_buf_len = len; 172 new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, 173 KM_SLEEP|KM_NOFS); 174 ptr = new_lv->lv_buf; 175 176 for (index = 0; index < new_lv->lv_niovecs; index++) { 177 struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; 178 179 memcpy(ptr, vec->i_addr, vec->i_len); 180 vec->i_addr = ptr; 181 ptr += vec->i_len; 182 } 183 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); 184 185 next: 186 if (!ret_lv) 187 ret_lv = new_lv; 188 else 189 lv->lv_next = new_lv; 190 lv = new_lv; 191 } 192 193 return ret_lv; 194 } 195 196 /* 197 * Prepare the log item for insertion into the CIL. Calculate the difference in 198 * log space and vectors it will consume, and if it is a new item pin it as 199 * well. 200 */ 201 STATIC void 202 xfs_cil_prepare_item( 203 struct xlog *log, 204 struct xfs_log_vec *lv, 205 int *len, 206 int *diff_iovecs) 207 { 208 struct xfs_log_vec *old = lv->lv_item->li_lv; 209 210 if (old) { 211 /* existing lv on log item, space used is a delta */ 212 ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) || 213 old->lv_buf_len == XFS_LOG_VEC_ORDERED); 214 215 /* 216 * If the new item is ordered, keep the old one that is already 217 * tracking dirty or ordered regions 218 */ 219 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { 220 ASSERT(!lv->lv_buf); 221 kmem_free(lv); 222 return; 223 } 224 225 *len += lv->lv_buf_len - old->lv_buf_len; 226 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; 227 kmem_free(old->lv_buf); 228 kmem_free(old); 229 } else { 230 /* new lv, must pin the log item */ 231 ASSERT(!lv->lv_item->li_lv); 232 233 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { 234 *len += lv->lv_buf_len; 235 *diff_iovecs += lv->lv_niovecs; 236 } 237 IOP_PIN(lv->lv_item); 238 239 } 240 241 /* attach new log vector to log item */ 242 lv->lv_item->li_lv = lv; 243 244 /* 245 * If this is the first time the item is being committed to the 246 * CIL, store the sequence number on the log item so we can 247 * tell in future commits whether this is the first checkpoint 248 * the item is being committed into. 249 */ 250 if (!lv->lv_item->li_seq) 251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; 252 } 253 254 /* 255 * Insert the log items into the CIL and calculate the difference in space 256 * consumed by the item. Add the space to the checkpoint ticket and calculate 257 * if the change requires additional log metadata. If it does, take that space 258 * as well. Remove the amount of space we added to the checkpoint ticket from 259 * the current transaction ticket so that the accounting works out correctly. 260 */ 261 static void 262 xlog_cil_insert_items( 263 struct xlog *log, 264 struct xfs_log_vec *log_vector, 265 struct xlog_ticket *ticket) 266 { 267 struct xfs_cil *cil = log->l_cilp; 268 struct xfs_cil_ctx *ctx = cil->xc_ctx; 269 struct xfs_log_vec *lv; 270 int len = 0; 271 int diff_iovecs = 0; 272 int iclog_space; 273 274 ASSERT(log_vector); 275 276 /* 277 * Do all the accounting aggregation and switching of log vectors 278 * around in a separate loop to the insertion of items into the CIL. 279 * Then we can do a separate loop to update the CIL within a single 280 * lock/unlock pair. This reduces the number of round trips on the CIL 281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall 282 * hold time for the transaction commit. 283 * 284 * If this is the first time the item is being placed into the CIL in 285 * this context, pin it so it can't be written to disk until the CIL is 286 * flushed to the iclog and the iclog written to disk. 287 * 288 * We can do this safely because the context can't checkpoint until we 289 * are done so it doesn't matter exactly how we update the CIL. 290 */ 291 spin_lock(&cil->xc_cil_lock); 292 for (lv = log_vector; lv; ) { 293 struct xfs_log_vec *next = lv->lv_next; 294 295 ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil)); 296 lv->lv_next = NULL; 297 298 /* 299 * xfs_cil_prepare_item() may free the lv, so move the item on 300 * the CIL first. 301 */ 302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); 303 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); 304 lv = next; 305 } 306 307 /* account for space used by new iovec headers */ 308 len += diff_iovecs * sizeof(xlog_op_header_t); 309 ctx->nvecs += diff_iovecs; 310 311 /* 312 * Now transfer enough transaction reservation to the context ticket 313 * for the checkpoint. The context ticket is special - the unit 314 * reservation has to grow as well as the current reservation as we 315 * steal from tickets so we can correctly determine the space used 316 * during the transaction commit. 317 */ 318 if (ctx->ticket->t_curr_res == 0) { 319 /* first commit in checkpoint, steal the header reservation */ 320 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); 321 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; 322 ticket->t_curr_res -= ctx->ticket->t_unit_res; 323 } 324 325 /* do we need space for more log record headers? */ 326 iclog_space = log->l_iclog_size - log->l_iclog_hsize; 327 if (len > 0 && (ctx->space_used / iclog_space != 328 (ctx->space_used + len) / iclog_space)) { 329 int hdrs; 330 331 hdrs = (len + iclog_space - 1) / iclog_space; 332 /* need to take into account split region headers, too */ 333 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); 334 ctx->ticket->t_unit_res += hdrs; 335 ctx->ticket->t_curr_res += hdrs; 336 ticket->t_curr_res -= hdrs; 337 ASSERT(ticket->t_curr_res >= len); 338 } 339 ticket->t_curr_res -= len; 340 ctx->space_used += len; 341 342 spin_unlock(&cil->xc_cil_lock); 343 } 344 345 static void 346 xlog_cil_free_logvec( 347 struct xfs_log_vec *log_vector) 348 { 349 struct xfs_log_vec *lv; 350 351 for (lv = log_vector; lv; ) { 352 struct xfs_log_vec *next = lv->lv_next; 353 kmem_free(lv->lv_buf); 354 kmem_free(lv); 355 lv = next; 356 } 357 } 358 359 /* 360 * Mark all items committed and clear busy extents. We free the log vector 361 * chains in a separate pass so that we unpin the log items as quickly as 362 * possible. 363 */ 364 static void 365 xlog_cil_committed( 366 void *args, 367 int abort) 368 { 369 struct xfs_cil_ctx *ctx = args; 370 struct xfs_mount *mp = ctx->cil->xc_log->l_mp; 371 372 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, 373 ctx->start_lsn, abort); 374 375 xfs_extent_busy_sort(&ctx->busy_extents); 376 xfs_extent_busy_clear(mp, &ctx->busy_extents, 377 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); 378 379 spin_lock(&ctx->cil->xc_cil_lock); 380 list_del(&ctx->committing); 381 spin_unlock(&ctx->cil->xc_cil_lock); 382 383 xlog_cil_free_logvec(ctx->lv_chain); 384 385 if (!list_empty(&ctx->busy_extents)) { 386 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); 387 388 xfs_discard_extents(mp, &ctx->busy_extents); 389 xfs_extent_busy_clear(mp, &ctx->busy_extents, false); 390 } 391 392 kmem_free(ctx); 393 } 394 395 /* 396 * Push the Committed Item List to the log. If @push_seq flag is zero, then it 397 * is a background flush and so we can chose to ignore it. Otherwise, if the 398 * current sequence is the same as @push_seq we need to do a flush. If 399 * @push_seq is less than the current sequence, then it has already been 400 * flushed and we don't need to do anything - the caller will wait for it to 401 * complete if necessary. 402 * 403 * @push_seq is a value rather than a flag because that allows us to do an 404 * unlocked check of the sequence number for a match. Hence we can allows log 405 * forces to run racily and not issue pushes for the same sequence twice. If we 406 * get a race between multiple pushes for the same sequence they will block on 407 * the first one and then abort, hence avoiding needless pushes. 408 */ 409 STATIC int 410 xlog_cil_push( 411 struct xlog *log) 412 { 413 struct xfs_cil *cil = log->l_cilp; 414 struct xfs_log_vec *lv; 415 struct xfs_cil_ctx *ctx; 416 struct xfs_cil_ctx *new_ctx; 417 struct xlog_in_core *commit_iclog; 418 struct xlog_ticket *tic; 419 int num_iovecs; 420 int error = 0; 421 struct xfs_trans_header thdr; 422 struct xfs_log_iovec lhdr; 423 struct xfs_log_vec lvhdr = { NULL }; 424 xfs_lsn_t commit_lsn; 425 xfs_lsn_t push_seq; 426 427 if (!cil) 428 return 0; 429 430 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 431 new_ctx->ticket = xlog_cil_ticket_alloc(log); 432 433 down_write(&cil->xc_ctx_lock); 434 ctx = cil->xc_ctx; 435 436 spin_lock(&cil->xc_cil_lock); 437 push_seq = cil->xc_push_seq; 438 ASSERT(push_seq <= ctx->sequence); 439 440 /* 441 * Check if we've anything to push. If there is nothing, then we don't 442 * move on to a new sequence number and so we have to be able to push 443 * this sequence again later. 444 */ 445 if (list_empty(&cil->xc_cil)) { 446 cil->xc_push_seq = 0; 447 spin_unlock(&cil->xc_cil_lock); 448 goto out_skip; 449 } 450 spin_unlock(&cil->xc_cil_lock); 451 452 453 /* check for a previously pushed seqeunce */ 454 if (push_seq < cil->xc_ctx->sequence) 455 goto out_skip; 456 457 /* 458 * pull all the log vectors off the items in the CIL, and 459 * remove the items from the CIL. We don't need the CIL lock 460 * here because it's only needed on the transaction commit 461 * side which is currently locked out by the flush lock. 462 */ 463 lv = NULL; 464 num_iovecs = 0; 465 while (!list_empty(&cil->xc_cil)) { 466 struct xfs_log_item *item; 467 468 item = list_first_entry(&cil->xc_cil, 469 struct xfs_log_item, li_cil); 470 list_del_init(&item->li_cil); 471 if (!ctx->lv_chain) 472 ctx->lv_chain = item->li_lv; 473 else 474 lv->lv_next = item->li_lv; 475 lv = item->li_lv; 476 item->li_lv = NULL; 477 num_iovecs += lv->lv_niovecs; 478 } 479 480 /* 481 * initialise the new context and attach it to the CIL. Then attach 482 * the current context to the CIL committing lsit so it can be found 483 * during log forces to extract the commit lsn of the sequence that 484 * needs to be forced. 485 */ 486 INIT_LIST_HEAD(&new_ctx->committing); 487 INIT_LIST_HEAD(&new_ctx->busy_extents); 488 new_ctx->sequence = ctx->sequence + 1; 489 new_ctx->cil = cil; 490 cil->xc_ctx = new_ctx; 491 492 /* 493 * mirror the new sequence into the cil structure so that we can do 494 * unlocked checks against the current sequence in log forces without 495 * risking deferencing a freed context pointer. 496 */ 497 cil->xc_current_sequence = new_ctx->sequence; 498 499 /* 500 * The switch is now done, so we can drop the context lock and move out 501 * of a shared context. We can't just go straight to the commit record, 502 * though - we need to synchronise with previous and future commits so 503 * that the commit records are correctly ordered in the log to ensure 504 * that we process items during log IO completion in the correct order. 505 * 506 * For example, if we get an EFI in one checkpoint and the EFD in the 507 * next (e.g. due to log forces), we do not want the checkpoint with 508 * the EFD to be committed before the checkpoint with the EFI. Hence 509 * we must strictly order the commit records of the checkpoints so 510 * that: a) the checkpoint callbacks are attached to the iclogs in the 511 * correct order; and b) the checkpoints are replayed in correct order 512 * in log recovery. 513 * 514 * Hence we need to add this context to the committing context list so 515 * that higher sequences will wait for us to write out a commit record 516 * before they do. 517 */ 518 spin_lock(&cil->xc_cil_lock); 519 list_add(&ctx->committing, &cil->xc_committing); 520 spin_unlock(&cil->xc_cil_lock); 521 up_write(&cil->xc_ctx_lock); 522 523 /* 524 * Build a checkpoint transaction header and write it to the log to 525 * begin the transaction. We need to account for the space used by the 526 * transaction header here as it is not accounted for in xlog_write(). 527 * 528 * The LSN we need to pass to the log items on transaction commit is 529 * the LSN reported by the first log vector write. If we use the commit 530 * record lsn then we can move the tail beyond the grant write head. 531 */ 532 tic = ctx->ticket; 533 thdr.th_magic = XFS_TRANS_HEADER_MAGIC; 534 thdr.th_type = XFS_TRANS_CHECKPOINT; 535 thdr.th_tid = tic->t_tid; 536 thdr.th_num_items = num_iovecs; 537 lhdr.i_addr = &thdr; 538 lhdr.i_len = sizeof(xfs_trans_header_t); 539 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; 540 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); 541 542 lvhdr.lv_niovecs = 1; 543 lvhdr.lv_iovecp = &lhdr; 544 lvhdr.lv_next = ctx->lv_chain; 545 546 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 547 if (error) 548 goto out_abort_free_ticket; 549 550 /* 551 * now that we've written the checkpoint into the log, strictly 552 * order the commit records so replay will get them in the right order. 553 */ 554 restart: 555 spin_lock(&cil->xc_cil_lock); 556 list_for_each_entry(new_ctx, &cil->xc_committing, committing) { 557 /* 558 * Higher sequences will wait for this one so skip them. 559 * Don't wait for own own sequence, either. 560 */ 561 if (new_ctx->sequence >= ctx->sequence) 562 continue; 563 if (!new_ctx->commit_lsn) { 564 /* 565 * It is still being pushed! Wait for the push to 566 * complete, then start again from the beginning. 567 */ 568 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); 569 goto restart; 570 } 571 } 572 spin_unlock(&cil->xc_cil_lock); 573 574 /* xfs_log_done always frees the ticket on error. */ 575 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 576 if (commit_lsn == -1) 577 goto out_abort; 578 579 /* attach all the transactions w/ busy extents to iclog */ 580 ctx->log_cb.cb_func = xlog_cil_committed; 581 ctx->log_cb.cb_arg = ctx; 582 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); 583 if (error) 584 goto out_abort; 585 586 /* 587 * now the checkpoint commit is complete and we've attached the 588 * callbacks to the iclog we can assign the commit LSN to the context 589 * and wake up anyone who is waiting for the commit to complete. 590 */ 591 spin_lock(&cil->xc_cil_lock); 592 ctx->commit_lsn = commit_lsn; 593 wake_up_all(&cil->xc_commit_wait); 594 spin_unlock(&cil->xc_cil_lock); 595 596 /* release the hounds! */ 597 return xfs_log_release_iclog(log->l_mp, commit_iclog); 598 599 out_skip: 600 up_write(&cil->xc_ctx_lock); 601 xfs_log_ticket_put(new_ctx->ticket); 602 kmem_free(new_ctx); 603 return 0; 604 605 out_abort_free_ticket: 606 xfs_log_ticket_put(tic); 607 out_abort: 608 xlog_cil_committed(ctx, XFS_LI_ABORTED); 609 return XFS_ERROR(EIO); 610 } 611 612 static void 613 xlog_cil_push_work( 614 struct work_struct *work) 615 { 616 struct xfs_cil *cil = container_of(work, struct xfs_cil, 617 xc_push_work); 618 xlog_cil_push(cil->xc_log); 619 } 620 621 /* 622 * We need to push CIL every so often so we don't cache more than we can fit in 623 * the log. The limit really is that a checkpoint can't be more than half the 624 * log (the current checkpoint is not allowed to overwrite the previous 625 * checkpoint), but commit latency and memory usage limit this to a smaller 626 * size. 627 */ 628 static void 629 xlog_cil_push_background( 630 struct xlog *log) 631 { 632 struct xfs_cil *cil = log->l_cilp; 633 634 /* 635 * The cil won't be empty because we are called while holding the 636 * context lock so whatever we added to the CIL will still be there 637 */ 638 ASSERT(!list_empty(&cil->xc_cil)); 639 640 /* 641 * don't do a background push if we haven't used up all the 642 * space available yet. 643 */ 644 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 645 return; 646 647 spin_lock(&cil->xc_cil_lock); 648 if (cil->xc_push_seq < cil->xc_current_sequence) { 649 cil->xc_push_seq = cil->xc_current_sequence; 650 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); 651 } 652 spin_unlock(&cil->xc_cil_lock); 653 654 } 655 656 static void 657 xlog_cil_push_foreground( 658 struct xlog *log, 659 xfs_lsn_t push_seq) 660 { 661 struct xfs_cil *cil = log->l_cilp; 662 663 if (!cil) 664 return; 665 666 ASSERT(push_seq && push_seq <= cil->xc_current_sequence); 667 668 /* start on any pending background push to minimise wait time on it */ 669 flush_work(&cil->xc_push_work); 670 671 /* 672 * If the CIL is empty or we've already pushed the sequence then 673 * there's no work we need to do. 674 */ 675 spin_lock(&cil->xc_cil_lock); 676 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { 677 spin_unlock(&cil->xc_cil_lock); 678 return; 679 } 680 681 cil->xc_push_seq = push_seq; 682 spin_unlock(&cil->xc_cil_lock); 683 684 /* do the push now */ 685 xlog_cil_push(log); 686 } 687 688 /* 689 * Commit a transaction with the given vector to the Committed Item List. 690 * 691 * To do this, we need to format the item, pin it in memory if required and 692 * account for the space used by the transaction. Once we have done that we 693 * need to release the unused reservation for the transaction, attach the 694 * transaction to the checkpoint context so we carry the busy extents through 695 * to checkpoint completion, and then unlock all the items in the transaction. 696 * 697 * Called with the context lock already held in read mode to lock out 698 * background commit, returns without it held once background commits are 699 * allowed again. 700 */ 701 int 702 xfs_log_commit_cil( 703 struct xfs_mount *mp, 704 struct xfs_trans *tp, 705 xfs_lsn_t *commit_lsn, 706 int flags) 707 { 708 struct xlog *log = mp->m_log; 709 int log_flags = 0; 710 struct xfs_log_vec *log_vector; 711 712 if (flags & XFS_TRANS_RELEASE_LOG_RES) 713 log_flags = XFS_LOG_REL_PERM_RESERV; 714 715 /* 716 * Do all the hard work of formatting items (including memory 717 * allocation) outside the CIL context lock. This prevents stalling CIL 718 * pushes when we are low on memory and a transaction commit spends a 719 * lot of time in memory reclaim. 720 */ 721 log_vector = xlog_cil_prepare_log_vecs(tp); 722 if (!log_vector) 723 return ENOMEM; 724 725 /* lock out background commit */ 726 down_read(&log->l_cilp->xc_ctx_lock); 727 if (commit_lsn) 728 *commit_lsn = log->l_cilp->xc_ctx->sequence; 729 730 /* xlog_cil_insert_items() destroys log_vector list */ 731 xlog_cil_insert_items(log, log_vector, tp->t_ticket); 732 733 /* check we didn't blow the reservation */ 734 if (tp->t_ticket->t_curr_res < 0) 735 xlog_print_tic_res(log->l_mp, tp->t_ticket); 736 737 /* attach the transaction to the CIL if it has any busy extents */ 738 if (!list_empty(&tp->t_busy)) { 739 spin_lock(&log->l_cilp->xc_cil_lock); 740 list_splice_init(&tp->t_busy, 741 &log->l_cilp->xc_ctx->busy_extents); 742 spin_unlock(&log->l_cilp->xc_cil_lock); 743 } 744 745 tp->t_commit_lsn = *commit_lsn; 746 xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 747 xfs_trans_unreserve_and_mod_sb(tp); 748 749 /* 750 * Once all the items of the transaction have been copied to the CIL, 751 * the items can be unlocked and freed. 752 * 753 * This needs to be done before we drop the CIL context lock because we 754 * have to update state in the log items and unlock them before they go 755 * to disk. If we don't, then the CIL checkpoint can race with us and 756 * we can run checkpoint completion before we've updated and unlocked 757 * the log items. This affects (at least) processing of stale buffers, 758 * inodes and EFIs. 759 */ 760 xfs_trans_free_items(tp, *commit_lsn, 0); 761 762 xlog_cil_push_background(log); 763 764 up_read(&log->l_cilp->xc_ctx_lock); 765 return 0; 766 } 767 768 /* 769 * Conditionally push the CIL based on the sequence passed in. 770 * 771 * We only need to push if we haven't already pushed the sequence 772 * number given. Hence the only time we will trigger a push here is 773 * if the push sequence is the same as the current context. 774 * 775 * We return the current commit lsn to allow the callers to determine if a 776 * iclog flush is necessary following this call. 777 */ 778 xfs_lsn_t 779 xlog_cil_force_lsn( 780 struct xlog *log, 781 xfs_lsn_t sequence) 782 { 783 struct xfs_cil *cil = log->l_cilp; 784 struct xfs_cil_ctx *ctx; 785 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 786 787 ASSERT(sequence <= cil->xc_current_sequence); 788 789 /* 790 * check to see if we need to force out the current context. 791 * xlog_cil_push() handles racing pushes for the same sequence, 792 * so no need to deal with it here. 793 */ 794 xlog_cil_push_foreground(log, sequence); 795 796 /* 797 * See if we can find a previous sequence still committing. 798 * We need to wait for all previous sequence commits to complete 799 * before allowing the force of push_seq to go ahead. Hence block 800 * on commits for those as well. 801 */ 802 restart: 803 spin_lock(&cil->xc_cil_lock); 804 list_for_each_entry(ctx, &cil->xc_committing, committing) { 805 if (ctx->sequence > sequence) 806 continue; 807 if (!ctx->commit_lsn) { 808 /* 809 * It is still being pushed! Wait for the push to 810 * complete, then start again from the beginning. 811 */ 812 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); 813 goto restart; 814 } 815 if (ctx->sequence != sequence) 816 continue; 817 /* found it! */ 818 commit_lsn = ctx->commit_lsn; 819 } 820 spin_unlock(&cil->xc_cil_lock); 821 return commit_lsn; 822 } 823 824 /* 825 * Check if the current log item was first committed in this sequence. 826 * We can't rely on just the log item being in the CIL, we have to check 827 * the recorded commit sequence number. 828 * 829 * Note: for this to be used in a non-racy manner, it has to be called with 830 * CIL flushing locked out. As a result, it should only be used during the 831 * transaction commit process when deciding what to format into the item. 832 */ 833 bool 834 xfs_log_item_in_current_chkpt( 835 struct xfs_log_item *lip) 836 { 837 struct xfs_cil_ctx *ctx; 838 839 if (list_empty(&lip->li_cil)) 840 return false; 841 842 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; 843 844 /* 845 * li_seq is written on the first commit of a log item to record the 846 * first checkpoint it is written to. Hence if it is different to the 847 * current sequence, we're in a new checkpoint. 848 */ 849 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) 850 return false; 851 return true; 852 } 853 854 /* 855 * Perform initial CIL structure initialisation. 856 */ 857 int 858 xlog_cil_init( 859 struct xlog *log) 860 { 861 struct xfs_cil *cil; 862 struct xfs_cil_ctx *ctx; 863 864 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); 865 if (!cil) 866 return ENOMEM; 867 868 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); 869 if (!ctx) { 870 kmem_free(cil); 871 return ENOMEM; 872 } 873 874 INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); 875 INIT_LIST_HEAD(&cil->xc_cil); 876 INIT_LIST_HEAD(&cil->xc_committing); 877 spin_lock_init(&cil->xc_cil_lock); 878 init_rwsem(&cil->xc_ctx_lock); 879 init_waitqueue_head(&cil->xc_commit_wait); 880 881 INIT_LIST_HEAD(&ctx->committing); 882 INIT_LIST_HEAD(&ctx->busy_extents); 883 ctx->sequence = 1; 884 ctx->cil = cil; 885 cil->xc_ctx = ctx; 886 cil->xc_current_sequence = ctx->sequence; 887 888 cil->xc_log = log; 889 log->l_cilp = cil; 890 return 0; 891 } 892 893 void 894 xlog_cil_destroy( 895 struct xlog *log) 896 { 897 if (log->l_cilp->xc_ctx) { 898 if (log->l_cilp->xc_ctx->ticket) 899 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); 900 kmem_free(log->l_cilp->xc_ctx); 901 } 902 903 ASSERT(list_empty(&log->l_cilp->xc_cil)); 904 kmem_free(log->l_cilp); 905 } 906 907