1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 #pragma ident "%Z%%M% %I% %E% SMI" 22 23 /* 24 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #include <sys/systm.h> 29 #include <sys/types.h> 30 #include <sys/vnode.h> 31 #include <sys/errno.h> 32 #include <sys/sysmacros.h> 33 #include <sys/debug.h> 34 #include <sys/kmem.h> 35 #include <sys/conf.h> 36 #include <sys/proc.h> 37 #include <sys/cmn_err.h> 38 #include <sys/fs/ufs_inode.h> 39 #include <sys/fs/ufs_filio.h> 40 #include <sys/fs/ufs_log.h> 41 #include <sys/inttypes.h> 42 #include <sys/atomic.h> 43 #include <sys/tuneable.h> 44 45 /* 46 * externs 47 */ 48 extern pri_t minclsyspri; 49 extern struct kmem_cache *lufs_bp; 50 extern int ufs_trans_push_quota(); 51 52 /* 53 * globals 54 */ 55 kmem_cache_t *mapentry_cache; 56 57 /* 58 * logmap tuning constants 59 */ 60 long logmap_maxnme_commit = 2048; 61 long logmap_maxnme_async = 4096; 62 long logmap_maxnme_sync = 6144; 63 long logmap_maxcfrag_commit = 4; /* Max canceled fragments per moby */ 64 65 66 uint64_t ufs_crb_size = 0; /* current size of all crb buffers */ 67 uint64_t ufs_crb_max_size = 0; /* highest crb buffer use so far */ 68 size_t ufs_crb_limit; /* max allowable size for crbs */ 69 uint64_t ufs_crb_alloc_fails = 0; /* crb allocation failures stat */ 70 #define UFS_MAX_CRB_DEFAULT_DIVISOR 10 /* max 1/10 kmem_maxavail() */ 71 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */ 72 void handle_dquot(mapentry_t *); 73 74 /* 75 * GENERIC MAP ROUTINES 76 */ 77 78 #define CRB_FREE(crb, me) \ 79 kmem_free(crb->c_buf, crb->c_nb); \ 80 atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \ 81 kmem_free(crb, sizeof (crb_t)); \ 82 (me)->me_crb = NULL; 83 84 #define CRB_RELE(me) { \ 85 crb_t *crb = (me)->me_crb; \ 86 if (crb && (--crb->c_refcnt == 0)) { \ 87 CRB_FREE(crb, me) \ 88 } \ 89 } 90 91 /* 92 * Check that the old delta has an argument and a push function of 93 * ufs_trans_push_quota(), then check that the old and new deltas differ. 94 * If so we clean up with handle_dquot() before replacing the old delta. 95 */ 96 #define HANDLE_DQUOT(me, melist) { \ 97 if ((me->me_arg) && \ 98 (me->me_func == ufs_trans_push_quota)) { \ 99 if (!((me->me_dt == melist->me_dt) && \ 100 (me->me_arg == melist->me_arg) && \ 101 (me->me_func == melist->me_func))) { \ 102 handle_dquot(me); \ 103 } \ 104 } \ 105 } 106 107 /* 108 * free up all the mapentries for a map 109 */ 110 void 111 map_free_entries(mt_map_t *mtm) 112 { 113 int i; 114 mapentry_t *me; 115 116 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 117 me->me_next->me_prev = me->me_prev; 118 me->me_prev->me_next = me->me_next; 119 CRB_RELE(me); 120 kmem_cache_free(mapentry_cache, me); 121 } 122 for (i = 0; i < mtm->mtm_nhash; i++) 123 mtm->mtm_hash[i] = NULL; 124 mtm->mtm_nme = 0; 125 mtm->mtm_nmet = 0; 126 } 127 128 /* 129 * done with map; free if necessary 130 */ 131 mt_map_t * 132 map_put(mt_map_t *mtm) 133 { 134 /* 135 * free up the map's memory 136 */ 137 map_free_entries(mtm); 138 ASSERT(map_put_debug(mtm)); 139 kmem_free(mtm->mtm_hash, 140 (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash)); 141 mutex_destroy(&mtm->mtm_mutex); 142 mutex_destroy(&mtm->mtm_scan_mutex); 143 cv_destroy(&mtm->mtm_to_roll_cv); 144 cv_destroy(&mtm->mtm_from_roll_cv); 145 rw_destroy(&mtm->mtm_rwlock); 146 mutex_destroy(&mtm->mtm_lock); 147 cv_destroy(&mtm->mtm_cv_commit); 148 cv_destroy(&mtm->mtm_cv_next); 149 cv_destroy(&mtm->mtm_cv_eot); 150 cv_destroy(&mtm->mtm_cv); 151 kmem_free(mtm, sizeof (mt_map_t)); 152 return (NULL); 153 } 154 /* 155 * Allocate a map; 156 */ 157 mt_map_t * 158 map_get(ml_unit_t *ul, enum maptypes maptype, int nh) 159 { 160 mt_map_t *mtm; 161 162 /* 163 * assume the map is not here and allocate the necessary structs 164 */ 165 mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP); 166 mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL); 167 mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL); 168 cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL); 169 cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL); 170 rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL); 171 mtm->mtm_next = (mapentry_t *)mtm; 172 mtm->mtm_prev = (mapentry_t *)mtm; 173 mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh), 174 KM_SLEEP); 175 mtm->mtm_nhash = nh; 176 mtm->mtm_debug = ul->un_debug; 177 mtm->mtm_type = maptype; 178 179 mtm->mtm_cfrags = 0; 180 mtm->mtm_cfragmax = logmap_maxcfrag_commit; 181 182 /* 183 * for scan test 184 */ 185 mtm->mtm_ul = ul; 186 187 /* 188 * Initialize locks 189 */ 190 mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL); 191 cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL); 192 cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL); 193 cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL); 194 cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL); 195 ASSERT(map_get_debug(ul, mtm)); 196 197 return (mtm); 198 } 199 200 /* 201 * DELTAMAP ROUTINES 202 */ 203 /* 204 * deltamap tuning constants 205 */ 206 long deltamap_maxnme = 1024; /* global so it can be set */ 207 208 int 209 deltamap_need_commit(mt_map_t *mtm) 210 { 211 return (mtm->mtm_nme > deltamap_maxnme); 212 } 213 214 /* 215 * put a delta into a deltamap; may sleep on memory 216 */ 217 void 218 deltamap_add( 219 mt_map_t *mtm, 220 offset_t mof, 221 off_t nb, 222 delta_t dtyp, 223 int (*func)(), 224 ulong_t arg, 225 threadtrans_t *tp) 226 { 227 int32_t hnb; 228 mapentry_t *me; 229 mapentry_t **mep; 230 231 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 232 map_check_linkage(mtm)); 233 234 mutex_enter(&mtm->mtm_mutex); 235 236 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 237 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 238 if (hnb > nb) 239 hnb = nb; 240 /* 241 * Search for dup entry. We need to ensure that we don't 242 * replace a map entry which carries quota information 243 * with a map entry which doesn't. In that case we lose 244 * reference the the dquot structure which will not be 245 * cleaned up by the push function me->me_func as this will 246 * never be called. 247 * The stray dquot would be found later by invalidatedq() 248 * causing a panic when the filesystem is unmounted. 249 */ 250 mep = MAP_HASH(mof, mtm); 251 for (me = *mep; me; me = me->me_hash) { 252 if (DATAwithinME(mof, hnb, me)) { 253 if (me->me_func == ufs_trans_push_quota) { 254 /* 255 * Don't remove quota entries which have 256 * incremented the ref count (those with a 257 * ufs_trans_push_quota push function). 258 * Let logmap_add[_buf] clean them up. 259 */ 260 continue; 261 } 262 break; 263 } 264 ASSERT((dtyp == DT_CANCEL) || 265 (!DATAoverlapME(mof, hnb, me)) || 266 MEwithinDATA(me, mof, hnb)); 267 } 268 269 if (me) { 270 /* already in map */ 271 continue; 272 } 273 274 /* 275 * Add up all the delta map deltas so we can compute 276 * an upper bound on the log size used. 277 * Note, some deltas get removed from the deltamap 278 * before the deltamap_push by lufs_write_strategy 279 * and so multiple deltas to the same mof offset 280 * don't get cancelled here but in the logmap. 281 * Thus we can't easily get a accurate count of 282 * the log space used - only an upper bound. 283 */ 284 if (tp && (mtm->mtm_ul->un_deltamap == mtm)) { 285 ASSERT(dtyp != DT_CANCEL); 286 if (dtyp == DT_ABZERO) { 287 tp->deltas_size += sizeof (struct delta); 288 } else { 289 tp->deltas_size += 290 (hnb + sizeof (struct delta)); 291 } 292 } 293 294 delta_stats[dtyp]++; 295 296 /* 297 * get a mapentry 298 * May need to drop & re-grab the mtm_mutex 299 * and then recheck for a duplicate 300 */ 301 me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP); 302 if (me == NULL) { 303 mutex_exit(&mtm->mtm_mutex); 304 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 305 mutex_enter(&mtm->mtm_mutex); 306 } 307 bzero(me, sizeof (mapentry_t)); 308 309 /* 310 * initialize and put in deltamap 311 */ 312 me->me_mof = mof; 313 me->me_nb = hnb; 314 me->me_func = func; 315 me->me_arg = arg; 316 me->me_dt = dtyp; 317 me->me_flags = ME_HASH; 318 me->me_tid = mtm->mtm_tid; 319 320 me->me_hash = *mep; 321 *mep = me; 322 me->me_next = (mapentry_t *)mtm; 323 me->me_prev = mtm->mtm_prev; 324 mtm->mtm_prev->me_next = me; 325 mtm->mtm_prev = me; 326 mtm->mtm_nme++; 327 } 328 mutex_exit(&mtm->mtm_mutex); 329 330 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 331 map_check_linkage(mtm)); 332 } 333 334 /* 335 * remove deltas within (mof, nb) and return as linked list 336 */ 337 mapentry_t * 338 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb) 339 { 340 off_t hnb; 341 mapentry_t *me; 342 mapentry_t **mep; 343 mapentry_t *mer; 344 345 if (mtm == NULL) 346 return (NULL); 347 348 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 349 map_check_linkage(mtm)); 350 351 mutex_enter(&mtm->mtm_mutex); 352 for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) { 353 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 354 if (hnb > nb) 355 hnb = nb; 356 /* 357 * remove entries from hash and return as a aged linked list 358 */ 359 mep = MAP_HASH(mof, mtm); 360 while ((me = *mep) != 0) { 361 if (MEwithinDATA(me, mof, hnb)) { 362 *mep = me->me_hash; 363 me->me_next->me_prev = me->me_prev; 364 me->me_prev->me_next = me->me_next; 365 me->me_hash = mer; 366 mer = me; 367 me->me_flags |= ME_LIST; 368 me->me_flags &= ~ME_HASH; 369 mtm->mtm_nme--; 370 } else 371 mep = &me->me_hash; 372 } 373 } 374 mutex_exit(&mtm->mtm_mutex); 375 376 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 377 map_check_linkage(mtm)); 378 379 return (mer); 380 } 381 382 /* 383 * delete entries within (mof, nb) 384 */ 385 void 386 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb) 387 { 388 mapentry_t *me; 389 mapentry_t *menext; 390 391 menext = deltamap_remove(mtm, mof, nb); 392 while ((me = menext) != 0) { 393 menext = me->me_hash; 394 kmem_cache_free(mapentry_cache, me); 395 } 396 } 397 398 /* 399 * Call the indicated function to cause deltas to move to the logmap. 400 * top_end_sync() is the only caller of this function and 401 * it has waited for the completion of all threads, so there can 402 * be no other activity in the deltamap. Therefore we don't need to 403 * hold the deltamap lock. 404 */ 405 void 406 deltamap_push(ml_unit_t *ul) 407 { 408 delta_t dtyp; 409 int (*func)(); 410 ulong_t arg; 411 mapentry_t *me; 412 offset_t mof; 413 off_t nb; 414 mt_map_t *mtm = ul->un_deltamap; 415 416 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 417 map_check_linkage(mtm)); 418 419 /* 420 * for every entry in the deltamap 421 */ 422 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 423 ASSERT(me->me_func); 424 func = me->me_func; 425 dtyp = me->me_dt; 426 arg = me->me_arg; 427 mof = me->me_mof; 428 nb = me->me_nb; 429 if ((ul->un_flags & LDL_ERROR) || 430 (*func)(ul->un_ufsvfs, dtyp, arg)) 431 deltamap_del(mtm, mof, nb); 432 } 433 434 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 435 map_check_linkage(mtm)); 436 } 437 438 /* 439 * LOGMAP ROUTINES 440 */ 441 442 int 443 logmap_need_commit(mt_map_t *mtm) 444 { 445 return ((mtm->mtm_nmet > logmap_maxnme_commit) || 446 (mtm->mtm_cfrags >= mtm->mtm_cfragmax)); 447 } 448 449 int 450 logmap_need_roll_async(mt_map_t *mtm) 451 { 452 return (mtm->mtm_nme > logmap_maxnme_async); 453 } 454 455 int 456 logmap_need_roll_sync(mt_map_t *mtm) 457 { 458 return (mtm->mtm_nme > logmap_maxnme_sync); 459 } 460 461 void 462 logmap_start_roll(ml_unit_t *ul) 463 { 464 mt_map_t *logmap = ul->un_logmap; 465 466 logmap_settail(logmap, ul); 467 ASSERT(!(ul->un_flags & LDL_NOROLL)); 468 mutex_enter(&logmap->mtm_mutex); 469 if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) { 470 logmap->mtm_flags |= MTM_ROLL_RUNNING; 471 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT); 472 (void) thread_create(NULL, 0, trans_roll, ul, 0, &p0, 473 TS_RUN, minclsyspri); 474 } 475 mutex_exit(&logmap->mtm_mutex); 476 } 477 478 void 479 logmap_kill_roll(ml_unit_t *ul) 480 { 481 mt_map_t *mtm = ul->un_logmap; 482 483 if (mtm == NULL) 484 return; 485 486 mutex_enter(&mtm->mtm_mutex); 487 488 while (mtm->mtm_flags & MTM_ROLL_RUNNING) { 489 mtm->mtm_flags |= MTM_ROLL_EXIT; 490 cv_signal(&mtm->mtm_to_roll_cv); 491 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 492 } 493 mutex_exit(&mtm->mtm_mutex); 494 } 495 496 /* 497 * kick the roll thread if it's not doing anything 498 */ 499 void 500 logmap_forceroll_nowait(mt_map_t *logmap) 501 { 502 /* 503 * Don't need to lock mtm_mutex to read mtm_flags here as we 504 * don't care in the rare case when we get a transitional value 505 * of mtm_flags. Just by signalling the thread it will wakeup 506 * and notice it has too many logmap entries. 507 */ 508 ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL)); 509 if ((logmap->mtm_flags & MTM_ROLLING) == 0) { 510 cv_signal(&logmap->mtm_to_roll_cv); 511 } 512 } 513 514 /* 515 * kick the roll thread and wait for it to finish a cycle 516 */ 517 void 518 logmap_forceroll(mt_map_t *mtm) 519 { 520 mutex_enter(&mtm->mtm_mutex); 521 if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) { 522 mtm->mtm_flags |= MTM_FORCE_ROLL; 523 cv_signal(&mtm->mtm_to_roll_cv); 524 } 525 do { 526 if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) { 527 mtm->mtm_flags &= ~MTM_FORCE_ROLL; 528 goto out; 529 } 530 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 531 } while (mtm->mtm_flags & MTM_FORCE_ROLL); 532 out: 533 mutex_exit(&mtm->mtm_mutex); 534 } 535 536 /* 537 * remove rolled deltas within (mof, nb) and free them 538 */ 539 void 540 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb) 541 { 542 int dolock = 0; 543 off_t hnb; 544 mapentry_t *me; 545 mapentry_t **mep; 546 offset_t savmof = mof; 547 off_t savnb = nb; 548 549 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 550 map_check_linkage(mtm)); 551 552 again: 553 if (dolock) 554 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 555 mutex_enter(&mtm->mtm_mutex); 556 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 557 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 558 if (hnb > nb) 559 hnb = nb; 560 /* 561 * remove and free the rolled entries 562 */ 563 mep = MAP_HASH(mof, mtm); 564 while ((me = *mep) != 0) { 565 if ((me->me_flags & ME_ROLL) && 566 (MEwithinDATA(me, mof, hnb))) { 567 if (me->me_flags & ME_AGE) { 568 ASSERT(dolock == 0); 569 dolock = 1; 570 mutex_exit(&mtm->mtm_mutex); 571 mof = savmof; 572 nb = savnb; 573 goto again; 574 } 575 *mep = me->me_hash; 576 me->me_next->me_prev = me->me_prev; 577 me->me_prev->me_next = me->me_next; 578 me->me_flags &= ~(ME_HASH|ME_ROLL); 579 ASSERT(!(me->me_flags & ME_USER)); 580 mtm->mtm_nme--; 581 /* 582 * cancelled entries are handled by someone else 583 */ 584 if ((me->me_flags & ME_CANCEL) == 0) { 585 roll_stats[me->me_dt]++; 586 CRB_RELE(me); 587 kmem_cache_free(mapentry_cache, me); 588 } 589 } else 590 mep = &me->me_hash; 591 } 592 } 593 mutex_exit(&mtm->mtm_mutex); 594 595 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 596 map_check_linkage(mtm)); 597 598 if (dolock) 599 rw_exit(&mtm->mtm_rwlock); 600 } 601 602 /* 603 * Find the disk offset of the next delta to roll. 604 * Returns 0: no more deltas to roll or a transaction is being committed 605 * 1: a delta to roll has been found and *mofp points 606 * to the master file disk offset 607 */ 608 int 609 logmap_next_roll(mt_map_t *logmap, offset_t *mofp) 610 { 611 mapentry_t *me; 612 613 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 614 map_check_linkage(logmap)); 615 616 mutex_enter(&logmap->mtm_mutex); 617 for (me = logmap->mtm_next; me != (mapentry_t *)logmap; 618 me = me->me_next) { 619 /* already rolled */ 620 if (me->me_flags & ME_ROLL) { 621 continue; 622 } 623 624 /* part of currently busy transaction; stop */ 625 if (me->me_tid == logmap->mtm_tid) { 626 break; 627 } 628 629 /* part of commit-in-progress transaction; stop */ 630 if (me->me_tid == logmap->mtm_committid) { 631 break; 632 } 633 634 /* 635 * We shouldn't see a DT_CANCEL mapentry whose 636 * tid != mtm_committid, or != mtm_tid since 637 * these are removed at the end of each committed 638 * transaction. 639 */ 640 ASSERT(!(me->me_dt == DT_CANCEL)); 641 642 *mofp = me->me_mof; 643 mutex_exit(&logmap->mtm_mutex); 644 return (1); 645 } 646 mutex_exit(&logmap->mtm_mutex); 647 return (0); 648 } 649 650 /* 651 * put mapentry on sorted age list 652 */ 653 static void 654 logmap_list_age(mapentry_t **age, mapentry_t *meadd) 655 { 656 mapentry_t *me; 657 658 ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST))); 659 660 for (me = *age; me; age = &me->me_agenext, me = *age) { 661 if (me->me_age > meadd->me_age) 662 break; 663 } 664 meadd->me_agenext = me; 665 meadd->me_flags |= ME_AGE; 666 *age = meadd; 667 } 668 669 /* 670 * get a list of deltas within <mof, mof+nb> 671 * returns with mtm_rwlock held 672 * return value says whether the entire mof range is covered by deltas 673 */ 674 int 675 logmap_list_get( 676 mt_map_t *mtm, 677 offset_t mof, 678 off_t nb, 679 mapentry_t **age) 680 { 681 off_t hnb; 682 mapentry_t *me; 683 mapentry_t **mep; 684 int rwtype = RW_READER; 685 offset_t savmof = mof; 686 off_t savnb = nb; 687 int entire = 0; 688 crb_t *crb; 689 690 mtm->mtm_ref = 1; 691 again: 692 693 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 694 map_check_linkage(mtm)); 695 696 rw_enter(&mtm->mtm_rwlock, rwtype); 697 *age = NULL; 698 mutex_enter(&mtm->mtm_mutex); 699 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 700 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 701 if (hnb > nb) 702 hnb = nb; 703 /* 704 * find overlapping entries 705 */ 706 mep = MAP_HASH(mof, mtm); 707 for (me = *mep; me; me = me->me_hash) { 708 if (me->me_dt == DT_CANCEL) 709 continue; 710 if (!DATAoverlapME(mof, hnb, me)) 711 continue; 712 /* 713 * check if map entry is in use 714 * (about to be rolled). 715 */ 716 if (me->me_flags & ME_AGE) { 717 /* 718 * reset the age bit in the list, 719 * upgrade the lock, and try again 720 */ 721 for (me = *age; me; me = *age) { 722 *age = me->me_agenext; 723 me->me_flags &= ~ME_AGE; 724 } 725 mutex_exit(&mtm->mtm_mutex); 726 rw_exit(&mtm->mtm_rwlock); 727 rwtype = RW_WRITER; 728 mof = savmof; 729 nb = savnb; 730 entire = 0; 731 goto again; 732 } else { 733 /* add mapentry to age ordered list */ 734 logmap_list_age(age, me); 735 crb = me->me_crb; 736 if (crb) { 737 if (DATAwithinCRB(savmof, savnb, crb)) { 738 entire = 1; 739 } 740 } else { 741 if (DATAwithinME(savmof, savnb, me)) { 742 entire = 1; 743 } 744 } 745 } 746 } 747 } 748 mutex_exit(&mtm->mtm_mutex); 749 750 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 751 return (entire); 752 } 753 754 /* 755 * Get a list of deltas for rolling - returns sucess or failure. 756 * Also return the cached roll buffer if all deltas point to it. 757 */ 758 int 759 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp) 760 { 761 mapentry_t *me, **mep, *age = NULL; 762 crb_t *crb = NULL; 763 764 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 765 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 766 map_check_linkage(logmap)); 767 ASSERT((mof & MAPBLOCKOFF) == 0); 768 769 rbp->rb_crb = NULL; 770 771 /* 772 * find overlapping entries 773 */ 774 mutex_enter(&logmap->mtm_mutex); 775 mep = MAP_HASH(mof, logmap); 776 for (me = *mep; me; me = me->me_hash) { 777 if (!DATAoverlapME(mof, MAPBLOCKSIZE, me)) 778 continue; 779 if (me->me_tid == logmap->mtm_tid) 780 continue; 781 if (me->me_tid == logmap->mtm_committid) 782 continue; 783 if (me->me_dt == DT_CANCEL) 784 continue; 785 786 /* 787 * Check if map entry is in use (by lufs_read_strategy()) 788 * and if so reset the age bit in the list, 789 * upgrade the lock, and try again 790 */ 791 if (me->me_flags & ME_AGE) { 792 for (me = age; me; me = age) { 793 age = me->me_agenext; 794 me->me_flags &= ~ME_AGE; 795 } 796 mutex_exit(&logmap->mtm_mutex); 797 return (1); /* failure */ 798 } else { 799 /* add mapentry to age ordered list */ 800 logmap_list_age(&age, me); 801 } 802 } 803 if (!age) { 804 goto out; 805 } 806 807 /* 808 * Mark the deltas as being rolled. 809 */ 810 for (me = age; me; me = me->me_agenext) { 811 me->me_flags |= ME_ROLL; 812 } 813 814 /* 815 * Test if all deltas are covered by one valid roll buffer 816 */ 817 crb = age->me_crb; 818 if (crb && !(crb->c_invalid)) { 819 for (me = age; me; me = me->me_agenext) { 820 if (me->me_crb != crb) { 821 crb = NULL; 822 break; 823 } 824 } 825 rbp->rb_crb = crb; 826 } 827 out: 828 rbp->rb_age = age; 829 830 mutex_exit(&logmap->mtm_mutex); 831 832 ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) || 833 logmap_logscan_debug(logmap, age)); 834 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 835 return (0); /* success */ 836 } 837 838 void 839 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age) 840 { 841 mapentry_t *me; 842 843 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 844 mutex_enter(&mtm->mtm_mutex); 845 for (me = age; me; me = age) { 846 age = me->me_agenext; 847 me->me_flags &= ~ME_AGE; 848 } 849 mutex_exit(&mtm->mtm_mutex); 850 } 851 852 void 853 logmap_list_put(mt_map_t *mtm, mapentry_t *age) 854 { 855 mapentry_t *me; 856 857 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 858 mutex_enter(&mtm->mtm_mutex); 859 for (me = age; me; me = age) { 860 age = me->me_agenext; 861 me->me_flags &= ~ME_AGE; 862 } 863 mutex_exit(&mtm->mtm_mutex); 864 rw_exit(&mtm->mtm_rwlock); 865 } 866 867 #define UFS_RW_BALANCE 2 868 int ufs_rw_balance = UFS_RW_BALANCE; 869 870 /* 871 * Check if we need to read the master. 872 * The master does not need to be read if the log deltas to the 873 * block are for one contiguous set of full disk sectors. 874 * Both cylinder group bit maps DT_CG (8K); directory entries (512B); 875 * and possibly others should not require master disk reads. 876 * Calculate the sector map for writing later. 877 */ 878 int 879 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp) 880 { 881 offset_t mof; 882 crb_t *crb; 883 mapentry_t *me; 884 int32_t nb; 885 int i; 886 int start_sec, end_sec; 887 int read_needed = 0; 888 int all_inodes = 1; 889 int first_sec = INT_MAX; 890 int last_sec = -1; 891 rbsecmap_t secmap = 0; 892 893 /* LINTED: warning: logical expression always true: op "||" */ 894 ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY)); 895 896 for (me = age; me; me = me->me_agenext) { 897 crb = me->me_crb; 898 if (crb) { 899 nb = crb->c_nb; 900 mof = crb->c_mof; 901 } else { 902 nb = me->me_nb; 903 mof = me->me_mof; 904 } 905 906 /* 907 * If the delta is not sector aligned then 908 * read the whole block. 909 */ 910 if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) { 911 read_needed = 1; 912 } 913 914 /* Set sector map used in the MAPBLOCKSIZE block. */ 915 start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT; 916 end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT); 917 for (i = start_sec; i <= end_sec; i++) { 918 secmap |= UINT16_C(1) << i; 919 } 920 921 if (me->me_dt != DT_INODE) { 922 all_inodes = 0; 923 } 924 if (start_sec < first_sec) { 925 first_sec = start_sec; 926 } 927 if (end_sec > last_sec) { 928 last_sec = end_sec; 929 } 930 } 931 932 ASSERT(secmap); 933 ASSERT(first_sec != INT_MAX); 934 ASSERT(last_sec != -1); 935 936 if (all_inodes) { 937 /* 938 * Here we have a tradeoff choice. It must be better to 939 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a 940 * read and a write. But what about 3 or more writes, versus 941 * a read+write? * Where is the cut over? It will depend on 942 * the track caching, scsi driver and other activity. 943 * A unpublished tunable is defined (ufs_rw_balance) that 944 * currently defaults to 2. 945 */ 946 if (!read_needed) { 947 int count = 0, gap = 0; 948 int sector_set; /* write needed to this sector */ 949 950 /* Count the gaps (every 1 to 0 transation) */ 951 for (i = first_sec + 1; i < last_sec; i++) { 952 sector_set = secmap & (UINT16_C(1) << i); 953 if (!gap && !sector_set) { 954 gap = 1; 955 count++; 956 if (count > ufs_rw_balance) { 957 read_needed = 1; 958 break; 959 } 960 } else if (gap && sector_set) { 961 gap = 0; 962 } 963 } 964 } 965 966 /* 967 * Inodes commonly make up the majority (~85%) of deltas. 968 * They cannot contain embedded user data, so its safe to 969 * read and write them all in one IO. 970 * But for directory entries, shadow inode data, and 971 * quota record data the user data fragments can be embedded 972 * betwen those metadata, and so its not safe to read, modify 973 * then write the entire range as user asynchronous user data 974 * writes could get overwritten with old data. 975 * Thus we have to create a segment map of meta data that 976 * needs to get written. 977 * 978 * If user data was logged then this issue would go away. 979 */ 980 if (read_needed) { 981 for (i = first_sec + 1; i < last_sec; i++) { 982 secmap |= (UINT16_C(1) << i); 983 } 984 } 985 } 986 rbp->rb_secmap = secmap; 987 return (read_needed); 988 } 989 990 /* 991 * Abort the load of a set of log map delta's. 992 * ie, 993 * Clear out all mapentries on this unit's log map 994 * which have a tid (transaction id) equal to the 995 * parameter tid. Walk the cancel list, taking everything 996 * off it, too. 997 */ 998 static void 999 logmap_abort(ml_unit_t *ul, uint32_t tid) 1000 { 1001 struct mt_map *mtm = ul->un_logmap; /* Log map */ 1002 mapentry_t *me, 1003 **mep; 1004 int i; 1005 1006 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1007 map_check_linkage(mtm)); 1008 1009 /* 1010 * wait for any outstanding reads to finish; lock out future reads 1011 */ 1012 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1013 1014 mutex_enter(&mtm->mtm_mutex); 1015 /* Take everything off cancel list */ 1016 while ((me = mtm->mtm_cancel) != NULL) { 1017 mtm->mtm_cancel = me->me_cancel; 1018 me->me_flags &= ~ME_CANCEL; 1019 me->me_cancel = NULL; 1020 } 1021 1022 /* 1023 * Now take out all mapentries with current tid, and committid 1024 * as this function is called from logmap_logscan and logmap_commit 1025 * When it is called from logmap_logscan mtm_tid == mtm_committid 1026 * But when logmap_abort is called from logmap_commit it is 1027 * because the log errored when trying to write the commit record, 1028 * after the async ops have been allowed to start in top_end_sync. 1029 * So we also need to remove all mapentries from the transaction whose 1030 * commit failed. 1031 */ 1032 for (i = 0; i < mtm->mtm_nhash; i++) { 1033 mep = &mtm->mtm_hash[i]; 1034 while ((me = *mep) != NULL) { 1035 if (me->me_tid == tid || 1036 me->me_tid == mtm->mtm_committid) { 1037 *mep = me->me_hash; 1038 me->me_next->me_prev = me->me_prev; 1039 me->me_prev->me_next = me->me_next; 1040 if (!(me->me_flags & ME_USER)) { 1041 mtm->mtm_nme--; 1042 } 1043 CRB_RELE(me); 1044 kmem_cache_free(mapentry_cache, me); 1045 continue; 1046 } 1047 mep = &me->me_hash; 1048 } 1049 } 1050 1051 if (!(ul->un_flags & LDL_SCAN)) 1052 mtm->mtm_flags |= MTM_CANCELED; 1053 mutex_exit(&mtm->mtm_mutex); 1054 mtm->mtm_dirty = 0; 1055 mtm->mtm_nmet = 0; 1056 rw_exit(&mtm->mtm_rwlock); 1057 1058 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1059 map_check_linkage(mtm)); 1060 } 1061 1062 static void 1063 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me) 1064 { 1065 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1066 1067 while (!ldl_has_space(ul, me)) { 1068 ASSERT(!(ul->un_flags & LDL_NOROLL)); 1069 mutex_exit(&ul->un_log_mutex); 1070 logmap_forceroll(mtm); 1071 mutex_enter(&ul->un_log_mutex); 1072 if (ul->un_flags & LDL_ERROR) 1073 break; 1074 } 1075 1076 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1077 } 1078 1079 /* 1080 * put a list of deltas into a logmap 1081 * If va == NULL, don't write to the log. 1082 */ 1083 void 1084 logmap_add( 1085 ml_unit_t *ul, 1086 char *va, /* Ptr to buf w/deltas & data */ 1087 offset_t vamof, /* Offset on master of buf start */ 1088 mapentry_t *melist) /* Entries to add */ 1089 { 1090 offset_t mof; 1091 off_t nb; 1092 mapentry_t *me; 1093 mapentry_t **mep; 1094 mapentry_t **savmep; 1095 uint32_t tid; 1096 mt_map_t *mtm = ul->un_logmap; 1097 1098 mutex_enter(&ul->un_log_mutex); 1099 if (va) 1100 logmap_wait_space(mtm, ul, melist); 1101 1102 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1103 map_check_linkage(mtm)); 1104 1105 mtm->mtm_ref = 1; 1106 mtm->mtm_dirty++; 1107 tid = mtm->mtm_tid; 1108 while (melist) { 1109 mof = melist->me_mof; 1110 nb = melist->me_nb; 1111 1112 /* 1113 * search for overlaping entries 1114 */ 1115 savmep = mep = MAP_HASH(mof, mtm); 1116 mutex_enter(&mtm->mtm_mutex); 1117 while ((me = *mep) != 0) { 1118 /* 1119 * Data consumes old map entry; cancel map entry. 1120 * Take care when we replace an old map entry 1121 * which carries quota information with a newer entry 1122 * which does not. In that case the push function 1123 * would not be called to clean up the dquot structure. 1124 * This would be found later by invalidatedq() causing 1125 * a panic when the filesystem in unmounted. 1126 * We clean up the dquot manually and then replace 1127 * the map entry. 1128 */ 1129 if (MEwithinDATA(me, mof, nb) && 1130 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1131 if (tid == me->me_tid && 1132 ((me->me_flags & ME_AGE) == 0)) { 1133 *mep = me->me_hash; 1134 me->me_next->me_prev = me->me_prev; 1135 me->me_prev->me_next = me->me_next; 1136 ASSERT(!(me->me_flags & ME_USER)); 1137 mtm->mtm_nme--; 1138 /* 1139 * Special case if the mapentry 1140 * carries a dquot and a push function. 1141 * We have to clean up the quota info 1142 * before replacing the mapentry. 1143 */ 1144 if (me->me_dt == DT_QR) 1145 HANDLE_DQUOT(me, melist); 1146 1147 kmem_cache_free(mapentry_cache, me); 1148 continue; 1149 } 1150 me->me_cancel = mtm->mtm_cancel; 1151 mtm->mtm_cancel = me; 1152 me->me_flags |= ME_CANCEL; 1153 } 1154 mep = &(*mep)->me_hash; 1155 } 1156 mutex_exit(&mtm->mtm_mutex); 1157 1158 /* 1159 * remove from list 1160 */ 1161 me = melist; 1162 melist = melist->me_hash; 1163 me->me_flags &= ~ME_LIST; 1164 /* 1165 * If va != NULL, put in the log. 1166 */ 1167 if (va) 1168 ldl_write(ul, va, vamof, me); 1169 if (ul->un_flags & LDL_ERROR) { 1170 kmem_cache_free(mapentry_cache, me); 1171 continue; 1172 } 1173 ASSERT((va == NULL) || 1174 ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1175 map_check_ldl_write(ul, va, vamof, me)); 1176 1177 /* 1178 * put on hash 1179 */ 1180 mutex_enter(&mtm->mtm_mutex); 1181 me->me_hash = *savmep; 1182 *savmep = me; 1183 me->me_next = (mapentry_t *)mtm; 1184 me->me_prev = mtm->mtm_prev; 1185 mtm->mtm_prev->me_next = me; 1186 mtm->mtm_prev = me; 1187 me->me_flags |= ME_HASH; 1188 me->me_tid = tid; 1189 me->me_age = mtm->mtm_age++; 1190 mtm->mtm_nme++; 1191 mtm->mtm_nmet++; 1192 mutex_exit(&mtm->mtm_mutex); 1193 } 1194 1195 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1196 map_check_linkage(mtm)); 1197 mutex_exit(&ul->un_log_mutex); 1198 } 1199 1200 /* 1201 * Add the delta(s) into the log. 1202 * Create one cached roll buffer logmap entry, and reference count the 1203 * number of mapentries refering to it. 1204 * Cancel previous logmap entries. 1205 * logmap_add is tolerant of failure to allocate a cached roll buffer. 1206 */ 1207 void 1208 logmap_add_buf( 1209 ml_unit_t *ul, 1210 char *va, /* Ptr to buf w/deltas & data */ 1211 offset_t bufmof, /* Offset on master of buf start */ 1212 mapentry_t *melist, /* Entries to add */ 1213 caddr_t buf, /* Buffer containing delta(s) */ 1214 uint32_t bufsz) /* Size of buf */ 1215 { 1216 offset_t mof; 1217 offset_t vamof = bufmof + (va - buf); 1218 off_t nb; 1219 mapentry_t *me; 1220 mapentry_t **mep; 1221 mapentry_t **savmep; 1222 uint32_t tid; 1223 mt_map_t *mtm = ul->un_logmap; 1224 crb_t *crb; 1225 crb_t *crbsav = NULL; 1226 1227 ASSERT((bufsz & DEV_BMASK) == 0); 1228 mutex_enter(&ul->un_log_mutex); 1229 logmap_wait_space(mtm, ul, melist); 1230 1231 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1232 map_check_linkage(mtm)); 1233 1234 mtm->mtm_ref = 1; 1235 mtm->mtm_dirty++; 1236 tid = mtm->mtm_tid; 1237 while (melist) { 1238 mof = melist->me_mof; 1239 nb = melist->me_nb; 1240 1241 /* 1242 * search for overlapping entries 1243 */ 1244 savmep = mep = MAP_HASH(mof, mtm); 1245 mutex_enter(&mtm->mtm_mutex); 1246 while ((me = *mep) != 0) { 1247 /* 1248 * Data consumes old map entry; cancel map entry. 1249 * Take care when we replace an old map entry 1250 * which carries quota information with a newer entry 1251 * which does not. In that case the push function 1252 * would not be called to clean up the dquot structure. 1253 * This would be found later by invalidatedq() causing 1254 * a panic when the filesystem in unmounted. 1255 * We clean up the dquot manually and then replace 1256 * the map entry. 1257 */ 1258 crb = me->me_crb; 1259 if (MEwithinDATA(me, mof, nb) && 1260 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1261 if (tid == me->me_tid && 1262 ((me->me_flags & ME_AGE) == 0)) { 1263 *mep = me->me_hash; 1264 me->me_next->me_prev = me->me_prev; 1265 me->me_prev->me_next = me->me_next; 1266 ASSERT(!(me->me_flags & ME_USER)); 1267 mtm->mtm_nme--; 1268 /* 1269 * Special case if the mapentry 1270 * carries a dquot and a push function. 1271 * We have to clean up the quota info 1272 * before replacing the mapentry. 1273 */ 1274 if (me->me_dt == DT_QR) 1275 HANDLE_DQUOT(me, melist); 1276 1277 /* 1278 * If this soon to be deleted mapentry 1279 * has a suitable roll buffer then 1280 * re-use it. 1281 */ 1282 if (crb && (--crb->c_refcnt == 0)) { 1283 if (crbsav || 1284 (crb->c_nb != bufsz)) { 1285 CRB_FREE(crb, me); 1286 } else { 1287 bcopy(buf, crb->c_buf, 1288 bufsz); 1289 crb->c_invalid = 0; 1290 crb->c_mof = bufmof; 1291 crbsav = crb; 1292 me->me_crb = NULL; 1293 } 1294 } 1295 kmem_cache_free(mapentry_cache, me); 1296 continue; 1297 } 1298 me->me_cancel = mtm->mtm_cancel; 1299 mtm->mtm_cancel = me; 1300 me->me_flags |= ME_CANCEL; 1301 } 1302 1303 /* 1304 * Inode deltas within the same fs block come 1305 * in individually as separate calls to logmap_add(). 1306 * All others come in as one call. So check for an 1307 * existing entry where we can re-use the crb. 1308 */ 1309 if ((me->me_dt == DT_INODE) && (tid == me->me_tid) && 1310 !crbsav && crb && 1311 WITHIN(mof, nb, crb->c_mof, crb->c_nb)) { 1312 ASSERT(crb->c_mof == bufmof); 1313 ASSERT(crb->c_nb == bufsz); 1314 bcopy(buf, crb->c_buf, bufsz); 1315 crbsav = crb; 1316 } 1317 mep = &(*mep)->me_hash; 1318 } 1319 mutex_exit(&mtm->mtm_mutex); 1320 1321 /* 1322 * If we don't already have a crb then allocate one 1323 * and copy the incoming buffer. Only do this once 1324 * for all the incoming deltas. 1325 */ 1326 if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) { 1327 /* 1328 * Only use a cached roll buffer if we 1329 * have enough memory, and check for failures. 1330 */ 1331 if (((ufs_crb_size + bufsz) < ufs_crb_limit) && 1332 (kmem_avail() > bufsz)) { 1333 crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP); 1334 } else { 1335 ufs_crb_alloc_fails++; 1336 } 1337 if (crbsav) { 1338 crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP); 1339 if (crbsav->c_buf) { 1340 atomic_add_64(&ufs_crb_size, 1341 (uint64_t)bufsz); 1342 if (ufs_crb_size > ufs_crb_max_size) { 1343 ufs_crb_max_size = ufs_crb_size; 1344 } 1345 bcopy(buf, crbsav->c_buf, bufsz); 1346 crbsav->c_nb = bufsz; 1347 crbsav->c_refcnt = 0; 1348 crbsav->c_invalid = 0; 1349 ASSERT((bufmof & DEV_BMASK) == 0); 1350 crbsav->c_mof = bufmof; 1351 } else { 1352 kmem_free(crbsav, sizeof (crb_t)); 1353 crbsav = NULL; 1354 } 1355 } 1356 } 1357 1358 /* 1359 * remove from list 1360 */ 1361 me = melist; 1362 melist = melist->me_hash; 1363 me->me_flags &= ~ME_LIST; 1364 me->me_crb = crbsav; 1365 if (crbsav) { 1366 crbsav->c_refcnt++; 1367 } 1368 crbsav = NULL; 1369 1370 ASSERT(va); 1371 ldl_write(ul, va, vamof, me); /* add to on-disk log */ 1372 if (ul->un_flags & LDL_ERROR) { 1373 CRB_RELE(me); 1374 kmem_cache_free(mapentry_cache, me); 1375 continue; 1376 } 1377 ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1378 map_check_ldl_write(ul, va, vamof, me)); 1379 1380 /* 1381 * put on hash 1382 */ 1383 mutex_enter(&mtm->mtm_mutex); 1384 me->me_hash = *savmep; 1385 *savmep = me; 1386 me->me_next = (mapentry_t *)mtm; 1387 me->me_prev = mtm->mtm_prev; 1388 mtm->mtm_prev->me_next = me; 1389 mtm->mtm_prev = me; 1390 me->me_flags |= ME_HASH; 1391 me->me_tid = tid; 1392 me->me_age = mtm->mtm_age++; 1393 mtm->mtm_nme++; 1394 mtm->mtm_nmet++; 1395 mutex_exit(&mtm->mtm_mutex); 1396 } 1397 1398 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1399 map_check_linkage(mtm)); 1400 mutex_exit(&ul->un_log_mutex); 1401 } 1402 1403 /* 1404 * free up any cancelled deltas 1405 */ 1406 void 1407 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead) 1408 { 1409 int dolock = 0; 1410 mapentry_t *me; 1411 mapentry_t **mep; 1412 1413 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1414 map_check_linkage(mtm)); 1415 1416 again: 1417 if (dolock) 1418 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1419 1420 /* 1421 * At EOT, cancel the indicated deltas 1422 */ 1423 mutex_enter(&mtm->mtm_mutex); 1424 if (mtm->mtm_flags & MTM_CANCELED) { 1425 mtm->mtm_flags &= ~MTM_CANCELED; 1426 ASSERT(dolock == 0); 1427 mutex_exit(&mtm->mtm_mutex); 1428 return; 1429 } 1430 1431 while ((me = *cancelhead) != NULL) { 1432 /* 1433 * roll forward or read collision; wait and try again 1434 */ 1435 if (me->me_flags & ME_AGE) { 1436 ASSERT(dolock == 0); 1437 mutex_exit(&mtm->mtm_mutex); 1438 dolock = 1; 1439 goto again; 1440 } 1441 /* 1442 * remove from cancel list 1443 */ 1444 *cancelhead = me->me_cancel; 1445 me->me_cancel = NULL; 1446 me->me_flags &= ~(ME_CANCEL); 1447 1448 /* 1449 * logmap_remove_roll handles ME_ROLL entries later 1450 * we leave them around for logmap_iscancel 1451 * XXX is this necessary? 1452 */ 1453 if (me->me_flags & ME_ROLL) 1454 continue; 1455 1456 /* 1457 * remove from hash (if necessary) 1458 */ 1459 if (me->me_flags & ME_HASH) { 1460 mep = MAP_HASH(me->me_mof, mtm); 1461 while (*mep) { 1462 if (*mep == me) { 1463 *mep = me->me_hash; 1464 me->me_next->me_prev = me->me_prev; 1465 me->me_prev->me_next = me->me_next; 1466 me->me_flags &= ~(ME_HASH); 1467 if (!(me->me_flags & ME_USER)) { 1468 mtm->mtm_nme--; 1469 } 1470 break; 1471 } else 1472 mep = &(*mep)->me_hash; 1473 } 1474 } 1475 /* 1476 * put the entry on the free list 1477 */ 1478 CRB_RELE(me); 1479 kmem_cache_free(mapentry_cache, me); 1480 } 1481 mutex_exit(&mtm->mtm_mutex); 1482 if (dolock) 1483 rw_exit(&mtm->mtm_rwlock); 1484 1485 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1486 map_check_linkage(mtm)); 1487 } 1488 1489 1490 void 1491 logmap_commit(ml_unit_t *ul, uint32_t tid) 1492 { 1493 mapentry_t me; 1494 mt_map_t *mtm = ul->un_logmap; 1495 1496 1497 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1498 1499 /* 1500 * async'ly write a commit rec into the log 1501 */ 1502 if (mtm->mtm_dirty) { 1503 /* 1504 * put commit record into log 1505 */ 1506 me.me_mof = mtm->mtm_tid; 1507 me.me_dt = DT_COMMIT; 1508 me.me_nb = 0; 1509 me.me_hash = NULL; 1510 logmap_wait_space(mtm, ul, &me); 1511 ldl_write(ul, NULL, (offset_t)0, &me); 1512 ldl_round_commit(ul); 1513 1514 /* 1515 * abort on error; else reset dirty flag 1516 */ 1517 if (ul->un_flags & LDL_ERROR) 1518 logmap_abort(ul, tid); 1519 else { 1520 mtm->mtm_dirty = 0; 1521 mtm->mtm_nmet = 0; 1522 mtm->mtm_cfrags = 0; 1523 } 1524 /* push commit */ 1525 ldl_push_commit(ul); 1526 } 1527 } 1528 1529 void 1530 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul) 1531 { 1532 off_t lof; 1533 uint32_t tid; 1534 mapentry_t *me; 1535 1536 /* 1537 * move the head forward so the log knows how full it is 1538 * Make sure to skip any mapentry whose me_lof is 0, these 1539 * are just place holders for DT_CANCELED freed user blocks 1540 * for the current moby. 1541 */ 1542 mutex_enter(&ul->un_log_mutex); 1543 mutex_enter(&mtm->mtm_mutex); 1544 me = mtm->mtm_next; 1545 while (me != (mapentry_t *)mtm && me->me_lof == 0) { 1546 me = me->me_next; 1547 } 1548 1549 if (me == (mapentry_t *)mtm) 1550 lof = -1; 1551 else { 1552 lof = me->me_lof; 1553 tid = me->me_tid; 1554 } 1555 mutex_exit(&mtm->mtm_mutex); 1556 ldl_sethead(ul, lof, tid); 1557 if (lof == -1) 1558 mtm->mtm_age = 0; 1559 mutex_exit(&ul->un_log_mutex); 1560 } 1561 1562 void 1563 logmap_settail(mt_map_t *mtm, ml_unit_t *ul) 1564 { 1565 off_t lof; 1566 size_t nb; 1567 1568 /* 1569 * set the tail after the logmap_abort 1570 */ 1571 mutex_enter(&ul->un_log_mutex); 1572 mutex_enter(&mtm->mtm_mutex); 1573 if (mtm->mtm_prev == (mapentry_t *)mtm) 1574 lof = -1; 1575 else { 1576 /* 1577 * set the tail to the end of the last commit 1578 */ 1579 lof = mtm->mtm_tail_lof; 1580 nb = mtm->mtm_tail_nb; 1581 } 1582 mutex_exit(&mtm->mtm_mutex); 1583 ldl_settail(ul, lof, nb); 1584 mutex_exit(&ul->un_log_mutex); 1585 } 1586 1587 /* 1588 * when reseting a device; roll the log until every 1589 * delta has been rolled forward 1590 */ 1591 void 1592 logmap_roll_dev(ml_unit_t *ul) 1593 { 1594 mt_map_t *mtm = ul->un_logmap; 1595 mapentry_t *me; 1596 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 1597 1598 again: 1599 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1600 map_check_linkage(mtm)); 1601 if (ul->un_flags & (LDL_ERROR|LDL_NOROLL)) 1602 return; 1603 1604 /* 1605 * look for deltas 1606 */ 1607 mutex_enter(&mtm->mtm_mutex); 1608 for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) { 1609 if (me->me_flags & ME_ROLL) 1610 break; 1611 if (me->me_tid == mtm->mtm_tid) 1612 continue; 1613 if (me->me_tid == mtm->mtm_committid) 1614 continue; 1615 break; 1616 } 1617 1618 /* 1619 * found a delta; kick the roll thread 1620 * but only if the thread is running... (jmh) 1621 */ 1622 if (me != (mapentry_t *)mtm) { 1623 mutex_exit(&mtm->mtm_mutex); 1624 logmap_forceroll(mtm); 1625 goto again; 1626 } 1627 1628 /* 1629 * no more deltas, return 1630 */ 1631 mutex_exit(&mtm->mtm_mutex); 1632 (void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs); 1633 1634 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1635 map_check_linkage(mtm)); 1636 } 1637 1638 static void 1639 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata) 1640 { 1641 mapentry_t *me; 1642 mapentry_t **mep; 1643 mt_map_t *mtm = ul->un_logmap; 1644 int frags; 1645 1646 /* 1647 * map has been referenced and is dirty 1648 */ 1649 mtm->mtm_ref = 1; 1650 mtm->mtm_dirty++; 1651 1652 /* 1653 * get a mapentry 1654 */ 1655 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1656 bzero(me, sizeof (mapentry_t)); 1657 1658 /* 1659 * initialize cancel record and put in logmap 1660 */ 1661 me->me_mof = mof; 1662 me->me_nb = nb; 1663 me->me_dt = DT_CANCEL; 1664 me->me_tid = mtm->mtm_tid; 1665 me->me_hash = NULL; 1666 1667 /* 1668 * Write delta to log if this delta is for metadata. If this is not 1669 * metadata it is user data and we are just putting a cancel 1670 * mapentry into the hash to cancel a user block deletion 1671 * in which we do not want the block to be allocated 1672 * within this moby. This cancel entry will prevent the block from 1673 * being allocated within the moby and prevent user data corruption 1674 * if we happen to crash before this moby is committed. 1675 */ 1676 mutex_enter(&ul->un_log_mutex); 1677 if (metadata) { 1678 logmap_wait_space(mtm, ul, me); 1679 ldl_write(ul, NULL, (offset_t)0, me); 1680 if (ul->un_flags & LDL_ERROR) { 1681 kmem_cache_free(mapentry_cache, me); 1682 mutex_exit(&ul->un_log_mutex); 1683 return; 1684 } 1685 } 1686 1687 /* 1688 * put in hash and on cancel list 1689 */ 1690 mep = MAP_HASH(mof, mtm); 1691 mutex_enter(&mtm->mtm_mutex); 1692 me->me_age = mtm->mtm_age++; 1693 me->me_hash = *mep; 1694 *mep = me; 1695 me->me_next = (mapentry_t *)mtm; 1696 me->me_prev = mtm->mtm_prev; 1697 mtm->mtm_prev->me_next = me; 1698 mtm->mtm_prev = me; 1699 me->me_cancel = mtm->mtm_cancel; 1700 mtm->mtm_cancel = me; 1701 if (metadata) { 1702 mtm->mtm_nme++; 1703 mtm->mtm_nmet++; 1704 } else { 1705 me->me_flags = ME_USER; 1706 } 1707 me->me_flags |= (ME_HASH|ME_CANCEL); 1708 if (!(metadata)) { 1709 frags = blkoff(ul->un_ufsvfs->vfs_fs, nb); 1710 if (frags) 1711 mtm->mtm_cfrags += numfrags(ul->un_ufsvfs->vfs_fs, 1712 frags); 1713 } 1714 mutex_exit(&mtm->mtm_mutex); 1715 1716 mutex_exit(&ul->un_log_mutex); 1717 } 1718 1719 /* 1720 * cancel entries in a logmap (entries are freed at EOT) 1721 */ 1722 void 1723 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata) 1724 { 1725 int32_t hnb; 1726 mapentry_t *me; 1727 mapentry_t **mep; 1728 mt_map_t *mtm = ul->un_logmap; 1729 crb_t *crb; 1730 1731 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1732 map_check_linkage(mtm)); 1733 1734 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1735 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1736 if (hnb > nb) 1737 hnb = nb; 1738 /* 1739 * Find overlapping metadata entries. Don't search through 1740 * the hash chains if this is user data because it is only 1741 * possible to have overlapping map entries for metadata, 1742 * and the search can become expensive for large files. 1743 */ 1744 if (metadata) { 1745 mep = MAP_HASH(mof, mtm); 1746 mutex_enter(&mtm->mtm_mutex); 1747 for (me = *mep; me; me = me->me_hash) { 1748 if (!DATAoverlapME(mof, hnb, me)) 1749 continue; 1750 1751 ASSERT(MEwithinDATA(me, mof, hnb)); 1752 1753 if ((me->me_flags & ME_CANCEL) == 0) { 1754 me->me_cancel = mtm->mtm_cancel; 1755 mtm->mtm_cancel = me; 1756 me->me_flags |= ME_CANCEL; 1757 crb = me->me_crb; 1758 if (crb) { 1759 crb->c_invalid = 1; 1760 } 1761 } 1762 } 1763 mutex_exit(&mtm->mtm_mutex); 1764 } 1765 1766 /* 1767 * put a cancel record into the log 1768 */ 1769 logmap_cancel_delta(ul, mof, hnb, metadata); 1770 } 1771 1772 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1773 map_check_linkage(mtm)); 1774 } 1775 1776 /* 1777 * check for overlap w/cancel delta 1778 */ 1779 int 1780 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb) 1781 { 1782 off_t hnb; 1783 mapentry_t *me; 1784 mapentry_t **mep; 1785 1786 mutex_enter(&mtm->mtm_mutex); 1787 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1788 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1789 if (hnb > nb) 1790 hnb = nb; 1791 /* 1792 * search for dup entry 1793 */ 1794 mep = MAP_HASH(mof, mtm); 1795 for (me = *mep; me; me = me->me_hash) { 1796 if (((me->me_flags & ME_ROLL) == 0) && 1797 (me->me_dt != DT_CANCEL)) 1798 continue; 1799 if (DATAoverlapME(mof, hnb, me)) 1800 break; 1801 } 1802 1803 /* 1804 * overlap detected 1805 */ 1806 if (me) { 1807 mutex_exit(&mtm->mtm_mutex); 1808 return (1); 1809 } 1810 } 1811 mutex_exit(&mtm->mtm_mutex); 1812 return (0); 1813 } 1814 1815 static int 1816 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp) 1817 { 1818 mapentry_t *me; 1819 int error; 1820 mt_map_t *mtm = ul->un_logmap; 1821 1822 /* 1823 * verify delta header; failure == mediafail 1824 */ 1825 error = 0; 1826 /* delta type */ 1827 if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX)) 1828 error = EINVAL; 1829 if (dp->d_typ == DT_COMMIT) { 1830 if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1)) 1831 error = EINVAL; 1832 } else { 1833 /* length of delta */ 1834 if ((dp->d_nb < INT32_C(0)) || 1835 (dp->d_nb > INT32_C(MAPBLOCKSIZE))) 1836 error = EINVAL; 1837 1838 /* offset on master device */ 1839 if (dp->d_mof < INT64_C(0)) 1840 error = EINVAL; 1841 } 1842 1843 if (error) { 1844 ldl_seterror(ul, "Error processing ufs log data during scan"); 1845 return (error); 1846 } 1847 1848 /* 1849 * process commit record 1850 */ 1851 if (dp->d_typ == DT_COMMIT) { 1852 if (mtm->mtm_dirty) { 1853 ASSERT(dp->d_nb == INT32_C(0)); 1854 logmap_free_cancel(mtm, &mtm->mtm_cancel); 1855 mtm->mtm_dirty = 0; 1856 mtm->mtm_nmet = 0; 1857 mtm->mtm_tid++; 1858 mtm->mtm_committid = mtm->mtm_tid; 1859 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1860 logmap_logscan_commit_debug(lof, mtm)); 1861 } 1862 /* 1863 * return #bytes to next sector (next delta header) 1864 */ 1865 *nbp = ldl_logscan_nbcommit(lof); 1866 mtm->mtm_tail_lof = lof; 1867 mtm->mtm_tail_nb = *nbp; 1868 return (0); 1869 } 1870 1871 /* 1872 * add delta to logmap 1873 */ 1874 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1875 bzero(me, sizeof (mapentry_t)); 1876 me->me_lof = lof; 1877 me->me_mof = dp->d_mof; 1878 me->me_nb = dp->d_nb; 1879 me->me_tid = mtm->mtm_tid; 1880 me->me_dt = dp->d_typ; 1881 me->me_hash = NULL; 1882 me->me_flags = (ME_LIST | ME_SCAN); 1883 logmap_add(ul, NULL, 0, me); 1884 switch (dp->d_typ) { 1885 case DT_CANCEL: 1886 me->me_flags |= ME_CANCEL; 1887 me->me_cancel = mtm->mtm_cancel; 1888 mtm->mtm_cancel = me; 1889 break; 1890 default: 1891 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1892 logmap_logscan_add_debug(dp, mtm)); 1893 break; 1894 } 1895 1896 sizeofdelta: 1897 /* 1898 * return #bytes till next delta header 1899 */ 1900 if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO)) 1901 *nbp = 0; 1902 else 1903 *nbp = dp->d_nb; 1904 return (0); 1905 } 1906 1907 void 1908 logmap_logscan(ml_unit_t *ul) 1909 { 1910 size_t nb, nbd; 1911 off_t lof; 1912 struct delta delta; 1913 mt_map_t *logmap = ul->un_logmap; 1914 1915 ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap); 1916 1917 /* 1918 * prepare the log for a logscan 1919 */ 1920 ldl_logscan_begin(ul); 1921 1922 /* 1923 * prepare the logmap for a logscan 1924 */ 1925 (void) map_free_entries(logmap); 1926 logmap->mtm_tid = 0; 1927 logmap->mtm_committid = UINT32_C(0); 1928 logmap->mtm_age = 0; 1929 logmap->mtm_dirty = 0; 1930 logmap->mtm_ref = 0; 1931 1932 /* 1933 * while not at end of log 1934 * read delta header 1935 * add to logmap 1936 * seek to beginning of next delta 1937 */ 1938 lof = ul->un_head_lof; 1939 nbd = sizeof (delta); 1940 while (lof != ul->un_tail_lof) { 1941 1942 /* read delta header */ 1943 if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta)) 1944 break; 1945 1946 /* add to logmap */ 1947 if (logmap_logscan_add(ul, &delta, lof, &nb)) 1948 break; 1949 1950 /* seek to next header (skip data) */ 1951 if (ldl_logscan_read(ul, &lof, nb, NULL)) 1952 break; 1953 } 1954 1955 /* 1956 * remove the last partial transaction from the logmap 1957 */ 1958 logmap_abort(ul, logmap->mtm_tid); 1959 1960 ldl_logscan_end(ul); 1961 } 1962 1963 void 1964 _init_map(void) 1965 { 1966 /* 1967 * Initialise the mapentry cache. No constructor or deconstructor 1968 * is needed. Also no reclaim function is supplied as reclaiming 1969 * current entries is not possible. 1970 */ 1971 mapentry_cache = kmem_cache_create("lufs_mapentry_cache", 1972 sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1973 } 1974 1975 /* 1976 * Special case when we replace an old map entry which carries quota 1977 * information with a newer entry which does not. 1978 * In that case the push function would not be called to clean up the 1979 * dquot structure. This would be found later by invalidatedq() causing 1980 * a panic when the filesystem in unmounted. 1981 * We clean up the dquot manually before replacing the map entry. 1982 */ 1983 void 1984 handle_dquot(mapentry_t *me) 1985 { 1986 int dolock = 0; 1987 int domutex = 0; 1988 struct dquot *dqp; 1989 1990 dqp = (struct dquot *)me->me_arg; 1991 1992 /* 1993 * We need vfs_dqrwlock to call dqput() 1994 */ 1995 dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock)); 1996 if (dolock) 1997 rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER); 1998 1999 domutex = (!MUTEX_HELD(&dqp->dq_lock)); 2000 if (domutex) 2001 mutex_enter(&dqp->dq_lock); 2002 2003 /* 2004 * Only clean up if the dquot is referenced 2005 */ 2006 if (dqp->dq_cnt == 0) { 2007 if (domutex) 2008 mutex_exit(&dqp->dq_lock); 2009 if (dolock) 2010 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2011 return; 2012 } 2013 2014 dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS); 2015 dqput(dqp); 2016 2017 if (domutex) 2018 mutex_exit(&dqp->dq_lock); 2019 2020 if (dolock) 2021 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2022 2023 } 2024