1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 #pragma ident "%Z%%M% %I% %E% SMI" 22 23 /* 24 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #include <sys/systm.h> 29 #include <sys/types.h> 30 #include <sys/vnode.h> 31 #include <sys/errno.h> 32 #include <sys/sysmacros.h> 33 #include <sys/debug.h> 34 #include <sys/kmem.h> 35 #include <sys/conf.h> 36 #include <sys/proc.h> 37 #include <sys/cmn_err.h> 38 #include <sys/fs/ufs_inode.h> 39 #include <sys/fs/ufs_filio.h> 40 #include <sys/fs/ufs_log.h> 41 #include <sys/inttypes.h> 42 #include <sys/atomic.h> 43 #include <sys/tuneable.h> 44 45 /* 46 * externs 47 */ 48 extern pri_t minclsyspri; 49 extern struct kmem_cache *lufs_bp; 50 extern int ufs_trans_push_quota(); 51 52 /* 53 * globals 54 */ 55 kmem_cache_t *mapentry_cache; 56 57 /* 58 * logmap tuning constants 59 */ 60 long logmap_maxnme_commit = 2048; 61 long logmap_maxnme_async = 4096; 62 long logmap_maxnme_sync = 6144; 63 long logmap_maxcfrag_commit = 4; /* Max canceled fragments per moby */ 64 65 66 uint64_t ufs_crb_size = 0; /* current size of all crb buffers */ 67 uint64_t ufs_crb_max_size = 0; /* highest crb buffer use so far */ 68 size_t ufs_crb_limit; /* max allowable size for crbs */ 69 uint64_t ufs_crb_alloc_fails = 0; /* crb allocation failures stat */ 70 #define UFS_MAX_CRB_DEFAULT_DIVISOR 10 /* max 1/10 kmem_maxavail() */ 71 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */ 72 void handle_dquot(mapentry_t *); 73 74 /* 75 * GENERIC MAP ROUTINES 76 */ 77 78 #define CRB_FREE(crb, me) \ 79 kmem_free(crb->c_buf, crb->c_nb); \ 80 atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \ 81 kmem_free(crb, sizeof (crb_t)); \ 82 (me)->me_crb = NULL; 83 84 #define CRB_RELE(me) { \ 85 crb_t *crb = (me)->me_crb; \ 86 if (crb && (--crb->c_refcnt == 0)) { \ 87 CRB_FREE(crb, me) \ 88 } \ 89 } 90 91 /* 92 * Check that the old delta has an argument and a push function of 93 * ufs_trans_push_quota(), then check that the old and new deltas differ. 94 * If so we clean up with handle_dquot() before replacing the old delta. 95 */ 96 #define HANDLE_DQUOT(me, melist) { \ 97 if ((me->me_arg) && \ 98 (me->me_func == ufs_trans_push_quota)) { \ 99 if (!((me->me_dt == melist->me_dt) && \ 100 (me->me_arg == melist->me_arg) && \ 101 (me->me_func == melist->me_func))) { \ 102 handle_dquot(me); \ 103 } \ 104 } \ 105 } 106 107 /* 108 * free up all the mapentries for a map 109 */ 110 void 111 map_free_entries(mt_map_t *mtm) 112 { 113 int i; 114 mapentry_t *me; 115 116 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 117 me->me_next->me_prev = me->me_prev; 118 me->me_prev->me_next = me->me_next; 119 CRB_RELE(me); 120 kmem_cache_free(mapentry_cache, me); 121 } 122 for (i = 0; i < mtm->mtm_nhash; i++) 123 mtm->mtm_hash[i] = NULL; 124 mtm->mtm_nme = 0; 125 mtm->mtm_nmet = 0; 126 } 127 128 /* 129 * done with map; free if necessary 130 */ 131 mt_map_t * 132 map_put(mt_map_t *mtm) 133 { 134 /* 135 * free up the map's memory 136 */ 137 map_free_entries(mtm); 138 ASSERT(map_put_debug(mtm)); 139 kmem_free(mtm->mtm_hash, 140 (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash)); 141 mutex_destroy(&mtm->mtm_mutex); 142 mutex_destroy(&mtm->mtm_scan_mutex); 143 cv_destroy(&mtm->mtm_to_roll_cv); 144 cv_destroy(&mtm->mtm_from_roll_cv); 145 rw_destroy(&mtm->mtm_rwlock); 146 mutex_destroy(&mtm->mtm_lock); 147 cv_destroy(&mtm->mtm_cv_commit); 148 cv_destroy(&mtm->mtm_cv_next); 149 cv_destroy(&mtm->mtm_cv_eot); 150 cv_destroy(&mtm->mtm_cv); 151 kmem_free(mtm, sizeof (mt_map_t)); 152 return (NULL); 153 } 154 /* 155 * Allocate a map; 156 */ 157 mt_map_t * 158 map_get(ml_unit_t *ul, enum maptypes maptype, int nh) 159 { 160 mt_map_t *mtm; 161 162 /* 163 * assume the map is not here and allocate the necessary structs 164 */ 165 mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP); 166 mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL); 167 mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL); 168 cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL); 169 cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL); 170 rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL); 171 mtm->mtm_next = (mapentry_t *)mtm; 172 mtm->mtm_prev = (mapentry_t *)mtm; 173 mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh), 174 KM_SLEEP); 175 mtm->mtm_nhash = nh; 176 mtm->mtm_debug = ul->un_debug; 177 mtm->mtm_type = maptype; 178 179 mtm->mtm_cfrags = 0; 180 mtm->mtm_cfragmax = logmap_maxcfrag_commit; 181 182 /* 183 * for scan test 184 */ 185 mtm->mtm_ul = ul; 186 187 /* 188 * Initialize locks 189 */ 190 mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL); 191 cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL); 192 cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL); 193 cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL); 194 cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL); 195 ASSERT(map_get_debug(ul, mtm)); 196 197 return (mtm); 198 } 199 200 /* 201 * DELTAMAP ROUTINES 202 */ 203 /* 204 * deltamap tuning constants 205 */ 206 long deltamap_maxnme = 1024; /* global so it can be set */ 207 208 int 209 deltamap_need_commit(mt_map_t *mtm) 210 { 211 return (mtm->mtm_nme > deltamap_maxnme); 212 } 213 214 /* 215 * put a delta into a deltamap; may sleep on memory 216 */ 217 void 218 deltamap_add( 219 mt_map_t *mtm, 220 offset_t mof, 221 off_t nb, 222 delta_t dtyp, 223 int (*func)(), 224 ulong_t arg, 225 threadtrans_t *tp) 226 { 227 int32_t hnb; 228 mapentry_t *me; 229 mapentry_t **mep; 230 231 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 232 map_check_linkage(mtm)); 233 234 mutex_enter(&mtm->mtm_mutex); 235 236 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 237 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 238 if (hnb > nb) 239 hnb = nb; 240 /* 241 * Search for dup entry. We need to ensure that we don't 242 * replace a map entry which carries quota information 243 * with a map entry which doesn't. In that case we lose 244 * reference the the dquot structure which will not be 245 * cleaned up by the push function me->me_func as this will 246 * never be called. 247 * The stray dquot would be found later by invalidatedq() 248 * causing a panic when the filesystem is unmounted. 249 */ 250 mep = MAP_HASH(mof, mtm); 251 for (me = *mep; me; me = me->me_hash) { 252 if (DATAwithinME(mof, hnb, me)) { 253 /* 254 * Don't remove quota entries which have 255 * incremented the ref count (those with a 256 * ufs_trans_push_quota push function). 257 * Let logmap_add[_buf] clean them up. 258 */ 259 if (me->me_func == ufs_trans_push_quota) { 260 continue; 261 } 262 break; 263 } 264 ASSERT((dtyp == DT_CANCEL) || 265 (!DATAoverlapME(mof, hnb, me)) || 266 MEwithinDATA(me, mof, hnb)); 267 } 268 269 if (me) { 270 /* already in map */ 271 continue; 272 } 273 274 /* 275 * Add up all the delta map deltas so we can compute 276 * an upper bound on the log size used. 277 * Note, some deltas get removed from the deltamap 278 * before the deltamap_push by lufs_write_strategy 279 * and so multiple deltas to the same mof offset 280 * don't get cancelled here but in the logmap. 281 * Thus we can't easily get a accurate count of 282 * the log space used - only an upper bound. 283 */ 284 if (tp && (mtm->mtm_ul->un_deltamap == mtm)) { 285 ASSERT(dtyp != DT_CANCEL); 286 if (dtyp == DT_ABZERO) { 287 tp->deltas_size += sizeof (struct delta); 288 } else { 289 tp->deltas_size += 290 (hnb + sizeof (struct delta)); 291 } 292 } 293 294 delta_stats[dtyp]++; 295 296 /* 297 * get a mapentry 298 * May need to drop & re-grab the mtm_mutex 299 * and then recheck for a duplicate 300 */ 301 me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP); 302 if (me == NULL) { 303 mutex_exit(&mtm->mtm_mutex); 304 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 305 mutex_enter(&mtm->mtm_mutex); 306 } 307 bzero(me, sizeof (mapentry_t)); 308 309 /* 310 * initialize and put in deltamap 311 */ 312 me->me_mof = mof; 313 me->me_nb = hnb; 314 me->me_func = func; 315 me->me_arg = arg; 316 me->me_dt = dtyp; 317 me->me_flags = ME_HASH; 318 me->me_tid = mtm->mtm_tid; 319 320 me->me_hash = *mep; 321 *mep = me; 322 me->me_next = (mapentry_t *)mtm; 323 me->me_prev = mtm->mtm_prev; 324 mtm->mtm_prev->me_next = me; 325 mtm->mtm_prev = me; 326 mtm->mtm_nme++; 327 } 328 mutex_exit(&mtm->mtm_mutex); 329 330 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 331 map_check_linkage(mtm)); 332 } 333 334 /* 335 * remove deltas within (mof, nb) and return as linked list 336 */ 337 mapentry_t * 338 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb) 339 { 340 off_t hnb; 341 mapentry_t *me; 342 mapentry_t **mep; 343 mapentry_t *mer; 344 345 if (mtm == NULL) 346 return (NULL); 347 348 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 349 map_check_linkage(mtm)); 350 351 mutex_enter(&mtm->mtm_mutex); 352 for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) { 353 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 354 if (hnb > nb) 355 hnb = nb; 356 /* 357 * remove entries from hash and return as a aged linked list 358 */ 359 mep = MAP_HASH(mof, mtm); 360 while ((me = *mep) != 0) { 361 if (MEwithinDATA(me, mof, hnb)) { 362 *mep = me->me_hash; 363 me->me_next->me_prev = me->me_prev; 364 me->me_prev->me_next = me->me_next; 365 me->me_hash = mer; 366 mer = me; 367 me->me_flags |= ME_LIST; 368 me->me_flags &= ~ME_HASH; 369 mtm->mtm_nme--; 370 } else 371 mep = &me->me_hash; 372 } 373 } 374 mutex_exit(&mtm->mtm_mutex); 375 376 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 377 map_check_linkage(mtm)); 378 379 return (mer); 380 } 381 382 /* 383 * delete entries within (mof, nb) 384 */ 385 void 386 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb) 387 { 388 mapentry_t *me; 389 mapentry_t *menext; 390 391 menext = deltamap_remove(mtm, mof, nb); 392 while ((me = menext) != 0) { 393 menext = me->me_hash; 394 kmem_cache_free(mapentry_cache, me); 395 } 396 } 397 398 /* 399 * Call the indicated function to cause deltas to move to the logmap. 400 * top_end_sync() is the only caller of this function and 401 * it has waited for the completion of all threads, so there can 402 * be no other activity in the deltamap. Therefore we don't need to 403 * hold the deltamap lock. 404 */ 405 void 406 deltamap_push(ml_unit_t *ul) 407 { 408 delta_t dtyp; 409 int (*func)(); 410 ulong_t arg; 411 mapentry_t *me; 412 offset_t mof; 413 off_t nb; 414 mt_map_t *mtm = ul->un_deltamap; 415 416 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 417 map_check_linkage(mtm)); 418 419 /* 420 * for every entry in the deltamap 421 */ 422 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 423 ASSERT(me->me_func); 424 func = me->me_func; 425 dtyp = me->me_dt; 426 arg = me->me_arg; 427 mof = me->me_mof; 428 nb = me->me_nb; 429 if ((ul->un_flags & LDL_ERROR) || 430 (*func)(ul->un_ufsvfs, dtyp, arg)) 431 deltamap_del(mtm, mof, nb); 432 } 433 434 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 435 map_check_linkage(mtm)); 436 } 437 438 /* 439 * LOGMAP ROUTINES 440 */ 441 442 int 443 logmap_need_commit(mt_map_t *mtm) 444 { 445 return ((mtm->mtm_nmet > logmap_maxnme_commit) || 446 (mtm->mtm_cfrags >= mtm->mtm_cfragmax)); 447 } 448 449 int 450 logmap_need_roll_async(mt_map_t *mtm) 451 { 452 return (mtm->mtm_nme > logmap_maxnme_async); 453 } 454 455 int 456 logmap_need_roll_sync(mt_map_t *mtm) 457 { 458 return (mtm->mtm_nme > logmap_maxnme_sync); 459 } 460 461 void 462 logmap_start_roll(ml_unit_t *ul) 463 { 464 mt_map_t *logmap = ul->un_logmap; 465 466 logmap_settail(logmap, ul); 467 ASSERT(!(ul->un_flags & LDL_NOROLL)); 468 mutex_enter(&logmap->mtm_mutex); 469 if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) { 470 logmap->mtm_flags |= MTM_ROLL_RUNNING; 471 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT); 472 (void) thread_create(NULL, 0, trans_roll, ul, 0, &p0, 473 TS_RUN, minclsyspri); 474 } 475 mutex_exit(&logmap->mtm_mutex); 476 } 477 478 void 479 logmap_kill_roll(ml_unit_t *ul) 480 { 481 mt_map_t *mtm = ul->un_logmap; 482 483 if (mtm == NULL) 484 return; 485 486 mutex_enter(&mtm->mtm_mutex); 487 488 while (mtm->mtm_flags & MTM_ROLL_RUNNING) { 489 mtm->mtm_flags |= MTM_ROLL_EXIT; 490 cv_signal(&mtm->mtm_to_roll_cv); 491 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 492 } 493 mutex_exit(&mtm->mtm_mutex); 494 } 495 496 /* 497 * kick the roll thread if it's not doing anything 498 */ 499 void 500 logmap_forceroll_nowait(mt_map_t *logmap) 501 { 502 /* 503 * Don't need to lock mtm_mutex to read mtm_flags here as we 504 * don't care in the rare case when we get a transitional value 505 * of mtm_flags. Just by signalling the thread it will wakeup 506 * and notice it has too many logmap entries. 507 */ 508 ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL)); 509 if ((logmap->mtm_flags & MTM_ROLLING) == 0) { 510 cv_signal(&logmap->mtm_to_roll_cv); 511 } 512 } 513 514 /* 515 * kick the roll thread and wait for it to finish a cycle 516 */ 517 void 518 logmap_forceroll(mt_map_t *mtm) 519 { 520 mutex_enter(&mtm->mtm_mutex); 521 if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) { 522 mtm->mtm_flags |= MTM_FORCE_ROLL; 523 cv_signal(&mtm->mtm_to_roll_cv); 524 } 525 do { 526 if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) { 527 mtm->mtm_flags &= ~MTM_FORCE_ROLL; 528 goto out; 529 } 530 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 531 } while (mtm->mtm_flags & MTM_FORCE_ROLL); 532 out: 533 mutex_exit(&mtm->mtm_mutex); 534 } 535 536 /* 537 * remove rolled deltas within (mof, nb) and free them 538 */ 539 void 540 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb) 541 { 542 int dolock = 0; 543 off_t hnb; 544 mapentry_t *me; 545 mapentry_t **mep; 546 offset_t savmof = mof; 547 off_t savnb = nb; 548 549 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 550 map_check_linkage(mtm)); 551 552 again: 553 if (dolock) 554 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 555 mutex_enter(&mtm->mtm_mutex); 556 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 557 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 558 if (hnb > nb) 559 hnb = nb; 560 /* 561 * remove and free the rolled entries 562 */ 563 mep = MAP_HASH(mof, mtm); 564 while ((me = *mep) != 0) { 565 if ((me->me_flags & ME_ROLL) && 566 (MEwithinDATA(me, mof, hnb))) { 567 if (me->me_flags & ME_AGE) { 568 ASSERT(dolock == 0); 569 dolock = 1; 570 mutex_exit(&mtm->mtm_mutex); 571 mof = savmof; 572 nb = savnb; 573 goto again; 574 } 575 *mep = me->me_hash; 576 me->me_next->me_prev = me->me_prev; 577 me->me_prev->me_next = me->me_next; 578 me->me_flags &= ~(ME_HASH|ME_ROLL); 579 ASSERT(!(me->me_flags & ME_USER)); 580 mtm->mtm_nme--; 581 /* 582 * cancelled entries are handled by someone else 583 */ 584 if ((me->me_flags & ME_CANCEL) == 0) { 585 roll_stats[me->me_dt]++; 586 CRB_RELE(me); 587 kmem_cache_free(mapentry_cache, me); 588 } 589 } else 590 mep = &me->me_hash; 591 } 592 } 593 mutex_exit(&mtm->mtm_mutex); 594 595 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 596 map_check_linkage(mtm)); 597 598 if (dolock) 599 rw_exit(&mtm->mtm_rwlock); 600 } 601 602 /* 603 * Find the disk offset of the next delta to roll. 604 * Returns 0: no more deltas to roll or a transaction is being committed 605 * 1: a delta to roll has been found and *mofp points 606 * to the master file disk offset 607 */ 608 int 609 logmap_next_roll(mt_map_t *logmap, offset_t *mofp) 610 { 611 mapentry_t *me; 612 613 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 614 map_check_linkage(logmap)); 615 616 mutex_enter(&logmap->mtm_mutex); 617 for (me = logmap->mtm_next; me != (mapentry_t *)logmap; 618 me = me->me_next) { 619 /* already rolled */ 620 if (me->me_flags & ME_ROLL) { 621 continue; 622 } 623 624 /* part of currently busy transaction; stop */ 625 if (me->me_tid == logmap->mtm_tid) { 626 break; 627 } 628 629 /* part of commit-in-progress transaction; stop */ 630 if (me->me_tid == logmap->mtm_committid) { 631 break; 632 } 633 634 /* 635 * We shouldn't see a DT_CANCEL mapentry whose 636 * tid != mtm_committid, or != mtm_tid since 637 * these are removed at the end of each committed 638 * transaction. 639 */ 640 ASSERT(!(me->me_dt == DT_CANCEL)); 641 642 *mofp = me->me_mof; 643 mutex_exit(&logmap->mtm_mutex); 644 return (1); 645 } 646 mutex_exit(&logmap->mtm_mutex); 647 return (0); 648 } 649 650 /* 651 * put mapentry on sorted age list 652 */ 653 static void 654 logmap_list_age(mapentry_t **age, mapentry_t *meadd) 655 { 656 mapentry_t *me; 657 658 ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST))); 659 660 for (me = *age; me; age = &me->me_agenext, me = *age) { 661 if (me->me_age > meadd->me_age) 662 break; 663 } 664 meadd->me_agenext = me; 665 meadd->me_flags |= ME_AGE; 666 *age = meadd; 667 } 668 669 /* 670 * get a list of deltas within <mof, mof+nb> 671 * returns with mtm_rwlock held 672 * return value says whether the entire mof range is covered by deltas 673 */ 674 int 675 logmap_list_get( 676 mt_map_t *mtm, 677 offset_t mof, 678 off_t nb, 679 mapentry_t **age) 680 { 681 off_t hnb; 682 mapentry_t *me; 683 mapentry_t **mep; 684 int rwtype = RW_READER; 685 offset_t savmof = mof; 686 off_t savnb = nb; 687 int entire = 0; 688 crb_t *crb; 689 690 mtm->mtm_ref = 1; 691 again: 692 693 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 694 map_check_linkage(mtm)); 695 696 rw_enter(&mtm->mtm_rwlock, rwtype); 697 *age = NULL; 698 mutex_enter(&mtm->mtm_mutex); 699 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 700 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 701 if (hnb > nb) 702 hnb = nb; 703 /* 704 * find overlapping entries 705 */ 706 mep = MAP_HASH(mof, mtm); 707 for (me = *mep; me; me = me->me_hash) { 708 if (me->me_dt == DT_CANCEL) 709 continue; 710 if (!DATAoverlapME(mof, hnb, me)) 711 continue; 712 /* 713 * check if map entry is in use 714 * (about to be rolled). 715 */ 716 if (me->me_flags & ME_AGE) { 717 /* 718 * reset the age bit in the list, 719 * upgrade the lock, and try again 720 */ 721 for (me = *age; me; me = *age) { 722 *age = me->me_agenext; 723 me->me_flags &= ~ME_AGE; 724 } 725 mutex_exit(&mtm->mtm_mutex); 726 rw_exit(&mtm->mtm_rwlock); 727 rwtype = RW_WRITER; 728 mof = savmof; 729 nb = savnb; 730 entire = 0; 731 goto again; 732 } else { 733 /* add mapentry to age ordered list */ 734 logmap_list_age(age, me); 735 crb = me->me_crb; 736 if (crb) { 737 if (DATAwithinCRB(savmof, savnb, crb)) { 738 entire = 1; 739 } 740 } else { 741 if (DATAwithinME(savmof, savnb, me)) { 742 entire = 1; 743 } 744 } 745 } 746 } 747 } 748 mutex_exit(&mtm->mtm_mutex); 749 750 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 751 return (entire); 752 } 753 754 /* 755 * Get a list of deltas for rolling - returns sucess or failure. 756 * Also return the cached roll buffer if all deltas point to it. 757 */ 758 int 759 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp) 760 { 761 mapentry_t *me, **mep, *age = NULL; 762 crb_t *crb = NULL; 763 764 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 765 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 766 map_check_linkage(logmap)); 767 ASSERT((mof & MAPBLOCKOFF) == 0); 768 769 rbp->rb_crb = NULL; 770 771 /* 772 * find overlapping entries 773 */ 774 mutex_enter(&logmap->mtm_mutex); 775 mep = MAP_HASH(mof, logmap); 776 for (me = *mep; me; me = me->me_hash) { 777 if (!DATAoverlapME(mof, MAPBLOCKSIZE, me)) 778 continue; 779 if (me->me_tid == logmap->mtm_tid) 780 continue; 781 if (me->me_tid == logmap->mtm_committid) 782 continue; 783 if (me->me_dt == DT_CANCEL) 784 continue; 785 786 /* 787 * Check if map entry is in use (by lufs_read_strategy()) 788 * and if so reset the age bit in the list, 789 * upgrade the lock, and try again 790 */ 791 if (me->me_flags & ME_AGE) { 792 for (me = age; me; me = age) { 793 age = me->me_agenext; 794 me->me_flags &= ~ME_AGE; 795 } 796 mutex_exit(&logmap->mtm_mutex); 797 return (1); /* failure */ 798 } else { 799 /* add mapentry to age ordered list */ 800 logmap_list_age(&age, me); 801 } 802 } 803 if (!age) { 804 goto out; 805 } 806 807 /* 808 * Mark the deltas as being rolled. 809 */ 810 for (me = age; me; me = me->me_agenext) { 811 me->me_flags |= ME_ROLL; 812 } 813 814 /* 815 * Test if all deltas are covered by one valid roll buffer 816 */ 817 crb = age->me_crb; 818 if (crb && !(crb->c_invalid)) { 819 for (me = age; me; me = me->me_agenext) { 820 if (me->me_crb != crb) { 821 crb = NULL; 822 break; 823 } 824 } 825 rbp->rb_crb = crb; 826 } 827 out: 828 rbp->rb_age = age; 829 830 mutex_exit(&logmap->mtm_mutex); 831 832 ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) || 833 logmap_logscan_debug(logmap, age)); 834 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 835 return (0); /* success */ 836 } 837 838 void 839 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age) 840 { 841 mapentry_t *me; 842 843 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 844 mutex_enter(&mtm->mtm_mutex); 845 for (me = age; me; me = age) { 846 age = me->me_agenext; 847 me->me_flags &= ~ME_AGE; 848 } 849 mutex_exit(&mtm->mtm_mutex); 850 } 851 852 void 853 logmap_list_put(mt_map_t *mtm, mapentry_t *age) 854 { 855 mapentry_t *me; 856 857 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 858 mutex_enter(&mtm->mtm_mutex); 859 for (me = age; me; me = age) { 860 age = me->me_agenext; 861 me->me_flags &= ~ME_AGE; 862 } 863 mutex_exit(&mtm->mtm_mutex); 864 rw_exit(&mtm->mtm_rwlock); 865 } 866 867 #define UFS_RW_BALANCE 2 868 int ufs_rw_balance = UFS_RW_BALANCE; 869 870 /* 871 * Check if we need to read the master. 872 * The master does not need to be read if the log deltas to the 873 * block are for one contiguous set of full disk sectors. 874 * Both cylinder group bit maps DT_CG (8K); directory entries (512B); 875 * and possibly others should not require master disk reads. 876 * Calculate the sector map for writing later. 877 */ 878 int 879 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp) 880 { 881 offset_t mof; 882 crb_t *crb; 883 mapentry_t *me; 884 int32_t nb; 885 int i; 886 int start_sec, end_sec; 887 int read_needed = 0; 888 int all_inodes = 1; 889 int first_sec = INT_MAX; 890 int last_sec = -1; 891 rbsecmap_t secmap = 0; 892 893 /* LINTED: warning: logical expression always true: op "||" */ 894 ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY)); 895 896 for (me = age; me; me = me->me_agenext) { 897 crb = me->me_crb; 898 if (crb) { 899 nb = crb->c_nb; 900 mof = crb->c_mof; 901 } else { 902 nb = me->me_nb; 903 mof = me->me_mof; 904 } 905 906 /* 907 * If the delta is not sector aligned then 908 * read the whole block. 909 */ 910 if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) { 911 read_needed = 1; 912 } 913 914 /* Set sector map used in the MAPBLOCKSIZE block. */ 915 start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT; 916 end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT); 917 for (i = start_sec; i <= end_sec; i++) { 918 secmap |= UINT16_C(1) << i; 919 } 920 921 if (me->me_dt != DT_INODE) { 922 all_inodes = 0; 923 } 924 if (start_sec < first_sec) { 925 first_sec = start_sec; 926 } 927 if (end_sec > last_sec) { 928 last_sec = end_sec; 929 } 930 } 931 932 ASSERT(secmap); 933 ASSERT(first_sec != INT_MAX); 934 ASSERT(last_sec != -1); 935 936 if (all_inodes) { 937 /* 938 * Here we have a tradeoff choice. It must be better to 939 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a 940 * read and a write. But what about 3 or more writes, versus 941 * a read+write? * Where is the cut over? It will depend on 942 * the track caching, scsi driver and other activity. 943 * A unpublished tunable is defined (ufs_rw_balance) that 944 * currently defaults to 2. 945 */ 946 if (!read_needed) { 947 int count = 0, gap = 0; 948 int sector_set; /* write needed to this sector */ 949 950 /* Count the gaps (every 1 to 0 transation) */ 951 for (i = first_sec + 1; i < last_sec; i++) { 952 sector_set = secmap & (UINT16_C(1) << i); 953 if (!gap && !sector_set) { 954 gap = 1; 955 count++; 956 if (count > ufs_rw_balance) { 957 read_needed = 1; 958 break; 959 } 960 } else if (gap && sector_set) { 961 gap = 0; 962 } 963 } 964 } 965 966 /* 967 * Inodes commonly make up the majority (~85%) of deltas. 968 * They cannot contain embedded user data, so its safe to 969 * read and write them all in one IO. 970 * But for directory entries, shadow inode data, and 971 * quota record data the user data fragments can be embedded 972 * betwen those metadata, and so its not safe to read, modify 973 * then write the entire range as user asynchronous user data 974 * writes could get overwritten with old data. 975 * Thus we have to create a segment map of meta data that 976 * needs to get written. 977 * 978 * If user data was logged then this issue would go away. 979 */ 980 if (read_needed) { 981 for (i = first_sec + 1; i < last_sec; i++) { 982 secmap |= (UINT16_C(1) << i); 983 } 984 } 985 } 986 rbp->rb_secmap = secmap; 987 return (read_needed); 988 } 989 990 /* 991 * Abort the load of a set of log map delta's. 992 * ie, 993 * Clear out all mapentries on this unit's log map 994 * which have a tid (transaction id) equal to the 995 * parameter tid. Walk the cancel list, taking everything 996 * off it, too. 997 */ 998 static void 999 logmap_abort(ml_unit_t *ul, uint32_t tid) 1000 { 1001 struct mt_map *mtm = ul->un_logmap; /* Log map */ 1002 mapentry_t *me, **mep; 1003 int i; 1004 1005 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1006 map_check_linkage(mtm)); 1007 1008 /* 1009 * wait for any outstanding reads to finish; lock out future reads 1010 */ 1011 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1012 1013 mutex_enter(&mtm->mtm_mutex); 1014 /* Take everything off cancel list */ 1015 while ((me = mtm->mtm_cancel) != NULL) { 1016 mtm->mtm_cancel = me->me_cancel; 1017 me->me_flags &= ~ME_CANCEL; 1018 me->me_cancel = NULL; 1019 } 1020 1021 /* 1022 * Now take out all mapentries with current tid, and committid 1023 * as this function is called from logmap_logscan and logmap_commit 1024 * When it is called from logmap_logscan mtm_tid == mtm_committid 1025 * But when logmap_abort is called from logmap_commit it is 1026 * because the log errored when trying to write the commit record, 1027 * after the async ops have been allowed to start in top_end_sync. 1028 * So we also need to remove all mapentries from the transaction whose 1029 * commit failed. 1030 */ 1031 for (i = 0; i < mtm->mtm_nhash; i++) { 1032 mep = &mtm->mtm_hash[i]; 1033 while ((me = *mep) != NULL) { 1034 if (me->me_tid == tid || 1035 me->me_tid == mtm->mtm_committid) { 1036 *mep = me->me_hash; 1037 me->me_next->me_prev = me->me_prev; 1038 me->me_prev->me_next = me->me_next; 1039 if (!(me->me_flags & ME_USER)) { 1040 mtm->mtm_nme--; 1041 } 1042 CRB_RELE(me); 1043 kmem_cache_free(mapentry_cache, me); 1044 continue; 1045 } 1046 mep = &me->me_hash; 1047 } 1048 } 1049 1050 if (!(ul->un_flags & LDL_SCAN)) 1051 mtm->mtm_flags |= MTM_CANCELED; 1052 mutex_exit(&mtm->mtm_mutex); 1053 mtm->mtm_dirty = 0; 1054 mtm->mtm_nmet = 0; 1055 rw_exit(&mtm->mtm_rwlock); 1056 1057 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1058 map_check_linkage(mtm)); 1059 } 1060 1061 static void 1062 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me) 1063 { 1064 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1065 1066 while (!ldl_has_space(ul, me)) { 1067 ASSERT(!(ul->un_flags & LDL_NOROLL)); 1068 mutex_exit(&ul->un_log_mutex); 1069 logmap_forceroll(mtm); 1070 mutex_enter(&ul->un_log_mutex); 1071 if (ul->un_flags & LDL_ERROR) 1072 break; 1073 } 1074 1075 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1076 } 1077 1078 /* 1079 * put a list of deltas into a logmap 1080 * If va == NULL, don't write to the log. 1081 */ 1082 void 1083 logmap_add( 1084 ml_unit_t *ul, 1085 char *va, /* Ptr to buf w/deltas & data */ 1086 offset_t vamof, /* Offset on master of buf start */ 1087 mapentry_t *melist) /* Entries to add */ 1088 { 1089 offset_t mof; 1090 off_t nb; 1091 mapentry_t *me; 1092 mapentry_t **mep; 1093 mapentry_t **savmep; 1094 uint32_t tid; 1095 mt_map_t *mtm = ul->un_logmap; 1096 1097 mutex_enter(&ul->un_log_mutex); 1098 if (va) 1099 logmap_wait_space(mtm, ul, melist); 1100 1101 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1102 map_check_linkage(mtm)); 1103 1104 mtm->mtm_ref = 1; 1105 mtm->mtm_dirty++; 1106 tid = mtm->mtm_tid; 1107 while (melist) { 1108 mof = melist->me_mof; 1109 nb = melist->me_nb; 1110 1111 /* 1112 * search for overlaping entries 1113 */ 1114 savmep = mep = MAP_HASH(mof, mtm); 1115 mutex_enter(&mtm->mtm_mutex); 1116 while ((me = *mep) != 0) { 1117 /* 1118 * Data consumes old map entry; cancel map entry. 1119 * Take care when we replace an old map entry 1120 * which carries quota information with a newer entry 1121 * which does not. In that case the push function 1122 * would not be called to clean up the dquot structure. 1123 * This would be found later by invalidatedq() causing 1124 * a panic when the filesystem in unmounted. 1125 * We clean up the dquot manually and then replace 1126 * the map entry. 1127 */ 1128 if (MEwithinDATA(me, mof, nb) && 1129 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1130 if (tid == me->me_tid && 1131 ((me->me_flags & ME_AGE) == 0)) { 1132 *mep = me->me_hash; 1133 me->me_next->me_prev = me->me_prev; 1134 me->me_prev->me_next = me->me_next; 1135 ASSERT(!(me->me_flags & ME_USER)); 1136 mtm->mtm_nme--; 1137 /* 1138 * Special case if the mapentry 1139 * carries a dquot and a push function. 1140 * We have to clean up the quota info 1141 * before replacing the mapentry. 1142 */ 1143 if (me->me_dt == DT_QR) 1144 HANDLE_DQUOT(me, melist); 1145 1146 kmem_cache_free(mapentry_cache, me); 1147 continue; 1148 } 1149 me->me_cancel = mtm->mtm_cancel; 1150 mtm->mtm_cancel = me; 1151 me->me_flags |= ME_CANCEL; 1152 } 1153 mep = &(*mep)->me_hash; 1154 } 1155 mutex_exit(&mtm->mtm_mutex); 1156 1157 /* 1158 * remove from list 1159 */ 1160 me = melist; 1161 melist = melist->me_hash; 1162 me->me_flags &= ~ME_LIST; 1163 /* 1164 * If va != NULL, put in the log. 1165 */ 1166 if (va) 1167 ldl_write(ul, va, vamof, me); 1168 if (ul->un_flags & LDL_ERROR) { 1169 kmem_cache_free(mapentry_cache, me); 1170 continue; 1171 } 1172 ASSERT((va == NULL) || 1173 ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1174 map_check_ldl_write(ul, va, vamof, me)); 1175 1176 /* 1177 * put on hash 1178 */ 1179 mutex_enter(&mtm->mtm_mutex); 1180 me->me_hash = *savmep; 1181 *savmep = me; 1182 me->me_next = (mapentry_t *)mtm; 1183 me->me_prev = mtm->mtm_prev; 1184 mtm->mtm_prev->me_next = me; 1185 mtm->mtm_prev = me; 1186 me->me_flags |= ME_HASH; 1187 me->me_tid = tid; 1188 me->me_age = mtm->mtm_age++; 1189 mtm->mtm_nme++; 1190 mtm->mtm_nmet++; 1191 mutex_exit(&mtm->mtm_mutex); 1192 } 1193 1194 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1195 map_check_linkage(mtm)); 1196 mutex_exit(&ul->un_log_mutex); 1197 } 1198 1199 /* 1200 * Add the delta(s) into the log. 1201 * Create one cached roll buffer logmap entry, and reference count the 1202 * number of mapentries refering to it. 1203 * Cancel previous logmap entries. 1204 * logmap_add is tolerant of failure to allocate a cached roll buffer. 1205 */ 1206 void 1207 logmap_add_buf( 1208 ml_unit_t *ul, 1209 char *va, /* Ptr to buf w/deltas & data */ 1210 offset_t bufmof, /* Offset on master of buf start */ 1211 mapentry_t *melist, /* Entries to add */ 1212 caddr_t buf, /* Buffer containing delta(s) */ 1213 uint32_t bufsz) /* Size of buf */ 1214 { 1215 offset_t mof; 1216 offset_t vamof = bufmof + (va - buf); 1217 off_t nb; 1218 mapentry_t *me; 1219 mapentry_t **mep; 1220 mapentry_t **savmep; 1221 uint32_t tid; 1222 mt_map_t *mtm = ul->un_logmap; 1223 crb_t *crb; 1224 crb_t *crbsav = NULL; 1225 1226 ASSERT((bufsz & DEV_BMASK) == 0); 1227 mutex_enter(&ul->un_log_mutex); 1228 logmap_wait_space(mtm, ul, melist); 1229 1230 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1231 map_check_linkage(mtm)); 1232 1233 mtm->mtm_ref = 1; 1234 mtm->mtm_dirty++; 1235 tid = mtm->mtm_tid; 1236 while (melist) { 1237 mof = melist->me_mof; 1238 nb = melist->me_nb; 1239 1240 /* 1241 * search for overlapping entries 1242 */ 1243 savmep = mep = MAP_HASH(mof, mtm); 1244 mutex_enter(&mtm->mtm_mutex); 1245 while ((me = *mep) != 0) { 1246 /* 1247 * Data consumes old map entry; cancel map entry. 1248 * Take care when we replace an old map entry 1249 * which carries quota information with a newer entry 1250 * which does not. In that case the push function 1251 * would not be called to clean up the dquot structure. 1252 * This would be found later by invalidatedq() causing 1253 * a panic when the filesystem in unmounted. 1254 * We clean up the dquot manually and then replace 1255 * the map entry. 1256 */ 1257 crb = me->me_crb; 1258 if (MEwithinDATA(me, mof, nb) && 1259 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1260 if (tid == me->me_tid && 1261 ((me->me_flags & ME_AGE) == 0)) { 1262 *mep = me->me_hash; 1263 me->me_next->me_prev = me->me_prev; 1264 me->me_prev->me_next = me->me_next; 1265 ASSERT(!(me->me_flags & ME_USER)); 1266 mtm->mtm_nme--; 1267 /* 1268 * Special case if the mapentry 1269 * carries a dquot and a push function. 1270 * We have to clean up the quota info 1271 * before replacing the mapentry. 1272 */ 1273 if (me->me_dt == DT_QR) 1274 HANDLE_DQUOT(me, melist); 1275 1276 /* 1277 * If this soon to be deleted mapentry 1278 * has a suitable roll buffer then 1279 * re-use it. 1280 */ 1281 if (crb && (--crb->c_refcnt == 0)) { 1282 if (crbsav || 1283 (crb->c_nb != bufsz)) { 1284 CRB_FREE(crb, me); 1285 } else { 1286 bcopy(buf, crb->c_buf, 1287 bufsz); 1288 crb->c_invalid = 0; 1289 crb->c_mof = bufmof; 1290 crbsav = crb; 1291 me->me_crb = NULL; 1292 } 1293 } 1294 kmem_cache_free(mapentry_cache, me); 1295 continue; 1296 } 1297 me->me_cancel = mtm->mtm_cancel; 1298 mtm->mtm_cancel = me; 1299 me->me_flags |= ME_CANCEL; 1300 } 1301 1302 /* 1303 * Inode deltas within the same fs block come 1304 * in individually as separate calls to logmap_add(). 1305 * All others come in as one call. So check for an 1306 * existing entry where we can re-use the crb. 1307 */ 1308 if ((me->me_dt == DT_INODE) && (tid == me->me_tid) && 1309 !crbsav && crb && 1310 WITHIN(mof, nb, crb->c_mof, crb->c_nb)) { 1311 ASSERT(crb->c_mof == bufmof); 1312 ASSERT(crb->c_nb == bufsz); 1313 bcopy(buf, crb->c_buf, bufsz); 1314 crbsav = crb; 1315 } 1316 mep = &(*mep)->me_hash; 1317 } 1318 mutex_exit(&mtm->mtm_mutex); 1319 1320 /* 1321 * If we don't already have a crb then allocate one 1322 * and copy the incoming buffer. Only do this once 1323 * for all the incoming deltas. 1324 */ 1325 if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) { 1326 /* 1327 * Only use a cached roll buffer if we 1328 * have enough memory, and check for failures. 1329 */ 1330 if (((ufs_crb_size + bufsz) < ufs_crb_limit) && 1331 (kmem_avail() > bufsz)) { 1332 crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP); 1333 } else { 1334 ufs_crb_alloc_fails++; 1335 } 1336 if (crbsav) { 1337 crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP); 1338 if (crbsav->c_buf) { 1339 atomic_add_64(&ufs_crb_size, 1340 (uint64_t)bufsz); 1341 if (ufs_crb_size > ufs_crb_max_size) { 1342 ufs_crb_max_size = ufs_crb_size; 1343 } 1344 bcopy(buf, crbsav->c_buf, bufsz); 1345 crbsav->c_nb = bufsz; 1346 crbsav->c_refcnt = 0; 1347 crbsav->c_invalid = 0; 1348 ASSERT((bufmof & DEV_BMASK) == 0); 1349 crbsav->c_mof = bufmof; 1350 } else { 1351 kmem_free(crbsav, sizeof (crb_t)); 1352 crbsav = NULL; 1353 } 1354 } 1355 } 1356 1357 /* 1358 * remove from list 1359 */ 1360 me = melist; 1361 melist = melist->me_hash; 1362 me->me_flags &= ~ME_LIST; 1363 me->me_crb = crbsav; 1364 if (crbsav) { 1365 crbsav->c_refcnt++; 1366 } 1367 crbsav = NULL; 1368 1369 ASSERT(va); 1370 ldl_write(ul, va, vamof, me); /* add to on-disk log */ 1371 if (ul->un_flags & LDL_ERROR) { 1372 CRB_RELE(me); 1373 kmem_cache_free(mapentry_cache, me); 1374 continue; 1375 } 1376 ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1377 map_check_ldl_write(ul, va, vamof, me)); 1378 1379 /* 1380 * put on hash 1381 */ 1382 mutex_enter(&mtm->mtm_mutex); 1383 me->me_hash = *savmep; 1384 *savmep = me; 1385 me->me_next = (mapentry_t *)mtm; 1386 me->me_prev = mtm->mtm_prev; 1387 mtm->mtm_prev->me_next = me; 1388 mtm->mtm_prev = me; 1389 me->me_flags |= ME_HASH; 1390 me->me_tid = tid; 1391 me->me_age = mtm->mtm_age++; 1392 mtm->mtm_nme++; 1393 mtm->mtm_nmet++; 1394 mutex_exit(&mtm->mtm_mutex); 1395 } 1396 1397 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1398 map_check_linkage(mtm)); 1399 mutex_exit(&ul->un_log_mutex); 1400 } 1401 1402 /* 1403 * free up any cancelled deltas 1404 */ 1405 void 1406 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead) 1407 { 1408 int dolock = 0; 1409 mapentry_t *me; 1410 mapentry_t **mep; 1411 1412 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1413 map_check_linkage(mtm)); 1414 1415 again: 1416 if (dolock) 1417 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1418 1419 /* 1420 * At EOT, cancel the indicated deltas 1421 */ 1422 mutex_enter(&mtm->mtm_mutex); 1423 if (mtm->mtm_flags & MTM_CANCELED) { 1424 mtm->mtm_flags &= ~MTM_CANCELED; 1425 ASSERT(dolock == 0); 1426 mutex_exit(&mtm->mtm_mutex); 1427 return; 1428 } 1429 1430 while ((me = *cancelhead) != NULL) { 1431 /* 1432 * roll forward or read collision; wait and try again 1433 */ 1434 if (me->me_flags & ME_AGE) { 1435 ASSERT(dolock == 0); 1436 mutex_exit(&mtm->mtm_mutex); 1437 dolock = 1; 1438 goto again; 1439 } 1440 /* 1441 * remove from cancel list 1442 */ 1443 *cancelhead = me->me_cancel; 1444 me->me_cancel = NULL; 1445 me->me_flags &= ~(ME_CANCEL); 1446 1447 /* 1448 * logmap_remove_roll handles ME_ROLL entries later 1449 * we leave them around for logmap_iscancel 1450 * XXX is this necessary? 1451 */ 1452 if (me->me_flags & ME_ROLL) 1453 continue; 1454 1455 /* 1456 * remove from hash (if necessary) 1457 */ 1458 if (me->me_flags & ME_HASH) { 1459 mep = MAP_HASH(me->me_mof, mtm); 1460 while (*mep) { 1461 if (*mep == me) { 1462 *mep = me->me_hash; 1463 me->me_next->me_prev = me->me_prev; 1464 me->me_prev->me_next = me->me_next; 1465 me->me_flags &= ~(ME_HASH); 1466 if (!(me->me_flags & ME_USER)) { 1467 mtm->mtm_nme--; 1468 } 1469 break; 1470 } else 1471 mep = &(*mep)->me_hash; 1472 } 1473 } 1474 /* 1475 * put the entry on the free list 1476 */ 1477 CRB_RELE(me); 1478 kmem_cache_free(mapentry_cache, me); 1479 } 1480 mutex_exit(&mtm->mtm_mutex); 1481 if (dolock) 1482 rw_exit(&mtm->mtm_rwlock); 1483 1484 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1485 map_check_linkage(mtm)); 1486 } 1487 1488 1489 void 1490 logmap_commit(ml_unit_t *ul, uint32_t tid) 1491 { 1492 mapentry_t me; 1493 mt_map_t *mtm = ul->un_logmap; 1494 1495 1496 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1497 1498 /* 1499 * async'ly write a commit rec into the log 1500 */ 1501 if (mtm->mtm_dirty) { 1502 /* 1503 * put commit record into log 1504 */ 1505 me.me_mof = mtm->mtm_tid; 1506 me.me_dt = DT_COMMIT; 1507 me.me_nb = 0; 1508 me.me_hash = NULL; 1509 logmap_wait_space(mtm, ul, &me); 1510 ldl_write(ul, NULL, (offset_t)0, &me); 1511 ldl_round_commit(ul); 1512 1513 /* 1514 * abort on error; else reset dirty flag 1515 */ 1516 if (ul->un_flags & LDL_ERROR) 1517 logmap_abort(ul, tid); 1518 else { 1519 mtm->mtm_dirty = 0; 1520 mtm->mtm_nmet = 0; 1521 mtm->mtm_cfrags = 0; 1522 } 1523 /* push commit */ 1524 ldl_push_commit(ul); 1525 } 1526 } 1527 1528 void 1529 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul) 1530 { 1531 off_t lof; 1532 uint32_t tid; 1533 mapentry_t *me; 1534 1535 /* 1536 * move the head forward so the log knows how full it is 1537 * Make sure to skip any mapentry whose me_lof is 0, these 1538 * are just place holders for DT_CANCELED freed user blocks 1539 * for the current moby. 1540 */ 1541 mutex_enter(&ul->un_log_mutex); 1542 mutex_enter(&mtm->mtm_mutex); 1543 me = mtm->mtm_next; 1544 while (me != (mapentry_t *)mtm && me->me_lof == 0) { 1545 me = me->me_next; 1546 } 1547 1548 if (me == (mapentry_t *)mtm) 1549 lof = -1; 1550 else { 1551 lof = me->me_lof; 1552 tid = me->me_tid; 1553 } 1554 mutex_exit(&mtm->mtm_mutex); 1555 ldl_sethead(ul, lof, tid); 1556 if (lof == -1) 1557 mtm->mtm_age = 0; 1558 mutex_exit(&ul->un_log_mutex); 1559 } 1560 1561 void 1562 logmap_settail(mt_map_t *mtm, ml_unit_t *ul) 1563 { 1564 off_t lof; 1565 size_t nb; 1566 1567 /* 1568 * set the tail after the logmap_abort 1569 */ 1570 mutex_enter(&ul->un_log_mutex); 1571 mutex_enter(&mtm->mtm_mutex); 1572 if (mtm->mtm_prev == (mapentry_t *)mtm) 1573 lof = -1; 1574 else { 1575 /* 1576 * set the tail to the end of the last commit 1577 */ 1578 lof = mtm->mtm_tail_lof; 1579 nb = mtm->mtm_tail_nb; 1580 } 1581 mutex_exit(&mtm->mtm_mutex); 1582 ldl_settail(ul, lof, nb); 1583 mutex_exit(&ul->un_log_mutex); 1584 } 1585 1586 /* 1587 * when reseting a device; roll the log until every 1588 * delta has been rolled forward 1589 */ 1590 void 1591 logmap_roll_dev(ml_unit_t *ul) 1592 { 1593 mt_map_t *mtm = ul->un_logmap; 1594 mapentry_t *me; 1595 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 1596 1597 again: 1598 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1599 map_check_linkage(mtm)); 1600 if (ul->un_flags & (LDL_ERROR|LDL_NOROLL)) 1601 return; 1602 1603 /* 1604 * look for deltas 1605 */ 1606 mutex_enter(&mtm->mtm_mutex); 1607 for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) { 1608 if (me->me_flags & ME_ROLL) 1609 break; 1610 if (me->me_tid == mtm->mtm_tid) 1611 continue; 1612 if (me->me_tid == mtm->mtm_committid) 1613 continue; 1614 break; 1615 } 1616 1617 /* 1618 * found a delta; kick the roll thread 1619 * but only if the thread is running... (jmh) 1620 */ 1621 if (me != (mapentry_t *)mtm) { 1622 mutex_exit(&mtm->mtm_mutex); 1623 logmap_forceroll(mtm); 1624 goto again; 1625 } 1626 1627 /* 1628 * no more deltas, return 1629 */ 1630 mutex_exit(&mtm->mtm_mutex); 1631 (void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs); 1632 1633 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1634 map_check_linkage(mtm)); 1635 } 1636 1637 static void 1638 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata) 1639 { 1640 mapentry_t *me; 1641 mapentry_t **mep; 1642 mt_map_t *mtm = ul->un_logmap; 1643 int frags; 1644 1645 /* 1646 * map has been referenced and is dirty 1647 */ 1648 mtm->mtm_ref = 1; 1649 mtm->mtm_dirty++; 1650 1651 /* 1652 * get a mapentry 1653 */ 1654 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1655 bzero(me, sizeof (mapentry_t)); 1656 1657 /* 1658 * initialize cancel record and put in logmap 1659 */ 1660 me->me_mof = mof; 1661 me->me_nb = nb; 1662 me->me_dt = DT_CANCEL; 1663 me->me_tid = mtm->mtm_tid; 1664 me->me_hash = NULL; 1665 1666 /* 1667 * Write delta to log if this delta is for metadata. If this is not 1668 * metadata it is user data and we are just putting a cancel 1669 * mapentry into the hash to cancel a user block deletion 1670 * in which we do not want the block to be allocated 1671 * within this moby. This cancel entry will prevent the block from 1672 * being allocated within the moby and prevent user data corruption 1673 * if we happen to crash before this moby is committed. 1674 */ 1675 mutex_enter(&ul->un_log_mutex); 1676 if (metadata) { 1677 logmap_wait_space(mtm, ul, me); 1678 ldl_write(ul, NULL, (offset_t)0, me); 1679 if (ul->un_flags & LDL_ERROR) { 1680 kmem_cache_free(mapentry_cache, me); 1681 mutex_exit(&ul->un_log_mutex); 1682 return; 1683 } 1684 } 1685 1686 /* 1687 * put in hash and on cancel list 1688 */ 1689 mep = MAP_HASH(mof, mtm); 1690 mutex_enter(&mtm->mtm_mutex); 1691 me->me_age = mtm->mtm_age++; 1692 me->me_hash = *mep; 1693 *mep = me; 1694 me->me_next = (mapentry_t *)mtm; 1695 me->me_prev = mtm->mtm_prev; 1696 mtm->mtm_prev->me_next = me; 1697 mtm->mtm_prev = me; 1698 me->me_cancel = mtm->mtm_cancel; 1699 mtm->mtm_cancel = me; 1700 if (metadata) { 1701 mtm->mtm_nme++; 1702 mtm->mtm_nmet++; 1703 } else { 1704 me->me_flags = ME_USER; 1705 } 1706 me->me_flags |= (ME_HASH|ME_CANCEL); 1707 if (!(metadata)) { 1708 frags = blkoff(ul->un_ufsvfs->vfs_fs, nb); 1709 if (frags) 1710 mtm->mtm_cfrags += 1711 numfrags(ul->un_ufsvfs->vfs_fs, frags); 1712 } 1713 mutex_exit(&mtm->mtm_mutex); 1714 1715 mutex_exit(&ul->un_log_mutex); 1716 } 1717 1718 /* 1719 * cancel entries in a logmap (entries are freed at EOT) 1720 */ 1721 void 1722 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata) 1723 { 1724 int32_t hnb; 1725 mapentry_t *me; 1726 mapentry_t **mep; 1727 mt_map_t *mtm = ul->un_logmap; 1728 crb_t *crb; 1729 1730 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1731 map_check_linkage(mtm)); 1732 1733 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1734 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1735 if (hnb > nb) 1736 hnb = nb; 1737 /* 1738 * Find overlapping metadata entries. Don't search through 1739 * the hash chains if this is user data because it is only 1740 * possible to have overlapping map entries for metadata, 1741 * and the search can become expensive for large files. 1742 */ 1743 if (metadata) { 1744 mep = MAP_HASH(mof, mtm); 1745 mutex_enter(&mtm->mtm_mutex); 1746 for (me = *mep; me; me = me->me_hash) { 1747 if (!DATAoverlapME(mof, hnb, me)) 1748 continue; 1749 1750 ASSERT(MEwithinDATA(me, mof, hnb)); 1751 1752 if ((me->me_flags & ME_CANCEL) == 0) { 1753 me->me_cancel = mtm->mtm_cancel; 1754 mtm->mtm_cancel = me; 1755 me->me_flags |= ME_CANCEL; 1756 crb = me->me_crb; 1757 if (crb) { 1758 crb->c_invalid = 1; 1759 } 1760 } 1761 } 1762 mutex_exit(&mtm->mtm_mutex); 1763 } 1764 1765 /* 1766 * put a cancel record into the log 1767 */ 1768 logmap_cancel_delta(ul, mof, hnb, metadata); 1769 } 1770 1771 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1772 map_check_linkage(mtm)); 1773 } 1774 1775 /* 1776 * check for overlap w/cancel delta 1777 */ 1778 int 1779 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb) 1780 { 1781 off_t hnb; 1782 mapentry_t *me; 1783 mapentry_t **mep; 1784 1785 mutex_enter(&mtm->mtm_mutex); 1786 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1787 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1788 if (hnb > nb) 1789 hnb = nb; 1790 /* 1791 * search for dup entry 1792 */ 1793 mep = MAP_HASH(mof, mtm); 1794 for (me = *mep; me; me = me->me_hash) { 1795 if (((me->me_flags & ME_ROLL) == 0) && 1796 (me->me_dt != DT_CANCEL)) 1797 continue; 1798 if (DATAoverlapME(mof, hnb, me)) 1799 break; 1800 } 1801 1802 /* 1803 * overlap detected 1804 */ 1805 if (me) { 1806 mutex_exit(&mtm->mtm_mutex); 1807 return (1); 1808 } 1809 } 1810 mutex_exit(&mtm->mtm_mutex); 1811 return (0); 1812 } 1813 1814 static int 1815 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp) 1816 { 1817 mapentry_t *me; 1818 int error; 1819 mt_map_t *mtm = ul->un_logmap; 1820 1821 /* 1822 * verify delta header; failure == mediafail 1823 */ 1824 error = 0; 1825 /* delta type */ 1826 if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX)) 1827 error = EINVAL; 1828 if (dp->d_typ == DT_COMMIT) { 1829 if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1)) 1830 error = EINVAL; 1831 } else { 1832 /* length of delta */ 1833 if ((dp->d_nb < INT32_C(0)) || 1834 (dp->d_nb > INT32_C(MAPBLOCKSIZE))) 1835 error = EINVAL; 1836 1837 /* offset on master device */ 1838 if (dp->d_mof < INT64_C(0)) 1839 error = EINVAL; 1840 } 1841 1842 if (error) { 1843 ldl_seterror(ul, "Error processing ufs log data during scan"); 1844 return (error); 1845 } 1846 1847 /* 1848 * process commit record 1849 */ 1850 if (dp->d_typ == DT_COMMIT) { 1851 if (mtm->mtm_dirty) { 1852 ASSERT(dp->d_nb == INT32_C(0)); 1853 logmap_free_cancel(mtm, &mtm->mtm_cancel); 1854 mtm->mtm_dirty = 0; 1855 mtm->mtm_nmet = 0; 1856 mtm->mtm_tid++; 1857 mtm->mtm_committid = mtm->mtm_tid; 1858 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1859 logmap_logscan_commit_debug(lof, mtm)); 1860 } 1861 /* 1862 * return #bytes to next sector (next delta header) 1863 */ 1864 *nbp = ldl_logscan_nbcommit(lof); 1865 mtm->mtm_tail_lof = lof; 1866 mtm->mtm_tail_nb = *nbp; 1867 return (0); 1868 } 1869 1870 /* 1871 * add delta to logmap 1872 */ 1873 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1874 bzero(me, sizeof (mapentry_t)); 1875 me->me_lof = lof; 1876 me->me_mof = dp->d_mof; 1877 me->me_nb = dp->d_nb; 1878 me->me_tid = mtm->mtm_tid; 1879 me->me_dt = dp->d_typ; 1880 me->me_hash = NULL; 1881 me->me_flags = (ME_LIST | ME_SCAN); 1882 logmap_add(ul, NULL, 0, me); 1883 switch (dp->d_typ) { 1884 case DT_CANCEL: 1885 me->me_flags |= ME_CANCEL; 1886 me->me_cancel = mtm->mtm_cancel; 1887 mtm->mtm_cancel = me; 1888 break; 1889 default: 1890 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1891 logmap_logscan_add_debug(dp, mtm)); 1892 break; 1893 } 1894 1895 sizeofdelta: 1896 /* 1897 * return #bytes till next delta header 1898 */ 1899 if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO)) 1900 *nbp = 0; 1901 else 1902 *nbp = dp->d_nb; 1903 return (0); 1904 } 1905 1906 void 1907 logmap_logscan(ml_unit_t *ul) 1908 { 1909 size_t nb, nbd; 1910 off_t lof; 1911 struct delta delta; 1912 mt_map_t *logmap = ul->un_logmap; 1913 1914 ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap); 1915 1916 /* 1917 * prepare the log for a logscan 1918 */ 1919 ldl_logscan_begin(ul); 1920 1921 /* 1922 * prepare the logmap for a logscan 1923 */ 1924 (void) map_free_entries(logmap); 1925 logmap->mtm_tid = 0; 1926 logmap->mtm_committid = UINT32_C(0); 1927 logmap->mtm_age = 0; 1928 logmap->mtm_dirty = 0; 1929 logmap->mtm_ref = 0; 1930 1931 /* 1932 * while not at end of log 1933 * read delta header 1934 * add to logmap 1935 * seek to beginning of next delta 1936 */ 1937 lof = ul->un_head_lof; 1938 nbd = sizeof (delta); 1939 while (lof != ul->un_tail_lof) { 1940 1941 /* read delta header */ 1942 if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta)) 1943 break; 1944 1945 /* add to logmap */ 1946 if (logmap_logscan_add(ul, &delta, lof, &nb)) 1947 break; 1948 1949 /* seek to next header (skip data) */ 1950 if (ldl_logscan_read(ul, &lof, nb, NULL)) 1951 break; 1952 } 1953 1954 /* 1955 * remove the last partial transaction from the logmap 1956 */ 1957 logmap_abort(ul, logmap->mtm_tid); 1958 1959 ldl_logscan_end(ul); 1960 } 1961 1962 void 1963 _init_map(void) 1964 { 1965 /* 1966 * Initialise the mapentry cache. No constructor or deconstructor 1967 * is needed. Also no reclaim function is supplied as reclaiming 1968 * current entries is not possible. 1969 */ 1970 mapentry_cache = kmem_cache_create("lufs_mapentry_cache", 1971 sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1972 } 1973 1974 /* 1975 * Special case when we replace an old map entry which carries quota 1976 * information with a newer entry which does not. 1977 * In that case the push function would not be called to clean up the 1978 * dquot structure. This would be found later by invalidatedq() causing 1979 * a panic when the filesystem in unmounted. 1980 * We clean up the dquot manually before replacing the map entry. 1981 */ 1982 void 1983 handle_dquot(mapentry_t *me) 1984 { 1985 int dolock = 0; 1986 int domutex = 0; 1987 struct dquot *dqp; 1988 1989 dqp = (struct dquot *)me->me_arg; 1990 1991 /* 1992 * We need vfs_dqrwlock to call dqput() 1993 */ 1994 dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock)); 1995 if (dolock) 1996 rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER); 1997 1998 domutex = (!MUTEX_HELD(&dqp->dq_lock)); 1999 if (domutex) 2000 mutex_enter(&dqp->dq_lock); 2001 2002 /* 2003 * Only clean up if the dquot is referenced 2004 */ 2005 if (dqp->dq_cnt == 0) { 2006 if (domutex) 2007 mutex_exit(&dqp->dq_lock); 2008 if (dolock) 2009 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2010 return; 2011 } 2012 2013 dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS); 2014 dqput(dqp); 2015 2016 if (domutex) 2017 mutex_exit(&dqp->dq_lock); 2018 2019 if (dolock) 2020 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2021 2022 } 2023