1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 #pragma ident "%Z%%M% %I% %E% SMI" 23 24 /* 25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 #include <sys/systm.h> 30 #include <sys/types.h> 31 #include <sys/vnode.h> 32 #include <sys/errno.h> 33 #include <sys/sysmacros.h> 34 #include <sys/debug.h> 35 #include <sys/kmem.h> 36 #include <sys/conf.h> 37 #include <sys/proc.h> 38 #include <sys/cmn_err.h> 39 #include <sys/fs/ufs_inode.h> 40 #include <sys/fs/ufs_filio.h> 41 #include <sys/fs/ufs_log.h> 42 #include <sys/inttypes.h> 43 #include <sys/atomic.h> 44 #include <sys/tuneable.h> 45 46 /* 47 * externs 48 */ 49 extern pri_t minclsyspri; 50 extern struct kmem_cache *lufs_bp; 51 extern int ufs_trans_push_quota(); 52 53 /* 54 * globals 55 */ 56 kmem_cache_t *mapentry_cache; 57 58 /* 59 * logmap tuning constants 60 */ 61 long logmap_maxnme_commit = 2048; 62 long logmap_maxnme_async = 4096; 63 long logmap_maxnme_sync = 6144; 64 long logmap_maxcfrag_commit = 4; /* Max canceled fragments per moby */ 65 66 67 uint64_t ufs_crb_size = 0; /* current size of all crb buffers */ 68 uint64_t ufs_crb_max_size = 0; /* highest crb buffer use so far */ 69 size_t ufs_crb_limit; /* max allowable size for crbs */ 70 uint64_t ufs_crb_alloc_fails = 0; /* crb allocation failures stat */ 71 #define UFS_MAX_CRB_DEFAULT_DIVISOR 10 /* max 1/10 kmem_maxavail() */ 72 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */ 73 void handle_dquot(mapentry_t *); 74 75 /* 76 * GENERIC MAP ROUTINES 77 */ 78 79 #define CRB_FREE(crb, me) \ 80 kmem_free(crb->c_buf, crb->c_nb); \ 81 atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \ 82 kmem_free(crb, sizeof (crb_t)); \ 83 (me)->me_crb = NULL; 84 85 #define CRB_RELE(me) { \ 86 crb_t *crb = (me)->me_crb; \ 87 if (crb && (--crb->c_refcnt == 0)) { \ 88 CRB_FREE(crb, me) \ 89 } \ 90 } 91 92 /* 93 * Check that the old delta has an argument and a push function of 94 * ufs_trans_push_quota(), then check that the old and new deltas differ. 95 * If so we clean up with handle_dquot() before replacing the old delta. 96 */ 97 #define HANDLE_DQUOT(me, melist) { \ 98 if ((me->me_arg) && \ 99 (me->me_func == ufs_trans_push_quota)) { \ 100 if (!((me->me_dt == melist->me_dt) && \ 101 (me->me_arg == melist->me_arg) && \ 102 (me->me_func == melist->me_func))) { \ 103 handle_dquot(me); \ 104 } \ 105 } \ 106 } 107 108 /* 109 * free up all the mapentries for a map 110 */ 111 void 112 map_free_entries(mt_map_t *mtm) 113 { 114 int i; 115 mapentry_t *me; 116 117 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 118 me->me_next->me_prev = me->me_prev; 119 me->me_prev->me_next = me->me_next; 120 CRB_RELE(me); 121 kmem_cache_free(mapentry_cache, me); 122 } 123 for (i = 0; i < mtm->mtm_nhash; i++) 124 mtm->mtm_hash[i] = NULL; 125 mtm->mtm_nme = 0; 126 mtm->mtm_nmet = 0; 127 } 128 129 /* 130 * done with map; free if necessary 131 */ 132 mt_map_t * 133 map_put(mt_map_t *mtm) 134 { 135 /* 136 * free up the map's memory 137 */ 138 map_free_entries(mtm); 139 ASSERT(map_put_debug(mtm)); 140 kmem_free(mtm->mtm_hash, 141 (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash)); 142 mutex_destroy(&mtm->mtm_mutex); 143 mutex_destroy(&mtm->mtm_scan_mutex); 144 cv_destroy(&mtm->mtm_to_roll_cv); 145 cv_destroy(&mtm->mtm_from_roll_cv); 146 rw_destroy(&mtm->mtm_rwlock); 147 mutex_destroy(&mtm->mtm_lock); 148 cv_destroy(&mtm->mtm_cv_commit); 149 cv_destroy(&mtm->mtm_cv_next); 150 cv_destroy(&mtm->mtm_cv_eot); 151 cv_destroy(&mtm->mtm_cv); 152 kmem_free(mtm, sizeof (mt_map_t)); 153 return (NULL); 154 } 155 /* 156 * Allocate a map; 157 */ 158 mt_map_t * 159 map_get(ml_unit_t *ul, enum maptypes maptype, int nh) 160 { 161 mt_map_t *mtm; 162 163 /* 164 * assume the map is not here and allocate the necessary structs 165 */ 166 mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP); 167 mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL); 168 mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL); 169 cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL); 170 cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL); 171 rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL); 172 mtm->mtm_next = (mapentry_t *)mtm; 173 mtm->mtm_prev = (mapentry_t *)mtm; 174 mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh), 175 KM_SLEEP); 176 mtm->mtm_nhash = nh; 177 mtm->mtm_debug = ul->un_debug; 178 mtm->mtm_type = maptype; 179 180 mtm->mtm_cfrags = 0; 181 mtm->mtm_cfragmax = logmap_maxcfrag_commit; 182 183 /* 184 * for scan test 185 */ 186 mtm->mtm_ul = ul; 187 188 /* 189 * Initialize locks 190 */ 191 mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL); 192 cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL); 193 cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL); 194 cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL); 195 cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL); 196 ASSERT(map_get_debug(ul, mtm)); 197 198 return (mtm); 199 } 200 201 /* 202 * DELTAMAP ROUTINES 203 */ 204 /* 205 * deltamap tuning constants 206 */ 207 long deltamap_maxnme = 1024; /* global so it can be set */ 208 209 int 210 deltamap_need_commit(mt_map_t *mtm) 211 { 212 return (mtm->mtm_nme > deltamap_maxnme); 213 } 214 215 /* 216 * put a delta into a deltamap; may sleep on memory 217 */ 218 void 219 deltamap_add( 220 mt_map_t *mtm, 221 offset_t mof, 222 off_t nb, 223 delta_t dtyp, 224 int (*func)(), 225 ulong_t arg, 226 threadtrans_t *tp) 227 { 228 int32_t hnb; 229 mapentry_t *me; 230 mapentry_t **mep; 231 232 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 233 map_check_linkage(mtm)); 234 235 mutex_enter(&mtm->mtm_mutex); 236 237 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 238 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 239 if (hnb > nb) 240 hnb = nb; 241 /* 242 * Search for dup entry. We need to ensure that we don't 243 * replace a map entry which carries quota information 244 * with a map entry which doesn't. In that case we lose 245 * reference the the dquot structure which will not be 246 * cleaned up by the push function me->me_func as this will 247 * never be called. 248 * The stray dquot would be found later by invalidatedq() 249 * causing a panic when the filesystem is unmounted. 250 */ 251 mep = MAP_HASH(mof, mtm); 252 for (me = *mep; me; me = me->me_hash) { 253 if (DATAwithinME(mof, hnb, me)) { 254 if (me->me_func == ufs_trans_push_quota) { 255 /* 256 * Don't remove quota entries which have 257 * incremented the ref count (those with a 258 * ufs_trans_push_quota push function). 259 * Let logmap_add[_buf] clean them up. 260 */ 261 continue; 262 } 263 break; 264 } 265 ASSERT((dtyp == DT_CANCEL) || 266 (!DATAoverlapME(mof, hnb, me)) || 267 MEwithinDATA(me, mof, hnb)); 268 } 269 270 if (me) { 271 /* already in map */ 272 continue; 273 } 274 275 /* 276 * Add up all the delta map deltas so we can compute 277 * an upper bound on the log size used. 278 * Note, some deltas get removed from the deltamap 279 * before the deltamap_push by lufs_write_strategy 280 * and so multiple deltas to the same mof offset 281 * don't get cancelled here but in the logmap. 282 * Thus we can't easily get a accurate count of 283 * the log space used - only an upper bound. 284 */ 285 if (tp && (mtm->mtm_ul->un_deltamap == mtm)) { 286 ASSERT(dtyp != DT_CANCEL); 287 if (dtyp == DT_ABZERO) { 288 tp->deltas_size += sizeof (struct delta); 289 } else { 290 tp->deltas_size += 291 (hnb + sizeof (struct delta)); 292 } 293 } 294 295 delta_stats[dtyp]++; 296 297 /* 298 * get a mapentry 299 * May need to drop & re-grab the mtm_mutex 300 * and then recheck for a duplicate 301 */ 302 me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP); 303 if (me == NULL) { 304 mutex_exit(&mtm->mtm_mutex); 305 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 306 mutex_enter(&mtm->mtm_mutex); 307 } 308 bzero(me, sizeof (mapentry_t)); 309 310 /* 311 * initialize and put in deltamap 312 */ 313 me->me_mof = mof; 314 me->me_nb = hnb; 315 me->me_func = func; 316 me->me_arg = arg; 317 me->me_dt = dtyp; 318 me->me_flags = ME_HASH; 319 me->me_tid = mtm->mtm_tid; 320 321 me->me_hash = *mep; 322 *mep = me; 323 me->me_next = (mapentry_t *)mtm; 324 me->me_prev = mtm->mtm_prev; 325 mtm->mtm_prev->me_next = me; 326 mtm->mtm_prev = me; 327 mtm->mtm_nme++; 328 } 329 mutex_exit(&mtm->mtm_mutex); 330 331 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 332 map_check_linkage(mtm)); 333 } 334 335 /* 336 * remove deltas within (mof, nb) and return as linked list 337 */ 338 mapentry_t * 339 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb) 340 { 341 off_t hnb; 342 mapentry_t *me; 343 mapentry_t **mep; 344 mapentry_t *mer; 345 346 if (mtm == NULL) 347 return (NULL); 348 349 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 350 map_check_linkage(mtm)); 351 352 mutex_enter(&mtm->mtm_mutex); 353 for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) { 354 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 355 if (hnb > nb) 356 hnb = nb; 357 /* 358 * remove entries from hash and return as a aged linked list 359 */ 360 mep = MAP_HASH(mof, mtm); 361 while ((me = *mep) != 0) { 362 if (MEwithinDATA(me, mof, hnb)) { 363 *mep = me->me_hash; 364 me->me_next->me_prev = me->me_prev; 365 me->me_prev->me_next = me->me_next; 366 me->me_hash = mer; 367 mer = me; 368 me->me_flags |= ME_LIST; 369 me->me_flags &= ~ME_HASH; 370 mtm->mtm_nme--; 371 } else 372 mep = &me->me_hash; 373 } 374 } 375 mutex_exit(&mtm->mtm_mutex); 376 377 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 378 map_check_linkage(mtm)); 379 380 return (mer); 381 } 382 383 /* 384 * delete entries within (mof, nb) 385 */ 386 void 387 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb) 388 { 389 mapentry_t *me; 390 mapentry_t *menext; 391 392 menext = deltamap_remove(mtm, mof, nb); 393 while ((me = menext) != 0) { 394 menext = me->me_hash; 395 kmem_cache_free(mapentry_cache, me); 396 } 397 } 398 399 /* 400 * Call the indicated function to cause deltas to move to the logmap. 401 * top_end_sync() is the only caller of this function and 402 * it has waited for the completion of all threads, so there can 403 * be no other activity in the deltamap. Therefore we don't need to 404 * hold the deltamap lock. 405 */ 406 void 407 deltamap_push(ml_unit_t *ul) 408 { 409 delta_t dtyp; 410 int (*func)(); 411 ulong_t arg; 412 mapentry_t *me; 413 offset_t mof; 414 off_t nb; 415 mt_map_t *mtm = ul->un_deltamap; 416 417 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 418 map_check_linkage(mtm)); 419 420 /* 421 * for every entry in the deltamap 422 */ 423 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 424 ASSERT(me->me_func); 425 func = me->me_func; 426 dtyp = me->me_dt; 427 arg = me->me_arg; 428 mof = me->me_mof; 429 nb = me->me_nb; 430 if ((ul->un_flags & LDL_ERROR) || 431 (*func)(ul->un_ufsvfs, dtyp, arg)) 432 deltamap_del(mtm, mof, nb); 433 } 434 435 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 436 map_check_linkage(mtm)); 437 } 438 439 /* 440 * LOGMAP ROUTINES 441 */ 442 443 int 444 logmap_need_commit(mt_map_t *mtm) 445 { 446 return ((mtm->mtm_nmet > logmap_maxnme_commit) || 447 (mtm->mtm_cfrags >= mtm->mtm_cfragmax)); 448 } 449 450 int 451 logmap_need_roll_async(mt_map_t *mtm) 452 { 453 return (mtm->mtm_nme > logmap_maxnme_async); 454 } 455 456 int 457 logmap_need_roll_sync(mt_map_t *mtm) 458 { 459 return (mtm->mtm_nme > logmap_maxnme_sync); 460 } 461 462 void 463 logmap_start_roll(ml_unit_t *ul) 464 { 465 mt_map_t *logmap = ul->un_logmap; 466 467 logmap_settail(logmap, ul); 468 ASSERT(!(ul->un_flags & LDL_NOROLL)); 469 mutex_enter(&logmap->mtm_mutex); 470 if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) { 471 logmap->mtm_flags |= MTM_ROLL_RUNNING; 472 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT); 473 (void) thread_create(NULL, 0, trans_roll, ul, 0, &p0, 474 TS_RUN, minclsyspri); 475 } 476 mutex_exit(&logmap->mtm_mutex); 477 } 478 479 void 480 logmap_kill_roll(ml_unit_t *ul) 481 { 482 mt_map_t *mtm = ul->un_logmap; 483 484 if (mtm == NULL) 485 return; 486 487 mutex_enter(&mtm->mtm_mutex); 488 489 while (mtm->mtm_flags & MTM_ROLL_RUNNING) { 490 mtm->mtm_flags |= MTM_ROLL_EXIT; 491 cv_signal(&mtm->mtm_to_roll_cv); 492 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 493 } 494 mutex_exit(&mtm->mtm_mutex); 495 } 496 497 /* 498 * kick the roll thread if it's not doing anything 499 */ 500 void 501 logmap_forceroll_nowait(mt_map_t *logmap) 502 { 503 /* 504 * Don't need to lock mtm_mutex to read mtm_flags here as we 505 * don't care in the rare case when we get a transitional value 506 * of mtm_flags. Just by signalling the thread it will wakeup 507 * and notice it has too many logmap entries. 508 */ 509 ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL)); 510 if ((logmap->mtm_flags & MTM_ROLLING) == 0) { 511 cv_signal(&logmap->mtm_to_roll_cv); 512 } 513 } 514 515 /* 516 * kick the roll thread and wait for it to finish a cycle 517 */ 518 void 519 logmap_forceroll(mt_map_t *mtm) 520 { 521 mutex_enter(&mtm->mtm_mutex); 522 if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) { 523 mtm->mtm_flags |= MTM_FORCE_ROLL; 524 cv_signal(&mtm->mtm_to_roll_cv); 525 } 526 do { 527 if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) { 528 mtm->mtm_flags &= ~MTM_FORCE_ROLL; 529 goto out; 530 } 531 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 532 } while (mtm->mtm_flags & MTM_FORCE_ROLL); 533 out: 534 mutex_exit(&mtm->mtm_mutex); 535 } 536 537 /* 538 * remove rolled deltas within (mof, nb) and free them 539 */ 540 void 541 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb) 542 { 543 int dolock = 0; 544 off_t hnb; 545 mapentry_t *me; 546 mapentry_t **mep; 547 offset_t savmof = mof; 548 off_t savnb = nb; 549 550 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 551 map_check_linkage(mtm)); 552 553 again: 554 if (dolock) 555 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 556 mutex_enter(&mtm->mtm_mutex); 557 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 558 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 559 if (hnb > nb) 560 hnb = nb; 561 /* 562 * remove and free the rolled entries 563 */ 564 mep = MAP_HASH(mof, mtm); 565 while ((me = *mep) != 0) { 566 if ((me->me_flags & ME_ROLL) && 567 (MEwithinDATA(me, mof, hnb))) { 568 if (me->me_flags & ME_AGE) { 569 ASSERT(dolock == 0); 570 dolock = 1; 571 mutex_exit(&mtm->mtm_mutex); 572 mof = savmof; 573 nb = savnb; 574 goto again; 575 } 576 *mep = me->me_hash; 577 me->me_next->me_prev = me->me_prev; 578 me->me_prev->me_next = me->me_next; 579 me->me_flags &= ~(ME_HASH|ME_ROLL); 580 ASSERT(!(me->me_flags & ME_USER)); 581 mtm->mtm_nme--; 582 /* 583 * cancelled entries are handled by someone else 584 */ 585 if ((me->me_flags & ME_CANCEL) == 0) { 586 roll_stats[me->me_dt]++; 587 CRB_RELE(me); 588 kmem_cache_free(mapentry_cache, me); 589 } 590 } else 591 mep = &me->me_hash; 592 } 593 } 594 mutex_exit(&mtm->mtm_mutex); 595 596 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 597 map_check_linkage(mtm)); 598 599 if (dolock) 600 rw_exit(&mtm->mtm_rwlock); 601 } 602 603 /* 604 * Find the disk offset of the next delta to roll. 605 * Returns 0: no more deltas to roll or a transaction is being committed 606 * 1: a delta to roll has been found and *mofp points 607 * to the master file disk offset 608 */ 609 int 610 logmap_next_roll(mt_map_t *logmap, offset_t *mofp) 611 { 612 mapentry_t *me; 613 614 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 615 map_check_linkage(logmap)); 616 617 mutex_enter(&logmap->mtm_mutex); 618 for (me = logmap->mtm_next; me != (mapentry_t *)logmap; 619 me = me->me_next) { 620 /* already rolled */ 621 if (me->me_flags & ME_ROLL) { 622 continue; 623 } 624 625 /* part of currently busy transaction; stop */ 626 if (me->me_tid == logmap->mtm_tid) { 627 break; 628 } 629 630 /* part of commit-in-progress transaction; stop */ 631 if (me->me_tid == logmap->mtm_committid) { 632 break; 633 } 634 635 /* 636 * We shouldn't see a DT_CANCEL mapentry whose 637 * tid != mtm_committid, or != mtm_tid since 638 * these are removed at the end of each committed 639 * transaction. 640 */ 641 ASSERT(!(me->me_dt == DT_CANCEL)); 642 643 *mofp = me->me_mof; 644 mutex_exit(&logmap->mtm_mutex); 645 return (1); 646 } 647 mutex_exit(&logmap->mtm_mutex); 648 return (0); 649 } 650 651 /* 652 * put mapentry on sorted age list 653 */ 654 static void 655 logmap_list_age(mapentry_t **age, mapentry_t *meadd) 656 { 657 mapentry_t *me; 658 659 ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST))); 660 661 for (me = *age; me; age = &me->me_agenext, me = *age) { 662 if (me->me_age > meadd->me_age) 663 break; 664 } 665 meadd->me_agenext = me; 666 meadd->me_flags |= ME_AGE; 667 *age = meadd; 668 } 669 670 /* 671 * get a list of deltas within <mof, mof+nb> 672 * returns with mtm_rwlock held 673 * return value says whether the entire mof range is covered by deltas 674 */ 675 int 676 logmap_list_get( 677 mt_map_t *mtm, 678 offset_t mof, 679 off_t nb, 680 mapentry_t **age) 681 { 682 off_t hnb; 683 mapentry_t *me; 684 mapentry_t **mep; 685 int rwtype = RW_READER; 686 offset_t savmof = mof; 687 off_t savnb = nb; 688 int entire = 0; 689 crb_t *crb; 690 691 mtm->mtm_ref = 1; 692 again: 693 694 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 695 map_check_linkage(mtm)); 696 697 rw_enter(&mtm->mtm_rwlock, rwtype); 698 *age = NULL; 699 mutex_enter(&mtm->mtm_mutex); 700 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 701 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 702 if (hnb > nb) 703 hnb = nb; 704 /* 705 * find overlapping entries 706 */ 707 mep = MAP_HASH(mof, mtm); 708 for (me = *mep; me; me = me->me_hash) { 709 if (me->me_dt == DT_CANCEL) 710 continue; 711 if (!DATAoverlapME(mof, hnb, me)) 712 continue; 713 /* 714 * check if map entry is in use 715 * (about to be rolled). 716 */ 717 if (me->me_flags & ME_AGE) { 718 /* 719 * reset the age bit in the list, 720 * upgrade the lock, and try again 721 */ 722 for (me = *age; me; me = *age) { 723 *age = me->me_agenext; 724 me->me_flags &= ~ME_AGE; 725 } 726 mutex_exit(&mtm->mtm_mutex); 727 rw_exit(&mtm->mtm_rwlock); 728 rwtype = RW_WRITER; 729 mof = savmof; 730 nb = savnb; 731 entire = 0; 732 goto again; 733 } else { 734 /* add mapentry to age ordered list */ 735 logmap_list_age(age, me); 736 crb = me->me_crb; 737 if (crb) { 738 if (DATAwithinCRB(savmof, savnb, crb)) { 739 entire = 1; 740 } 741 } else { 742 if (DATAwithinME(savmof, savnb, me)) { 743 entire = 1; 744 } 745 } 746 } 747 } 748 } 749 mutex_exit(&mtm->mtm_mutex); 750 751 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 752 return (entire); 753 } 754 755 /* 756 * Get a list of deltas for rolling - returns sucess or failure. 757 * Also return the cached roll buffer if all deltas point to it. 758 */ 759 int 760 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp) 761 { 762 mapentry_t *me, **mep, *age = NULL; 763 crb_t *crb = NULL; 764 765 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 766 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 767 map_check_linkage(logmap)); 768 ASSERT((mof & MAPBLOCKOFF) == 0); 769 770 rbp->rb_crb = NULL; 771 772 /* 773 * find overlapping entries 774 */ 775 mutex_enter(&logmap->mtm_mutex); 776 mep = MAP_HASH(mof, logmap); 777 for (me = *mep; me; me = me->me_hash) { 778 if (!DATAoverlapME(mof, MAPBLOCKSIZE, me)) 779 continue; 780 if (me->me_tid == logmap->mtm_tid) 781 continue; 782 if (me->me_tid == logmap->mtm_committid) 783 continue; 784 if (me->me_dt == DT_CANCEL) 785 continue; 786 787 /* 788 * Check if map entry is in use (by lufs_read_strategy()) 789 * and if so reset the age bit in the list, 790 * upgrade the lock, and try again 791 */ 792 if (me->me_flags & ME_AGE) { 793 for (me = age; me; me = age) { 794 age = me->me_agenext; 795 me->me_flags &= ~ME_AGE; 796 } 797 mutex_exit(&logmap->mtm_mutex); 798 return (1); /* failure */ 799 } else { 800 /* add mapentry to age ordered list */ 801 logmap_list_age(&age, me); 802 } 803 } 804 if (!age) { 805 goto out; 806 } 807 808 /* 809 * Mark the deltas as being rolled. 810 */ 811 for (me = age; me; me = me->me_agenext) { 812 me->me_flags |= ME_ROLL; 813 } 814 815 /* 816 * Test if all deltas are covered by one valid roll buffer 817 */ 818 crb = age->me_crb; 819 if (crb && !(crb->c_invalid)) { 820 for (me = age; me; me = me->me_agenext) { 821 if (me->me_crb != crb) { 822 crb = NULL; 823 break; 824 } 825 } 826 rbp->rb_crb = crb; 827 } 828 out: 829 rbp->rb_age = age; 830 831 mutex_exit(&logmap->mtm_mutex); 832 833 ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) || 834 logmap_logscan_debug(logmap, age)); 835 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 836 return (0); /* success */ 837 } 838 839 void 840 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age) 841 { 842 mapentry_t *me; 843 844 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 845 mutex_enter(&mtm->mtm_mutex); 846 for (me = age; me; me = age) { 847 age = me->me_agenext; 848 me->me_flags &= ~ME_AGE; 849 } 850 mutex_exit(&mtm->mtm_mutex); 851 } 852 853 void 854 logmap_list_put(mt_map_t *mtm, mapentry_t *age) 855 { 856 mapentry_t *me; 857 858 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 859 mutex_enter(&mtm->mtm_mutex); 860 for (me = age; me; me = age) { 861 age = me->me_agenext; 862 me->me_flags &= ~ME_AGE; 863 } 864 mutex_exit(&mtm->mtm_mutex); 865 rw_exit(&mtm->mtm_rwlock); 866 } 867 868 #define UFS_RW_BALANCE 2 869 int ufs_rw_balance = UFS_RW_BALANCE; 870 871 /* 872 * Check if we need to read the master. 873 * The master does not need to be read if the log deltas to the 874 * block are for one contiguous set of full disk sectors. 875 * Both cylinder group bit maps DT_CG (8K); directory entries (512B); 876 * and possibly others should not require master disk reads. 877 * Calculate the sector map for writing later. 878 */ 879 int 880 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp) 881 { 882 offset_t mof; 883 crb_t *crb; 884 mapentry_t *me; 885 int32_t nb; 886 int i; 887 int start_sec, end_sec; 888 int read_needed = 0; 889 int all_inodes = 1; 890 int first_sec = INT_MAX; 891 int last_sec = -1; 892 rbsecmap_t secmap = 0; 893 894 /* LINTED: warning: logical expression always true: op "||" */ 895 ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY)); 896 897 for (me = age; me; me = me->me_agenext) { 898 crb = me->me_crb; 899 if (crb) { 900 nb = crb->c_nb; 901 mof = crb->c_mof; 902 } else { 903 nb = me->me_nb; 904 mof = me->me_mof; 905 } 906 907 /* 908 * If the delta is not sector aligned then 909 * read the whole block. 910 */ 911 if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) { 912 read_needed = 1; 913 } 914 915 /* Set sector map used in the MAPBLOCKSIZE block. */ 916 start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT; 917 end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT); 918 for (i = start_sec; i <= end_sec; i++) { 919 secmap |= UINT16_C(1) << i; 920 } 921 922 if (me->me_dt != DT_INODE) { 923 all_inodes = 0; 924 } 925 if (start_sec < first_sec) { 926 first_sec = start_sec; 927 } 928 if (end_sec > last_sec) { 929 last_sec = end_sec; 930 } 931 } 932 933 ASSERT(secmap); 934 ASSERT(first_sec != INT_MAX); 935 ASSERT(last_sec != -1); 936 937 if (all_inodes) { 938 /* 939 * Here we have a tradeoff choice. It must be better to 940 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a 941 * read and a write. But what about 3 or more writes, versus 942 * a read+write? * Where is the cut over? It will depend on 943 * the track caching, scsi driver and other activity. 944 * A unpublished tunable is defined (ufs_rw_balance) that 945 * currently defaults to 2. 946 */ 947 if (!read_needed) { 948 int count = 0, gap = 0; 949 int sector_set; /* write needed to this sector */ 950 951 /* Count the gaps (every 1 to 0 transation) */ 952 for (i = first_sec + 1; i < last_sec; i++) { 953 sector_set = secmap & (UINT16_C(1) << i); 954 if (!gap && !sector_set) { 955 gap = 1; 956 count++; 957 if (count > ufs_rw_balance) { 958 read_needed = 1; 959 break; 960 } 961 } else if (gap && sector_set) { 962 gap = 0; 963 } 964 } 965 } 966 967 /* 968 * Inodes commonly make up the majority (~85%) of deltas. 969 * They cannot contain embedded user data, so its safe to 970 * read and write them all in one IO. 971 * But for directory entries, shadow inode data, and 972 * quota record data the user data fragments can be embedded 973 * betwen those metadata, and so its not safe to read, modify 974 * then write the entire range as user asynchronous user data 975 * writes could get overwritten with old data. 976 * Thus we have to create a segment map of meta data that 977 * needs to get written. 978 * 979 * If user data was logged then this issue would go away. 980 */ 981 if (read_needed) { 982 for (i = first_sec + 1; i < last_sec; i++) { 983 secmap |= (UINT16_C(1) << i); 984 } 985 } 986 } 987 rbp->rb_secmap = secmap; 988 return (read_needed); 989 } 990 991 /* 992 * Abort the load of a set of log map delta's. 993 * ie, 994 * Clear out all mapentries on this unit's log map 995 * which have a tid (transaction id) equal to the 996 * parameter tid. Walk the cancel list, taking everything 997 * off it, too. 998 */ 999 static void 1000 logmap_abort(ml_unit_t *ul, uint32_t tid) 1001 { 1002 struct mt_map *mtm = ul->un_logmap; /* Log map */ 1003 mapentry_t *me, 1004 **mep; 1005 int i; 1006 1007 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1008 map_check_linkage(mtm)); 1009 1010 /* 1011 * wait for any outstanding reads to finish; lock out future reads 1012 */ 1013 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1014 1015 mutex_enter(&mtm->mtm_mutex); 1016 /* Take everything off cancel list */ 1017 while ((me = mtm->mtm_cancel) != NULL) { 1018 mtm->mtm_cancel = me->me_cancel; 1019 me->me_flags &= ~ME_CANCEL; 1020 me->me_cancel = NULL; 1021 } 1022 1023 /* 1024 * Now take out all mapentries with current tid, and committid 1025 * as this function is called from logmap_logscan and logmap_commit 1026 * When it is called from logmap_logscan mtm_tid == mtm_committid 1027 * But when logmap_abort is called from logmap_commit it is 1028 * because the log errored when trying to write the commit record, 1029 * after the async ops have been allowed to start in top_end_sync. 1030 * So we also need to remove all mapentries from the transaction whose 1031 * commit failed. 1032 */ 1033 for (i = 0; i < mtm->mtm_nhash; i++) { 1034 mep = &mtm->mtm_hash[i]; 1035 while ((me = *mep) != NULL) { 1036 if (me->me_tid == tid || 1037 me->me_tid == mtm->mtm_committid) { 1038 *mep = me->me_hash; 1039 me->me_next->me_prev = me->me_prev; 1040 me->me_prev->me_next = me->me_next; 1041 if (!(me->me_flags & ME_USER)) { 1042 mtm->mtm_nme--; 1043 } 1044 CRB_RELE(me); 1045 kmem_cache_free(mapentry_cache, me); 1046 continue; 1047 } 1048 mep = &me->me_hash; 1049 } 1050 } 1051 1052 if (!(ul->un_flags & LDL_SCAN)) 1053 mtm->mtm_flags |= MTM_CANCELED; 1054 mutex_exit(&mtm->mtm_mutex); 1055 mtm->mtm_dirty = 0; 1056 mtm->mtm_nmet = 0; 1057 rw_exit(&mtm->mtm_rwlock); 1058 1059 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1060 map_check_linkage(mtm)); 1061 } 1062 1063 static void 1064 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me) 1065 { 1066 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1067 1068 while (!ldl_has_space(ul, me)) { 1069 ASSERT(!(ul->un_flags & LDL_NOROLL)); 1070 mutex_exit(&ul->un_log_mutex); 1071 logmap_forceroll(mtm); 1072 mutex_enter(&ul->un_log_mutex); 1073 if (ul->un_flags & LDL_ERROR) 1074 break; 1075 } 1076 1077 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1078 } 1079 1080 /* 1081 * put a list of deltas into a logmap 1082 * If va == NULL, don't write to the log. 1083 */ 1084 void 1085 logmap_add( 1086 ml_unit_t *ul, 1087 char *va, /* Ptr to buf w/deltas & data */ 1088 offset_t vamof, /* Offset on master of buf start */ 1089 mapentry_t *melist) /* Entries to add */ 1090 { 1091 offset_t mof; 1092 off_t nb; 1093 mapentry_t *me; 1094 mapentry_t **mep; 1095 mapentry_t **savmep; 1096 uint32_t tid; 1097 mt_map_t *mtm = ul->un_logmap; 1098 1099 mutex_enter(&ul->un_log_mutex); 1100 if (va) 1101 logmap_wait_space(mtm, ul, melist); 1102 1103 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1104 map_check_linkage(mtm)); 1105 1106 mtm->mtm_ref = 1; 1107 mtm->mtm_dirty++; 1108 tid = mtm->mtm_tid; 1109 while (melist) { 1110 mof = melist->me_mof; 1111 nb = melist->me_nb; 1112 1113 /* 1114 * search for overlaping entries 1115 */ 1116 savmep = mep = MAP_HASH(mof, mtm); 1117 mutex_enter(&mtm->mtm_mutex); 1118 while ((me = *mep) != 0) { 1119 /* 1120 * Data consumes old map entry; cancel map entry. 1121 * Take care when we replace an old map entry 1122 * which carries quota information with a newer entry 1123 * which does not. In that case the push function 1124 * would not be called to clean up the dquot structure. 1125 * This would be found later by invalidatedq() causing 1126 * a panic when the filesystem in unmounted. 1127 * We clean up the dquot manually and then replace 1128 * the map entry. 1129 */ 1130 if (MEwithinDATA(me, mof, nb) && 1131 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1132 if (tid == me->me_tid && 1133 ((me->me_flags & ME_AGE) == 0)) { 1134 *mep = me->me_hash; 1135 me->me_next->me_prev = me->me_prev; 1136 me->me_prev->me_next = me->me_next; 1137 ASSERT(!(me->me_flags & ME_USER)); 1138 mtm->mtm_nme--; 1139 /* 1140 * Special case if the mapentry 1141 * carries a dquot and a push function. 1142 * We have to clean up the quota info 1143 * before replacing the mapentry. 1144 */ 1145 if (me->me_dt == DT_QR) 1146 HANDLE_DQUOT(me, melist); 1147 1148 kmem_cache_free(mapentry_cache, me); 1149 continue; 1150 } 1151 me->me_cancel = mtm->mtm_cancel; 1152 mtm->mtm_cancel = me; 1153 me->me_flags |= ME_CANCEL; 1154 } 1155 mep = &(*mep)->me_hash; 1156 } 1157 mutex_exit(&mtm->mtm_mutex); 1158 1159 /* 1160 * remove from list 1161 */ 1162 me = melist; 1163 melist = melist->me_hash; 1164 me->me_flags &= ~ME_LIST; 1165 /* 1166 * If va != NULL, put in the log. 1167 */ 1168 if (va) 1169 ldl_write(ul, va, vamof, me); 1170 if (ul->un_flags & LDL_ERROR) { 1171 kmem_cache_free(mapentry_cache, me); 1172 continue; 1173 } 1174 ASSERT((va == NULL) || 1175 ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1176 map_check_ldl_write(ul, va, vamof, me)); 1177 1178 /* 1179 * put on hash 1180 */ 1181 mutex_enter(&mtm->mtm_mutex); 1182 me->me_hash = *savmep; 1183 *savmep = me; 1184 me->me_next = (mapentry_t *)mtm; 1185 me->me_prev = mtm->mtm_prev; 1186 mtm->mtm_prev->me_next = me; 1187 mtm->mtm_prev = me; 1188 me->me_flags |= ME_HASH; 1189 me->me_tid = tid; 1190 me->me_age = mtm->mtm_age++; 1191 mtm->mtm_nme++; 1192 mtm->mtm_nmet++; 1193 mutex_exit(&mtm->mtm_mutex); 1194 } 1195 1196 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1197 map_check_linkage(mtm)); 1198 mutex_exit(&ul->un_log_mutex); 1199 } 1200 1201 /* 1202 * Add the delta(s) into the log. 1203 * Create one cached roll buffer logmap entry, and reference count the 1204 * number of mapentries refering to it. 1205 * Cancel previous logmap entries. 1206 * logmap_add is tolerant of failure to allocate a cached roll buffer. 1207 */ 1208 void 1209 logmap_add_buf( 1210 ml_unit_t *ul, 1211 char *va, /* Ptr to buf w/deltas & data */ 1212 offset_t bufmof, /* Offset on master of buf start */ 1213 mapentry_t *melist, /* Entries to add */ 1214 caddr_t buf, /* Buffer containing delta(s) */ 1215 uint32_t bufsz) /* Size of buf */ 1216 { 1217 offset_t mof; 1218 offset_t vamof = bufmof + (va - buf); 1219 off_t nb; 1220 mapentry_t *me; 1221 mapentry_t **mep; 1222 mapentry_t **savmep; 1223 uint32_t tid; 1224 mt_map_t *mtm = ul->un_logmap; 1225 crb_t *crb; 1226 crb_t *crbsav = NULL; 1227 1228 ASSERT((bufsz & DEV_BMASK) == 0); 1229 mutex_enter(&ul->un_log_mutex); 1230 logmap_wait_space(mtm, ul, melist); 1231 1232 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1233 map_check_linkage(mtm)); 1234 1235 mtm->mtm_ref = 1; 1236 mtm->mtm_dirty++; 1237 tid = mtm->mtm_tid; 1238 while (melist) { 1239 mof = melist->me_mof; 1240 nb = melist->me_nb; 1241 1242 /* 1243 * search for overlapping entries 1244 */ 1245 savmep = mep = MAP_HASH(mof, mtm); 1246 mutex_enter(&mtm->mtm_mutex); 1247 while ((me = *mep) != 0) { 1248 /* 1249 * Data consumes old map entry; cancel map entry. 1250 * Take care when we replace an old map entry 1251 * which carries quota information with a newer entry 1252 * which does not. In that case the push function 1253 * would not be called to clean up the dquot structure. 1254 * This would be found later by invalidatedq() causing 1255 * a panic when the filesystem in unmounted. 1256 * We clean up the dquot manually and then replace 1257 * the map entry. 1258 */ 1259 crb = me->me_crb; 1260 if (MEwithinDATA(me, mof, nb) && 1261 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1262 if (tid == me->me_tid && 1263 ((me->me_flags & ME_AGE) == 0)) { 1264 *mep = me->me_hash; 1265 me->me_next->me_prev = me->me_prev; 1266 me->me_prev->me_next = me->me_next; 1267 ASSERT(!(me->me_flags & ME_USER)); 1268 mtm->mtm_nme--; 1269 /* 1270 * Special case if the mapentry 1271 * carries a dquot and a push function. 1272 * We have to clean up the quota info 1273 * before replacing the mapentry. 1274 */ 1275 if (me->me_dt == DT_QR) 1276 HANDLE_DQUOT(me, melist); 1277 1278 /* 1279 * If this soon to be deleted mapentry 1280 * has a suitable roll buffer then 1281 * re-use it. 1282 */ 1283 if (crb && (--crb->c_refcnt == 0)) { 1284 if (crbsav || 1285 (crb->c_nb != bufsz)) { 1286 CRB_FREE(crb, me); 1287 } else { 1288 bcopy(buf, crb->c_buf, 1289 bufsz); 1290 crb->c_invalid = 0; 1291 crb->c_mof = bufmof; 1292 crbsav = crb; 1293 me->me_crb = NULL; 1294 } 1295 } 1296 kmem_cache_free(mapentry_cache, me); 1297 continue; 1298 } 1299 me->me_cancel = mtm->mtm_cancel; 1300 mtm->mtm_cancel = me; 1301 me->me_flags |= ME_CANCEL; 1302 } 1303 1304 /* 1305 * Inode deltas within the same fs block come 1306 * in individually as separate calls to logmap_add(). 1307 * All others come in as one call. So check for an 1308 * existing entry where we can re-use the crb. 1309 */ 1310 if ((me->me_dt == DT_INODE) && (tid == me->me_tid) && 1311 !crbsav && crb && 1312 WITHIN(mof, nb, crb->c_mof, crb->c_nb)) { 1313 ASSERT(crb->c_mof == bufmof); 1314 ASSERT(crb->c_nb == bufsz); 1315 bcopy(buf, crb->c_buf, bufsz); 1316 crbsav = crb; 1317 } 1318 mep = &(*mep)->me_hash; 1319 } 1320 mutex_exit(&mtm->mtm_mutex); 1321 1322 /* 1323 * If we don't already have a crb then allocate one 1324 * and copy the incoming buffer. Only do this once 1325 * for all the incoming deltas. 1326 */ 1327 if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) { 1328 /* 1329 * Only use a cached roll buffer if we 1330 * have enough memory, and check for failures. 1331 */ 1332 if (((ufs_crb_size + bufsz) < ufs_crb_limit) && 1333 (kmem_avail() > bufsz)) { 1334 crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP); 1335 } else { 1336 ufs_crb_alloc_fails++; 1337 } 1338 if (crbsav) { 1339 crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP); 1340 if (crbsav->c_buf) { 1341 atomic_add_64(&ufs_crb_size, 1342 (uint64_t)bufsz); 1343 if (ufs_crb_size > ufs_crb_max_size) { 1344 ufs_crb_max_size = ufs_crb_size; 1345 } 1346 bcopy(buf, crbsav->c_buf, bufsz); 1347 crbsav->c_nb = bufsz; 1348 crbsav->c_refcnt = 0; 1349 crbsav->c_invalid = 0; 1350 ASSERT((bufmof & DEV_BMASK) == 0); 1351 crbsav->c_mof = bufmof; 1352 } else { 1353 kmem_free(crbsav, sizeof (crb_t)); 1354 crbsav = NULL; 1355 } 1356 } 1357 } 1358 1359 /* 1360 * remove from list 1361 */ 1362 me = melist; 1363 melist = melist->me_hash; 1364 me->me_flags &= ~ME_LIST; 1365 me->me_crb = crbsav; 1366 if (crbsav) { 1367 crbsav->c_refcnt++; 1368 } 1369 crbsav = NULL; 1370 1371 ASSERT(va); 1372 ldl_write(ul, va, vamof, me); /* add to on-disk log */ 1373 if (ul->un_flags & LDL_ERROR) { 1374 CRB_RELE(me); 1375 kmem_cache_free(mapentry_cache, me); 1376 continue; 1377 } 1378 ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1379 map_check_ldl_write(ul, va, vamof, me)); 1380 1381 /* 1382 * put on hash 1383 */ 1384 mutex_enter(&mtm->mtm_mutex); 1385 me->me_hash = *savmep; 1386 *savmep = me; 1387 me->me_next = (mapentry_t *)mtm; 1388 me->me_prev = mtm->mtm_prev; 1389 mtm->mtm_prev->me_next = me; 1390 mtm->mtm_prev = me; 1391 me->me_flags |= ME_HASH; 1392 me->me_tid = tid; 1393 me->me_age = mtm->mtm_age++; 1394 mtm->mtm_nme++; 1395 mtm->mtm_nmet++; 1396 mutex_exit(&mtm->mtm_mutex); 1397 } 1398 1399 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1400 map_check_linkage(mtm)); 1401 mutex_exit(&ul->un_log_mutex); 1402 } 1403 1404 /* 1405 * free up any cancelled deltas 1406 */ 1407 void 1408 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead) 1409 { 1410 int dolock = 0; 1411 mapentry_t *me; 1412 mapentry_t **mep; 1413 1414 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1415 map_check_linkage(mtm)); 1416 1417 again: 1418 if (dolock) 1419 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1420 1421 /* 1422 * At EOT, cancel the indicated deltas 1423 */ 1424 mutex_enter(&mtm->mtm_mutex); 1425 if (mtm->mtm_flags & MTM_CANCELED) { 1426 mtm->mtm_flags &= ~MTM_CANCELED; 1427 ASSERT(dolock == 0); 1428 mutex_exit(&mtm->mtm_mutex); 1429 return; 1430 } 1431 1432 while ((me = *cancelhead) != NULL) { 1433 /* 1434 * roll forward or read collision; wait and try again 1435 */ 1436 if (me->me_flags & ME_AGE) { 1437 ASSERT(dolock == 0); 1438 mutex_exit(&mtm->mtm_mutex); 1439 dolock = 1; 1440 goto again; 1441 } 1442 /* 1443 * remove from cancel list 1444 */ 1445 *cancelhead = me->me_cancel; 1446 me->me_cancel = NULL; 1447 me->me_flags &= ~(ME_CANCEL); 1448 1449 /* 1450 * logmap_remove_roll handles ME_ROLL entries later 1451 * we leave them around for logmap_iscancel 1452 * XXX is this necessary? 1453 */ 1454 if (me->me_flags & ME_ROLL) 1455 continue; 1456 1457 /* 1458 * remove from hash (if necessary) 1459 */ 1460 if (me->me_flags & ME_HASH) { 1461 mep = MAP_HASH(me->me_mof, mtm); 1462 while (*mep) { 1463 if (*mep == me) { 1464 *mep = me->me_hash; 1465 me->me_next->me_prev = me->me_prev; 1466 me->me_prev->me_next = me->me_next; 1467 me->me_flags &= ~(ME_HASH); 1468 if (!(me->me_flags & ME_USER)) { 1469 mtm->mtm_nme--; 1470 } 1471 break; 1472 } else 1473 mep = &(*mep)->me_hash; 1474 } 1475 } 1476 /* 1477 * put the entry on the free list 1478 */ 1479 CRB_RELE(me); 1480 kmem_cache_free(mapentry_cache, me); 1481 } 1482 mutex_exit(&mtm->mtm_mutex); 1483 if (dolock) 1484 rw_exit(&mtm->mtm_rwlock); 1485 1486 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1487 map_check_linkage(mtm)); 1488 } 1489 1490 1491 void 1492 logmap_commit(ml_unit_t *ul, uint32_t tid) 1493 { 1494 mapentry_t me; 1495 mt_map_t *mtm = ul->un_logmap; 1496 1497 1498 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1499 1500 /* 1501 * async'ly write a commit rec into the log 1502 */ 1503 if (mtm->mtm_dirty) { 1504 /* 1505 * put commit record into log 1506 */ 1507 me.me_mof = mtm->mtm_tid; 1508 me.me_dt = DT_COMMIT; 1509 me.me_nb = 0; 1510 me.me_hash = NULL; 1511 logmap_wait_space(mtm, ul, &me); 1512 ldl_write(ul, NULL, (offset_t)0, &me); 1513 ldl_round_commit(ul); 1514 1515 /* 1516 * abort on error; else reset dirty flag 1517 */ 1518 if (ul->un_flags & LDL_ERROR) 1519 logmap_abort(ul, tid); 1520 else { 1521 mtm->mtm_dirty = 0; 1522 mtm->mtm_nmet = 0; 1523 mtm->mtm_cfrags = 0; 1524 } 1525 /* push commit */ 1526 ldl_push_commit(ul); 1527 } 1528 } 1529 1530 void 1531 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul) 1532 { 1533 off_t lof; 1534 uint32_t tid; 1535 mapentry_t *me; 1536 1537 /* 1538 * move the head forward so the log knows how full it is 1539 * Make sure to skip any mapentry whose me_lof is 0, these 1540 * are just place holders for DT_CANCELED freed user blocks 1541 * for the current moby. 1542 */ 1543 mutex_enter(&ul->un_log_mutex); 1544 mutex_enter(&mtm->mtm_mutex); 1545 me = mtm->mtm_next; 1546 while (me != (mapentry_t *)mtm && me->me_lof == 0) { 1547 me = me->me_next; 1548 } 1549 1550 if (me == (mapentry_t *)mtm) 1551 lof = -1; 1552 else { 1553 lof = me->me_lof; 1554 tid = me->me_tid; 1555 } 1556 mutex_exit(&mtm->mtm_mutex); 1557 ldl_sethead(ul, lof, tid); 1558 if (lof == -1) 1559 mtm->mtm_age = 0; 1560 mutex_exit(&ul->un_log_mutex); 1561 } 1562 1563 void 1564 logmap_settail(mt_map_t *mtm, ml_unit_t *ul) 1565 { 1566 off_t lof; 1567 size_t nb; 1568 1569 /* 1570 * set the tail after the logmap_abort 1571 */ 1572 mutex_enter(&ul->un_log_mutex); 1573 mutex_enter(&mtm->mtm_mutex); 1574 if (mtm->mtm_prev == (mapentry_t *)mtm) 1575 lof = -1; 1576 else { 1577 /* 1578 * set the tail to the end of the last commit 1579 */ 1580 lof = mtm->mtm_tail_lof; 1581 nb = mtm->mtm_tail_nb; 1582 } 1583 mutex_exit(&mtm->mtm_mutex); 1584 ldl_settail(ul, lof, nb); 1585 mutex_exit(&ul->un_log_mutex); 1586 } 1587 1588 /* 1589 * when reseting a device; roll the log until every 1590 * delta has been rolled forward 1591 */ 1592 void 1593 logmap_roll_dev(ml_unit_t *ul) 1594 { 1595 mt_map_t *mtm = ul->un_logmap; 1596 mapentry_t *me; 1597 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 1598 1599 again: 1600 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1601 map_check_linkage(mtm)); 1602 if (ul->un_flags & (LDL_ERROR|LDL_NOROLL)) 1603 return; 1604 1605 /* 1606 * look for deltas 1607 */ 1608 mutex_enter(&mtm->mtm_mutex); 1609 for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) { 1610 if (me->me_flags & ME_ROLL) 1611 break; 1612 if (me->me_tid == mtm->mtm_tid) 1613 continue; 1614 if (me->me_tid == mtm->mtm_committid) 1615 continue; 1616 break; 1617 } 1618 1619 /* 1620 * found a delta; kick the roll thread 1621 * but only if the thread is running... (jmh) 1622 */ 1623 if (me != (mapentry_t *)mtm) { 1624 mutex_exit(&mtm->mtm_mutex); 1625 logmap_forceroll(mtm); 1626 goto again; 1627 } 1628 1629 /* 1630 * no more deltas, return 1631 */ 1632 mutex_exit(&mtm->mtm_mutex); 1633 (void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs); 1634 1635 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1636 map_check_linkage(mtm)); 1637 } 1638 1639 static void 1640 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata) 1641 { 1642 mapentry_t *me; 1643 mapentry_t **mep; 1644 mt_map_t *mtm = ul->un_logmap; 1645 int frags; 1646 1647 /* 1648 * map has been referenced and is dirty 1649 */ 1650 mtm->mtm_ref = 1; 1651 mtm->mtm_dirty++; 1652 1653 /* 1654 * get a mapentry 1655 */ 1656 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1657 bzero(me, sizeof (mapentry_t)); 1658 1659 /* 1660 * initialize cancel record and put in logmap 1661 */ 1662 me->me_mof = mof; 1663 me->me_nb = nb; 1664 me->me_dt = DT_CANCEL; 1665 me->me_tid = mtm->mtm_tid; 1666 me->me_hash = NULL; 1667 1668 /* 1669 * Write delta to log if this delta is for metadata. If this is not 1670 * metadata it is user data and we are just putting a cancel 1671 * mapentry into the hash to cancel a user block deletion 1672 * in which we do not want the block to be allocated 1673 * within this moby. This cancel entry will prevent the block from 1674 * being allocated within the moby and prevent user data corruption 1675 * if we happen to crash before this moby is committed. 1676 */ 1677 mutex_enter(&ul->un_log_mutex); 1678 if (metadata) { 1679 logmap_wait_space(mtm, ul, me); 1680 ldl_write(ul, NULL, (offset_t)0, me); 1681 if (ul->un_flags & LDL_ERROR) { 1682 kmem_cache_free(mapentry_cache, me); 1683 mutex_exit(&ul->un_log_mutex); 1684 return; 1685 } 1686 } 1687 1688 /* 1689 * put in hash and on cancel list 1690 */ 1691 mep = MAP_HASH(mof, mtm); 1692 mutex_enter(&mtm->mtm_mutex); 1693 me->me_age = mtm->mtm_age++; 1694 me->me_hash = *mep; 1695 *mep = me; 1696 me->me_next = (mapentry_t *)mtm; 1697 me->me_prev = mtm->mtm_prev; 1698 mtm->mtm_prev->me_next = me; 1699 mtm->mtm_prev = me; 1700 me->me_cancel = mtm->mtm_cancel; 1701 mtm->mtm_cancel = me; 1702 if (metadata) { 1703 mtm->mtm_nme++; 1704 mtm->mtm_nmet++; 1705 } else { 1706 me->me_flags = ME_USER; 1707 } 1708 me->me_flags |= (ME_HASH|ME_CANCEL); 1709 if (!(metadata)) { 1710 frags = blkoff(ul->un_ufsvfs->vfs_fs, nb); 1711 if (frags) 1712 mtm->mtm_cfrags += numfrags(ul->un_ufsvfs->vfs_fs, 1713 frags); 1714 } 1715 mutex_exit(&mtm->mtm_mutex); 1716 1717 mutex_exit(&ul->un_log_mutex); 1718 } 1719 1720 /* 1721 * cancel entries in a logmap (entries are freed at EOT) 1722 */ 1723 void 1724 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata) 1725 { 1726 int32_t hnb; 1727 mapentry_t *me; 1728 mapentry_t **mep; 1729 mt_map_t *mtm = ul->un_logmap; 1730 crb_t *crb; 1731 1732 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1733 map_check_linkage(mtm)); 1734 1735 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1736 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1737 if (hnb > nb) 1738 hnb = nb; 1739 /* 1740 * find overlapping entries 1741 */ 1742 mep = MAP_HASH(mof, mtm); 1743 mutex_enter(&mtm->mtm_mutex); 1744 for (me = *mep; me; me = me->me_hash) { 1745 if (!DATAoverlapME(mof, hnb, me)) 1746 continue; 1747 1748 ASSERT(MEwithinDATA(me, mof, hnb)); 1749 1750 if ((me->me_flags & ME_CANCEL) == 0) { 1751 me->me_cancel = mtm->mtm_cancel; 1752 mtm->mtm_cancel = me; 1753 me->me_flags |= ME_CANCEL; 1754 crb = me->me_crb; 1755 if (crb) { 1756 crb->c_invalid = 1; 1757 } 1758 } 1759 } 1760 mutex_exit(&mtm->mtm_mutex); 1761 1762 /* 1763 * put a cancel record into the log 1764 */ 1765 logmap_cancel_delta(ul, mof, hnb, metadata); 1766 } 1767 1768 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1769 map_check_linkage(mtm)); 1770 } 1771 1772 /* 1773 * check for overlap w/cancel delta 1774 */ 1775 int 1776 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb) 1777 { 1778 off_t hnb; 1779 mapentry_t *me; 1780 mapentry_t **mep; 1781 1782 mutex_enter(&mtm->mtm_mutex); 1783 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1784 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1785 if (hnb > nb) 1786 hnb = nb; 1787 /* 1788 * search for dup entry 1789 */ 1790 mep = MAP_HASH(mof, mtm); 1791 for (me = *mep; me; me = me->me_hash) { 1792 if (((me->me_flags & ME_ROLL) == 0) && 1793 (me->me_dt != DT_CANCEL)) 1794 continue; 1795 if (DATAoverlapME(mof, hnb, me)) 1796 break; 1797 } 1798 1799 /* 1800 * overlap detected 1801 */ 1802 if (me) { 1803 mutex_exit(&mtm->mtm_mutex); 1804 return (1); 1805 } 1806 } 1807 mutex_exit(&mtm->mtm_mutex); 1808 return (0); 1809 } 1810 1811 static int 1812 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp) 1813 { 1814 mapentry_t *me; 1815 int error; 1816 mt_map_t *mtm = ul->un_logmap; 1817 1818 /* 1819 * verify delta header; failure == mediafail 1820 */ 1821 error = 0; 1822 /* delta type */ 1823 if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX)) 1824 error = EINVAL; 1825 if (dp->d_typ == DT_COMMIT) { 1826 if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1)) 1827 error = EINVAL; 1828 } else { 1829 /* length of delta */ 1830 if ((dp->d_nb < INT32_C(0)) || 1831 (dp->d_nb > INT32_C(MAPBLOCKSIZE))) 1832 error = EINVAL; 1833 1834 /* offset on master device */ 1835 if (dp->d_mof < INT64_C(0)) 1836 error = EINVAL; 1837 } 1838 1839 if (error) { 1840 ldl_seterror(ul, "Error processing ufs log data during scan"); 1841 return (error); 1842 } 1843 1844 /* 1845 * process commit record 1846 */ 1847 if (dp->d_typ == DT_COMMIT) { 1848 if (mtm->mtm_dirty) { 1849 ASSERT(dp->d_nb == INT32_C(0)); 1850 logmap_free_cancel(mtm, &mtm->mtm_cancel); 1851 mtm->mtm_dirty = 0; 1852 mtm->mtm_nmet = 0; 1853 mtm->mtm_tid++; 1854 mtm->mtm_committid = mtm->mtm_tid; 1855 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1856 logmap_logscan_commit_debug(lof, mtm)); 1857 } 1858 /* 1859 * return #bytes to next sector (next delta header) 1860 */ 1861 *nbp = ldl_logscan_nbcommit(lof); 1862 mtm->mtm_tail_lof = lof; 1863 mtm->mtm_tail_nb = *nbp; 1864 return (0); 1865 } 1866 1867 /* 1868 * add delta to logmap 1869 */ 1870 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1871 bzero(me, sizeof (mapentry_t)); 1872 me->me_lof = lof; 1873 me->me_mof = dp->d_mof; 1874 me->me_nb = dp->d_nb; 1875 me->me_tid = mtm->mtm_tid; 1876 me->me_dt = dp->d_typ; 1877 me->me_hash = NULL; 1878 me->me_flags = (ME_LIST | ME_SCAN); 1879 logmap_add(ul, NULL, 0, me); 1880 switch (dp->d_typ) { 1881 case DT_CANCEL: 1882 me->me_flags |= ME_CANCEL; 1883 me->me_cancel = mtm->mtm_cancel; 1884 mtm->mtm_cancel = me; 1885 break; 1886 default: 1887 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1888 logmap_logscan_add_debug(dp, mtm)); 1889 break; 1890 } 1891 1892 sizeofdelta: 1893 /* 1894 * return #bytes till next delta header 1895 */ 1896 if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO)) 1897 *nbp = 0; 1898 else 1899 *nbp = dp->d_nb; 1900 return (0); 1901 } 1902 1903 void 1904 logmap_logscan(ml_unit_t *ul) 1905 { 1906 size_t nb, nbd; 1907 off_t lof; 1908 struct delta delta; 1909 mt_map_t *logmap = ul->un_logmap; 1910 1911 ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap); 1912 1913 /* 1914 * prepare the log for a logscan 1915 */ 1916 ldl_logscan_begin(ul); 1917 1918 /* 1919 * prepare the logmap for a logscan 1920 */ 1921 (void) map_free_entries(logmap); 1922 logmap->mtm_tid = 0; 1923 logmap->mtm_committid = UINT32_C(0); 1924 logmap->mtm_age = 0; 1925 logmap->mtm_dirty = 0; 1926 logmap->mtm_ref = 0; 1927 1928 /* 1929 * while not at end of log 1930 * read delta header 1931 * add to logmap 1932 * seek to beginning of next delta 1933 */ 1934 lof = ul->un_head_lof; 1935 nbd = sizeof (delta); 1936 while (lof != ul->un_tail_lof) { 1937 1938 /* read delta header */ 1939 if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta)) 1940 break; 1941 1942 /* add to logmap */ 1943 if (logmap_logscan_add(ul, &delta, lof, &nb)) 1944 break; 1945 1946 /* seek to next header (skip data) */ 1947 if (ldl_logscan_read(ul, &lof, nb, NULL)) 1948 break; 1949 } 1950 1951 /* 1952 * remove the last partial transaction from the logmap 1953 */ 1954 logmap_abort(ul, logmap->mtm_tid); 1955 1956 ldl_logscan_end(ul); 1957 } 1958 1959 void 1960 _init_map(void) 1961 { 1962 /* 1963 * Initialise the mapentry cache. No constructor or deconstructor 1964 * is needed. Also no reclaim function is supplied as reclaiming 1965 * current entries is not possible. 1966 */ 1967 mapentry_cache = kmem_cache_create("lufs_mapentry_cache", 1968 sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1969 } 1970 1971 /* 1972 * Special case when we replace an old map entry which carries quota 1973 * information with a newer entry which does not. 1974 * In that case the push function would not be called to clean up the 1975 * dquot structure. This would be found later by invalidatedq() causing 1976 * a panic when the filesystem in unmounted. 1977 * We clean up the dquot manually before replacing the map entry. 1978 */ 1979 void 1980 handle_dquot(mapentry_t *me) 1981 { 1982 int dolock = 0; 1983 int domutex = 0; 1984 struct dquot *dqp; 1985 1986 dqp = (struct dquot *)me->me_arg; 1987 1988 /* 1989 * We need vfs_dqrwlock to call dqput() 1990 */ 1991 dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock)); 1992 if (dolock) 1993 rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER); 1994 1995 domutex = (!MUTEX_HELD(&dqp->dq_lock)); 1996 if (domutex) 1997 mutex_enter(&dqp->dq_lock); 1998 1999 /* 2000 * Only clean up if the dquot is referenced 2001 */ 2002 if (dqp->dq_cnt == 0) { 2003 if (domutex) 2004 mutex_exit(&dqp->dq_lock); 2005 if (dolock) 2006 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2007 return; 2008 } 2009 2010 dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS); 2011 dqput(dqp); 2012 2013 if (domutex) 2014 mutex_exit(&dqp->dq_lock); 2015 2016 if (dolock) 2017 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2018 2019 } 2020