1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2012 Milan Jurik. All rights reserved. 25 */ 26 27 #include <sys/systm.h> 28 #include <sys/types.h> 29 #include <sys/vnode.h> 30 #include <sys/errno.h> 31 #include <sys/sysmacros.h> 32 #include <sys/debug.h> 33 #include <sys/kmem.h> 34 #include <sys/conf.h> 35 #include <sys/proc.h> 36 #include <sys/cmn_err.h> 37 #include <sys/fs/ufs_inode.h> 38 #include <sys/fs/ufs_filio.h> 39 #include <sys/fs/ufs_log.h> 40 #include <sys/inttypes.h> 41 #include <sys/atomic.h> 42 #include <sys/tuneable.h> 43 44 /* 45 * externs 46 */ 47 extern pri_t minclsyspri; 48 extern struct kmem_cache *lufs_bp; 49 extern int ufs_trans_push_quota(ufsvfs_t *, delta_t, struct dquot *); 50 51 /* 52 * globals 53 */ 54 kmem_cache_t *mapentry_cache; 55 56 /* 57 * logmap tuning constants 58 */ 59 long logmap_maxnme_commit = 2048; 60 long logmap_maxnme_async = 4096; 61 long logmap_maxnme_sync = 6144; 62 long logmap_maxcfrag_commit = 4; /* Max canceled fragments per moby */ 63 64 65 uint64_t ufs_crb_size = 0; /* current size of all crb buffers */ 66 uint64_t ufs_crb_max_size = 0; /* highest crb buffer use so far */ 67 size_t ufs_crb_limit; /* max allowable size for crbs */ 68 uint64_t ufs_crb_alloc_fails = 0; /* crb allocation failures stat */ 69 #define UFS_MAX_CRB_DEFAULT_DIVISOR 10 /* max 1/10 kmem_maxavail() */ 70 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */ 71 void handle_dquot(mapentry_t *); 72 73 /* 74 * GENERIC MAP ROUTINES 75 */ 76 77 #define CRB_FREE(crb, me) \ 78 kmem_free(crb->c_buf, crb->c_nb); \ 79 atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \ 80 kmem_free(crb, sizeof (crb_t)); \ 81 (me)->me_crb = NULL; 82 83 #define CRB_RELE(me) { \ 84 crb_t *crb = (me)->me_crb; \ 85 if (crb && (--crb->c_refcnt == 0)) { \ 86 CRB_FREE(crb, me) \ 87 } \ 88 } 89 90 /* 91 * Check that the old delta has an argument and a push function of 92 * ufs_trans_push_quota(), then check that the old and new deltas differ. 93 * If so we clean up with handle_dquot() before replacing the old delta. 94 */ 95 #define HANDLE_DQUOT(me, melist) { \ 96 if ((me->me_arg) && \ 97 (me->me_func == ufs_trans_push_quota)) { \ 98 if (!((me->me_dt == melist->me_dt) && \ 99 (me->me_arg == melist->me_arg) && \ 100 (me->me_func == melist->me_func))) { \ 101 handle_dquot(me); \ 102 } \ 103 } \ 104 } 105 106 /* 107 * free up all the mapentries for a map 108 */ 109 void 110 map_free_entries(mt_map_t *mtm) 111 { 112 int i; 113 mapentry_t *me; 114 115 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 116 me->me_next->me_prev = me->me_prev; 117 me->me_prev->me_next = me->me_next; 118 CRB_RELE(me); 119 kmem_cache_free(mapentry_cache, me); 120 } 121 for (i = 0; i < mtm->mtm_nhash; i++) 122 mtm->mtm_hash[i] = NULL; 123 mtm->mtm_nme = 0; 124 mtm->mtm_nmet = 0; 125 } 126 127 /* 128 * done with map; free if necessary 129 */ 130 mt_map_t * 131 map_put(mt_map_t *mtm) 132 { 133 /* 134 * free up the map's memory 135 */ 136 map_free_entries(mtm); 137 ASSERT(map_put_debug(mtm)); 138 kmem_free(mtm->mtm_hash, 139 (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash)); 140 mutex_destroy(&mtm->mtm_mutex); 141 mutex_destroy(&mtm->mtm_scan_mutex); 142 cv_destroy(&mtm->mtm_to_roll_cv); 143 cv_destroy(&mtm->mtm_from_roll_cv); 144 rw_destroy(&mtm->mtm_rwlock); 145 mutex_destroy(&mtm->mtm_lock); 146 cv_destroy(&mtm->mtm_cv_commit); 147 cv_destroy(&mtm->mtm_cv_next); 148 cv_destroy(&mtm->mtm_cv_eot); 149 cv_destroy(&mtm->mtm_cv); 150 kmem_free(mtm, sizeof (mt_map_t)); 151 return (NULL); 152 } 153 /* 154 * Allocate a map; 155 */ 156 mt_map_t * 157 map_get(ml_unit_t *ul, enum maptypes maptype, int nh) 158 { 159 mt_map_t *mtm; 160 161 /* 162 * assume the map is not here and allocate the necessary structs 163 */ 164 mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP); 165 mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL); 166 mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL); 167 cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL); 168 cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL); 169 rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL); 170 mtm->mtm_next = (mapentry_t *)mtm; 171 mtm->mtm_prev = (mapentry_t *)mtm; 172 mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh), 173 KM_SLEEP); 174 mtm->mtm_nhash = nh; 175 mtm->mtm_debug = ul->un_debug; 176 mtm->mtm_type = maptype; 177 178 mtm->mtm_cfrags = 0; 179 mtm->mtm_cfragmax = logmap_maxcfrag_commit; 180 181 /* 182 * for scan test 183 */ 184 mtm->mtm_ul = ul; 185 186 /* 187 * Initialize locks 188 */ 189 mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL); 190 cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL); 191 cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL); 192 cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL); 193 cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL); 194 ASSERT(map_get_debug(ul, mtm)); 195 196 return (mtm); 197 } 198 199 /* 200 * DELTAMAP ROUTINES 201 */ 202 /* 203 * deltamap tuning constants 204 */ 205 long deltamap_maxnme = 1024; /* global so it can be set */ 206 207 int 208 deltamap_need_commit(mt_map_t *mtm) 209 { 210 return (mtm->mtm_nme > deltamap_maxnme); 211 } 212 213 /* 214 * put a delta into a deltamap; may sleep on memory 215 */ 216 void 217 deltamap_add( 218 mt_map_t *mtm, 219 offset_t mof, 220 off_t nb, 221 delta_t dtyp, 222 int (*func)(), 223 ulong_t arg, 224 threadtrans_t *tp) 225 { 226 int32_t hnb; 227 mapentry_t *me; 228 mapentry_t **mep; 229 230 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 231 map_check_linkage(mtm)); 232 233 mutex_enter(&mtm->mtm_mutex); 234 235 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 236 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 237 if (hnb > nb) 238 hnb = nb; 239 /* 240 * Search for dup entry. We need to ensure that we don't 241 * replace a map entry which carries quota information 242 * with a map entry which doesn't. In that case we lose 243 * reference the the dquot structure which will not be 244 * cleaned up by the push function me->me_func as this will 245 * never be called. 246 * The stray dquot would be found later by invalidatedq() 247 * causing a panic when the filesystem is unmounted. 248 */ 249 mep = MAP_HASH(mof, mtm); 250 for (me = *mep; me; me = me->me_hash) { 251 if (DATAwithinME(mof, hnb, me)) { 252 /* 253 * Don't remove quota entries which have 254 * incremented the ref count (those with a 255 * ufs_trans_push_quota push function). 256 * Let logmap_add[_buf] clean them up. 257 */ 258 if (me->me_func == ufs_trans_push_quota) { 259 continue; 260 } 261 break; 262 } 263 ASSERT((dtyp == DT_CANCEL) || 264 (!DATAoverlapME(mof, hnb, me)) || 265 MEwithinDATA(me, mof, hnb)); 266 } 267 268 if (me) { 269 /* already in map */ 270 continue; 271 } 272 273 /* 274 * Add up all the delta map deltas so we can compute 275 * an upper bound on the log size used. 276 * Note, some deltas get removed from the deltamap 277 * before the deltamap_push by lufs_write_strategy 278 * and so multiple deltas to the same mof offset 279 * don't get cancelled here but in the logmap. 280 * Thus we can't easily get a accurate count of 281 * the log space used - only an upper bound. 282 */ 283 if (tp && (mtm->mtm_ul->un_deltamap == mtm)) { 284 ASSERT(dtyp != DT_CANCEL); 285 if (dtyp == DT_ABZERO) { 286 tp->deltas_size += sizeof (struct delta); 287 } else { 288 tp->deltas_size += 289 (hnb + sizeof (struct delta)); 290 } 291 } 292 293 delta_stats[dtyp]++; 294 295 /* 296 * get a mapentry 297 * May need to drop & re-grab the mtm_mutex 298 * and then recheck for a duplicate 299 */ 300 me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP); 301 if (me == NULL) { 302 mutex_exit(&mtm->mtm_mutex); 303 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 304 mutex_enter(&mtm->mtm_mutex); 305 } 306 bzero(me, sizeof (mapentry_t)); 307 308 /* 309 * initialize and put in deltamap 310 */ 311 me->me_mof = mof; 312 me->me_nb = hnb; 313 me->me_func = func; 314 me->me_arg = arg; 315 me->me_dt = dtyp; 316 me->me_flags = ME_HASH; 317 me->me_tid = mtm->mtm_tid; 318 319 me->me_hash = *mep; 320 *mep = me; 321 me->me_next = (mapentry_t *)mtm; 322 me->me_prev = mtm->mtm_prev; 323 mtm->mtm_prev->me_next = me; 324 mtm->mtm_prev = me; 325 mtm->mtm_nme++; 326 } 327 mutex_exit(&mtm->mtm_mutex); 328 329 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 330 map_check_linkage(mtm)); 331 } 332 333 /* 334 * remove deltas within (mof, nb) and return as linked list 335 */ 336 mapentry_t * 337 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb) 338 { 339 off_t hnb; 340 mapentry_t *me; 341 mapentry_t **mep; 342 mapentry_t *mer; 343 344 if (mtm == NULL) 345 return (NULL); 346 347 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 348 map_check_linkage(mtm)); 349 350 mutex_enter(&mtm->mtm_mutex); 351 for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) { 352 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 353 if (hnb > nb) 354 hnb = nb; 355 /* 356 * remove entries from hash and return as a aged linked list 357 */ 358 mep = MAP_HASH(mof, mtm); 359 while ((me = *mep) != 0) { 360 if (MEwithinDATA(me, mof, hnb)) { 361 *mep = me->me_hash; 362 me->me_next->me_prev = me->me_prev; 363 me->me_prev->me_next = me->me_next; 364 me->me_hash = mer; 365 mer = me; 366 me->me_flags |= ME_LIST; 367 me->me_flags &= ~ME_HASH; 368 mtm->mtm_nme--; 369 } else 370 mep = &me->me_hash; 371 } 372 } 373 mutex_exit(&mtm->mtm_mutex); 374 375 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 376 map_check_linkage(mtm)); 377 378 return (mer); 379 } 380 381 /* 382 * delete entries within (mof, nb) 383 */ 384 void 385 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb) 386 { 387 mapentry_t *me; 388 mapentry_t *menext; 389 390 menext = deltamap_remove(mtm, mof, nb); 391 while ((me = menext) != 0) { 392 menext = me->me_hash; 393 kmem_cache_free(mapentry_cache, me); 394 } 395 } 396 397 /* 398 * Call the indicated function to cause deltas to move to the logmap. 399 * top_end_sync() is the only caller of this function and 400 * it has waited for the completion of all threads, so there can 401 * be no other activity in the deltamap. Therefore we don't need to 402 * hold the deltamap lock. 403 */ 404 void 405 deltamap_push(ml_unit_t *ul) 406 { 407 delta_t dtyp; 408 int (*func)(); 409 ulong_t arg; 410 mapentry_t *me; 411 offset_t mof; 412 off_t nb; 413 mt_map_t *mtm = ul->un_deltamap; 414 415 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 416 map_check_linkage(mtm)); 417 418 /* 419 * for every entry in the deltamap 420 */ 421 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) { 422 ASSERT(me->me_func); 423 func = me->me_func; 424 dtyp = me->me_dt; 425 arg = me->me_arg; 426 mof = me->me_mof; 427 nb = me->me_nb; 428 if ((ul->un_flags & LDL_ERROR) || 429 (*func)(ul->un_ufsvfs, dtyp, arg)) 430 deltamap_del(mtm, mof, nb); 431 } 432 433 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 434 map_check_linkage(mtm)); 435 } 436 437 /* 438 * LOGMAP ROUTINES 439 */ 440 441 int 442 logmap_need_commit(mt_map_t *mtm) 443 { 444 return ((mtm->mtm_nmet > logmap_maxnme_commit) || 445 (mtm->mtm_cfrags >= mtm->mtm_cfragmax)); 446 } 447 448 int 449 logmap_need_roll_async(mt_map_t *mtm) 450 { 451 return (mtm->mtm_nme > logmap_maxnme_async); 452 } 453 454 int 455 logmap_need_roll_sync(mt_map_t *mtm) 456 { 457 return (mtm->mtm_nme > logmap_maxnme_sync); 458 } 459 460 void 461 logmap_start_roll(ml_unit_t *ul) 462 { 463 mt_map_t *logmap = ul->un_logmap; 464 465 logmap_settail(logmap, ul); 466 ASSERT(!(ul->un_flags & LDL_NOROLL)); 467 mutex_enter(&logmap->mtm_mutex); 468 if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) { 469 logmap->mtm_flags |= MTM_ROLL_RUNNING; 470 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT); 471 (void) thread_create(NULL, 0, trans_roll, ul, 0, &p0, 472 TS_RUN, minclsyspri); 473 } 474 mutex_exit(&logmap->mtm_mutex); 475 } 476 477 void 478 logmap_kill_roll(ml_unit_t *ul) 479 { 480 mt_map_t *mtm = ul->un_logmap; 481 482 if (mtm == NULL) 483 return; 484 485 mutex_enter(&mtm->mtm_mutex); 486 487 while (mtm->mtm_flags & MTM_ROLL_RUNNING) { 488 mtm->mtm_flags |= MTM_ROLL_EXIT; 489 cv_signal(&mtm->mtm_to_roll_cv); 490 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 491 } 492 mutex_exit(&mtm->mtm_mutex); 493 } 494 495 /* 496 * kick the roll thread if it's not doing anything 497 */ 498 void 499 logmap_forceroll_nowait(mt_map_t *logmap) 500 { 501 /* 502 * Don't need to lock mtm_mutex to read mtm_flags here as we 503 * don't care in the rare case when we get a transitional value 504 * of mtm_flags. Just by signalling the thread it will wakeup 505 * and notice it has too many logmap entries. 506 */ 507 ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL)); 508 if ((logmap->mtm_flags & MTM_ROLLING) == 0) { 509 cv_signal(&logmap->mtm_to_roll_cv); 510 } 511 } 512 513 /* 514 * kick the roll thread and wait for it to finish a cycle 515 */ 516 void 517 logmap_forceroll(mt_map_t *mtm) 518 { 519 mutex_enter(&mtm->mtm_mutex); 520 if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) { 521 mtm->mtm_flags |= MTM_FORCE_ROLL; 522 cv_signal(&mtm->mtm_to_roll_cv); 523 } 524 do { 525 if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) { 526 mtm->mtm_flags &= ~MTM_FORCE_ROLL; 527 goto out; 528 } 529 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex); 530 } while (mtm->mtm_flags & MTM_FORCE_ROLL); 531 out: 532 mutex_exit(&mtm->mtm_mutex); 533 } 534 535 /* 536 * remove rolled deltas within (mof, nb) and free them 537 */ 538 void 539 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb) 540 { 541 int dolock = 0; 542 off_t hnb; 543 mapentry_t *me; 544 mapentry_t **mep; 545 offset_t savmof = mof; 546 off_t savnb = nb; 547 548 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 549 map_check_linkage(mtm)); 550 551 again: 552 if (dolock) 553 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 554 mutex_enter(&mtm->mtm_mutex); 555 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 556 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 557 if (hnb > nb) 558 hnb = nb; 559 /* 560 * remove and free the rolled entries 561 */ 562 mep = MAP_HASH(mof, mtm); 563 while ((me = *mep) != 0) { 564 if ((me->me_flags & ME_ROLL) && 565 (MEwithinDATA(me, mof, hnb))) { 566 if (me->me_flags & ME_AGE) { 567 ASSERT(dolock == 0); 568 dolock = 1; 569 mutex_exit(&mtm->mtm_mutex); 570 mof = savmof; 571 nb = savnb; 572 goto again; 573 } 574 *mep = me->me_hash; 575 me->me_next->me_prev = me->me_prev; 576 me->me_prev->me_next = me->me_next; 577 me->me_flags &= ~(ME_HASH|ME_ROLL); 578 ASSERT(!(me->me_flags & ME_USER)); 579 mtm->mtm_nme--; 580 /* 581 * cancelled entries are handled by someone else 582 */ 583 if ((me->me_flags & ME_CANCEL) == 0) { 584 roll_stats[me->me_dt]++; 585 CRB_RELE(me); 586 kmem_cache_free(mapentry_cache, me); 587 } 588 } else 589 mep = &me->me_hash; 590 } 591 } 592 mutex_exit(&mtm->mtm_mutex); 593 594 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 595 map_check_linkage(mtm)); 596 597 if (dolock) 598 rw_exit(&mtm->mtm_rwlock); 599 } 600 601 /* 602 * Find the disk offset of the next delta to roll. 603 * Returns 0: no more deltas to roll or a transaction is being committed 604 * 1: a delta to roll has been found and *mofp points 605 * to the master file disk offset 606 */ 607 int 608 logmap_next_roll(mt_map_t *logmap, offset_t *mofp) 609 { 610 mapentry_t *me; 611 612 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 613 map_check_linkage(logmap)); 614 615 mutex_enter(&logmap->mtm_mutex); 616 for (me = logmap->mtm_next; me != (mapentry_t *)logmap; 617 me = me->me_next) { 618 /* already rolled */ 619 if (me->me_flags & ME_ROLL) { 620 continue; 621 } 622 623 /* part of currently busy transaction; stop */ 624 if (me->me_tid == logmap->mtm_tid) { 625 break; 626 } 627 628 /* part of commit-in-progress transaction; stop */ 629 if (me->me_tid == logmap->mtm_committid) { 630 break; 631 } 632 633 /* 634 * We shouldn't see a DT_CANCEL mapentry whose 635 * tid != mtm_committid, or != mtm_tid since 636 * these are removed at the end of each committed 637 * transaction. 638 */ 639 ASSERT(!(me->me_dt == DT_CANCEL)); 640 641 *mofp = me->me_mof; 642 mutex_exit(&logmap->mtm_mutex); 643 return (1); 644 } 645 mutex_exit(&logmap->mtm_mutex); 646 return (0); 647 } 648 649 /* 650 * put mapentry on sorted age list 651 */ 652 static void 653 logmap_list_age(mapentry_t **age, mapentry_t *meadd) 654 { 655 mapentry_t *me; 656 657 ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST))); 658 659 for (me = *age; me; age = &me->me_agenext, me = *age) { 660 if (me->me_age > meadd->me_age) 661 break; 662 } 663 meadd->me_agenext = me; 664 meadd->me_flags |= ME_AGE; 665 *age = meadd; 666 } 667 668 /* 669 * get a list of deltas within <mof, mof+nb> 670 * returns with mtm_rwlock held 671 * return value says whether the entire mof range is covered by deltas 672 */ 673 int 674 logmap_list_get( 675 mt_map_t *mtm, 676 offset_t mof, 677 off_t nb, 678 mapentry_t **age) 679 { 680 off_t hnb; 681 mapentry_t *me; 682 mapentry_t **mep; 683 int rwtype = RW_READER; 684 offset_t savmof = mof; 685 off_t savnb = nb; 686 int entire = 0; 687 crb_t *crb; 688 689 mtm->mtm_ref = 1; 690 again: 691 692 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 693 map_check_linkage(mtm)); 694 695 rw_enter(&mtm->mtm_rwlock, rwtype); 696 *age = NULL; 697 mutex_enter(&mtm->mtm_mutex); 698 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 699 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 700 if (hnb > nb) 701 hnb = nb; 702 /* 703 * find overlapping entries 704 */ 705 mep = MAP_HASH(mof, mtm); 706 for (me = *mep; me; me = me->me_hash) { 707 if (me->me_dt == DT_CANCEL) 708 continue; 709 if (!DATAoverlapME(mof, hnb, me)) 710 continue; 711 /* 712 * check if map entry is in use 713 * (about to be rolled). 714 */ 715 if (me->me_flags & ME_AGE) { 716 /* 717 * reset the age bit in the list, 718 * upgrade the lock, and try again 719 */ 720 for (me = *age; me; me = *age) { 721 *age = me->me_agenext; 722 me->me_flags &= ~ME_AGE; 723 } 724 mutex_exit(&mtm->mtm_mutex); 725 rw_exit(&mtm->mtm_rwlock); 726 rwtype = RW_WRITER; 727 mof = savmof; 728 nb = savnb; 729 entire = 0; 730 goto again; 731 } else { 732 /* add mapentry to age ordered list */ 733 logmap_list_age(age, me); 734 crb = me->me_crb; 735 if (crb) { 736 if (DATAwithinCRB(savmof, savnb, crb)) { 737 entire = 1; 738 } 739 } else { 740 if (DATAwithinME(savmof, savnb, me)) { 741 entire = 1; 742 } 743 } 744 } 745 } 746 } 747 mutex_exit(&mtm->mtm_mutex); 748 749 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 750 return (entire); 751 } 752 753 /* 754 * Get a list of deltas for rolling - returns sucess or failure. 755 * Also return the cached roll buffer if all deltas point to it. 756 */ 757 int 758 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp) 759 { 760 mapentry_t *me, **mep, *age = NULL; 761 crb_t *crb = NULL; 762 763 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 764 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) || 765 map_check_linkage(logmap)); 766 ASSERT((mof & MAPBLOCKOFF) == 0); 767 768 rbp->rb_crb = NULL; 769 770 /* 771 * find overlapping entries 772 */ 773 mutex_enter(&logmap->mtm_mutex); 774 mep = MAP_HASH(mof, logmap); 775 for (me = *mep; me; me = me->me_hash) { 776 if (!DATAoverlapME(mof, MAPBLOCKSIZE, me)) 777 continue; 778 if (me->me_tid == logmap->mtm_tid) 779 continue; 780 if (me->me_tid == logmap->mtm_committid) 781 continue; 782 if (me->me_dt == DT_CANCEL) 783 continue; 784 785 /* 786 * Check if map entry is in use (by lufs_read_strategy()) 787 * and if so reset the age bit in the list, 788 * upgrade the lock, and try again 789 */ 790 if (me->me_flags & ME_AGE) { 791 for (me = age; me; me = age) { 792 age = me->me_agenext; 793 me->me_flags &= ~ME_AGE; 794 } 795 mutex_exit(&logmap->mtm_mutex); 796 return (1); /* failure */ 797 } else { 798 /* add mapentry to age ordered list */ 799 logmap_list_age(&age, me); 800 } 801 } 802 if (!age) { 803 goto out; 804 } 805 806 /* 807 * Mark the deltas as being rolled. 808 */ 809 for (me = age; me; me = me->me_agenext) { 810 me->me_flags |= ME_ROLL; 811 } 812 813 /* 814 * Test if all deltas are covered by one valid roll buffer 815 */ 816 crb = age->me_crb; 817 if (crb && !(crb->c_invalid)) { 818 for (me = age; me; me = me->me_agenext) { 819 if (me->me_crb != crb) { 820 crb = NULL; 821 break; 822 } 823 } 824 rbp->rb_crb = crb; 825 } 826 out: 827 rbp->rb_age = age; 828 829 mutex_exit(&logmap->mtm_mutex); 830 831 ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) || 832 logmap_logscan_debug(logmap, age)); 833 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock)); 834 return (0); /* success */ 835 } 836 837 void 838 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age) 839 { 840 mapentry_t *me; 841 842 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 843 mutex_enter(&mtm->mtm_mutex); 844 for (me = age; me; me = age) { 845 age = me->me_agenext; 846 me->me_flags &= ~ME_AGE; 847 } 848 mutex_exit(&mtm->mtm_mutex); 849 } 850 851 void 852 logmap_list_put(mt_map_t *mtm, mapentry_t *age) 853 { 854 mapentry_t *me; 855 856 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock)); 857 mutex_enter(&mtm->mtm_mutex); 858 for (me = age; me; me = age) { 859 age = me->me_agenext; 860 me->me_flags &= ~ME_AGE; 861 } 862 mutex_exit(&mtm->mtm_mutex); 863 rw_exit(&mtm->mtm_rwlock); 864 } 865 866 #define UFS_RW_BALANCE 2 867 int ufs_rw_balance = UFS_RW_BALANCE; 868 869 /* 870 * Check if we need to read the master. 871 * The master does not need to be read if the log deltas to the 872 * block are for one contiguous set of full disk sectors. 873 * Both cylinder group bit maps DT_CG (8K); directory entries (512B); 874 * and possibly others should not require master disk reads. 875 * Calculate the sector map for writing later. 876 */ 877 int 878 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp) 879 { 880 offset_t mof; 881 crb_t *crb; 882 mapentry_t *me; 883 int32_t nb; 884 int i; 885 int start_sec, end_sec; 886 int read_needed = 0; 887 int all_inodes = 1; 888 int first_sec = INT_MAX; 889 int last_sec = -1; 890 rbsecmap_t secmap = 0; 891 892 /* LINTED: warning: logical expression always true: op "||" */ 893 ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY)); 894 895 for (me = age; me; me = me->me_agenext) { 896 crb = me->me_crb; 897 if (crb) { 898 nb = crb->c_nb; 899 mof = crb->c_mof; 900 } else { 901 nb = me->me_nb; 902 mof = me->me_mof; 903 } 904 905 /* 906 * If the delta is not sector aligned then 907 * read the whole block. 908 */ 909 if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) { 910 read_needed = 1; 911 } 912 913 /* Set sector map used in the MAPBLOCKSIZE block. */ 914 start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT; 915 end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT); 916 for (i = start_sec; i <= end_sec; i++) { 917 secmap |= UINT16_C(1) << i; 918 } 919 920 if (me->me_dt != DT_INODE) { 921 all_inodes = 0; 922 } 923 if (start_sec < first_sec) { 924 first_sec = start_sec; 925 } 926 if (end_sec > last_sec) { 927 last_sec = end_sec; 928 } 929 } 930 931 ASSERT(secmap); 932 ASSERT(first_sec != INT_MAX); 933 ASSERT(last_sec != -1); 934 935 if (all_inodes) { 936 /* 937 * Here we have a tradeoff choice. It must be better to 938 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a 939 * read and a write. But what about 3 or more writes, versus 940 * a read+write? * Where is the cut over? It will depend on 941 * the track caching, scsi driver and other activity. 942 * A unpublished tunable is defined (ufs_rw_balance) that 943 * currently defaults to 2. 944 */ 945 if (!read_needed) { 946 int count = 0, gap = 0; 947 int sector_set; /* write needed to this sector */ 948 949 /* Count the gaps (every 1 to 0 transation) */ 950 for (i = first_sec + 1; i < last_sec; i++) { 951 sector_set = secmap & (UINT16_C(1) << i); 952 if (!gap && !sector_set) { 953 gap = 1; 954 count++; 955 if (count > ufs_rw_balance) { 956 read_needed = 1; 957 break; 958 } 959 } else if (gap && sector_set) { 960 gap = 0; 961 } 962 } 963 } 964 965 /* 966 * Inodes commonly make up the majority (~85%) of deltas. 967 * They cannot contain embedded user data, so its safe to 968 * read and write them all in one IO. 969 * But for directory entries, shadow inode data, and 970 * quota record data the user data fragments can be embedded 971 * betwen those metadata, and so its not safe to read, modify 972 * then write the entire range as user asynchronous user data 973 * writes could get overwritten with old data. 974 * Thus we have to create a segment map of meta data that 975 * needs to get written. 976 * 977 * If user data was logged then this issue would go away. 978 */ 979 if (read_needed) { 980 for (i = first_sec + 1; i < last_sec; i++) { 981 secmap |= (UINT16_C(1) << i); 982 } 983 } 984 } 985 rbp->rb_secmap = secmap; 986 return (read_needed); 987 } 988 989 /* 990 * Abort the load of a set of log map delta's. 991 * ie, 992 * Clear out all mapentries on this unit's log map 993 * which have a tid (transaction id) equal to the 994 * parameter tid. Walk the cancel list, taking everything 995 * off it, too. 996 */ 997 static void 998 logmap_abort(ml_unit_t *ul, uint32_t tid) 999 { 1000 struct mt_map *mtm = ul->un_logmap; /* Log map */ 1001 mapentry_t *me, **mep; 1002 int i; 1003 1004 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1005 map_check_linkage(mtm)); 1006 1007 /* 1008 * wait for any outstanding reads to finish; lock out future reads 1009 */ 1010 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1011 1012 mutex_enter(&mtm->mtm_mutex); 1013 /* Take everything off cancel list */ 1014 while ((me = mtm->mtm_cancel) != NULL) { 1015 mtm->mtm_cancel = me->me_cancel; 1016 me->me_flags &= ~ME_CANCEL; 1017 me->me_cancel = NULL; 1018 } 1019 1020 /* 1021 * Now take out all mapentries with current tid, and committid 1022 * as this function is called from logmap_logscan and logmap_commit 1023 * When it is called from logmap_logscan mtm_tid == mtm_committid 1024 * But when logmap_abort is called from logmap_commit it is 1025 * because the log errored when trying to write the commit record, 1026 * after the async ops have been allowed to start in top_end_sync. 1027 * So we also need to remove all mapentries from the transaction whose 1028 * commit failed. 1029 */ 1030 for (i = 0; i < mtm->mtm_nhash; i++) { 1031 mep = &mtm->mtm_hash[i]; 1032 while ((me = *mep) != NULL) { 1033 if (me->me_tid == tid || 1034 me->me_tid == mtm->mtm_committid) { 1035 *mep = me->me_hash; 1036 me->me_next->me_prev = me->me_prev; 1037 me->me_prev->me_next = me->me_next; 1038 if (!(me->me_flags & ME_USER)) { 1039 mtm->mtm_nme--; 1040 } 1041 CRB_RELE(me); 1042 kmem_cache_free(mapentry_cache, me); 1043 continue; 1044 } 1045 mep = &me->me_hash; 1046 } 1047 } 1048 1049 if (!(ul->un_flags & LDL_SCAN)) 1050 mtm->mtm_flags |= MTM_CANCELED; 1051 mutex_exit(&mtm->mtm_mutex); 1052 mtm->mtm_dirty = 0; 1053 mtm->mtm_nmet = 0; 1054 rw_exit(&mtm->mtm_rwlock); 1055 1056 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1057 map_check_linkage(mtm)); 1058 } 1059 1060 static void 1061 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me) 1062 { 1063 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1064 1065 while (!ldl_has_space(ul, me)) { 1066 ASSERT(!(ul->un_flags & LDL_NOROLL)); 1067 mutex_exit(&ul->un_log_mutex); 1068 logmap_forceroll(mtm); 1069 mutex_enter(&ul->un_log_mutex); 1070 if (ul->un_flags & LDL_ERROR) 1071 break; 1072 } 1073 1074 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1075 } 1076 1077 /* 1078 * put a list of deltas into a logmap 1079 * If va == NULL, don't write to the log. 1080 */ 1081 void 1082 logmap_add( 1083 ml_unit_t *ul, 1084 char *va, /* Ptr to buf w/deltas & data */ 1085 offset_t vamof, /* Offset on master of buf start */ 1086 mapentry_t *melist) /* Entries to add */ 1087 { 1088 offset_t mof; 1089 off_t nb; 1090 mapentry_t *me; 1091 mapentry_t **mep; 1092 mapentry_t **savmep; 1093 uint32_t tid; 1094 mt_map_t *mtm = ul->un_logmap; 1095 1096 mutex_enter(&ul->un_log_mutex); 1097 if (va) 1098 logmap_wait_space(mtm, ul, melist); 1099 1100 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1101 map_check_linkage(mtm)); 1102 1103 mtm->mtm_ref = 1; 1104 mtm->mtm_dirty++; 1105 tid = mtm->mtm_tid; 1106 while (melist) { 1107 mof = melist->me_mof; 1108 nb = melist->me_nb; 1109 1110 /* 1111 * search for overlaping entries 1112 */ 1113 savmep = mep = MAP_HASH(mof, mtm); 1114 mutex_enter(&mtm->mtm_mutex); 1115 while ((me = *mep) != 0) { 1116 /* 1117 * Data consumes old map entry; cancel map entry. 1118 * Take care when we replace an old map entry 1119 * which carries quota information with a newer entry 1120 * which does not. In that case the push function 1121 * would not be called to clean up the dquot structure. 1122 * This would be found later by invalidatedq() causing 1123 * a panic when the filesystem in unmounted. 1124 * We clean up the dquot manually and then replace 1125 * the map entry. 1126 */ 1127 if (MEwithinDATA(me, mof, nb) && 1128 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1129 if (tid == me->me_tid && 1130 ((me->me_flags & ME_AGE) == 0)) { 1131 *mep = me->me_hash; 1132 me->me_next->me_prev = me->me_prev; 1133 me->me_prev->me_next = me->me_next; 1134 ASSERT(!(me->me_flags & ME_USER)); 1135 mtm->mtm_nme--; 1136 /* 1137 * Special case if the mapentry 1138 * carries a dquot and a push function. 1139 * We have to clean up the quota info 1140 * before replacing the mapentry. 1141 */ 1142 if (me->me_dt == DT_QR) 1143 HANDLE_DQUOT(me, melist); 1144 1145 kmem_cache_free(mapentry_cache, me); 1146 continue; 1147 } 1148 me->me_cancel = mtm->mtm_cancel; 1149 mtm->mtm_cancel = me; 1150 me->me_flags |= ME_CANCEL; 1151 } 1152 mep = &(*mep)->me_hash; 1153 } 1154 mutex_exit(&mtm->mtm_mutex); 1155 1156 /* 1157 * remove from list 1158 */ 1159 me = melist; 1160 melist = melist->me_hash; 1161 me->me_flags &= ~ME_LIST; 1162 /* 1163 * If va != NULL, put in the log. 1164 */ 1165 if (va) 1166 ldl_write(ul, va, vamof, me); 1167 if (ul->un_flags & LDL_ERROR) { 1168 kmem_cache_free(mapentry_cache, me); 1169 continue; 1170 } 1171 ASSERT((va == NULL) || 1172 ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1173 map_check_ldl_write(ul, va, vamof, me)); 1174 1175 /* 1176 * put on hash 1177 */ 1178 mutex_enter(&mtm->mtm_mutex); 1179 me->me_hash = *savmep; 1180 *savmep = me; 1181 me->me_next = (mapentry_t *)mtm; 1182 me->me_prev = mtm->mtm_prev; 1183 mtm->mtm_prev->me_next = me; 1184 mtm->mtm_prev = me; 1185 me->me_flags |= ME_HASH; 1186 me->me_tid = tid; 1187 me->me_age = mtm->mtm_age++; 1188 mtm->mtm_nme++; 1189 mtm->mtm_nmet++; 1190 mutex_exit(&mtm->mtm_mutex); 1191 } 1192 1193 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1194 map_check_linkage(mtm)); 1195 mutex_exit(&ul->un_log_mutex); 1196 } 1197 1198 /* 1199 * Add the delta(s) into the log. 1200 * Create one cached roll buffer logmap entry, and reference count the 1201 * number of mapentries refering to it. 1202 * Cancel previous logmap entries. 1203 * logmap_add is tolerant of failure to allocate a cached roll buffer. 1204 */ 1205 void 1206 logmap_add_buf( 1207 ml_unit_t *ul, 1208 char *va, /* Ptr to buf w/deltas & data */ 1209 offset_t bufmof, /* Offset on master of buf start */ 1210 mapentry_t *melist, /* Entries to add */ 1211 caddr_t buf, /* Buffer containing delta(s) */ 1212 uint32_t bufsz) /* Size of buf */ 1213 { 1214 offset_t mof; 1215 offset_t vamof = bufmof + (va - buf); 1216 off_t nb; 1217 mapentry_t *me; 1218 mapentry_t **mep; 1219 mapentry_t **savmep; 1220 uint32_t tid; 1221 mt_map_t *mtm = ul->un_logmap; 1222 crb_t *crb; 1223 crb_t *crbsav = NULL; 1224 1225 ASSERT((bufsz & DEV_BMASK) == 0); 1226 mutex_enter(&ul->un_log_mutex); 1227 logmap_wait_space(mtm, ul, melist); 1228 1229 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1230 map_check_linkage(mtm)); 1231 1232 mtm->mtm_ref = 1; 1233 mtm->mtm_dirty++; 1234 tid = mtm->mtm_tid; 1235 while (melist) { 1236 mof = melist->me_mof; 1237 nb = melist->me_nb; 1238 1239 /* 1240 * search for overlapping entries 1241 */ 1242 savmep = mep = MAP_HASH(mof, mtm); 1243 mutex_enter(&mtm->mtm_mutex); 1244 while ((me = *mep) != 0) { 1245 /* 1246 * Data consumes old map entry; cancel map entry. 1247 * Take care when we replace an old map entry 1248 * which carries quota information with a newer entry 1249 * which does not. In that case the push function 1250 * would not be called to clean up the dquot structure. 1251 * This would be found later by invalidatedq() causing 1252 * a panic when the filesystem in unmounted. 1253 * We clean up the dquot manually and then replace 1254 * the map entry. 1255 */ 1256 crb = me->me_crb; 1257 if (MEwithinDATA(me, mof, nb) && 1258 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) { 1259 if (tid == me->me_tid && 1260 ((me->me_flags & ME_AGE) == 0)) { 1261 *mep = me->me_hash; 1262 me->me_next->me_prev = me->me_prev; 1263 me->me_prev->me_next = me->me_next; 1264 ASSERT(!(me->me_flags & ME_USER)); 1265 mtm->mtm_nme--; 1266 /* 1267 * Special case if the mapentry 1268 * carries a dquot and a push function. 1269 * We have to clean up the quota info 1270 * before replacing the mapentry. 1271 */ 1272 if (me->me_dt == DT_QR) 1273 HANDLE_DQUOT(me, melist); 1274 1275 /* 1276 * If this soon to be deleted mapentry 1277 * has a suitable roll buffer then 1278 * re-use it. 1279 */ 1280 if (crb && (--crb->c_refcnt == 0)) { 1281 if (crbsav || 1282 (crb->c_nb != bufsz)) { 1283 CRB_FREE(crb, me); 1284 } else { 1285 bcopy(buf, crb->c_buf, 1286 bufsz); 1287 crb->c_invalid = 0; 1288 crb->c_mof = bufmof; 1289 crbsav = crb; 1290 me->me_crb = NULL; 1291 } 1292 } 1293 kmem_cache_free(mapentry_cache, me); 1294 continue; 1295 } 1296 me->me_cancel = mtm->mtm_cancel; 1297 mtm->mtm_cancel = me; 1298 me->me_flags |= ME_CANCEL; 1299 } 1300 1301 /* 1302 * Inode deltas within the same fs block come 1303 * in individually as separate calls to logmap_add(). 1304 * All others come in as one call. So check for an 1305 * existing entry where we can re-use the crb. 1306 */ 1307 if ((me->me_dt == DT_INODE) && (tid == me->me_tid) && 1308 !crbsav && crb && 1309 WITHIN(mof, nb, crb->c_mof, crb->c_nb)) { 1310 ASSERT(crb->c_mof == bufmof); 1311 ASSERT(crb->c_nb == bufsz); 1312 bcopy(buf, crb->c_buf, bufsz); 1313 crbsav = crb; 1314 } 1315 mep = &(*mep)->me_hash; 1316 } 1317 mutex_exit(&mtm->mtm_mutex); 1318 1319 /* 1320 * If we don't already have a crb then allocate one 1321 * and copy the incoming buffer. Only do this once 1322 * for all the incoming deltas. 1323 */ 1324 if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) { 1325 /* 1326 * Only use a cached roll buffer if we 1327 * have enough memory, and check for failures. 1328 */ 1329 if (((ufs_crb_size + bufsz) < ufs_crb_limit) && 1330 (kmem_avail() > bufsz)) { 1331 crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP); 1332 } else { 1333 ufs_crb_alloc_fails++; 1334 } 1335 if (crbsav) { 1336 crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP); 1337 if (crbsav->c_buf) { 1338 atomic_add_64(&ufs_crb_size, 1339 (uint64_t)bufsz); 1340 if (ufs_crb_size > ufs_crb_max_size) { 1341 ufs_crb_max_size = ufs_crb_size; 1342 } 1343 bcopy(buf, crbsav->c_buf, bufsz); 1344 crbsav->c_nb = bufsz; 1345 crbsav->c_refcnt = 0; 1346 crbsav->c_invalid = 0; 1347 ASSERT((bufmof & DEV_BMASK) == 0); 1348 crbsav->c_mof = bufmof; 1349 } else { 1350 kmem_free(crbsav, sizeof (crb_t)); 1351 crbsav = NULL; 1352 } 1353 } 1354 } 1355 1356 /* 1357 * remove from list 1358 */ 1359 me = melist; 1360 melist = melist->me_hash; 1361 me->me_flags &= ~ME_LIST; 1362 me->me_crb = crbsav; 1363 if (crbsav) { 1364 crbsav->c_refcnt++; 1365 } 1366 crbsav = NULL; 1367 1368 ASSERT(va); 1369 ldl_write(ul, va, vamof, me); /* add to on-disk log */ 1370 if (ul->un_flags & LDL_ERROR) { 1371 CRB_RELE(me); 1372 kmem_cache_free(mapentry_cache, me); 1373 continue; 1374 } 1375 ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) || 1376 map_check_ldl_write(ul, va, vamof, me)); 1377 1378 /* 1379 * put on hash 1380 */ 1381 mutex_enter(&mtm->mtm_mutex); 1382 me->me_hash = *savmep; 1383 *savmep = me; 1384 me->me_next = (mapentry_t *)mtm; 1385 me->me_prev = mtm->mtm_prev; 1386 mtm->mtm_prev->me_next = me; 1387 mtm->mtm_prev = me; 1388 me->me_flags |= ME_HASH; 1389 me->me_tid = tid; 1390 me->me_age = mtm->mtm_age++; 1391 mtm->mtm_nme++; 1392 mtm->mtm_nmet++; 1393 mutex_exit(&mtm->mtm_mutex); 1394 } 1395 1396 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1397 map_check_linkage(mtm)); 1398 mutex_exit(&ul->un_log_mutex); 1399 } 1400 1401 /* 1402 * free up any cancelled deltas 1403 */ 1404 void 1405 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead) 1406 { 1407 int dolock = 0; 1408 mapentry_t *me; 1409 mapentry_t **mep; 1410 1411 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1412 map_check_linkage(mtm)); 1413 1414 again: 1415 if (dolock) 1416 rw_enter(&mtm->mtm_rwlock, RW_WRITER); 1417 1418 /* 1419 * At EOT, cancel the indicated deltas 1420 */ 1421 mutex_enter(&mtm->mtm_mutex); 1422 if (mtm->mtm_flags & MTM_CANCELED) { 1423 mtm->mtm_flags &= ~MTM_CANCELED; 1424 ASSERT(dolock == 0); 1425 mutex_exit(&mtm->mtm_mutex); 1426 return; 1427 } 1428 1429 while ((me = *cancelhead) != NULL) { 1430 /* 1431 * roll forward or read collision; wait and try again 1432 */ 1433 if (me->me_flags & ME_AGE) { 1434 ASSERT(dolock == 0); 1435 mutex_exit(&mtm->mtm_mutex); 1436 dolock = 1; 1437 goto again; 1438 } 1439 /* 1440 * remove from cancel list 1441 */ 1442 *cancelhead = me->me_cancel; 1443 me->me_cancel = NULL; 1444 me->me_flags &= ~(ME_CANCEL); 1445 1446 /* 1447 * logmap_remove_roll handles ME_ROLL entries later 1448 * we leave them around for logmap_iscancel 1449 * XXX is this necessary? 1450 */ 1451 if (me->me_flags & ME_ROLL) 1452 continue; 1453 1454 /* 1455 * remove from hash (if necessary) 1456 */ 1457 if (me->me_flags & ME_HASH) { 1458 mep = MAP_HASH(me->me_mof, mtm); 1459 while (*mep) { 1460 if (*mep == me) { 1461 *mep = me->me_hash; 1462 me->me_next->me_prev = me->me_prev; 1463 me->me_prev->me_next = me->me_next; 1464 me->me_flags &= ~(ME_HASH); 1465 if (!(me->me_flags & ME_USER)) { 1466 mtm->mtm_nme--; 1467 } 1468 break; 1469 } else 1470 mep = &(*mep)->me_hash; 1471 } 1472 } 1473 /* 1474 * put the entry on the free list 1475 */ 1476 CRB_RELE(me); 1477 kmem_cache_free(mapentry_cache, me); 1478 } 1479 mutex_exit(&mtm->mtm_mutex); 1480 if (dolock) 1481 rw_exit(&mtm->mtm_rwlock); 1482 1483 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1484 map_check_linkage(mtm)); 1485 } 1486 1487 1488 void 1489 logmap_commit(ml_unit_t *ul, uint32_t tid) 1490 { 1491 mapentry_t me; 1492 mt_map_t *mtm = ul->un_logmap; 1493 1494 1495 ASSERT(MUTEX_HELD(&ul->un_log_mutex)); 1496 1497 /* 1498 * async'ly write a commit rec into the log 1499 */ 1500 if (mtm->mtm_dirty) { 1501 /* 1502 * put commit record into log 1503 */ 1504 me.me_mof = mtm->mtm_tid; 1505 me.me_dt = DT_COMMIT; 1506 me.me_nb = 0; 1507 me.me_hash = NULL; 1508 logmap_wait_space(mtm, ul, &me); 1509 ldl_write(ul, NULL, (offset_t)0, &me); 1510 ldl_round_commit(ul); 1511 1512 /* 1513 * abort on error; else reset dirty flag 1514 */ 1515 if (ul->un_flags & LDL_ERROR) 1516 logmap_abort(ul, tid); 1517 else { 1518 mtm->mtm_dirty = 0; 1519 mtm->mtm_nmet = 0; 1520 mtm->mtm_cfrags = 0; 1521 } 1522 /* push commit */ 1523 ldl_push_commit(ul); 1524 } 1525 } 1526 1527 void 1528 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul) 1529 { 1530 off_t lof; 1531 uint32_t tid; 1532 mapentry_t *me; 1533 1534 /* 1535 * move the head forward so the log knows how full it is 1536 * Make sure to skip any mapentry whose me_lof is 0, these 1537 * are just place holders for DT_CANCELED freed user blocks 1538 * for the current moby. 1539 */ 1540 mutex_enter(&ul->un_log_mutex); 1541 mutex_enter(&mtm->mtm_mutex); 1542 me = mtm->mtm_next; 1543 while (me != (mapentry_t *)mtm && me->me_lof == 0) { 1544 me = me->me_next; 1545 } 1546 1547 if (me == (mapentry_t *)mtm) 1548 lof = -1; 1549 else { 1550 lof = me->me_lof; 1551 tid = me->me_tid; 1552 } 1553 mutex_exit(&mtm->mtm_mutex); 1554 ldl_sethead(ul, lof, tid); 1555 if (lof == -1) 1556 mtm->mtm_age = 0; 1557 mutex_exit(&ul->un_log_mutex); 1558 } 1559 1560 void 1561 logmap_settail(mt_map_t *mtm, ml_unit_t *ul) 1562 { 1563 off_t lof; 1564 size_t nb; 1565 1566 /* 1567 * set the tail after the logmap_abort 1568 */ 1569 mutex_enter(&ul->un_log_mutex); 1570 mutex_enter(&mtm->mtm_mutex); 1571 if (mtm->mtm_prev == (mapentry_t *)mtm) 1572 lof = -1; 1573 else { 1574 /* 1575 * set the tail to the end of the last commit 1576 */ 1577 lof = mtm->mtm_tail_lof; 1578 nb = mtm->mtm_tail_nb; 1579 } 1580 mutex_exit(&mtm->mtm_mutex); 1581 ldl_settail(ul, lof, nb); 1582 mutex_exit(&ul->un_log_mutex); 1583 } 1584 1585 /* 1586 * when reseting a device; roll the log until every 1587 * delta has been rolled forward 1588 */ 1589 void 1590 logmap_roll_dev(ml_unit_t *ul) 1591 { 1592 mt_map_t *mtm = ul->un_logmap; 1593 mapentry_t *me; 1594 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 1595 1596 again: 1597 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1598 map_check_linkage(mtm)); 1599 if (ul->un_flags & (LDL_ERROR|LDL_NOROLL)) 1600 return; 1601 1602 /* 1603 * look for deltas 1604 */ 1605 mutex_enter(&mtm->mtm_mutex); 1606 for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) { 1607 if (me->me_flags & ME_ROLL) 1608 break; 1609 if (me->me_tid == mtm->mtm_tid) 1610 continue; 1611 if (me->me_tid == mtm->mtm_committid) 1612 continue; 1613 break; 1614 } 1615 1616 /* 1617 * found a delta; kick the roll thread 1618 * but only if the thread is running... (jmh) 1619 */ 1620 if (me != (mapentry_t *)mtm) { 1621 mutex_exit(&mtm->mtm_mutex); 1622 logmap_forceroll(mtm); 1623 goto again; 1624 } 1625 1626 /* 1627 * no more deltas, return 1628 */ 1629 mutex_exit(&mtm->mtm_mutex); 1630 (void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs); 1631 1632 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1633 map_check_linkage(mtm)); 1634 } 1635 1636 static void 1637 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata) 1638 { 1639 mapentry_t *me; 1640 mapentry_t **mep; 1641 mt_map_t *mtm = ul->un_logmap; 1642 int frags; 1643 1644 /* 1645 * map has been referenced and is dirty 1646 */ 1647 mtm->mtm_ref = 1; 1648 mtm->mtm_dirty++; 1649 1650 /* 1651 * get a mapentry 1652 */ 1653 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1654 bzero(me, sizeof (mapentry_t)); 1655 1656 /* 1657 * initialize cancel record and put in logmap 1658 */ 1659 me->me_mof = mof; 1660 me->me_nb = nb; 1661 me->me_dt = DT_CANCEL; 1662 me->me_tid = mtm->mtm_tid; 1663 me->me_hash = NULL; 1664 1665 /* 1666 * Write delta to log if this delta is for metadata. If this is not 1667 * metadata it is user data and we are just putting a cancel 1668 * mapentry into the hash to cancel a user block deletion 1669 * in which we do not want the block to be allocated 1670 * within this moby. This cancel entry will prevent the block from 1671 * being allocated within the moby and prevent user data corruption 1672 * if we happen to crash before this moby is committed. 1673 */ 1674 mutex_enter(&ul->un_log_mutex); 1675 if (metadata) { 1676 logmap_wait_space(mtm, ul, me); 1677 ldl_write(ul, NULL, (offset_t)0, me); 1678 if (ul->un_flags & LDL_ERROR) { 1679 kmem_cache_free(mapentry_cache, me); 1680 mutex_exit(&ul->un_log_mutex); 1681 return; 1682 } 1683 } 1684 1685 /* 1686 * put in hash and on cancel list 1687 */ 1688 mep = MAP_HASH(mof, mtm); 1689 mutex_enter(&mtm->mtm_mutex); 1690 me->me_age = mtm->mtm_age++; 1691 me->me_hash = *mep; 1692 *mep = me; 1693 me->me_next = (mapentry_t *)mtm; 1694 me->me_prev = mtm->mtm_prev; 1695 mtm->mtm_prev->me_next = me; 1696 mtm->mtm_prev = me; 1697 me->me_cancel = mtm->mtm_cancel; 1698 mtm->mtm_cancel = me; 1699 if (metadata) { 1700 mtm->mtm_nme++; 1701 mtm->mtm_nmet++; 1702 } else { 1703 me->me_flags = ME_USER; 1704 } 1705 me->me_flags |= (ME_HASH|ME_CANCEL); 1706 if (!(metadata)) { 1707 frags = blkoff(ul->un_ufsvfs->vfs_fs, nb); 1708 if (frags) 1709 mtm->mtm_cfrags += 1710 numfrags(ul->un_ufsvfs->vfs_fs, frags); 1711 } 1712 mutex_exit(&mtm->mtm_mutex); 1713 1714 mutex_exit(&ul->un_log_mutex); 1715 } 1716 1717 /* 1718 * cancel entries in a logmap (entries are freed at EOT) 1719 */ 1720 void 1721 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata) 1722 { 1723 int32_t hnb; 1724 mapentry_t *me; 1725 mapentry_t **mep; 1726 mt_map_t *mtm = ul->un_logmap; 1727 crb_t *crb; 1728 1729 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1730 map_check_linkage(mtm)); 1731 1732 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1733 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1734 if (hnb > nb) 1735 hnb = nb; 1736 /* 1737 * Find overlapping metadata entries. Don't search through 1738 * the hash chains if this is user data because it is only 1739 * possible to have overlapping map entries for metadata, 1740 * and the search can become expensive for large files. 1741 */ 1742 if (metadata) { 1743 mep = MAP_HASH(mof, mtm); 1744 mutex_enter(&mtm->mtm_mutex); 1745 for (me = *mep; me; me = me->me_hash) { 1746 if (!DATAoverlapME(mof, hnb, me)) 1747 continue; 1748 1749 ASSERT(MEwithinDATA(me, mof, hnb)); 1750 1751 if ((me->me_flags & ME_CANCEL) == 0) { 1752 me->me_cancel = mtm->mtm_cancel; 1753 mtm->mtm_cancel = me; 1754 me->me_flags |= ME_CANCEL; 1755 crb = me->me_crb; 1756 if (crb) { 1757 crb->c_invalid = 1; 1758 } 1759 } 1760 } 1761 mutex_exit(&mtm->mtm_mutex); 1762 } 1763 1764 /* 1765 * put a cancel record into the log 1766 */ 1767 logmap_cancel_delta(ul, mof, hnb, metadata); 1768 } 1769 1770 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) || 1771 map_check_linkage(mtm)); 1772 } 1773 1774 /* 1775 * check for overlap w/cancel delta 1776 */ 1777 int 1778 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb) 1779 { 1780 off_t hnb; 1781 mapentry_t *me; 1782 mapentry_t **mep; 1783 1784 mutex_enter(&mtm->mtm_mutex); 1785 for (hnb = 0; nb; nb -= hnb, mof += hnb) { 1786 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF); 1787 if (hnb > nb) 1788 hnb = nb; 1789 /* 1790 * search for dup entry 1791 */ 1792 mep = MAP_HASH(mof, mtm); 1793 for (me = *mep; me; me = me->me_hash) { 1794 if (((me->me_flags & ME_ROLL) == 0) && 1795 (me->me_dt != DT_CANCEL)) 1796 continue; 1797 if (DATAoverlapME(mof, hnb, me)) 1798 break; 1799 } 1800 1801 /* 1802 * overlap detected 1803 */ 1804 if (me) { 1805 mutex_exit(&mtm->mtm_mutex); 1806 return (1); 1807 } 1808 } 1809 mutex_exit(&mtm->mtm_mutex); 1810 return (0); 1811 } 1812 1813 static int 1814 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp) 1815 { 1816 mapentry_t *me; 1817 int error; 1818 mt_map_t *mtm = ul->un_logmap; 1819 1820 /* 1821 * verify delta header; failure == mediafail 1822 */ 1823 error = 0; 1824 /* delta type */ 1825 if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX)) 1826 error = EINVAL; 1827 if (dp->d_typ == DT_COMMIT) { 1828 if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1)) 1829 error = EINVAL; 1830 } else { 1831 /* length of delta */ 1832 if ((dp->d_nb < INT32_C(0)) || 1833 (dp->d_nb > INT32_C(MAPBLOCKSIZE))) 1834 error = EINVAL; 1835 1836 /* offset on master device */ 1837 if (dp->d_mof < INT64_C(0)) 1838 error = EINVAL; 1839 } 1840 1841 if (error) { 1842 ldl_seterror(ul, "Error processing ufs log data during scan"); 1843 return (error); 1844 } 1845 1846 /* 1847 * process commit record 1848 */ 1849 if (dp->d_typ == DT_COMMIT) { 1850 if (mtm->mtm_dirty) { 1851 ASSERT(dp->d_nb == INT32_C(0)); 1852 logmap_free_cancel(mtm, &mtm->mtm_cancel); 1853 mtm->mtm_dirty = 0; 1854 mtm->mtm_nmet = 0; 1855 mtm->mtm_tid++; 1856 mtm->mtm_committid = mtm->mtm_tid; 1857 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1858 logmap_logscan_commit_debug(lof, mtm)); 1859 } 1860 /* 1861 * return #bytes to next sector (next delta header) 1862 */ 1863 *nbp = ldl_logscan_nbcommit(lof); 1864 mtm->mtm_tail_lof = lof; 1865 mtm->mtm_tail_nb = *nbp; 1866 return (0); 1867 } 1868 1869 /* 1870 * add delta to logmap 1871 */ 1872 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP); 1873 bzero(me, sizeof (mapentry_t)); 1874 me->me_lof = lof; 1875 me->me_mof = dp->d_mof; 1876 me->me_nb = dp->d_nb; 1877 me->me_tid = mtm->mtm_tid; 1878 me->me_dt = dp->d_typ; 1879 me->me_hash = NULL; 1880 me->me_flags = (ME_LIST | ME_SCAN); 1881 logmap_add(ul, NULL, 0, me); 1882 switch (dp->d_typ) { 1883 case DT_CANCEL: 1884 me->me_flags |= ME_CANCEL; 1885 me->me_cancel = mtm->mtm_cancel; 1886 mtm->mtm_cancel = me; 1887 break; 1888 default: 1889 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) || 1890 logmap_logscan_add_debug(dp, mtm)); 1891 break; 1892 } 1893 1894 sizeofdelta: 1895 /* 1896 * return #bytes till next delta header 1897 */ 1898 if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO)) 1899 *nbp = 0; 1900 else 1901 *nbp = dp->d_nb; 1902 return (0); 1903 } 1904 1905 void 1906 logmap_logscan(ml_unit_t *ul) 1907 { 1908 size_t nb, nbd; 1909 off_t lof; 1910 struct delta delta; 1911 mt_map_t *logmap = ul->un_logmap; 1912 1913 ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap); 1914 1915 /* 1916 * prepare the log for a logscan 1917 */ 1918 ldl_logscan_begin(ul); 1919 1920 /* 1921 * prepare the logmap for a logscan 1922 */ 1923 (void) map_free_entries(logmap); 1924 logmap->mtm_tid = 0; 1925 logmap->mtm_committid = UINT32_C(0); 1926 logmap->mtm_age = 0; 1927 logmap->mtm_dirty = 0; 1928 logmap->mtm_ref = 0; 1929 1930 /* 1931 * while not at end of log 1932 * read delta header 1933 * add to logmap 1934 * seek to beginning of next delta 1935 */ 1936 lof = ul->un_head_lof; 1937 nbd = sizeof (delta); 1938 while (lof != ul->un_tail_lof) { 1939 1940 /* read delta header */ 1941 if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta)) 1942 break; 1943 1944 /* add to logmap */ 1945 if (logmap_logscan_add(ul, &delta, lof, &nb)) 1946 break; 1947 1948 /* seek to next header (skip data) */ 1949 if (ldl_logscan_read(ul, &lof, nb, NULL)) 1950 break; 1951 } 1952 1953 /* 1954 * remove the last partial transaction from the logmap 1955 */ 1956 logmap_abort(ul, logmap->mtm_tid); 1957 1958 ldl_logscan_end(ul); 1959 } 1960 1961 void 1962 _init_map(void) 1963 { 1964 /* 1965 * Initialise the mapentry cache. No constructor or deconstructor 1966 * is needed. Also no reclaim function is supplied as reclaiming 1967 * current entries is not possible. 1968 */ 1969 mapentry_cache = kmem_cache_create("lufs_mapentry_cache", 1970 sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1971 } 1972 1973 /* 1974 * Special case when we replace an old map entry which carries quota 1975 * information with a newer entry which does not. 1976 * In that case the push function would not be called to clean up the 1977 * dquot structure. This would be found later by invalidatedq() causing 1978 * a panic when the filesystem in unmounted. 1979 * We clean up the dquot manually before replacing the map entry. 1980 */ 1981 void 1982 handle_dquot(mapentry_t *me) 1983 { 1984 int dolock = 0; 1985 int domutex = 0; 1986 struct dquot *dqp; 1987 1988 dqp = (struct dquot *)me->me_arg; 1989 1990 /* 1991 * We need vfs_dqrwlock to call dqput() 1992 */ 1993 dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock)); 1994 if (dolock) 1995 rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER); 1996 1997 domutex = (!MUTEX_HELD(&dqp->dq_lock)); 1998 if (domutex) 1999 mutex_enter(&dqp->dq_lock); 2000 2001 /* 2002 * Only clean up if the dquot is referenced 2003 */ 2004 if (dqp->dq_cnt == 0) { 2005 if (domutex) 2006 mutex_exit(&dqp->dq_lock); 2007 if (dolock) 2008 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2009 return; 2010 } 2011 2012 dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS); 2013 dqput(dqp); 2014 2015 if (domutex) 2016 mutex_exit(&dqp->dq_lock); 2017 2018 if (dolock) 2019 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock); 2020 2021 } 2022