1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 29 /* 30 * This file contains the top half of the zfs directory structure 31 * implementation. The bottom half is in zap_leaf.c. 32 * 33 * The zdir is an extendable hash data structure. There is a table of 34 * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are 35 * each a constant size and hold a variable number of directory entries. 36 * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. 37 * 38 * The pointer table holds a power of 2 number of pointers. 39 * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to 40 * by the pointer at index i in the table holds entries whose hash value 41 * has a zd_prefix_len - bit prefix 42 */ 43 44 #include <sys/spa.h> 45 #include <sys/dmu.h> 46 #include <sys/zfs_context.h> 47 #include <sys/zap.h> 48 #include <sys/refcount.h> 49 #include <sys/zap_impl.h> 50 #include <sys/zap_leaf.h> 51 52 int fzap_default_block_shift = 14; /* 16k blocksize */ 53 54 static void zap_leaf_pageout(dmu_buf_t *db, void *vl); 55 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); 56 57 58 void 59 fzap_byteswap(void *vbuf, size_t size) 60 { 61 uint64_t block_type; 62 63 block_type = *(uint64_t *)vbuf; 64 65 switch (block_type) { 66 case ZBT_LEAF: 67 case BSWAP_64(ZBT_LEAF): 68 zap_leaf_byteswap(vbuf, size); 69 return; 70 case ZBT_HEADER: 71 case BSWAP_64(ZBT_HEADER): 72 default: 73 /* it's a ptrtbl block */ 74 byteswap_uint64_array(vbuf, size); 75 return; 76 } 77 } 78 79 void 80 fzap_upgrade(zap_t *zap, dmu_tx_t *tx) 81 { 82 dmu_buf_t *db; 83 zap_leaf_t *l; 84 int i; 85 zap_phys_t *zp; 86 87 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 88 zap->zap_ismicro = FALSE; 89 90 (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, 91 &zap->zap_f.zap_phys, zap_pageout); 92 93 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); 94 zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; 95 96 zp = zap->zap_f.zap_phys; 97 /* 98 * explicitly zero it since it might be coming from an 99 * initialized microzap 100 */ 101 bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); 102 zp->zap_block_type = ZBT_HEADER; 103 zp->zap_magic = ZAP_MAGIC; 104 105 zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); 106 107 zp->zap_freeblk = 2; /* block 1 will be the first leaf */ 108 zp->zap_num_leafs = 1; 109 zp->zap_num_entries = 0; 110 zp->zap_salt = zap->zap_salt; 111 112 /* block 1 will be the first leaf */ 113 for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) 114 ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; 115 116 /* 117 * set up block 1 - the first leaf 118 */ 119 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, 120 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db)); 121 dmu_buf_will_dirty(db, tx); 122 123 l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); 124 l->l_dbuf = db; 125 l->l_phys = db->db_data; 126 127 zap_leaf_init(l); 128 129 kmem_free(l, sizeof (zap_leaf_t)); 130 dmu_buf_rele(db, FTAG); 131 } 132 133 static int 134 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) 135 { 136 if (RW_WRITE_HELD(&zap->zap_rwlock)) 137 return (1); 138 if (rw_tryupgrade(&zap->zap_rwlock)) { 139 dmu_buf_will_dirty(zap->zap_dbuf, tx); 140 return (1); 141 } 142 return (0); 143 } 144 145 /* 146 * Generic routines for dealing with the pointer & cookie tables. 147 */ 148 149 static int 150 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, 151 void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), 152 dmu_tx_t *tx) 153 { 154 uint64_t b, newblk; 155 dmu_buf_t *db_old, *db_new; 156 int err; 157 int bs = FZAP_BLOCK_SHIFT(zap); 158 int hepb = 1<<(bs-4); 159 /* hepb = half the number of entries in a block */ 160 161 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 162 ASSERT(tbl->zt_blk != 0); 163 ASSERT(tbl->zt_numblks > 0); 164 165 if (tbl->zt_nextblk != 0) { 166 newblk = tbl->zt_nextblk; 167 } else { 168 newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); 169 tbl->zt_nextblk = newblk; 170 ASSERT3U(tbl->zt_blks_copied, ==, 0); 171 dmu_prefetch(zap->zap_objset, zap->zap_object, 172 tbl->zt_blk << bs, tbl->zt_numblks << bs); 173 } 174 175 /* 176 * Copy the ptrtbl from the old to new location. 177 */ 178 179 b = tbl->zt_blks_copied; 180 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 181 (tbl->zt_blk + b) << bs, FTAG, &db_old); 182 if (err) 183 return (err); 184 185 /* first half of entries in old[b] go to new[2*b+0] */ 186 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, 187 (newblk + 2*b+0) << bs, FTAG, &db_new)); 188 dmu_buf_will_dirty(db_new, tx); 189 transfer_func(db_old->db_data, db_new->db_data, hepb); 190 dmu_buf_rele(db_new, FTAG); 191 192 /* second half of entries in old[b] go to new[2*b+1] */ 193 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, 194 (newblk + 2*b+1) << bs, FTAG, &db_new)); 195 dmu_buf_will_dirty(db_new, tx); 196 transfer_func((uint64_t *)db_old->db_data + hepb, 197 db_new->db_data, hepb); 198 dmu_buf_rele(db_new, FTAG); 199 200 dmu_buf_rele(db_old, FTAG); 201 202 tbl->zt_blks_copied++; 203 204 dprintf("copied block %llu of %llu\n", 205 tbl->zt_blks_copied, tbl->zt_numblks); 206 207 if (tbl->zt_blks_copied == tbl->zt_numblks) { 208 (void) dmu_free_range(zap->zap_objset, zap->zap_object, 209 tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); 210 211 tbl->zt_blk = newblk; 212 tbl->zt_numblks *= 2; 213 tbl->zt_shift++; 214 tbl->zt_nextblk = 0; 215 tbl->zt_blks_copied = 0; 216 217 dprintf("finished; numblocks now %llu (%lluk entries)\n", 218 tbl->zt_numblks, 1<<(tbl->zt_shift-10)); 219 } 220 221 return (0); 222 } 223 224 static int 225 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, 226 dmu_tx_t *tx) 227 { 228 int err; 229 uint64_t blk, off; 230 int bs = FZAP_BLOCK_SHIFT(zap); 231 dmu_buf_t *db; 232 233 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 234 ASSERT(tbl->zt_blk != 0); 235 236 dprintf("storing %llx at index %llx\n", val, idx); 237 238 blk = idx >> (bs-3); 239 off = idx & ((1<<(bs-3))-1); 240 241 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 242 (tbl->zt_blk + blk) << bs, FTAG, &db); 243 if (err) 244 return (err); 245 dmu_buf_will_dirty(db, tx); 246 247 if (tbl->zt_nextblk != 0) { 248 uint64_t idx2 = idx * 2; 249 uint64_t blk2 = idx2 >> (bs-3); 250 uint64_t off2 = idx2 & ((1<<(bs-3))-1); 251 dmu_buf_t *db2; 252 253 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 254 (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); 255 if (err) { 256 dmu_buf_rele(db, FTAG); 257 return (err); 258 } 259 dmu_buf_will_dirty(db2, tx); 260 ((uint64_t *)db2->db_data)[off2] = val; 261 ((uint64_t *)db2->db_data)[off2+1] = val; 262 dmu_buf_rele(db2, FTAG); 263 } 264 265 ((uint64_t *)db->db_data)[off] = val; 266 dmu_buf_rele(db, FTAG); 267 268 return (0); 269 } 270 271 static int 272 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) 273 { 274 uint64_t blk, off; 275 int err; 276 dmu_buf_t *db; 277 int bs = FZAP_BLOCK_SHIFT(zap); 278 279 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 280 281 blk = idx >> (bs-3); 282 off = idx & ((1<<(bs-3))-1); 283 284 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 285 (tbl->zt_blk + blk) << bs, FTAG, &db); 286 if (err) 287 return (err); 288 *valp = ((uint64_t *)db->db_data)[off]; 289 dmu_buf_rele(db, FTAG); 290 291 if (tbl->zt_nextblk != 0) { 292 /* 293 * read the nextblk for the sake of i/o error checking, 294 * so that zap_table_load() will catch errors for 295 * zap_table_store. 296 */ 297 blk = (idx*2) >> (bs-3); 298 299 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 300 (tbl->zt_nextblk + blk) << bs, FTAG, &db); 301 dmu_buf_rele(db, FTAG); 302 } 303 return (err); 304 } 305 306 /* 307 * Routines for growing the ptrtbl. 308 */ 309 310 static void 311 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) 312 { 313 int i; 314 for (i = 0; i < n; i++) { 315 uint64_t lb = src[i]; 316 dst[2*i+0] = lb; 317 dst[2*i+1] = lb; 318 } 319 } 320 321 static int 322 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) 323 { 324 /* In case things go horribly wrong. */ 325 if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2) 326 return (ENOSPC); 327 328 if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { 329 /* 330 * We are outgrowing the "embedded" ptrtbl (the one 331 * stored in the header block). Give it its own entire 332 * block, which will double the size of the ptrtbl. 333 */ 334 uint64_t newblk; 335 dmu_buf_t *db_new; 336 int err; 337 338 ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, 339 ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); 340 ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); 341 342 newblk = zap_allocate_blocks(zap, 1); 343 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 344 newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); 345 if (err) 346 return (err); 347 dmu_buf_will_dirty(db_new, tx); 348 zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 349 db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); 350 dmu_buf_rele(db_new, FTAG); 351 352 zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; 353 zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; 354 zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; 355 356 ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, 357 zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << 358 (FZAP_BLOCK_SHIFT(zap)-3)); 359 360 return (0); 361 } else { 362 return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, 363 zap_ptrtbl_transfer, tx)); 364 } 365 } 366 367 static void 368 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) 369 { 370 dmu_buf_will_dirty(zap->zap_dbuf, tx); 371 mutex_enter(&zap->zap_f.zap_num_entries_mtx); 372 ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); 373 zap->zap_f.zap_phys->zap_num_entries += delta; 374 mutex_exit(&zap->zap_f.zap_num_entries_mtx); 375 } 376 377 static uint64_t 378 zap_allocate_blocks(zap_t *zap, int nblocks) 379 { 380 uint64_t newblk; 381 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 382 newblk = zap->zap_f.zap_phys->zap_freeblk; 383 zap->zap_f.zap_phys->zap_freeblk += nblocks; 384 return (newblk); 385 } 386 387 static zap_leaf_t * 388 zap_create_leaf(zap_t *zap, dmu_tx_t *tx) 389 { 390 void *winner; 391 zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); 392 393 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 394 395 rw_init(&l->l_rwlock, 0, 0, 0); 396 rw_enter(&l->l_rwlock, RW_WRITER); 397 l->l_blkid = zap_allocate_blocks(zap, 1); 398 l->l_dbuf = NULL; 399 l->l_phys = NULL; 400 401 VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, 402 l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); 403 winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); 404 ASSERT(winner == NULL); 405 dmu_buf_will_dirty(l->l_dbuf, tx); 406 407 zap_leaf_init(l); 408 409 zap->zap_f.zap_phys->zap_num_leafs++; 410 411 return (l); 412 } 413 414 int 415 fzap_count(zap_t *zap, uint64_t *count) 416 { 417 ASSERT(!zap->zap_ismicro); 418 mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ 419 *count = zap->zap_f.zap_phys->zap_num_entries; 420 mutex_exit(&zap->zap_f.zap_num_entries_mtx); 421 return (0); 422 } 423 424 /* 425 * Routines for obtaining zap_leaf_t's 426 */ 427 428 void 429 zap_put_leaf(zap_leaf_t *l) 430 { 431 rw_exit(&l->l_rwlock); 432 dmu_buf_rele(l->l_dbuf, NULL); 433 } 434 435 _NOTE(ARGSUSED(0)) 436 static void 437 zap_leaf_pageout(dmu_buf_t *db, void *vl) 438 { 439 zap_leaf_t *l = vl; 440 441 rw_destroy(&l->l_rwlock); 442 kmem_free(l, sizeof (zap_leaf_t)); 443 } 444 445 static zap_leaf_t * 446 zap_open_leaf(uint64_t blkid, dmu_buf_t *db) 447 { 448 zap_leaf_t *l, *winner; 449 450 ASSERT(blkid != 0); 451 452 l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); 453 rw_init(&l->l_rwlock, 0, 0, 0); 454 rw_enter(&l->l_rwlock, RW_WRITER); 455 l->l_blkid = blkid; 456 l->l_bs = highbit(db->db_size)-1; 457 l->l_dbuf = db; 458 l->l_phys = NULL; 459 460 winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); 461 462 rw_exit(&l->l_rwlock); 463 if (winner != NULL) { 464 /* someone else set it first */ 465 zap_leaf_pageout(NULL, l); 466 l = winner; 467 } 468 469 /* 470 * lhr_pad was previously used for the next leaf in the leaf 471 * chain. There should be no chained leafs (as we have removed 472 * support for them). 473 */ 474 ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0); 475 476 /* 477 * There should be more hash entries than there can be 478 * chunks to put in the hash table 479 */ 480 ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); 481 482 /* The chunks should begin at the end of the hash table */ 483 ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, 484 &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); 485 486 /* The chunks should end at the end of the block */ 487 ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - 488 (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); 489 490 return (l); 491 } 492 493 static int 494 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, 495 zap_leaf_t **lp) 496 { 497 dmu_buf_t *db; 498 zap_leaf_t *l; 499 int bs = FZAP_BLOCK_SHIFT(zap); 500 int err; 501 502 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 503 504 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 505 blkid << bs, NULL, &db); 506 if (err) 507 return (err); 508 509 ASSERT3U(db->db_object, ==, zap->zap_object); 510 ASSERT3U(db->db_offset, ==, blkid << bs); 511 ASSERT3U(db->db_size, ==, 1 << bs); 512 ASSERT(blkid != 0); 513 514 l = dmu_buf_get_user(db); 515 516 if (l == NULL) 517 l = zap_open_leaf(blkid, db); 518 519 rw_enter(&l->l_rwlock, lt); 520 /* 521 * Must lock before dirtying, otherwise l->l_phys could change, 522 * causing ASSERT below to fail. 523 */ 524 if (lt == RW_WRITER) 525 dmu_buf_will_dirty(db, tx); 526 ASSERT3U(l->l_blkid, ==, blkid); 527 ASSERT3P(l->l_dbuf, ==, db); 528 ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); 529 ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); 530 ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); 531 532 *lp = l; 533 return (0); 534 } 535 536 static int 537 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) 538 { 539 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 540 541 if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { 542 ASSERT3U(idx, <, 543 (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); 544 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); 545 return (0); 546 } else { 547 return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, 548 idx, valp)); 549 } 550 } 551 552 static int 553 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) 554 { 555 ASSERT(tx != NULL); 556 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 557 558 if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { 559 ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; 560 return (0); 561 } else { 562 return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, 563 idx, blk, tx)); 564 } 565 } 566 567 static int 568 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) 569 { 570 uint64_t idx, blk; 571 int err; 572 573 ASSERT(zap->zap_dbuf == NULL || 574 zap->zap_f.zap_phys == zap->zap_dbuf->db_data); 575 ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); 576 idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); 577 err = zap_idx_to_blk(zap, idx, &blk); 578 if (err != 0) 579 return (err); 580 err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); 581 582 ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == 583 (*lp)->l_phys->l_hdr.lh_prefix); 584 return (err); 585 } 586 587 static int 588 zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, 589 zap_leaf_t **lp) 590 { 591 zap_leaf_t *nl; 592 int prefix_diff, i, err; 593 uint64_t sibling; 594 int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; 595 596 ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); 597 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 598 599 ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, 600 l->l_phys->l_hdr.lh_prefix); 601 602 if (zap_tryupgradedir(zap, tx) == 0 || 603 old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { 604 /* We failed to upgrade, or need to grow the pointer table */ 605 objset_t *os = zap->zap_objset; 606 uint64_t object = zap->zap_object; 607 608 zap_put_leaf(l); 609 zap_unlockdir(zap); 610 err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap); 611 if (err) 612 return (err); 613 ASSERT(!zap->zap_ismicro); 614 615 while (old_prefix_len == 616 zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { 617 err = zap_grow_ptrtbl(zap, tx); 618 if (err) 619 return (err); 620 } 621 622 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); 623 if (err) 624 return (err); 625 626 if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { 627 /* it split while our locks were down */ 628 *lp = l; 629 return (0); 630 } 631 } 632 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); 633 ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); 634 ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, 635 l->l_phys->l_hdr.lh_prefix); 636 637 prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - 638 (old_prefix_len + 1); 639 sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; 640 641 /* check for i/o errors before doing zap_leaf_split */ 642 for (i = 0; i < (1ULL<<prefix_diff); i++) { 643 uint64_t blk; 644 err = zap_idx_to_blk(zap, sibling+i, &blk); 645 if (err) 646 return (err); 647 ASSERT3U(blk, ==, l->l_blkid); 648 } 649 650 nl = zap_create_leaf(zap, tx); 651 zap_leaf_split(l, nl); 652 653 /* set sibling pointers */ 654 for (i = 0; i < (1ULL<<prefix_diff); i++) { 655 err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); 656 ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ 657 } 658 659 if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { 660 /* we want the sibling */ 661 zap_put_leaf(l); 662 *lp = nl; 663 } else { 664 zap_put_leaf(nl); 665 *lp = l; 666 } 667 668 return (0); 669 } 670 671 static void 672 zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) 673 { 674 int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; 675 int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && 676 l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); 677 678 zap_put_leaf(l); 679 680 if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { 681 int err; 682 683 /* 684 * We are in the middle of growing the pointer table, or 685 * this leaf will soon make us grow it. 686 */ 687 if (zap_tryupgradedir(zap, tx) == 0) { 688 objset_t *os = zap->zap_objset; 689 uint64_t zapobj = zap->zap_object; 690 691 zap_unlockdir(zap); 692 err = zap_lockdir(os, zapobj, tx, 693 RW_WRITER, FALSE, &zap); 694 if (err) 695 return; 696 } 697 698 /* could have finished growing while our locks were down */ 699 if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) 700 (void) zap_grow_ptrtbl(zap, tx); 701 } 702 } 703 704 705 static int 706 fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) 707 { 708 if (name && strlen(name) > ZAP_MAXNAMELEN) 709 return (E2BIG); 710 711 /* Only integer sizes supported by C */ 712 switch (integer_size) { 713 case 1: 714 case 2: 715 case 4: 716 case 8: 717 break; 718 default: 719 return (EINVAL); 720 } 721 722 if (integer_size * num_integers > ZAP_MAXVALUELEN) 723 return (E2BIG); 724 725 return (0); 726 } 727 728 /* 729 * Routines for maniplulating attributes. 730 */ 731 int 732 fzap_lookup(zap_t *zap, const char *name, 733 uint64_t integer_size, uint64_t num_integers, void *buf) 734 { 735 zap_leaf_t *l; 736 int err; 737 uint64_t hash; 738 zap_entry_handle_t zeh; 739 740 err = fzap_checksize(name, integer_size, num_integers); 741 if (err != 0) 742 return (err); 743 744 hash = zap_hash(zap, name); 745 err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); 746 if (err != 0) 747 return (err); 748 err = zap_leaf_lookup(l, name, hash, &zeh); 749 if (err == 0) 750 err = zap_entry_read(&zeh, integer_size, num_integers, buf); 751 752 zap_put_leaf(l); 753 return (err); 754 } 755 756 int 757 fzap_add_cd(zap_t *zap, const char *name, 758 uint64_t integer_size, uint64_t num_integers, 759 const void *val, uint32_t cd, dmu_tx_t *tx) 760 { 761 zap_leaf_t *l; 762 uint64_t hash; 763 int err; 764 zap_entry_handle_t zeh; 765 766 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 767 ASSERT(!zap->zap_ismicro); 768 ASSERT(fzap_checksize(name, integer_size, num_integers) == 0); 769 770 hash = zap_hash(zap, name); 771 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); 772 if (err != 0) 773 return (err); 774 retry: 775 err = zap_leaf_lookup(l, name, hash, &zeh); 776 if (err == 0) { 777 err = EEXIST; 778 goto out; 779 } 780 if (err != ENOENT) 781 goto out; 782 783 err = zap_entry_create(l, name, hash, cd, 784 integer_size, num_integers, val, &zeh); 785 786 if (err == 0) { 787 zap_increment_num_entries(zap, 1, tx); 788 } else if (err == EAGAIN) { 789 err = zap_expand_leaf(zap, l, hash, tx, &l); 790 if (err == 0) 791 goto retry; 792 } 793 794 out: 795 zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); 796 return (err); 797 } 798 799 int 800 fzap_add(zap_t *zap, const char *name, 801 uint64_t integer_size, uint64_t num_integers, 802 const void *val, dmu_tx_t *tx) 803 { 804 int err = fzap_checksize(name, integer_size, num_integers); 805 if (err != 0) 806 return (err); 807 808 return (fzap_add_cd(zap, name, integer_size, num_integers, 809 val, ZAP_MAXCD, tx)); 810 } 811 812 int 813 fzap_update(zap_t *zap, const char *name, 814 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) 815 { 816 zap_leaf_t *l; 817 uint64_t hash; 818 int err, create; 819 zap_entry_handle_t zeh; 820 821 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); 822 err = fzap_checksize(name, integer_size, num_integers); 823 if (err != 0) 824 return (err); 825 826 hash = zap_hash(zap, name); 827 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); 828 if (err != 0) 829 return (err); 830 retry: 831 err = zap_leaf_lookup(l, name, hash, &zeh); 832 create = (err == ENOENT); 833 ASSERT(err == 0 || err == ENOENT); 834 835 /* XXX If this leaf is chained, split it if we can. */ 836 837 if (create) { 838 err = zap_entry_create(l, name, hash, ZAP_MAXCD, 839 integer_size, num_integers, val, &zeh); 840 if (err == 0) 841 zap_increment_num_entries(zap, 1, tx); 842 } else { 843 err = zap_entry_update(&zeh, integer_size, num_integers, val); 844 } 845 846 if (err == EAGAIN) { 847 err = zap_expand_leaf(zap, l, hash, tx, &l); 848 if (err == 0) 849 goto retry; 850 } 851 852 out: 853 zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); 854 return (err); 855 } 856 857 int 858 fzap_length(zap_t *zap, const char *name, 859 uint64_t *integer_size, uint64_t *num_integers) 860 { 861 zap_leaf_t *l; 862 int err; 863 uint64_t hash; 864 zap_entry_handle_t zeh; 865 866 hash = zap_hash(zap, name); 867 err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); 868 if (err != 0) 869 return (err); 870 err = zap_leaf_lookup(l, name, hash, &zeh); 871 if (err != 0) 872 goto out; 873 874 if (integer_size) 875 *integer_size = zeh.zeh_integer_size; 876 if (num_integers) 877 *num_integers = zeh.zeh_num_integers; 878 out: 879 zap_put_leaf(l); 880 return (err); 881 } 882 883 int 884 fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx) 885 { 886 zap_leaf_t *l; 887 uint64_t hash; 888 int err; 889 zap_entry_handle_t zeh; 890 891 hash = zap_hash(zap, name); 892 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); 893 if (err != 0) 894 return (err); 895 err = zap_leaf_lookup(l, name, hash, &zeh); 896 if (err == 0) { 897 zap_entry_remove(&zeh); 898 zap_increment_num_entries(zap, -1, tx); 899 } 900 zap_put_leaf(l); 901 dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n", 902 zap->zap_objset, zap->zap_object, name, err); 903 return (err); 904 } 905 906 int 907 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name) 908 { 909 zap_cursor_t zc; 910 zap_attribute_t *za; 911 int err; 912 913 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 914 for (zap_cursor_init(&zc, os, zapobj); 915 (err = zap_cursor_retrieve(&zc, za)) == 0; 916 zap_cursor_advance(&zc)) { 917 if (za->za_first_integer == value) { 918 (void) strcpy(name, za->za_name); 919 break; 920 } 921 } 922 zap_cursor_fini(&zc); 923 kmem_free(za, sizeof (zap_attribute_t)); 924 return (err); 925 } 926 927 928 /* 929 * Routines for iterating over the attributes. 930 */ 931 932 int 933 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) 934 { 935 int err = ENOENT; 936 zap_entry_handle_t zeh; 937 zap_leaf_t *l; 938 939 /* retrieve the next entry at or after zc_hash/zc_cd */ 940 /* if no entry, return ENOENT */ 941 942 if (zc->zc_leaf && 943 (ZAP_HASH_IDX(zc->zc_hash, 944 zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != 945 zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { 946 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 947 zap_put_leaf(zc->zc_leaf); 948 zc->zc_leaf = NULL; 949 } 950 951 again: 952 if (zc->zc_leaf == NULL) { 953 err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, 954 &zc->zc_leaf); 955 if (err != 0) 956 return (err); 957 } else { 958 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); 959 } 960 l = zc->zc_leaf; 961 962 err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); 963 964 if (err == ENOENT) { 965 uint64_t nocare = 966 (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; 967 zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; 968 zc->zc_cd = 0; 969 if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { 970 zc->zc_hash = -1ULL; 971 } else { 972 zap_put_leaf(zc->zc_leaf); 973 zc->zc_leaf = NULL; 974 goto again; 975 } 976 } 977 978 if (err == 0) { 979 zc->zc_hash = zeh.zeh_hash; 980 zc->zc_cd = zeh.zeh_cd; 981 za->za_integer_length = zeh.zeh_integer_size; 982 za->za_num_integers = zeh.zeh_num_integers; 983 if (zeh.zeh_num_integers == 0) { 984 za->za_first_integer = 0; 985 } else { 986 err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); 987 ASSERT(err == 0 || err == EOVERFLOW); 988 } 989 err = zap_entry_read_name(&zeh, 990 sizeof (za->za_name), za->za_name); 991 ASSERT(err == 0); 992 } 993 rw_exit(&zc->zc_leaf->l_rwlock); 994 return (err); 995 } 996 997 998 static void 999 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) 1000 { 1001 int i, err; 1002 uint64_t lastblk = 0; 1003 1004 /* 1005 * NB: if a leaf has more pointers than an entire ptrtbl block 1006 * can hold, then it'll be accounted for more than once, since 1007 * we won't have lastblk. 1008 */ 1009 for (i = 0; i < len; i++) { 1010 zap_leaf_t *l; 1011 1012 if (tbl[i] == lastblk) 1013 continue; 1014 lastblk = tbl[i]; 1015 1016 err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); 1017 if (err == 0) { 1018 zap_leaf_stats(zap, l, zs); 1019 zap_put_leaf(l); 1020 } 1021 } 1022 } 1023 1024 void 1025 fzap_get_stats(zap_t *zap, zap_stats_t *zs) 1026 { 1027 int bs = FZAP_BLOCK_SHIFT(zap); 1028 zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; 1029 zs->zs_blocksize = 1ULL << bs; 1030 zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; 1031 zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; 1032 zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; 1033 1034 if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { 1035 /* the ptrtbl is entirely in the header block. */ 1036 zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1037 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); 1038 } else { 1039 int b; 1040 1041 dmu_prefetch(zap->zap_objset, zap->zap_object, 1042 zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, 1043 zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); 1044 1045 for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; 1046 b++) { 1047 dmu_buf_t *db; 1048 int err; 1049 1050 err = dmu_buf_hold(zap->zap_objset, zap->zap_object, 1051 (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, 1052 FTAG, &db); 1053 if (err == 0) { 1054 zap_stats_ptrtbl(zap, db->db_data, 1055 1<<(bs-3), zs); 1056 dmu_buf_rele(db, FTAG); 1057 } 1058 } 1059 } 1060 } 1061