153089ab7Seschrock /* 253089ab7Seschrock * CDDL HEADER START 353089ab7Seschrock * 453089ab7Seschrock * The contents of this file are subject to the terms of the 553089ab7Seschrock * Common Development and Distribution License (the "License"). 653089ab7Seschrock * You may not use this file except in compliance with the License. 753089ab7Seschrock * 853089ab7Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 953089ab7Seschrock * or http://www.opensolaris.org/os/licensing. 1053089ab7Seschrock * See the License for the specific language governing permissions 1153089ab7Seschrock * and limitations under the License. 1253089ab7Seschrock * 1353089ab7Seschrock * When distributing Covered Code, include this CDDL HEADER in each 1453089ab7Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1553089ab7Seschrock * If applicable, add the following below this CDDL HEADER, with the 1653089ab7Seschrock * fields enclosed by brackets "[]" replaced with your own identifying 1753089ab7Seschrock * information: Portions Copyright [yyyy] [name of copyright owner] 1853089ab7Seschrock * 1953089ab7Seschrock * CDDL HEADER END 2053089ab7Seschrock */ 2153089ab7Seschrock 2253089ab7Seschrock /* 237802d7bfSMatthew Ahrens * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24*c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 2553089ab7Seschrock */ 2653089ab7Seschrock 2753089ab7Seschrock #include <sys/arc.h> 2853089ab7Seschrock #include <sys/bptree.h> 2953089ab7Seschrock #include <sys/dmu.h> 3053089ab7Seschrock #include <sys/dmu_objset.h> 3153089ab7Seschrock #include <sys/dmu_tx.h> 3253089ab7Seschrock #include <sys/dmu_traverse.h> 3353089ab7Seschrock #include <sys/dsl_dataset.h> 3453089ab7Seschrock #include <sys/dsl_dir.h> 3553089ab7Seschrock #include <sys/dsl_pool.h> 3653089ab7Seschrock #include <sys/dnode.h> 3753089ab7Seschrock #include <sys/refcount.h> 3853089ab7Seschrock #include <sys/spa.h> 3953089ab7Seschrock 4053089ab7Seschrock /* 4153089ab7Seschrock * A bptree is a queue of root block pointers from destroyed datasets. When a 4253089ab7Seschrock * dataset is destroyed its root block pointer is put on the end of the pool's 4353089ab7Seschrock * bptree queue so the dataset's blocks can be freed asynchronously by 4453089ab7Seschrock * dsl_scan_sync. This allows the delete operation to finish without traversing 4553089ab7Seschrock * all the dataset's blocks. 4653089ab7Seschrock * 47f7170741SWill Andrews * Note that while bt_begin and bt_end are only ever incremented in this code, 4853089ab7Seschrock * they are effectively reset to 0 every time the entire bptree is freed because 4953089ab7Seschrock * the bptree's object is destroyed and re-created. 5053089ab7Seschrock */ 5153089ab7Seschrock 5253089ab7Seschrock struct bptree_args { 5353089ab7Seschrock bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 5453089ab7Seschrock boolean_t ba_free; /* true if freeing during traversal */ 5553089ab7Seschrock 5653089ab7Seschrock bptree_itor_t *ba_func; /* function to call for each blockpointer */ 5753089ab7Seschrock void *ba_arg; /* caller supplied argument to ba_func */ 5853089ab7Seschrock dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 5953089ab7Seschrock } bptree_args_t; 6053089ab7Seschrock 6153089ab7Seschrock uint64_t 6253089ab7Seschrock bptree_alloc(objset_t *os, dmu_tx_t *tx) 6353089ab7Seschrock { 6453089ab7Seschrock uint64_t obj; 6553089ab7Seschrock dmu_buf_t *db; 6653089ab7Seschrock bptree_phys_t *bt; 6753089ab7Seschrock 6853089ab7Seschrock obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 69b5152584SMatthew Ahrens SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 7053089ab7Seschrock sizeof (bptree_phys_t), tx); 7153089ab7Seschrock 7253089ab7Seschrock /* 7353089ab7Seschrock * Bonus buffer contents are already initialized to 0, but for 7453089ab7Seschrock * readability we make it explicit. 7553089ab7Seschrock */ 76b420f3adSRichard Lowe VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 7753089ab7Seschrock dmu_buf_will_dirty(db, tx); 7853089ab7Seschrock bt = db->db_data; 7953089ab7Seschrock bt->bt_begin = 0; 8053089ab7Seschrock bt->bt_end = 0; 8153089ab7Seschrock bt->bt_bytes = 0; 8253089ab7Seschrock bt->bt_comp = 0; 8353089ab7Seschrock bt->bt_uncomp = 0; 8453089ab7Seschrock dmu_buf_rele(db, FTAG); 8553089ab7Seschrock 8653089ab7Seschrock return (obj); 8753089ab7Seschrock } 8853089ab7Seschrock 8953089ab7Seschrock int 9053089ab7Seschrock bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 9153089ab7Seschrock { 9253089ab7Seschrock dmu_buf_t *db; 9353089ab7Seschrock bptree_phys_t *bt; 9453089ab7Seschrock 95b420f3adSRichard Lowe VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 9653089ab7Seschrock bt = db->db_data; 9753089ab7Seschrock ASSERT3U(bt->bt_begin, ==, bt->bt_end); 98fb09f5aaSMadhav Suresh ASSERT0(bt->bt_bytes); 99fb09f5aaSMadhav Suresh ASSERT0(bt->bt_comp); 100fb09f5aaSMadhav Suresh ASSERT0(bt->bt_uncomp); 10153089ab7Seschrock dmu_buf_rele(db, FTAG); 10253089ab7Seschrock 10353089ab7Seschrock return (dmu_object_free(os, obj, tx)); 10453089ab7Seschrock } 10553089ab7Seschrock 1067fd05ac4SMatthew Ahrens boolean_t 1077fd05ac4SMatthew Ahrens bptree_is_empty(objset_t *os, uint64_t obj) 1087fd05ac4SMatthew Ahrens { 1097fd05ac4SMatthew Ahrens dmu_buf_t *db; 1107fd05ac4SMatthew Ahrens bptree_phys_t *bt; 1117fd05ac4SMatthew Ahrens boolean_t rv; 1127fd05ac4SMatthew Ahrens 1137fd05ac4SMatthew Ahrens VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); 1147fd05ac4SMatthew Ahrens bt = db->db_data; 1157fd05ac4SMatthew Ahrens rv = (bt->bt_begin == bt->bt_end); 1167fd05ac4SMatthew Ahrens dmu_buf_rele(db, FTAG); 1177fd05ac4SMatthew Ahrens return (rv); 1187fd05ac4SMatthew Ahrens } 1197fd05ac4SMatthew Ahrens 12053089ab7Seschrock void 12153089ab7Seschrock bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 12253089ab7Seschrock uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 12353089ab7Seschrock { 12453089ab7Seschrock dmu_buf_t *db; 12553089ab7Seschrock bptree_phys_t *bt; 1267fd05ac4SMatthew Ahrens bptree_entry_phys_t bte = { 0 }; 12753089ab7Seschrock 12853089ab7Seschrock /* 12953089ab7Seschrock * bptree objects are in the pool mos, therefore they can only be 13053089ab7Seschrock * modified in syncing context. Furthermore, this is only modified 13153089ab7Seschrock * by the sync thread, so no locking is necessary. 13253089ab7Seschrock */ 13353089ab7Seschrock ASSERT(dmu_tx_is_syncing(tx)); 13453089ab7Seschrock 135b420f3adSRichard Lowe VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 13653089ab7Seschrock bt = db->db_data; 13753089ab7Seschrock 13853089ab7Seschrock bte.be_birth_txg = birth_txg; 13953089ab7Seschrock bte.be_bp = *bp; 14053089ab7Seschrock dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); 14153089ab7Seschrock 14253089ab7Seschrock dmu_buf_will_dirty(db, tx); 14353089ab7Seschrock bt->bt_end++; 14453089ab7Seschrock bt->bt_bytes += bytes; 14553089ab7Seschrock bt->bt_comp += comp; 14653089ab7Seschrock bt->bt_uncomp += uncomp; 14753089ab7Seschrock dmu_buf_rele(db, FTAG); 14853089ab7Seschrock } 14953089ab7Seschrock 15053089ab7Seschrock /* ARGSUSED */ 15153089ab7Seschrock static int 1521b912ec7SGeorge Wilson bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1537802d7bfSMatthew Ahrens const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 15453089ab7Seschrock { 15553089ab7Seschrock int err; 15653089ab7Seschrock struct bptree_args *ba = arg; 15753089ab7Seschrock 158a2cdcdd2SPaul Dagnelie if (bp == NULL || BP_IS_HOLE(bp)) 15953089ab7Seschrock return (0); 16053089ab7Seschrock 16153089ab7Seschrock err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 16253089ab7Seschrock if (err == 0 && ba->ba_free) { 16353089ab7Seschrock ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 16453089ab7Seschrock ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 16553089ab7Seschrock ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 16653089ab7Seschrock } 16753089ab7Seschrock return (err); 16853089ab7Seschrock } 16953089ab7Seschrock 1707fd05ac4SMatthew Ahrens /* 1717fd05ac4SMatthew Ahrens * If "free" is set: 1727fd05ac4SMatthew Ahrens * - It is assumed that "func" will be freeing the block pointers. 1737fd05ac4SMatthew Ahrens * - If "func" returns nonzero, the bookmark will be remembered and 1747fd05ac4SMatthew Ahrens * iteration will be restarted from this point on next invocation. 1757fd05ac4SMatthew Ahrens * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), 1767fd05ac4SMatthew Ahrens * bptree_iterate will remember the bookmark, continue traversing 1777fd05ac4SMatthew Ahrens * any additional entries, and return 0. 1787fd05ac4SMatthew Ahrens * 1797fd05ac4SMatthew Ahrens * If "free" is not set, traversal will stop and return an error if 1807fd05ac4SMatthew Ahrens * an i/o error is encountered. 1817fd05ac4SMatthew Ahrens * 1827fd05ac4SMatthew Ahrens * In either case, if zfs_free_leak_on_eio is set, i/o errors will be 1837fd05ac4SMatthew Ahrens * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to 1847fd05ac4SMatthew Ahrens * traverse_dataset_destroyed()). 1857fd05ac4SMatthew Ahrens */ 18653089ab7Seschrock int 18753089ab7Seschrock bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 18853089ab7Seschrock void *arg, dmu_tx_t *tx) 18953089ab7Seschrock { 1907fd05ac4SMatthew Ahrens boolean_t ioerr = B_FALSE; 19153089ab7Seschrock int err; 19253089ab7Seschrock uint64_t i; 19353089ab7Seschrock dmu_buf_t *db; 19453089ab7Seschrock struct bptree_args ba; 19553089ab7Seschrock 19653089ab7Seschrock ASSERT(!free || dmu_tx_is_syncing(tx)); 19753089ab7Seschrock 19853089ab7Seschrock err = dmu_bonus_hold(os, obj, FTAG, &db); 19953089ab7Seschrock if (err != 0) 20053089ab7Seschrock return (err); 20153089ab7Seschrock 20253089ab7Seschrock if (free) 20353089ab7Seschrock dmu_buf_will_dirty(db, tx); 20453089ab7Seschrock 20553089ab7Seschrock ba.ba_phys = db->db_data; 20653089ab7Seschrock ba.ba_free = free; 20753089ab7Seschrock ba.ba_func = func; 20853089ab7Seschrock ba.ba_arg = arg; 20953089ab7Seschrock ba.ba_tx = tx; 21053089ab7Seschrock 21153089ab7Seschrock err = 0; 21253089ab7Seschrock for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 21353089ab7Seschrock bptree_entry_phys_t bte; 2148b36997aSMatthew Ahrens int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; 21553089ab7Seschrock 21653089ab7Seschrock err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 21753089ab7Seschrock &bte, DMU_READ_NO_PREFETCH); 21853089ab7Seschrock if (err != 0) 21953089ab7Seschrock break; 22053089ab7Seschrock 2217fd05ac4SMatthew Ahrens if (zfs_free_leak_on_eio) 2228b36997aSMatthew Ahrens flags |= TRAVERSE_HARD; 2237fd05ac4SMatthew Ahrens zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " 2247fd05ac4SMatthew Ahrens "bookmark %lld/%lld/%lld/%lld", 2257fd05ac4SMatthew Ahrens i, (longlong_t)bte.be_birth_txg, 2267fd05ac4SMatthew Ahrens (longlong_t)bte.be_zb.zb_objset, 2277fd05ac4SMatthew Ahrens (longlong_t)bte.be_zb.zb_object, 2287fd05ac4SMatthew Ahrens (longlong_t)bte.be_zb.zb_level, 2297fd05ac4SMatthew Ahrens (longlong_t)bte.be_zb.zb_blkid); 23053089ab7Seschrock err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 2318b36997aSMatthew Ahrens bte.be_birth_txg, &bte.be_zb, flags, 23253089ab7Seschrock bptree_visit_cb, &ba); 23353089ab7Seschrock if (free) { 2347fd05ac4SMatthew Ahrens /* 2357fd05ac4SMatthew Ahrens * The callback has freed the visited block pointers. 2367fd05ac4SMatthew Ahrens * Record our traversal progress on disk, either by 2377fd05ac4SMatthew Ahrens * updating this record's bookmark, or by logically 2387fd05ac4SMatthew Ahrens * removing this record by advancing bt_begin. 2397fd05ac4SMatthew Ahrens */ 2407fd05ac4SMatthew Ahrens if (err != 0) { 24153089ab7Seschrock /* save bookmark for future resume */ 24253089ab7Seschrock ASSERT3U(bte.be_zb.zb_objset, ==, 24353089ab7Seschrock ZB_DESTROYED_OBJSET); 244fb09f5aaSMadhav Suresh ASSERT0(bte.be_zb.zb_level); 24553089ab7Seschrock dmu_write(os, obj, i * sizeof (bte), 24653089ab7Seschrock sizeof (bte), &bte, tx); 2477fd05ac4SMatthew Ahrens if (err == EIO || err == ECKSUM || 2487fd05ac4SMatthew Ahrens err == ENXIO) { 2497fd05ac4SMatthew Ahrens /* 2507fd05ac4SMatthew Ahrens * Skip the rest of this tree and 2517fd05ac4SMatthew Ahrens * continue on to the next entry. 2527fd05ac4SMatthew Ahrens */ 2537fd05ac4SMatthew Ahrens err = 0; 2547fd05ac4SMatthew Ahrens ioerr = B_TRUE; 2557fd05ac4SMatthew Ahrens } else { 25653089ab7Seschrock break; 2578b36997aSMatthew Ahrens } 2587fd05ac4SMatthew Ahrens } else if (ioerr) { 2598b36997aSMatthew Ahrens /* 2607fd05ac4SMatthew Ahrens * This entry is finished, but there were 2617fd05ac4SMatthew Ahrens * i/o errors on previous entries, so we 2627fd05ac4SMatthew Ahrens * can't adjust bt_begin. Set this entry's 2637fd05ac4SMatthew Ahrens * be_birth_txg such that it will be 2647fd05ac4SMatthew Ahrens * treated as a no-op in future traversals. 2658b36997aSMatthew Ahrens */ 2667fd05ac4SMatthew Ahrens bte.be_birth_txg = UINT64_MAX; 2677fd05ac4SMatthew Ahrens dmu_write(os, obj, i * sizeof (bte), 2687fd05ac4SMatthew Ahrens sizeof (bte), &bte, tx); 2698b36997aSMatthew Ahrens } 2708b36997aSMatthew Ahrens 2717fd05ac4SMatthew Ahrens if (!ioerr) { 27253089ab7Seschrock ba.ba_phys->bt_begin++; 27353089ab7Seschrock (void) dmu_free_range(os, obj, 27453089ab7Seschrock i * sizeof (bte), sizeof (bte), tx); 27553089ab7Seschrock } 2767fd05ac4SMatthew Ahrens } else if (err != 0) { 2777fd05ac4SMatthew Ahrens break; 2787fd05ac4SMatthew Ahrens } 27953089ab7Seschrock } 28053089ab7Seschrock 2817fd05ac4SMatthew Ahrens ASSERT(!free || err != 0 || ioerr || 2827fd05ac4SMatthew Ahrens ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 28353089ab7Seschrock 28453089ab7Seschrock /* if all blocks are free there should be no used space */ 28553089ab7Seschrock if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 2867fd05ac4SMatthew Ahrens if (zfs_free_leak_on_eio) { 2877fd05ac4SMatthew Ahrens ba.ba_phys->bt_bytes = 0; 2887fd05ac4SMatthew Ahrens ba.ba_phys->bt_comp = 0; 2897fd05ac4SMatthew Ahrens ba.ba_phys->bt_uncomp = 0; 2907fd05ac4SMatthew Ahrens } 2917fd05ac4SMatthew Ahrens 292fb09f5aaSMadhav Suresh ASSERT0(ba.ba_phys->bt_bytes); 293fb09f5aaSMadhav Suresh ASSERT0(ba.ba_phys->bt_comp); 294fb09f5aaSMadhav Suresh ASSERT0(ba.ba_phys->bt_uncomp); 29553089ab7Seschrock } 29653089ab7Seschrock 29753089ab7Seschrock dmu_buf_rele(db, FTAG); 29853089ab7Seschrock 29953089ab7Seschrock return (err); 30053089ab7Seschrock } 301