1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dsl_pool.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/arc.h> 35 #include <sys/zap.h> 36 #include <sys/zio.h> 37 #include <sys/zfs_context.h> 38 #include <sys/fs/zfs.h> 39 40 int zfs_no_write_throttle = 0; 41 uint64_t zfs_write_limit_override = 0; 42 43 static int 44 dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) 45 { 46 uint64_t obj; 47 int err; 48 49 err = zap_lookup(dp->dp_meta_objset, 50 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 51 MOS_DIR_NAME, sizeof (obj), 1, &obj); 52 if (err) 53 return (err); 54 55 return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); 56 } 57 58 static dsl_pool_t * 59 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 60 { 61 dsl_pool_t *dp; 62 blkptr_t *bp = spa_get_rootblkptr(spa); 63 extern uint64_t zfs_write_limit_min; 64 65 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 66 dp->dp_spa = spa; 67 dp->dp_meta_rootbp = *bp; 68 rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); 69 dp->dp_write_limit = zfs_write_limit_min; 70 txg_init(dp, txg); 71 72 txg_list_create(&dp->dp_dirty_datasets, 73 offsetof(dsl_dataset_t, ds_dirty_link)); 74 txg_list_create(&dp->dp_dirty_dirs, 75 offsetof(dsl_dir_t, dd_dirty_link)); 76 txg_list_create(&dp->dp_sync_tasks, 77 offsetof(dsl_sync_task_group_t, dstg_node)); 78 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), 79 offsetof(dsl_dataset_t, ds_synced_link)); 80 81 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 82 83 return (dp); 84 } 85 86 int 87 dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 88 { 89 int err; 90 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 91 objset_impl_t *osi; 92 93 rw_enter(&dp->dp_config_rwlock, RW_READER); 94 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); 95 if (err) 96 goto out; 97 dp->dp_meta_objset = &osi->os; 98 99 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 100 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 101 &dp->dp_root_dir_obj); 102 if (err) 103 goto out; 104 105 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 106 NULL, dp, &dp->dp_root_dir); 107 if (err) 108 goto out; 109 110 err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); 111 if (err) 112 goto out; 113 114 out: 115 rw_exit(&dp->dp_config_rwlock); 116 if (err) 117 dsl_pool_close(dp); 118 else 119 *dpp = dp; 120 121 return (err); 122 } 123 124 void 125 dsl_pool_close(dsl_pool_t *dp) 126 { 127 /* drop our reference from dsl_pool_open() */ 128 if (dp->dp_mos_dir) 129 dsl_dir_close(dp->dp_mos_dir, dp); 130 if (dp->dp_root_dir) 131 dsl_dir_close(dp->dp_root_dir, dp); 132 133 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 134 if (dp->dp_meta_objset) 135 dmu_objset_evict(NULL, dp->dp_meta_objset->os); 136 137 txg_list_destroy(&dp->dp_dirty_datasets); 138 txg_list_destroy(&dp->dp_dirty_dirs); 139 list_destroy(&dp->dp_synced_datasets); 140 141 arc_flush(dp->dp_spa); 142 txg_fini(dp); 143 rw_destroy(&dp->dp_config_rwlock); 144 mutex_destroy(&dp->dp_lock); 145 kmem_free(dp, sizeof (dsl_pool_t)); 146 } 147 148 dsl_pool_t * 149 dsl_pool_create(spa_t *spa, uint64_t txg) 150 { 151 int err; 152 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 153 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 154 dp->dp_meta_objset = &dmu_objset_create_impl(spa, 155 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; 156 157 /* create the pool directory */ 158 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 159 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 160 ASSERT3U(err, ==, 0); 161 162 /* create and open the root dir */ 163 dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); 164 VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, 165 NULL, dp, &dp->dp_root_dir)); 166 167 /* create and open the meta-objset dir */ 168 (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx); 169 VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); 170 171 dmu_tx_commit(tx); 172 173 return (dp); 174 } 175 176 void 177 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 178 { 179 zio_t *zio; 180 dmu_tx_t *tx; 181 dsl_dir_t *dd; 182 dsl_dataset_t *ds; 183 dsl_sync_task_group_t *dstg; 184 objset_impl_t *mosi = dp->dp_meta_objset->os; 185 int err; 186 187 tx = dmu_tx_create_assigned(dp, txg); 188 189 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 190 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 191 if (!list_link_active(&ds->ds_synced_link)) 192 list_insert_tail(&dp->dp_synced_datasets, ds); 193 else 194 dmu_buf_rele(ds->ds_dbuf, ds); 195 dsl_dataset_sync(ds, zio, tx); 196 } 197 err = zio_wait(zio); 198 ASSERT(err == 0); 199 200 while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) 201 dsl_sync_task_group_sync(dstg, tx); 202 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 203 dsl_dir_sync(dd, tx); 204 205 if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 206 list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { 207 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 208 dmu_objset_sync(mosi, zio, tx); 209 err = zio_wait(zio); 210 ASSERT(err == 0); 211 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 212 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 213 } 214 215 dmu_tx_commit(tx); 216 } 217 218 void 219 dsl_pool_zil_clean(dsl_pool_t *dp) 220 { 221 dsl_dataset_t *ds; 222 223 while (ds = list_head(&dp->dp_synced_datasets)) { 224 list_remove(&dp->dp_synced_datasets, ds); 225 ASSERT(ds->ds_user_ptr != NULL); 226 zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); 227 dmu_buf_rele(ds->ds_dbuf, ds); 228 } 229 } 230 231 /* 232 * TRUE if the current thread is the tx_sync_thread or if we 233 * are being called from SPA context during pool initialization. 234 */ 235 int 236 dsl_pool_sync_context(dsl_pool_t *dp) 237 { 238 return (curthread == dp->dp_tx.tx_sync_thread || 239 spa_get_dsl(dp->dp_spa) == NULL); 240 } 241 242 uint64_t 243 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 244 { 245 uint64_t space, resv; 246 247 /* 248 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 249 * efficiency. 250 * XXX The intent log is not accounted for, so it must fit 251 * within this slop. 252 * 253 * If we're trying to assess whether it's OK to do a free, 254 * cut the reservation in half to allow forward progress 255 * (e.g. make it possible to rm(1) files from a full pool). 256 */ 257 space = spa_get_dspace(dp->dp_spa); 258 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 259 if (netfree) 260 resv >>= 1; 261 262 return (space - resv); 263 } 264 265 int 266 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 267 { 268 uint64_t reserved = 0; 269 uint64_t write_limit = (zfs_write_limit_override ? 270 zfs_write_limit_override : dp->dp_write_limit); 271 272 if (zfs_no_write_throttle) { 273 dp->dp_tempreserved[tx->tx_txg & TXG_MASK] += space; 274 return (0); 275 } 276 277 /* 278 * Check to see if we have exceeded the maximum allowed IO for 279 * this transaction group. We can do this without locks since 280 * a little slop here is ok. Note that we do the reserved check 281 * with only half the requested reserve: this is because the 282 * reserve requests are worst-case, and we really don't want to 283 * throttle based off of worst-case estimates. 284 */ 285 if (write_limit > 0) { 286 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 287 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 288 289 if (reserved && reserved > write_limit) 290 return (ERESTART); 291 } 292 293 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 294 295 /* 296 * If this transaction group is over 7/8ths capacity, delay 297 * the caller 1 clock tick. This will slow down the "fill" 298 * rate until the sync process can catch up with us. 299 */ 300 if (reserved && reserved > (write_limit - write_limit << 3)) 301 txg_delay(dp, tx->tx_txg, 1); 302 303 return (0); 304 } 305 306 void 307 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 308 { 309 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 310 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 311 } 312 313 void 314 dsl_pool_memory_pressure(dsl_pool_t *dp) 315 { 316 extern uint64_t zfs_write_limit_min; 317 uint64_t space_inuse = 0; 318 int i; 319 320 if (dp->dp_write_limit == zfs_write_limit_min) 321 return; 322 323 for (i = 0; i < TXG_SIZE; i++) { 324 space_inuse += dp->dp_space_towrite[i]; 325 space_inuse += dp->dp_tempreserved[i]; 326 } 327 dp->dp_write_limit = MAX(zfs_write_limit_min, 328 MIN(dp->dp_write_limit, space_inuse / 4)); 329 } 330 331 void 332 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 333 { 334 if (space > 0) { 335 mutex_enter(&dp->dp_lock); 336 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 337 mutex_exit(&dp->dp_lock); 338 } 339 } 340