1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dbuf.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
34 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
35 #include <sys/dsl_pool.h>
36 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
37 #include <sys/spa.h>
38 #include <sys/sa.h>
39 #include <sys/sa_impl.h>
40 #include <sys/zfs_context.h>
41 #include <sys/varargs.h>
42
43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
44 uint64_t arg1, uint64_t arg2);
45
46
47 dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t * dd)48 dmu_tx_create_dd(dsl_dir_t *dd)
49 {
50 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
51 tx->tx_dir = dd;
52 if (dd != NULL)
53 tx->tx_pool = dd->dd_pool;
54 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
55 offsetof(dmu_tx_hold_t, txh_node));
56 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
57 offsetof(dmu_tx_callback_t, dcb_node));
58 tx->tx_start = gethrtime();
59 #ifdef ZFS_DEBUG
60 refcount_create(&tx->tx_space_written);
61 refcount_create(&tx->tx_space_freed);
62 #endif
63 return (tx);
64 }
65
66 dmu_tx_t *
dmu_tx_create(objset_t * os)67 dmu_tx_create(objset_t *os)
68 {
69 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
70 tx->tx_objset = os;
71 tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
72 return (tx);
73 }
74
75 dmu_tx_t *
dmu_tx_create_assigned(struct dsl_pool * dp,uint64_t txg)76 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
77 {
78 dmu_tx_t *tx = dmu_tx_create_dd(NULL);
79
80 ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
81 tx->tx_pool = dp;
82 tx->tx_txg = txg;
83 tx->tx_anyobj = TRUE;
84
85 return (tx);
86 }
87
88 int
dmu_tx_is_syncing(dmu_tx_t * tx)89 dmu_tx_is_syncing(dmu_tx_t *tx)
90 {
91 return (tx->tx_anyobj);
92 }
93
94 int
dmu_tx_private_ok(dmu_tx_t * tx)95 dmu_tx_private_ok(dmu_tx_t *tx)
96 {
97 return (tx->tx_anyobj);
98 }
99
100 static dmu_tx_hold_t *
dmu_tx_hold_object_impl(dmu_tx_t * tx,objset_t * os,uint64_t object,enum dmu_tx_hold_type type,uint64_t arg1,uint64_t arg2)101 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
102 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
103 {
104 dmu_tx_hold_t *txh;
105 dnode_t *dn = NULL;
106 int err;
107
108 if (object != DMU_NEW_OBJECT) {
109 err = dnode_hold(os, object, tx, &dn);
110 if (err) {
111 tx->tx_err = err;
112 return (NULL);
113 }
114
115 if (err == 0 && tx->tx_txg != 0) {
116 mutex_enter(&dn->dn_mtx);
117 /*
118 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
119 * problem, but there's no way for it to happen (for
120 * now, at least).
121 */
122 ASSERT(dn->dn_assigned_txg == 0);
123 dn->dn_assigned_txg = tx->tx_txg;
124 (void) refcount_add(&dn->dn_tx_holds, tx);
125 mutex_exit(&dn->dn_mtx);
126 }
127 }
128
129 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
130 txh->txh_tx = tx;
131 txh->txh_dnode = dn;
132 #ifdef ZFS_DEBUG
133 txh->txh_type = type;
134 txh->txh_arg1 = arg1;
135 txh->txh_arg2 = arg2;
136 #endif
137 list_insert_tail(&tx->tx_holds, txh);
138
139 return (txh);
140 }
141
142 void
dmu_tx_add_new_object(dmu_tx_t * tx,objset_t * os,uint64_t object)143 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
144 {
145 /*
146 * If we're syncing, they can manipulate any object anyhow, and
147 * the hold on the dnode_t can cause problems.
148 */
149 if (!dmu_tx_is_syncing(tx)) {
150 (void) dmu_tx_hold_object_impl(tx, os,
151 object, THT_NEWOBJECT, 0, 0);
152 }
153 }
154
155 static int
dmu_tx_check_ioerr(zio_t * zio,dnode_t * dn,int level,uint64_t blkid)156 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
157 {
158 int err;
159 dmu_buf_impl_t *db;
160
161 rw_enter(&dn->dn_struct_rwlock, RW_READER);
162 db = dbuf_hold_level(dn, level, blkid, FTAG);
163 rw_exit(&dn->dn_struct_rwlock);
164 if (db == NULL)
165 return (SET_ERROR(EIO));
166 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
167 dbuf_rele(db, FTAG);
168 return (err);
169 }
170
171 static void
dmu_tx_count_twig(dmu_tx_hold_t * txh,dnode_t * dn,dmu_buf_impl_t * db,int level,uint64_t blkid,boolean_t freeable,uint64_t * history)172 dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
173 int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
174 {
175 objset_t *os = dn->dn_objset;
176 dsl_dataset_t *ds = os->os_dsl_dataset;
177 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
178 dmu_buf_impl_t *parent = NULL;
179 blkptr_t *bp = NULL;
180 uint64_t space;
181
182 if (level >= dn->dn_nlevels || history[level] == blkid)
183 return;
184
185 history[level] = blkid;
186
187 space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
188
189 if (db == NULL || db == dn->dn_dbuf) {
190 ASSERT(level != 0);
191 db = NULL;
192 } else {
193 ASSERT(DB_DNODE(db) == dn);
194 ASSERT(db->db_level == level);
195 ASSERT(db->db.db_size == space);
196 ASSERT(db->db_blkid == blkid);
197 bp = db->db_blkptr;
198 parent = db->db_parent;
199 }
200
201 freeable = (bp && (freeable ||
202 dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
203
204 if (freeable)
205 txh->txh_space_tooverwrite += space;
206 else
207 txh->txh_space_towrite += space;
208 if (bp)
209 txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
210
211 dmu_tx_count_twig(txh, dn, parent, level + 1,
212 blkid >> epbs, freeable, history);
213 }
214
215 /* ARGSUSED */
216 static void
dmu_tx_count_write(dmu_tx_hold_t * txh,uint64_t off,uint64_t len)217 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
218 {
219 dnode_t *dn = txh->txh_dnode;
220 uint64_t start, end, i;
221 int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
222 int err = 0;
223
224 if (len == 0)
225 return;
226
227 min_bs = SPA_MINBLOCKSHIFT;
228 max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
229 min_ibs = DN_MIN_INDBLKSHIFT;
230 max_ibs = DN_MAX_INDBLKSHIFT;
231
232 if (dn) {
233 uint64_t history[DN_MAX_LEVELS];
234 int nlvls = dn->dn_nlevels;
235 int delta;
236
237 /*
238 * For i/o error checking, read the first and last level-0
239 * blocks (if they are not aligned), and all the level-1 blocks.
240 */
241 if (dn->dn_maxblkid == 0) {
242 delta = dn->dn_datablksz;
243 start = (off < dn->dn_datablksz) ? 0 : 1;
244 end = (off+len <= dn->dn_datablksz) ? 0 : 1;
245 if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
246 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
247 if (err)
248 goto out;
249 delta -= off;
250 }
251 } else {
252 zio_t *zio = zio_root(dn->dn_objset->os_spa,
253 NULL, NULL, ZIO_FLAG_CANFAIL);
254
255 /* first level-0 block */
256 start = off >> dn->dn_datablkshift;
257 if (P2PHASE(off, dn->dn_datablksz) ||
258 len < dn->dn_datablksz) {
259 err = dmu_tx_check_ioerr(zio, dn, 0, start);
260 if (err)
261 goto out;
262 }
263
264 /* last level-0 block */
265 end = (off+len-1) >> dn->dn_datablkshift;
266 if (end != start && end <= dn->dn_maxblkid &&
267 P2PHASE(off+len, dn->dn_datablksz)) {
268 err = dmu_tx_check_ioerr(zio, dn, 0, end);
269 if (err)
270 goto out;
271 }
272
273 /* level-1 blocks */
274 if (nlvls > 1) {
275 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
276 for (i = (start>>shft)+1; i < end>>shft; i++) {
277 err = dmu_tx_check_ioerr(zio, dn, 1, i);
278 if (err)
279 goto out;
280 }
281 }
282
283 err = zio_wait(zio);
284 if (err)
285 goto out;
286 delta = P2NPHASE(off, dn->dn_datablksz);
287 }
288
289 min_ibs = max_ibs = dn->dn_indblkshift;
290 if (dn->dn_maxblkid > 0) {
291 /*
292 * The blocksize can't change,
293 * so we can make a more precise estimate.
294 */
295 ASSERT(dn->dn_datablkshift != 0);
296 min_bs = max_bs = dn->dn_datablkshift;
297 } else {
298 /*
299 * The blocksize can increase up to the recordsize,
300 * or if it is already more than the recordsize,
301 * up to the next power of 2.
302 */
303 min_bs = highbit64(dn->dn_datablksz - 1);
304 max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
305 }
306
307 /*
308 * If this write is not off the end of the file
309 * we need to account for overwrites/unref.
310 */
311 if (start <= dn->dn_maxblkid) {
312 for (int l = 0; l < DN_MAX_LEVELS; l++)
313 history[l] = -1ULL;
314 }
315 while (start <= dn->dn_maxblkid) {
316 dmu_buf_impl_t *db;
317
318 rw_enter(&dn->dn_struct_rwlock, RW_READER);
319 err = dbuf_hold_impl(dn, 0, start,
320 FALSE, FALSE, FTAG, &db);
321 rw_exit(&dn->dn_struct_rwlock);
322
323 if (err) {
324 txh->txh_tx->tx_err = err;
325 return;
326 }
327
328 dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
329 history);
330 dbuf_rele(db, FTAG);
331 if (++start > end) {
332 /*
333 * Account for new indirects appearing
334 * before this IO gets assigned into a txg.
335 */
336 bits = 64 - min_bs;
337 epbs = min_ibs - SPA_BLKPTRSHIFT;
338 for (bits -= epbs * (nlvls - 1);
339 bits >= 0; bits -= epbs)
340 txh->txh_fudge += 1ULL << max_ibs;
341 goto out;
342 }
343 off += delta;
344 if (len >= delta)
345 len -= delta;
346 delta = dn->dn_datablksz;
347 }
348 }
349
350 /*
351 * 'end' is the last thing we will access, not one past.
352 * This way we won't overflow when accessing the last byte.
353 */
354 start = P2ALIGN(off, 1ULL << max_bs);
355 end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
356 txh->txh_space_towrite += end - start + 1;
357
358 start >>= min_bs;
359 end >>= min_bs;
360
361 epbs = min_ibs - SPA_BLKPTRSHIFT;
362
363 /*
364 * The object contains at most 2^(64 - min_bs) blocks,
365 * and each indirect level maps 2^epbs.
366 */
367 for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
368 start >>= epbs;
369 end >>= epbs;
370 ASSERT3U(end, >=, start);
371 txh->txh_space_towrite += (end - start + 1) << max_ibs;
372 if (start != 0) {
373 /*
374 * We also need a new blkid=0 indirect block
375 * to reference any existing file data.
376 */
377 txh->txh_space_towrite += 1ULL << max_ibs;
378 }
379 }
380
381 out:
382 if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
383 2 * DMU_MAX_ACCESS)
384 err = SET_ERROR(EFBIG);
385
386 if (err)
387 txh->txh_tx->tx_err = err;
388 }
389
390 static void
dmu_tx_count_dnode(dmu_tx_hold_t * txh)391 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
392 {
393 dnode_t *dn = txh->txh_dnode;
394 dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
395 uint64_t space = mdn->dn_datablksz +
396 ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
397
398 if (dn && dn->dn_dbuf->db_blkptr &&
399 dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
400 dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
401 txh->txh_space_tooverwrite += space;
402 txh->txh_space_tounref += space;
403 } else {
404 txh->txh_space_towrite += space;
405 if (dn && dn->dn_dbuf->db_blkptr)
406 txh->txh_space_tounref += space;
407 }
408 }
409
410 void
dmu_tx_hold_write(dmu_tx_t * tx,uint64_t object,uint64_t off,int len)411 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
412 {
413 dmu_tx_hold_t *txh;
414
415 ASSERT(tx->tx_txg == 0);
416 ASSERT(len < DMU_MAX_ACCESS);
417 ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
418
419 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
420 object, THT_WRITE, off, len);
421 if (txh == NULL)
422 return;
423
424 dmu_tx_count_write(txh, off, len);
425 dmu_tx_count_dnode(txh);
426 }
427
428 static void
dmu_tx_count_free(dmu_tx_hold_t * txh,uint64_t off,uint64_t len)429 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
430 {
431 uint64_t blkid, nblks, lastblk;
432 uint64_t space = 0, unref = 0, skipped = 0;
433 dnode_t *dn = txh->txh_dnode;
434 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
435 spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
436 int epbs;
437 uint64_t l0span = 0, nl1blks = 0;
438
439 if (dn->dn_nlevels == 0)
440 return;
441
442 /*
443 * The struct_rwlock protects us against dn_nlevels
444 * changing, in case (against all odds) we manage to dirty &
445 * sync out the changes after we check for being dirty.
446 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
447 */
448 rw_enter(&dn->dn_struct_rwlock, RW_READER);
449 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
450 if (dn->dn_maxblkid == 0) {
451 if (off == 0 && len >= dn->dn_datablksz) {
452 blkid = 0;
453 nblks = 1;
454 } else {
455 rw_exit(&dn->dn_struct_rwlock);
456 return;
457 }
458 } else {
459 blkid = off >> dn->dn_datablkshift;
460 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
461
462 if (blkid > dn->dn_maxblkid) {
463 rw_exit(&dn->dn_struct_rwlock);
464 return;
465 }
466 if (blkid + nblks > dn->dn_maxblkid)
467 nblks = dn->dn_maxblkid - blkid + 1;
468
469 }
470 l0span = nblks; /* save for later use to calc level > 1 overhead */
471 if (dn->dn_nlevels == 1) {
472 int i;
473 for (i = 0; i < nblks; i++) {
474 blkptr_t *bp = dn->dn_phys->dn_blkptr;
475 ASSERT3U(blkid + i, <, dn->dn_nblkptr);
476 bp += blkid + i;
477 if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
478 dprintf_bp(bp, "can free old%s", "");
479 space += bp_get_dsize(spa, bp);
480 }
481 unref += BP_GET_ASIZE(bp);
482 }
483 nl1blks = 1;
484 nblks = 0;
485 }
486
487 lastblk = blkid + nblks - 1;
488 while (nblks) {
489 dmu_buf_impl_t *dbuf;
490 uint64_t ibyte, new_blkid;
491 int epb = 1 << epbs;
492 int err, i, blkoff, tochk;
493 blkptr_t *bp;
494
495 ibyte = blkid << dn->dn_datablkshift;
496 err = dnode_next_offset(dn,
497 DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
498 new_blkid = ibyte >> dn->dn_datablkshift;
499 if (err == ESRCH) {
500 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
501 break;
502 }
503 if (err) {
504 txh->txh_tx->tx_err = err;
505 break;
506 }
507 if (new_blkid > lastblk) {
508 skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
509 break;
510 }
511
512 if (new_blkid > blkid) {
513 ASSERT((new_blkid >> epbs) > (blkid >> epbs));
514 skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
515 nblks -= new_blkid - blkid;
516 blkid = new_blkid;
517 }
518 blkoff = P2PHASE(blkid, epb);
519 tochk = MIN(epb - blkoff, nblks);
520
521 err = dbuf_hold_impl(dn, 1, blkid >> epbs,
522 FALSE, FALSE, FTAG, &dbuf);
523 if (err) {
524 txh->txh_tx->tx_err = err;
525 break;
526 }
527
528 txh->txh_memory_tohold += dbuf->db.db_size;
529
530 /*
531 * We don't check memory_tohold against DMU_MAX_ACCESS because
532 * memory_tohold is an over-estimation (especially the >L1
533 * indirect blocks), so it could fail. Callers should have
534 * already verified that they will not be holding too much
535 * memory.
536 */
537
538 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
539 if (err != 0) {
540 txh->txh_tx->tx_err = err;
541 dbuf_rele(dbuf, FTAG);
542 break;
543 }
544
545 bp = dbuf->db.db_data;
546 bp += blkoff;
547
548 for (i = 0; i < tochk; i++) {
549 if (dsl_dataset_block_freeable(ds, &bp[i],
550 bp[i].blk_birth)) {
551 dprintf_bp(&bp[i], "can free old%s", "");
552 space += bp_get_dsize(spa, &bp[i]);
553 }
554 unref += BP_GET_ASIZE(bp);
555 }
556 dbuf_rele(dbuf, FTAG);
557
558 ++nl1blks;
559 blkid += tochk;
560 nblks -= tochk;
561 }
562 rw_exit(&dn->dn_struct_rwlock);
563
564 /*
565 * Add in memory requirements of higher-level indirects.
566 * This assumes a worst-possible scenario for dn_nlevels and a
567 * worst-possible distribution of l1-blocks over the region to free.
568 */
569 {
570 uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
571 int level = 2;
572 /*
573 * Here we don't use DN_MAX_LEVEL, but calculate it with the
574 * given datablkshift and indblkshift. This makes the
575 * difference between 19 and 8 on large files.
576 */
577 int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
578 (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
579
580 while (level++ < maxlevel) {
581 txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
582 << dn->dn_indblkshift;
583 blkcnt = 1 + (blkcnt >> epbs);
584 }
585 }
586
587 /* account for new level 1 indirect blocks that might show up */
588 if (skipped > 0) {
589 txh->txh_fudge += skipped << dn->dn_indblkshift;
590 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
591 txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
592 }
593 txh->txh_space_tofree += space;
594 txh->txh_space_tounref += unref;
595 }
596
597 /*
598 * This function marks the transaction as being a "net free". The end
599 * result is that refquotas will be disabled for this transaction, and
600 * this transaction will be able to use half of the pool space overhead
601 * (see dsl_pool_adjustedsize()). Therefore this function should only
602 * be called for transactions that we expect will not cause a net increase
603 * in the amount of space used (but it's OK if that is occasionally not true).
604 */
605 void
dmu_tx_mark_netfree(dmu_tx_t * tx)606 dmu_tx_mark_netfree(dmu_tx_t *tx)
607 {
608 dmu_tx_hold_t *txh;
609
610 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
611 DMU_NEW_OBJECT, THT_FREE, 0, 0);
612
613 /*
614 * Pretend that this operation will free 1GB of space. This
615 * should be large enough to cancel out the largest write.
616 * We don't want to use something like UINT64_MAX, because that would
617 * cause overflows when doing math with these values (e.g. in
618 * dmu_tx_try_assign()).
619 */
620 txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024;
621 }
622
623 void
dmu_tx_hold_free(dmu_tx_t * tx,uint64_t object,uint64_t off,uint64_t len)624 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
625 {
626 dmu_tx_hold_t *txh;
627 dnode_t *dn;
628 int err;
629 zio_t *zio;
630
631 ASSERT(tx->tx_txg == 0);
632
633 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
634 object, THT_FREE, off, len);
635 if (txh == NULL)
636 return;
637 dn = txh->txh_dnode;
638 dmu_tx_count_dnode(txh);
639
640 if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
641 return;
642 if (len == DMU_OBJECT_END)
643 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
644
645 /*
646 * For i/o error checking, we read the first and last level-0
647 * blocks if they are not aligned, and all the level-1 blocks.
648 *
649 * Note: dbuf_free_range() assumes that we have not instantiated
650 * any level-0 dbufs that will be completely freed. Therefore we must
651 * exercise care to not read or count the first and last blocks
652 * if they are blocksize-aligned.
653 */
654 if (dn->dn_datablkshift == 0) {
655 if (off != 0 || len < dn->dn_datablksz)
656 dmu_tx_count_write(txh, 0, dn->dn_datablksz);
657 } else {
658 /* first block will be modified if it is not aligned */
659 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
660 dmu_tx_count_write(txh, off, 1);
661 /* last block will be modified if it is not aligned */
662 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
663 dmu_tx_count_write(txh, off+len, 1);
664 }
665
666 /*
667 * Check level-1 blocks.
668 */
669 if (dn->dn_nlevels > 1) {
670 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
671 SPA_BLKPTRSHIFT;
672 uint64_t start = off >> shift;
673 uint64_t end = (off + len) >> shift;
674
675 ASSERT(dn->dn_indblkshift != 0);
676
677 /*
678 * dnode_reallocate() can result in an object with indirect
679 * blocks having an odd data block size. In this case,
680 * just check the single block.
681 */
682 if (dn->dn_datablkshift == 0)
683 start = end = 0;
684
685 zio = zio_root(tx->tx_pool->dp_spa,
686 NULL, NULL, ZIO_FLAG_CANFAIL);
687 for (uint64_t i = start; i <= end; i++) {
688 uint64_t ibyte = i << shift;
689 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
690 i = ibyte >> shift;
691 if (err == ESRCH || i > end)
692 break;
693 if (err) {
694 tx->tx_err = err;
695 return;
696 }
697
698 err = dmu_tx_check_ioerr(zio, dn, 1, i);
699 if (err) {
700 tx->tx_err = err;
701 return;
702 }
703 }
704 err = zio_wait(zio);
705 if (err) {
706 tx->tx_err = err;
707 return;
708 }
709 }
710
711 dmu_tx_count_free(txh, off, len);
712 }
713
714 void
dmu_tx_hold_zap(dmu_tx_t * tx,uint64_t object,int add,const char * name)715 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
716 {
717 dmu_tx_hold_t *txh;
718 dnode_t *dn;
719 dsl_dataset_phys_t *ds_phys;
720 uint64_t nblocks;
721 int epbs, err;
722
723 ASSERT(tx->tx_txg == 0);
724
725 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
726 object, THT_ZAP, add, (uintptr_t)name);
727 if (txh == NULL)
728 return;
729 dn = txh->txh_dnode;
730
731 dmu_tx_count_dnode(txh);
732
733 if (dn == NULL) {
734 /*
735 * We will be able to fit a new object's entries into one leaf
736 * block. So there will be at most 2 blocks total,
737 * including the header block.
738 */
739 dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
740 return;
741 }
742
743 ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
744
745 if (dn->dn_maxblkid == 0 && !add) {
746 blkptr_t *bp;
747
748 /*
749 * If there is only one block (i.e. this is a micro-zap)
750 * and we are not adding anything, the accounting is simple.
751 */
752 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
753 if (err) {
754 tx->tx_err = err;
755 return;
756 }
757
758 /*
759 * Use max block size here, since we don't know how much
760 * the size will change between now and the dbuf dirty call.
761 */
762 bp = &dn->dn_phys->dn_blkptr[0];
763 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
764 bp, bp->blk_birth))
765 txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
766 else
767 txh->txh_space_towrite += MZAP_MAX_BLKSZ;
768 if (!BP_IS_HOLE(bp))
769 txh->txh_space_tounref += MZAP_MAX_BLKSZ;
770 return;
771 }
772
773 if (dn->dn_maxblkid > 0 && name) {
774 /*
775 * access the name in this fat-zap so that we'll check
776 * for i/o errors to the leaf blocks, etc.
777 */
778 err = zap_lookup(dn->dn_objset, dn->dn_object, name,
779 8, 0, NULL);
780 if (err == EIO) {
781 tx->tx_err = err;
782 return;
783 }
784 }
785
786 err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
787 &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
788
789 /*
790 * If the modified blocks are scattered to the four winds,
791 * we'll have to modify an indirect twig for each.
792 */
793 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
794 ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
795 for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
796 if (ds_phys->ds_prev_snap_obj)
797 txh->txh_space_towrite += 3 << dn->dn_indblkshift;
798 else
799 txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
800 }
801
802 void
dmu_tx_hold_bonus(dmu_tx_t * tx,uint64_t object)803 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
804 {
805 dmu_tx_hold_t *txh;
806
807 ASSERT(tx->tx_txg == 0);
808
809 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
810 object, THT_BONUS, 0, 0);
811 if (txh)
812 dmu_tx_count_dnode(txh);
813 }
814
815 void
dmu_tx_hold_space(dmu_tx_t * tx,uint64_t space)816 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
817 {
818 dmu_tx_hold_t *txh;
819 ASSERT(tx->tx_txg == 0);
820
821 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
822 DMU_NEW_OBJECT, THT_SPACE, space, 0);
823
824 txh->txh_space_towrite += space;
825 }
826
827 int
dmu_tx_holds(dmu_tx_t * tx,uint64_t object)828 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
829 {
830 dmu_tx_hold_t *txh;
831 int holds = 0;
832
833 /*
834 * By asserting that the tx is assigned, we're counting the
835 * number of dn_tx_holds, which is the same as the number of
836 * dn_holds. Otherwise, we'd be counting dn_holds, but
837 * dn_tx_holds could be 0.
838 */
839 ASSERT(tx->tx_txg != 0);
840
841 /* if (tx->tx_anyobj == TRUE) */
842 /* return (0); */
843
844 for (txh = list_head(&tx->tx_holds); txh;
845 txh = list_next(&tx->tx_holds, txh)) {
846 if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
847 holds++;
848 }
849
850 return (holds);
851 }
852
853 #ifdef ZFS_DEBUG
854 void
dmu_tx_dirty_buf(dmu_tx_t * tx,dmu_buf_impl_t * db)855 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
856 {
857 dmu_tx_hold_t *txh;
858 int match_object = FALSE, match_offset = FALSE;
859 dnode_t *dn;
860
861 DB_DNODE_ENTER(db);
862 dn = DB_DNODE(db);
863 ASSERT(tx->tx_txg != 0);
864 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
865 ASSERT3U(dn->dn_object, ==, db->db.db_object);
866
867 if (tx->tx_anyobj) {
868 DB_DNODE_EXIT(db);
869 return;
870 }
871
872 /* XXX No checking on the meta dnode for now */
873 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
874 DB_DNODE_EXIT(db);
875 return;
876 }
877
878 for (txh = list_head(&tx->tx_holds); txh;
879 txh = list_next(&tx->tx_holds, txh)) {
880 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
881 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
882 match_object = TRUE;
883 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
884 int datablkshift = dn->dn_datablkshift ?
885 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
886 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
887 int shift = datablkshift + epbs * db->db_level;
888 uint64_t beginblk = shift >= 64 ? 0 :
889 (txh->txh_arg1 >> shift);
890 uint64_t endblk = shift >= 64 ? 0 :
891 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
892 uint64_t blkid = db->db_blkid;
893
894 /* XXX txh_arg2 better not be zero... */
895
896 dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
897 txh->txh_type, beginblk, endblk);
898
899 switch (txh->txh_type) {
900 case THT_WRITE:
901 if (blkid >= beginblk && blkid <= endblk)
902 match_offset = TRUE;
903 /*
904 * We will let this hold work for the bonus
905 * or spill buffer so that we don't need to
906 * hold it when creating a new object.
907 */
908 if (blkid == DMU_BONUS_BLKID ||
909 blkid == DMU_SPILL_BLKID)
910 match_offset = TRUE;
911 /*
912 * They might have to increase nlevels,
913 * thus dirtying the new TLIBs. Or the
914 * might have to change the block size,
915 * thus dirying the new lvl=0 blk=0.
916 */
917 if (blkid == 0)
918 match_offset = TRUE;
919 break;
920 case THT_FREE:
921 /*
922 * We will dirty all the level 1 blocks in
923 * the free range and perhaps the first and
924 * last level 0 block.
925 */
926 if (blkid >= beginblk && (blkid <= endblk ||
927 txh->txh_arg2 == DMU_OBJECT_END))
928 match_offset = TRUE;
929 break;
930 case THT_SPILL:
931 if (blkid == DMU_SPILL_BLKID)
932 match_offset = TRUE;
933 break;
934 case THT_BONUS:
935 if (blkid == DMU_BONUS_BLKID)
936 match_offset = TRUE;
937 break;
938 case THT_ZAP:
939 match_offset = TRUE;
940 break;
941 case THT_NEWOBJECT:
942 match_object = TRUE;
943 break;
944 default:
945 ASSERT(!"bad txh_type");
946 }
947 }
948 if (match_object && match_offset) {
949 DB_DNODE_EXIT(db);
950 return;
951 }
952 }
953 DB_DNODE_EXIT(db);
954 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
955 (u_longlong_t)db->db.db_object, db->db_level,
956 (u_longlong_t)db->db_blkid);
957 }
958 #endif
959
960 /*
961 * If we can't do 10 iops, something is wrong. Let us go ahead
962 * and hit zfs_dirty_data_max.
963 */
964 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
965 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
966
967 /*
968 * We delay transactions when we've determined that the backend storage
969 * isn't able to accommodate the rate of incoming writes.
970 *
971 * If there is already a transaction waiting, we delay relative to when
972 * that transaction finishes waiting. This way the calculated min_time
973 * is independent of the number of threads concurrently executing
974 * transactions.
975 *
976 * If we are the only waiter, wait relative to when the transaction
977 * started, rather than the current time. This credits the transaction for
978 * "time already served", e.g. reading indirect blocks.
979 *
980 * The minimum time for a transaction to take is calculated as:
981 * min_time = scale * (dirty - min) / (max - dirty)
982 * min_time is then capped at zfs_delay_max_ns.
983 *
984 * The delay has two degrees of freedom that can be adjusted via tunables.
985 * The percentage of dirty data at which we start to delay is defined by
986 * zfs_delay_min_dirty_percent. This should typically be at or above
987 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
988 * delay after writing at full speed has failed to keep up with the incoming
989 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
990 * speaking, this variable determines the amount of delay at the midpoint of
991 * the curve.
992 *
993 * delay
994 * 10ms +-------------------------------------------------------------*+
995 * | *|
996 * 9ms + *+
997 * | *|
998 * 8ms + *+
999 * | * |
1000 * 7ms + * +
1001 * | * |
1002 * 6ms + * +
1003 * | * |
1004 * 5ms + * +
1005 * | * |
1006 * 4ms + * +
1007 * | * |
1008 * 3ms + * +
1009 * | * |
1010 * 2ms + (midpoint) * +
1011 * | | ** |
1012 * 1ms + v *** +
1013 * | zfs_delay_scale ----------> ******** |
1014 * 0 +-------------------------------------*********----------------+
1015 * 0% <- zfs_dirty_data_max -> 100%
1016 *
1017 * Note that since the delay is added to the outstanding time remaining on the
1018 * most recent transaction, the delay is effectively the inverse of IOPS.
1019 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
1020 * was chosen such that small changes in the amount of accumulated dirty data
1021 * in the first 3/4 of the curve yield relatively small differences in the
1022 * amount of delay.
1023 *
1024 * The effects can be easier to understand when the amount of delay is
1025 * represented on a log scale:
1026 *
1027 * delay
1028 * 100ms +-------------------------------------------------------------++
1029 * + +
1030 * | |
1031 * + *+
1032 * 10ms + *+
1033 * + ** +
1034 * | (midpoint) ** |
1035 * + | ** +
1036 * 1ms + v **** +
1037 * + zfs_delay_scale ----------> ***** +
1038 * | **** |
1039 * + **** +
1040 * 100us + ** +
1041 * + * +
1042 * | * |
1043 * + * +
1044 * 10us + * +
1045 * + +
1046 * | |
1047 * + +
1048 * +--------------------------------------------------------------+
1049 * 0% <- zfs_dirty_data_max -> 100%
1050 *
1051 * Note here that only as the amount of dirty data approaches its limit does
1052 * the delay start to increase rapidly. The goal of a properly tuned system
1053 * should be to keep the amount of dirty data out of that range by first
1054 * ensuring that the appropriate limits are set for the I/O scheduler to reach
1055 * optimal throughput on the backend storage, and then by changing the value
1056 * of zfs_delay_scale to increase the steepness of the curve.
1057 */
1058 static void
dmu_tx_delay(dmu_tx_t * tx,uint64_t dirty)1059 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1060 {
1061 dsl_pool_t *dp = tx->tx_pool;
1062 uint64_t delay_min_bytes =
1063 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1064 hrtime_t wakeup, min_tx_time, now;
1065
1066 if (dirty <= delay_min_bytes)
1067 return;
1068
1069 /*
1070 * The caller has already waited until we are under the max.
1071 * We make them pass us the amount of dirty data so we don't
1072 * have to handle the case of it being >= the max, which could
1073 * cause a divide-by-zero if it's == the max.
1074 */
1075 ASSERT3U(dirty, <, zfs_dirty_data_max);
1076
1077 now = gethrtime();
1078 min_tx_time = zfs_delay_scale *
1079 (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1080 if (now > tx->tx_start + min_tx_time)
1081 return;
1082
1083 min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1084
1085 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1086 uint64_t, min_tx_time);
1087
1088 mutex_enter(&dp->dp_lock);
1089 wakeup = MAX(tx->tx_start + min_tx_time,
1090 dp->dp_last_wakeup + min_tx_time);
1091 dp->dp_last_wakeup = wakeup;
1092 mutex_exit(&dp->dp_lock);
1093
1094 #ifdef _KERNEL
1095 mutex_enter(&curthread->t_delay_lock);
1096 while (cv_timedwait_hires(&curthread->t_delay_cv,
1097 &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
1098 CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
1099 continue;
1100 mutex_exit(&curthread->t_delay_lock);
1101 #else
1102 hrtime_t delta = wakeup - gethrtime();
1103 struct timespec ts;
1104 ts.tv_sec = delta / NANOSEC;
1105 ts.tv_nsec = delta % NANOSEC;
1106 (void) nanosleep(&ts, NULL);
1107 #endif
1108 }
1109
1110 static int
dmu_tx_try_assign(dmu_tx_t * tx,txg_how_t txg_how)1111 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1112 {
1113 dmu_tx_hold_t *txh;
1114 spa_t *spa = tx->tx_pool->dp_spa;
1115 uint64_t memory, asize, fsize, usize;
1116 uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1117
1118 ASSERT0(tx->tx_txg);
1119
1120 if (tx->tx_err)
1121 return (tx->tx_err);
1122
1123 if (spa_suspended(spa)) {
1124 /*
1125 * If the user has indicated a blocking failure mode
1126 * then return ERESTART which will block in dmu_tx_wait().
1127 * Otherwise, return EIO so that an error can get
1128 * propagated back to the VOP calls.
1129 *
1130 * Note that we always honor the txg_how flag regardless
1131 * of the failuremode setting.
1132 */
1133 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1134 txg_how != TXG_WAIT)
1135 return (SET_ERROR(EIO));
1136
1137 return (SET_ERROR(ERESTART));
1138 }
1139
1140 if (!tx->tx_waited &&
1141 dsl_pool_need_dirty_delay(tx->tx_pool)) {
1142 tx->tx_wait_dirty = B_TRUE;
1143 return (SET_ERROR(ERESTART));
1144 }
1145
1146 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1147 tx->tx_needassign_txh = NULL;
1148
1149 /*
1150 * NB: No error returns are allowed after txg_hold_open, but
1151 * before processing the dnode holds, due to the
1152 * dmu_tx_unassign() logic.
1153 */
1154
1155 towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1156 for (txh = list_head(&tx->tx_holds); txh;
1157 txh = list_next(&tx->tx_holds, txh)) {
1158 dnode_t *dn = txh->txh_dnode;
1159 if (dn != NULL) {
1160 mutex_enter(&dn->dn_mtx);
1161 if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1162 mutex_exit(&dn->dn_mtx);
1163 tx->tx_needassign_txh = txh;
1164 return (SET_ERROR(ERESTART));
1165 }
1166 if (dn->dn_assigned_txg == 0)
1167 dn->dn_assigned_txg = tx->tx_txg;
1168 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1169 (void) refcount_add(&dn->dn_tx_holds, tx);
1170 mutex_exit(&dn->dn_mtx);
1171 }
1172 towrite += txh->txh_space_towrite;
1173 tofree += txh->txh_space_tofree;
1174 tooverwrite += txh->txh_space_tooverwrite;
1175 tounref += txh->txh_space_tounref;
1176 tohold += txh->txh_memory_tohold;
1177 fudge += txh->txh_fudge;
1178 }
1179
1180 /*
1181 * If a snapshot has been taken since we made our estimates,
1182 * assume that we won't be able to free or overwrite anything.
1183 */
1184 if (tx->tx_objset &&
1185 dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
1186 tx->tx_lastsnap_txg) {
1187 towrite += tooverwrite;
1188 tooverwrite = tofree = 0;
1189 }
1190
1191 /* needed allocation: worst-case estimate of write space */
1192 asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
1193 /* freed space estimate: worst-case overwrite + free estimate */
1194 fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
1195 /* convert unrefd space to worst-case estimate */
1196 usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
1197 /* calculate memory footprint estimate */
1198 memory = towrite + tooverwrite + tohold;
1199
1200 #ifdef ZFS_DEBUG
1201 /*
1202 * Add in 'tohold' to account for our dirty holds on this memory
1203 * XXX - the "fudge" factor is to account for skipped blocks that
1204 * we missed because dnode_next_offset() misses in-core-only blocks.
1205 */
1206 tx->tx_space_towrite = asize +
1207 spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1208 tx->tx_space_tofree = tofree;
1209 tx->tx_space_tooverwrite = tooverwrite;
1210 tx->tx_space_tounref = tounref;
1211 #endif
1212
1213 if (tx->tx_dir && asize != 0) {
1214 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1215 asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1216 if (err)
1217 return (err);
1218 }
1219
1220 return (0);
1221 }
1222
1223 static void
dmu_tx_unassign(dmu_tx_t * tx)1224 dmu_tx_unassign(dmu_tx_t *tx)
1225 {
1226 dmu_tx_hold_t *txh;
1227
1228 if (tx->tx_txg == 0)
1229 return;
1230
1231 txg_rele_to_quiesce(&tx->tx_txgh);
1232
1233 /*
1234 * Walk the transaction's hold list, removing the hold on the
1235 * associated dnode, and notifying waiters if the refcount drops to 0.
1236 */
1237 for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1238 txh = list_next(&tx->tx_holds, txh)) {
1239 dnode_t *dn = txh->txh_dnode;
1240
1241 if (dn == NULL)
1242 continue;
1243 mutex_enter(&dn->dn_mtx);
1244 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1245
1246 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1247 dn->dn_assigned_txg = 0;
1248 cv_broadcast(&dn->dn_notxholds);
1249 }
1250 mutex_exit(&dn->dn_mtx);
1251 }
1252
1253 txg_rele_to_sync(&tx->tx_txgh);
1254
1255 tx->tx_lasttried_txg = tx->tx_txg;
1256 tx->tx_txg = 0;
1257 }
1258
1259 /*
1260 * Assign tx to a transaction group. txg_how can be one of:
1261 *
1262 * (1) TXG_WAIT. If the current open txg is full, waits until there's
1263 * a new one. This should be used when you're not holding locks.
1264 * It will only fail if we're truly out of space (or over quota).
1265 *
1266 * (2) TXG_NOWAIT. If we can't assign into the current open txg without
1267 * blocking, returns immediately with ERESTART. This should be used
1268 * whenever you're holding locks. On an ERESTART error, the caller
1269 * should drop locks, do a dmu_tx_wait(tx), and try again.
1270 *
1271 * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1272 * has already been called on behalf of this operation (though
1273 * most likely on a different tx).
1274 */
1275 int
dmu_tx_assign(dmu_tx_t * tx,txg_how_t txg_how)1276 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1277 {
1278 int err;
1279
1280 ASSERT(tx->tx_txg == 0);
1281 ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1282 txg_how == TXG_WAITED);
1283 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1284
1285 /* If we might wait, we must not hold the config lock. */
1286 ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1287
1288 if (txg_how == TXG_WAITED)
1289 tx->tx_waited = B_TRUE;
1290
1291 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1292 dmu_tx_unassign(tx);
1293
1294 if (err != ERESTART || txg_how != TXG_WAIT)
1295 return (err);
1296
1297 dmu_tx_wait(tx);
1298 }
1299
1300 txg_rele_to_quiesce(&tx->tx_txgh);
1301
1302 return (0);
1303 }
1304
1305 void
dmu_tx_wait(dmu_tx_t * tx)1306 dmu_tx_wait(dmu_tx_t *tx)
1307 {
1308 spa_t *spa = tx->tx_pool->dp_spa;
1309 dsl_pool_t *dp = tx->tx_pool;
1310
1311 ASSERT(tx->tx_txg == 0);
1312 ASSERT(!dsl_pool_config_held(tx->tx_pool));
1313
1314 if (tx->tx_wait_dirty) {
1315 /*
1316 * dmu_tx_try_assign() has determined that we need to wait
1317 * because we've consumed much or all of the dirty buffer
1318 * space.
1319 */
1320 mutex_enter(&dp->dp_lock);
1321 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1322 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1323 uint64_t dirty = dp->dp_dirty_total;
1324 mutex_exit(&dp->dp_lock);
1325
1326 dmu_tx_delay(tx, dirty);
1327
1328 tx->tx_wait_dirty = B_FALSE;
1329
1330 /*
1331 * Note: setting tx_waited only has effect if the caller
1332 * used TX_WAIT. Otherwise they are going to destroy
1333 * this tx and try again. The common case, zfs_write(),
1334 * uses TX_WAIT.
1335 */
1336 tx->tx_waited = B_TRUE;
1337 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1338 /*
1339 * If the pool is suspended we need to wait until it
1340 * is resumed. Note that it's possible that the pool
1341 * has become active after this thread has tried to
1342 * obtain a tx. If that's the case then tx_lasttried_txg
1343 * would not have been set.
1344 */
1345 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1346 } else if (tx->tx_needassign_txh) {
1347 /*
1348 * A dnode is assigned to the quiescing txg. Wait for its
1349 * transaction to complete.
1350 */
1351 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1352
1353 mutex_enter(&dn->dn_mtx);
1354 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1355 cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1356 mutex_exit(&dn->dn_mtx);
1357 tx->tx_needassign_txh = NULL;
1358 } else {
1359 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1360 }
1361 }
1362
1363 void
dmu_tx_willuse_space(dmu_tx_t * tx,int64_t delta)1364 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1365 {
1366 #ifdef ZFS_DEBUG
1367 if (tx->tx_dir == NULL || delta == 0)
1368 return;
1369
1370 if (delta > 0) {
1371 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1372 tx->tx_space_towrite);
1373 (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1374 } else {
1375 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1376 }
1377 #endif
1378 }
1379
1380 void
dmu_tx_commit(dmu_tx_t * tx)1381 dmu_tx_commit(dmu_tx_t *tx)
1382 {
1383 dmu_tx_hold_t *txh;
1384
1385 ASSERT(tx->tx_txg != 0);
1386
1387 /*
1388 * Go through the transaction's hold list and remove holds on
1389 * associated dnodes, notifying waiters if no holds remain.
1390 */
1391 while (txh = list_head(&tx->tx_holds)) {
1392 dnode_t *dn = txh->txh_dnode;
1393
1394 list_remove(&tx->tx_holds, txh);
1395 kmem_free(txh, sizeof (dmu_tx_hold_t));
1396 if (dn == NULL)
1397 continue;
1398 mutex_enter(&dn->dn_mtx);
1399 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1400
1401 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1402 dn->dn_assigned_txg = 0;
1403 cv_broadcast(&dn->dn_notxholds);
1404 }
1405 mutex_exit(&dn->dn_mtx);
1406 dnode_rele(dn, tx);
1407 }
1408
1409 if (tx->tx_tempreserve_cookie)
1410 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1411
1412 if (!list_is_empty(&tx->tx_callbacks))
1413 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1414
1415 if (tx->tx_anyobj == FALSE)
1416 txg_rele_to_sync(&tx->tx_txgh);
1417
1418 list_destroy(&tx->tx_callbacks);
1419 list_destroy(&tx->tx_holds);
1420 #ifdef ZFS_DEBUG
1421 dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1422 tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1423 tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1424 refcount_destroy_many(&tx->tx_space_written,
1425 refcount_count(&tx->tx_space_written));
1426 refcount_destroy_many(&tx->tx_space_freed,
1427 refcount_count(&tx->tx_space_freed));
1428 #endif
1429 kmem_free(tx, sizeof (dmu_tx_t));
1430 }
1431
1432 void
dmu_tx_abort(dmu_tx_t * tx)1433 dmu_tx_abort(dmu_tx_t *tx)
1434 {
1435 dmu_tx_hold_t *txh;
1436
1437 ASSERT(tx->tx_txg == 0);
1438
1439 while (txh = list_head(&tx->tx_holds)) {
1440 dnode_t *dn = txh->txh_dnode;
1441
1442 list_remove(&tx->tx_holds, txh);
1443 kmem_free(txh, sizeof (dmu_tx_hold_t));
1444 if (dn != NULL)
1445 dnode_rele(dn, tx);
1446 }
1447
1448 /*
1449 * Call any registered callbacks with an error code.
1450 */
1451 if (!list_is_empty(&tx->tx_callbacks))
1452 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1453
1454 list_destroy(&tx->tx_callbacks);
1455 list_destroy(&tx->tx_holds);
1456 #ifdef ZFS_DEBUG
1457 refcount_destroy_many(&tx->tx_space_written,
1458 refcount_count(&tx->tx_space_written));
1459 refcount_destroy_many(&tx->tx_space_freed,
1460 refcount_count(&tx->tx_space_freed));
1461 #endif
1462 kmem_free(tx, sizeof (dmu_tx_t));
1463 }
1464
1465 uint64_t
dmu_tx_get_txg(dmu_tx_t * tx)1466 dmu_tx_get_txg(dmu_tx_t *tx)
1467 {
1468 ASSERT(tx->tx_txg != 0);
1469 return (tx->tx_txg);
1470 }
1471
1472 dsl_pool_t *
dmu_tx_pool(dmu_tx_t * tx)1473 dmu_tx_pool(dmu_tx_t *tx)
1474 {
1475 ASSERT(tx->tx_pool != NULL);
1476 return (tx->tx_pool);
1477 }
1478
1479
1480 void
dmu_tx_callback_register(dmu_tx_t * tx,dmu_tx_callback_func_t * func,void * data)1481 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1482 {
1483 dmu_tx_callback_t *dcb;
1484
1485 dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1486
1487 dcb->dcb_func = func;
1488 dcb->dcb_data = data;
1489
1490 list_insert_tail(&tx->tx_callbacks, dcb);
1491 }
1492
1493 /*
1494 * Call all the commit callbacks on a list, with a given error code.
1495 */
1496 void
dmu_tx_do_callbacks(list_t * cb_list,int error)1497 dmu_tx_do_callbacks(list_t *cb_list, int error)
1498 {
1499 dmu_tx_callback_t *dcb;
1500
1501 while (dcb = list_head(cb_list)) {
1502 list_remove(cb_list, dcb);
1503 dcb->dcb_func(dcb->dcb_data, error);
1504 kmem_free(dcb, sizeof (dmu_tx_callback_t));
1505 }
1506 }
1507
1508 /*
1509 * Interface to hold a bunch of attributes.
1510 * used for creating new files.
1511 * attrsize is the total size of all attributes
1512 * to be added during object creation
1513 *
1514 * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1515 */
1516
1517 /*
1518 * hold necessary attribute name for attribute registration.
1519 * should be a very rare case where this is needed. If it does
1520 * happen it would only happen on the first write to the file system.
1521 */
1522 static void
dmu_tx_sa_registration_hold(sa_os_t * sa,dmu_tx_t * tx)1523 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1524 {
1525 int i;
1526
1527 if (!sa->sa_need_attr_registration)
1528 return;
1529
1530 for (i = 0; i != sa->sa_num_attrs; i++) {
1531 if (!sa->sa_attr_table[i].sa_registered) {
1532 if (sa->sa_reg_attr_obj)
1533 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1534 B_TRUE, sa->sa_attr_table[i].sa_name);
1535 else
1536 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1537 B_TRUE, sa->sa_attr_table[i].sa_name);
1538 }
1539 }
1540 }
1541
1542
1543 void
dmu_tx_hold_spill(dmu_tx_t * tx,uint64_t object)1544 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1545 {
1546 dnode_t *dn;
1547 dmu_tx_hold_t *txh;
1548
1549 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1550 THT_SPILL, 0, 0);
1551
1552 dn = txh->txh_dnode;
1553
1554 if (dn == NULL)
1555 return;
1556
1557 /* If blkptr doesn't exist then add space to towrite */
1558 if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1559 txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
1560 } else {
1561 blkptr_t *bp;
1562
1563 bp = &dn->dn_phys->dn_spill;
1564 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1565 bp, bp->blk_birth))
1566 txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
1567 else
1568 txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
1569 if (!BP_IS_HOLE(bp))
1570 txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
1571 }
1572 }
1573
1574 void
dmu_tx_hold_sa_create(dmu_tx_t * tx,int attrsize)1575 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1576 {
1577 sa_os_t *sa = tx->tx_objset->os_sa;
1578
1579 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1580
1581 if (tx->tx_objset->os_sa->sa_master_obj == 0)
1582 return;
1583
1584 if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1585 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1586 else {
1587 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1588 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1589 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1590 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1591 }
1592
1593 dmu_tx_sa_registration_hold(sa, tx);
1594
1595 if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1596 return;
1597
1598 (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1599 THT_SPILL, 0, 0);
1600 }
1601
1602 /*
1603 * Hold SA attribute
1604 *
1605 * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1606 *
1607 * variable_size is the total size of all variable sized attributes
1608 * passed to this function. It is not the total size of all
1609 * variable size attributes that *may* exist on this object.
1610 */
1611 void
dmu_tx_hold_sa(dmu_tx_t * tx,sa_handle_t * hdl,boolean_t may_grow)1612 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1613 {
1614 uint64_t object;
1615 sa_os_t *sa = tx->tx_objset->os_sa;
1616
1617 ASSERT(hdl != NULL);
1618
1619 object = sa_handle_object(hdl);
1620
1621 dmu_tx_hold_bonus(tx, object);
1622
1623 if (tx->tx_objset->os_sa->sa_master_obj == 0)
1624 return;
1625
1626 if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1627 tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1628 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1629 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1630 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1631 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1632 }
1633
1634 dmu_tx_sa_registration_hold(sa, tx);
1635
1636 if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1637 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1638
1639 if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1640 ASSERT(tx->tx_txg == 0);
1641 dmu_tx_hold_spill(tx, object);
1642 } else {
1643 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1644 dnode_t *dn;
1645
1646 DB_DNODE_ENTER(db);
1647 dn = DB_DNODE(db);
1648 if (dn->dn_have_spill) {
1649 ASSERT(tx->tx_txg == 0);
1650 dmu_tx_hold_spill(tx, object);
1651 }
1652 DB_DNODE_EXIT(db);
1653 }
1654 }
1655