xref: /freebsd/sys/contrib/openzfs/module/zfs/dmu_object.c (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24  * Copyright 2014 HybridCluster. All rights reserved.
25  */
26 
27 #include <sys/dbuf.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dmu_objset.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dnode.h>
33 #include <sys/zap.h>
34 #include <sys/zfeature.h>
35 #include <sys/dsl_dataset.h>
36 
37 /*
38  * Each of the concurrent object allocators will grab
39  * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
40  * grab 128 slots, which is 4 blocks worth.  This was experimentally
41  * determined to be the lowest value that eliminates the measurable effect
42  * of lock contention from this code path.
43  */
44 int dmu_object_alloc_chunk_shift = 7;
45 
46 static uint64_t
47 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
48     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
49     int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
50 {
51 	uint64_t object;
52 	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
53 	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
54 	dnode_t *dn = NULL;
55 	int dn_slots = dnodesize >> DNODE_SHIFT;
56 	boolean_t restarted = B_FALSE;
57 	uint64_t *cpuobj = NULL;
58 	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
59 	int error;
60 
61 	kpreempt_disable();
62 	cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
63 	    os->os_obj_next_percpu_len];
64 	kpreempt_enable();
65 
66 	if (dn_slots == 0) {
67 		dn_slots = DNODE_MIN_SLOTS;
68 	} else {
69 		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
70 		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
71 	}
72 
73 	/*
74 	 * The "chunk" of dnodes that is assigned to a CPU-specific
75 	 * allocator needs to be at least one block's worth, to avoid
76 	 * lock contention on the dbuf.  It can be at most one L1 block's
77 	 * worth, so that the "rescan after polishing off a L1's worth"
78 	 * logic below will be sure to kick in.
79 	 */
80 	if (dnodes_per_chunk < DNODES_PER_BLOCK)
81 		dnodes_per_chunk = DNODES_PER_BLOCK;
82 	if (dnodes_per_chunk > L1_dnode_count)
83 		dnodes_per_chunk = L1_dnode_count;
84 
85 	/*
86 	 * The caller requested the dnode be returned as a performance
87 	 * optimization in order to avoid releasing the hold only to
88 	 * immediately reacquire it.  Since they caller is responsible
89 	 * for releasing the hold they must provide the tag.
90 	 */
91 	if (allocated_dnode != NULL) {
92 		ASSERT3P(tag, !=, NULL);
93 	} else {
94 		ASSERT3P(tag, ==, NULL);
95 		tag = FTAG;
96 	}
97 
98 	object = *cpuobj;
99 	for (;;) {
100 		/*
101 		 * If we finished a chunk of dnodes, get a new one from
102 		 * the global allocator.
103 		 */
104 		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
105 		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
106 		    dn_slots)) {
107 			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
108 			mutex_enter(&os->os_obj_lock);
109 			ASSERT0(P2PHASE(os->os_obj_next_chunk,
110 			    dnodes_per_chunk));
111 			object = os->os_obj_next_chunk;
112 
113 			/*
114 			 * Each time we polish off a L1 bp worth of dnodes
115 			 * (2^12 objects), move to another L1 bp that's
116 			 * still reasonably sparse (at most 1/4 full). Look
117 			 * from the beginning at most once per txg. If we
118 			 * still can't allocate from that L1 block, search
119 			 * for an empty L0 block, which will quickly skip
120 			 * to the end of the metadnode if no nearby L0
121 			 * blocks are empty. This fallback avoids a
122 			 * pathology where full dnode blocks containing
123 			 * large dnodes appear sparse because they have a
124 			 * low blk_fill, leading to many failed allocation
125 			 * attempts. In the long term a better mechanism to
126 			 * search for sparse metadnode regions, such as
127 			 * spacemaps, could be implemented.
128 			 *
129 			 * os_scan_dnodes is set during txg sync if enough
130 			 * objects have been freed since the previous
131 			 * rescan to justify backfilling again.
132 			 *
133 			 * Note that dmu_traverse depends on the behavior
134 			 * that we use multiple blocks of the dnode object
135 			 * before going back to reuse objects.  Any change
136 			 * to this algorithm should preserve that property
137 			 * or find another solution to the issues described
138 			 * in traverse_visitbp.
139 			 */
140 			if (P2PHASE(object, L1_dnode_count) == 0) {
141 				uint64_t offset;
142 				uint64_t blkfill;
143 				int minlvl;
144 				if (os->os_rescan_dnodes) {
145 					offset = 0;
146 					os->os_rescan_dnodes = B_FALSE;
147 				} else {
148 					offset = object << DNODE_SHIFT;
149 				}
150 				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
151 				minlvl = restarted ? 1 : 2;
152 				restarted = B_TRUE;
153 				error = dnode_next_offset(DMU_META_DNODE(os),
154 				    DNODE_FIND_HOLE, &offset, minlvl,
155 				    blkfill, 0);
156 				if (error == 0) {
157 					object = offset >> DNODE_SHIFT;
158 				}
159 			}
160 			/*
161 			 * Note: if "restarted", we may find a L0 that
162 			 * is not suitably aligned.
163 			 */
164 			os->os_obj_next_chunk =
165 			    P2ALIGN(object, dnodes_per_chunk) +
166 			    dnodes_per_chunk;
167 			(void) atomic_swap_64(cpuobj, object);
168 			mutex_exit(&os->os_obj_lock);
169 		}
170 
171 		/*
172 		 * The value of (*cpuobj) before adding dn_slots is the object
173 		 * ID assigned to us.  The value afterwards is the object ID
174 		 * assigned to whoever wants to do an allocation next.
175 		 */
176 		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
177 
178 		/*
179 		 * XXX We should check for an i/o error here and return
180 		 * up to our caller.  Actually we should pre-read it in
181 		 * dmu_tx_assign(), but there is currently no mechanism
182 		 * to do so.
183 		 */
184 		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
185 		    dn_slots, tag, &dn);
186 		if (error == 0) {
187 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
188 			/*
189 			 * Another thread could have allocated it; check
190 			 * again now that we have the struct lock.
191 			 */
192 			if (dn->dn_type == DMU_OT_NONE) {
193 				dnode_allocate(dn, ot, blocksize,
194 				    indirect_blockshift, bonustype,
195 				    bonuslen, dn_slots, tx);
196 				rw_exit(&dn->dn_struct_rwlock);
197 				dmu_tx_add_new_object(tx, dn);
198 
199 				/*
200 				 * Caller requested the allocated dnode be
201 				 * returned and is responsible for the hold.
202 				 */
203 				if (allocated_dnode != NULL)
204 					*allocated_dnode = dn;
205 				else
206 					dnode_rele(dn, tag);
207 
208 				return (object);
209 			}
210 			rw_exit(&dn->dn_struct_rwlock);
211 			dnode_rele(dn, tag);
212 			DNODE_STAT_BUMP(dnode_alloc_race);
213 		}
214 
215 		/*
216 		 * Skip to next known valid starting point on error.  This
217 		 * is the start of the next block of dnodes.
218 		 */
219 		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
220 			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
221 			DNODE_STAT_BUMP(dnode_alloc_next_block);
222 		}
223 		(void) atomic_swap_64(cpuobj, object);
224 	}
225 }
226 
227 uint64_t
228 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
229     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
230 {
231 	return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
232 	    bonuslen, 0, NULL, NULL, tx);
233 }
234 
235 uint64_t
236 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
237     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
238     dmu_tx_t *tx)
239 {
240 	return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
241 	    bonustype, bonuslen, 0, NULL, NULL, tx);
242 }
243 
244 uint64_t
245 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
246     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
247 {
248 	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
249 	    bonuslen, dnodesize, NULL, NULL, tx));
250 }
251 
252 /*
253  * Allocate a new object and return a pointer to the newly allocated dnode
254  * via the allocated_dnode argument.  The returned dnode will be held and
255  * the caller is responsible for releasing the hold by calling dnode_rele().
256  */
257 uint64_t
258 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
259     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
260     int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
261 {
262 	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
263 	    bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
264 }
265 
266 int
267 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
268     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
269 {
270 	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
271 	    bonuslen, 0, tx));
272 }
273 
274 int
275 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
276     int blocksize, dmu_object_type_t bonustype, int bonuslen,
277     int dnodesize, dmu_tx_t *tx)
278 {
279 	dnode_t *dn;
280 	int dn_slots = dnodesize >> DNODE_SHIFT;
281 	int err;
282 
283 	if (dn_slots == 0)
284 		dn_slots = DNODE_MIN_SLOTS;
285 	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
286 	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
287 
288 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
289 		return (SET_ERROR(EBADF));
290 
291 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
292 	    FTAG, &dn);
293 	if (err)
294 		return (err);
295 
296 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
297 	dmu_tx_add_new_object(tx, dn);
298 
299 	dnode_rele(dn, FTAG);
300 
301 	return (0);
302 }
303 
304 int
305 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
306     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
307 {
308 	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
309 	    bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
310 }
311 
312 int
313 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
314     int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
315     boolean_t keep_spill, dmu_tx_t *tx)
316 {
317 	dnode_t *dn;
318 	int dn_slots = dnodesize >> DNODE_SHIFT;
319 	int err;
320 
321 	if (dn_slots == 0)
322 		dn_slots = DNODE_MIN_SLOTS;
323 
324 	if (object == DMU_META_DNODE_OBJECT)
325 		return (SET_ERROR(EBADF));
326 
327 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
328 	    FTAG, &dn);
329 	if (err)
330 		return (err);
331 
332 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
333 	    keep_spill, tx);
334 
335 	dnode_rele(dn, FTAG);
336 	return (err);
337 }
338 
339 int
340 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
341 {
342 	dnode_t *dn;
343 	int err;
344 
345 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
346 	    FTAG, &dn);
347 	if (err)
348 		return (err);
349 
350 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
351 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
352 		dbuf_rm_spill(dn, tx);
353 		dnode_rm_spill(dn, tx);
354 	}
355 	rw_exit(&dn->dn_struct_rwlock);
356 
357 	dnode_rele(dn, FTAG);
358 	return (err);
359 }
360 
361 int
362 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
363 {
364 	dnode_t *dn;
365 	int err;
366 
367 	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
368 
369 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
370 	    FTAG, &dn);
371 	if (err)
372 		return (err);
373 
374 	ASSERT(dn->dn_type != DMU_OT_NONE);
375 	/*
376 	 * If we don't create this free range, we'll leak indirect blocks when
377 	 * we get to freeing the dnode in syncing context.
378 	 */
379 	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
380 	dnode_free(dn, tx);
381 	dnode_rele(dn, FTAG);
382 
383 	return (0);
384 }
385 
386 /*
387  * Return (in *objectp) the next object which is allocated (or a hole)
388  * after *object, taking into account only objects that may have been modified
389  * after the specified txg.
390  */
391 int
392 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
393 {
394 	uint64_t offset;
395 	uint64_t start_obj;
396 	struct dsl_dataset *ds = os->os_dsl_dataset;
397 	int error;
398 
399 	if (*objectp == 0) {
400 		start_obj = 1;
401 	} else if (ds && dsl_dataset_feature_is_active(ds,
402 	    SPA_FEATURE_LARGE_DNODE)) {
403 		uint64_t i = *objectp + 1;
404 		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
405 		dmu_object_info_t doi;
406 
407 		/*
408 		 * Scan through the remaining meta dnode block.  The contents
409 		 * of each slot in the block are known so it can be quickly
410 		 * checked.  If the block is exhausted without a match then
411 		 * hand off to dnode_next_offset() for further scanning.
412 		 */
413 		while (i <= last_obj) {
414 			error = dmu_object_info(os, i, &doi);
415 			if (error == ENOENT) {
416 				if (hole) {
417 					*objectp = i;
418 					return (0);
419 				} else {
420 					i++;
421 				}
422 			} else if (error == EEXIST) {
423 				i++;
424 			} else if (error == 0) {
425 				if (hole) {
426 					i += doi.doi_dnodesize >> DNODE_SHIFT;
427 				} else {
428 					*objectp = i;
429 					return (0);
430 				}
431 			} else {
432 				return (error);
433 			}
434 		}
435 
436 		start_obj = i;
437 	} else {
438 		start_obj = *objectp + 1;
439 	}
440 
441 	offset = start_obj << DNODE_SHIFT;
442 
443 	error = dnode_next_offset(DMU_META_DNODE(os),
444 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
445 
446 	*objectp = offset >> DNODE_SHIFT;
447 
448 	return (error);
449 }
450 
451 /*
452  * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
453  * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
454  *
455  * Only for use from syncing context, on MOS objects.
456  */
457 void
458 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
459     dmu_tx_t *tx)
460 {
461 	dnode_t *dn;
462 
463 	ASSERT(dmu_tx_is_syncing(tx));
464 
465 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
466 	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
467 		dnode_rele(dn, FTAG);
468 		return;
469 	}
470 	ASSERT3U(dn->dn_type, ==, old_type);
471 	ASSERT0(dn->dn_maxblkid);
472 
473 	/*
474 	 * We must initialize the ZAP data before changing the type,
475 	 * so that concurrent calls to *_is_zapified() can determine if
476 	 * the object has been completely zapified by checking the type.
477 	 */
478 	mzap_create_impl(dn, 0, 0, tx);
479 
480 	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
481 	    DMU_OTN_ZAP_METADATA;
482 	dnode_setdirty(dn, tx);
483 	dnode_rele(dn, FTAG);
484 
485 	spa_feature_incr(dmu_objset_spa(mos),
486 	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
487 }
488 
489 void
490 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
491 {
492 	dnode_t *dn;
493 	dmu_object_type_t t;
494 
495 	ASSERT(dmu_tx_is_syncing(tx));
496 
497 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
498 	t = dn->dn_type;
499 	dnode_rele(dn, FTAG);
500 
501 	if (t == DMU_OTN_ZAP_METADATA) {
502 		spa_feature_decr(dmu_objset_spa(mos),
503 		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
504 	}
505 	VERIFY0(dmu_object_free(mos, object, tx));
506 }
507 
508 EXPORT_SYMBOL(dmu_object_alloc);
509 EXPORT_SYMBOL(dmu_object_alloc_ibs);
510 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
511 EXPORT_SYMBOL(dmu_object_alloc_hold);
512 EXPORT_SYMBOL(dmu_object_claim);
513 EXPORT_SYMBOL(dmu_object_claim_dnsize);
514 EXPORT_SYMBOL(dmu_object_reclaim);
515 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
516 EXPORT_SYMBOL(dmu_object_rm_spill);
517 EXPORT_SYMBOL(dmu_object_free);
518 EXPORT_SYMBOL(dmu_object_next);
519 EXPORT_SYMBOL(dmu_object_zapify);
520 EXPORT_SYMBOL(dmu_object_free_zapified);
521 
522 /* BEGIN CSTYLED */
523 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW,
524 	"CPU-specific allocator grabs 2^N objects at once");
525 /* END CSTYLED */
526