xref: /freebsd/sys/contrib/openzfs/module/zfs/dmu_object.c (revision 2a58b312b62f908ec92311d1bd8536dbaeb8e55b)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23eda14cbcSMatt Macy  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
24eda14cbcSMatt Macy  * Copyright 2014 HybridCluster. All rights reserved.
25eda14cbcSMatt Macy  */
26eda14cbcSMatt Macy 
27eda14cbcSMatt Macy #include <sys/dbuf.h>
28eda14cbcSMatt Macy #include <sys/dmu.h>
29eda14cbcSMatt Macy #include <sys/dmu_impl.h>
30eda14cbcSMatt Macy #include <sys/dmu_objset.h>
31eda14cbcSMatt Macy #include <sys/dmu_tx.h>
32eda14cbcSMatt Macy #include <sys/dnode.h>
33eda14cbcSMatt Macy #include <sys/zap.h>
34eda14cbcSMatt Macy #include <sys/zfeature.h>
35eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
36eda14cbcSMatt Macy 
37eda14cbcSMatt Macy /*
38eda14cbcSMatt Macy  * Each of the concurrent object allocators will grab
39eda14cbcSMatt Macy  * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
40eda14cbcSMatt Macy  * grab 128 slots, which is 4 blocks worth.  This was experimentally
41eda14cbcSMatt Macy  * determined to be the lowest value that eliminates the measurable effect
42eda14cbcSMatt Macy  * of lock contention from this code path.
43eda14cbcSMatt Macy  */
44be181ee2SMartin Matuska uint_t dmu_object_alloc_chunk_shift = 7;
45eda14cbcSMatt Macy 
46eda14cbcSMatt Macy static uint64_t
47eda14cbcSMatt Macy dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
48eda14cbcSMatt Macy     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
49a0b956f5SMartin Matuska     int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
50eda14cbcSMatt Macy {
51eda14cbcSMatt Macy 	uint64_t object;
52eda14cbcSMatt Macy 	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
53eda14cbcSMatt Macy 	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
54eda14cbcSMatt Macy 	dnode_t *dn = NULL;
55eda14cbcSMatt Macy 	int dn_slots = dnodesize >> DNODE_SHIFT;
56eda14cbcSMatt Macy 	boolean_t restarted = B_FALSE;
57eda14cbcSMatt Macy 	uint64_t *cpuobj = NULL;
58be181ee2SMartin Matuska 	uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
59eda14cbcSMatt Macy 	int error;
60eda14cbcSMatt Macy 
617877fdebSMatt Macy 	cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
62eda14cbcSMatt Macy 	    os->os_obj_next_percpu_len];
63eda14cbcSMatt Macy 
64eda14cbcSMatt Macy 	if (dn_slots == 0) {
65eda14cbcSMatt Macy 		dn_slots = DNODE_MIN_SLOTS;
66eda14cbcSMatt Macy 	} else {
67eda14cbcSMatt Macy 		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
68eda14cbcSMatt Macy 		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
69eda14cbcSMatt Macy 	}
70eda14cbcSMatt Macy 
71eda14cbcSMatt Macy 	/*
72eda14cbcSMatt Macy 	 * The "chunk" of dnodes that is assigned to a CPU-specific
73eda14cbcSMatt Macy 	 * allocator needs to be at least one block's worth, to avoid
74eda14cbcSMatt Macy 	 * lock contention on the dbuf.  It can be at most one L1 block's
75eda14cbcSMatt Macy 	 * worth, so that the "rescan after polishing off a L1's worth"
76eda14cbcSMatt Macy 	 * logic below will be sure to kick in.
77eda14cbcSMatt Macy 	 */
78eda14cbcSMatt Macy 	if (dnodes_per_chunk < DNODES_PER_BLOCK)
79eda14cbcSMatt Macy 		dnodes_per_chunk = DNODES_PER_BLOCK;
80eda14cbcSMatt Macy 	if (dnodes_per_chunk > L1_dnode_count)
81eda14cbcSMatt Macy 		dnodes_per_chunk = L1_dnode_count;
82eda14cbcSMatt Macy 
83eda14cbcSMatt Macy 	/*
84eda14cbcSMatt Macy 	 * The caller requested the dnode be returned as a performance
85eda14cbcSMatt Macy 	 * optimization in order to avoid releasing the hold only to
86eda14cbcSMatt Macy 	 * immediately reacquire it.  Since they caller is responsible
87eda14cbcSMatt Macy 	 * for releasing the hold they must provide the tag.
88eda14cbcSMatt Macy 	 */
89eda14cbcSMatt Macy 	if (allocated_dnode != NULL) {
90eda14cbcSMatt Macy 		ASSERT3P(tag, !=, NULL);
91eda14cbcSMatt Macy 	} else {
92eda14cbcSMatt Macy 		ASSERT3P(tag, ==, NULL);
93eda14cbcSMatt Macy 		tag = FTAG;
94eda14cbcSMatt Macy 	}
95eda14cbcSMatt Macy 
96eda14cbcSMatt Macy 	object = *cpuobj;
97eda14cbcSMatt Macy 	for (;;) {
98eda14cbcSMatt Macy 		/*
99eda14cbcSMatt Macy 		 * If we finished a chunk of dnodes, get a new one from
100eda14cbcSMatt Macy 		 * the global allocator.
101eda14cbcSMatt Macy 		 */
102eda14cbcSMatt Macy 		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
103eda14cbcSMatt Macy 		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
104eda14cbcSMatt Macy 		    dn_slots)) {
105eda14cbcSMatt Macy 			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
106eda14cbcSMatt Macy 			mutex_enter(&os->os_obj_lock);
107eda14cbcSMatt Macy 			ASSERT0(P2PHASE(os->os_obj_next_chunk,
108eda14cbcSMatt Macy 			    dnodes_per_chunk));
109eda14cbcSMatt Macy 			object = os->os_obj_next_chunk;
110eda14cbcSMatt Macy 
111eda14cbcSMatt Macy 			/*
112eda14cbcSMatt Macy 			 * Each time we polish off a L1 bp worth of dnodes
113eda14cbcSMatt Macy 			 * (2^12 objects), move to another L1 bp that's
114eda14cbcSMatt Macy 			 * still reasonably sparse (at most 1/4 full). Look
115eda14cbcSMatt Macy 			 * from the beginning at most once per txg. If we
116eda14cbcSMatt Macy 			 * still can't allocate from that L1 block, search
117eda14cbcSMatt Macy 			 * for an empty L0 block, which will quickly skip
118eda14cbcSMatt Macy 			 * to the end of the metadnode if no nearby L0
119eda14cbcSMatt Macy 			 * blocks are empty. This fallback avoids a
120eda14cbcSMatt Macy 			 * pathology where full dnode blocks containing
121eda14cbcSMatt Macy 			 * large dnodes appear sparse because they have a
122eda14cbcSMatt Macy 			 * low blk_fill, leading to many failed allocation
123eda14cbcSMatt Macy 			 * attempts. In the long term a better mechanism to
124eda14cbcSMatt Macy 			 * search for sparse metadnode regions, such as
125eda14cbcSMatt Macy 			 * spacemaps, could be implemented.
126eda14cbcSMatt Macy 			 *
127eda14cbcSMatt Macy 			 * os_scan_dnodes is set during txg sync if enough
128eda14cbcSMatt Macy 			 * objects have been freed since the previous
129eda14cbcSMatt Macy 			 * rescan to justify backfilling again.
130eda14cbcSMatt Macy 			 *
131eda14cbcSMatt Macy 			 * Note that dmu_traverse depends on the behavior
132eda14cbcSMatt Macy 			 * that we use multiple blocks of the dnode object
133eda14cbcSMatt Macy 			 * before going back to reuse objects.  Any change
134eda14cbcSMatt Macy 			 * to this algorithm should preserve that property
135eda14cbcSMatt Macy 			 * or find another solution to the issues described
136eda14cbcSMatt Macy 			 * in traverse_visitbp.
137eda14cbcSMatt Macy 			 */
138eda14cbcSMatt Macy 			if (P2PHASE(object, L1_dnode_count) == 0) {
139eda14cbcSMatt Macy 				uint64_t offset;
140eda14cbcSMatt Macy 				uint64_t blkfill;
141eda14cbcSMatt Macy 				int minlvl;
142eda14cbcSMatt Macy 				if (os->os_rescan_dnodes) {
143eda14cbcSMatt Macy 					offset = 0;
144eda14cbcSMatt Macy 					os->os_rescan_dnodes = B_FALSE;
145eda14cbcSMatt Macy 				} else {
146eda14cbcSMatt Macy 					offset = object << DNODE_SHIFT;
147eda14cbcSMatt Macy 				}
148eda14cbcSMatt Macy 				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
149eda14cbcSMatt Macy 				minlvl = restarted ? 1 : 2;
150eda14cbcSMatt Macy 				restarted = B_TRUE;
151eda14cbcSMatt Macy 				error = dnode_next_offset(DMU_META_DNODE(os),
152eda14cbcSMatt Macy 				    DNODE_FIND_HOLE, &offset, minlvl,
153eda14cbcSMatt Macy 				    blkfill, 0);
154eda14cbcSMatt Macy 				if (error == 0) {
155eda14cbcSMatt Macy 					object = offset >> DNODE_SHIFT;
156eda14cbcSMatt Macy 				}
157eda14cbcSMatt Macy 			}
158eda14cbcSMatt Macy 			/*
159eda14cbcSMatt Macy 			 * Note: if "restarted", we may find a L0 that
160eda14cbcSMatt Macy 			 * is not suitably aligned.
161eda14cbcSMatt Macy 			 */
162eda14cbcSMatt Macy 			os->os_obj_next_chunk =
163eda14cbcSMatt Macy 			    P2ALIGN(object, dnodes_per_chunk) +
164eda14cbcSMatt Macy 			    dnodes_per_chunk;
165eda14cbcSMatt Macy 			(void) atomic_swap_64(cpuobj, object);
166eda14cbcSMatt Macy 			mutex_exit(&os->os_obj_lock);
167eda14cbcSMatt Macy 		}
168eda14cbcSMatt Macy 
169eda14cbcSMatt Macy 		/*
170eda14cbcSMatt Macy 		 * The value of (*cpuobj) before adding dn_slots is the object
171eda14cbcSMatt Macy 		 * ID assigned to us.  The value afterwards is the object ID
172eda14cbcSMatt Macy 		 * assigned to whoever wants to do an allocation next.
173eda14cbcSMatt Macy 		 */
174eda14cbcSMatt Macy 		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
175eda14cbcSMatt Macy 
176eda14cbcSMatt Macy 		/*
177eda14cbcSMatt Macy 		 * XXX We should check for an i/o error here and return
178eda14cbcSMatt Macy 		 * up to our caller.  Actually we should pre-read it in
179eda14cbcSMatt Macy 		 * dmu_tx_assign(), but there is currently no mechanism
180eda14cbcSMatt Macy 		 * to do so.
181eda14cbcSMatt Macy 		 */
182eda14cbcSMatt Macy 		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
183eda14cbcSMatt Macy 		    dn_slots, tag, &dn);
184eda14cbcSMatt Macy 		if (error == 0) {
185eda14cbcSMatt Macy 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
186eda14cbcSMatt Macy 			/*
187eda14cbcSMatt Macy 			 * Another thread could have allocated it; check
188eda14cbcSMatt Macy 			 * again now that we have the struct lock.
189eda14cbcSMatt Macy 			 */
190eda14cbcSMatt Macy 			if (dn->dn_type == DMU_OT_NONE) {
191eda14cbcSMatt Macy 				dnode_allocate(dn, ot, blocksize,
192eda14cbcSMatt Macy 				    indirect_blockshift, bonustype,
193eda14cbcSMatt Macy 				    bonuslen, dn_slots, tx);
194eda14cbcSMatt Macy 				rw_exit(&dn->dn_struct_rwlock);
195eda14cbcSMatt Macy 				dmu_tx_add_new_object(tx, dn);
196eda14cbcSMatt Macy 
197eda14cbcSMatt Macy 				/*
198eda14cbcSMatt Macy 				 * Caller requested the allocated dnode be
199eda14cbcSMatt Macy 				 * returned and is responsible for the hold.
200eda14cbcSMatt Macy 				 */
201eda14cbcSMatt Macy 				if (allocated_dnode != NULL)
202eda14cbcSMatt Macy 					*allocated_dnode = dn;
203eda14cbcSMatt Macy 				else
204eda14cbcSMatt Macy 					dnode_rele(dn, tag);
205eda14cbcSMatt Macy 
206eda14cbcSMatt Macy 				return (object);
207eda14cbcSMatt Macy 			}
208eda14cbcSMatt Macy 			rw_exit(&dn->dn_struct_rwlock);
209eda14cbcSMatt Macy 			dnode_rele(dn, tag);
210eda14cbcSMatt Macy 			DNODE_STAT_BUMP(dnode_alloc_race);
211eda14cbcSMatt Macy 		}
212eda14cbcSMatt Macy 
213eda14cbcSMatt Macy 		/*
214eda14cbcSMatt Macy 		 * Skip to next known valid starting point on error.  This
215eda14cbcSMatt Macy 		 * is the start of the next block of dnodes.
216eda14cbcSMatt Macy 		 */
217eda14cbcSMatt Macy 		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
218eda14cbcSMatt Macy 			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
219eda14cbcSMatt Macy 			DNODE_STAT_BUMP(dnode_alloc_next_block);
220eda14cbcSMatt Macy 		}
221eda14cbcSMatt Macy 		(void) atomic_swap_64(cpuobj, object);
222eda14cbcSMatt Macy 	}
223eda14cbcSMatt Macy }
224eda14cbcSMatt Macy 
225eda14cbcSMatt Macy uint64_t
226eda14cbcSMatt Macy dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
227eda14cbcSMatt Macy     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
228eda14cbcSMatt Macy {
229eda14cbcSMatt Macy 	return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
230eda14cbcSMatt Macy 	    bonuslen, 0, NULL, NULL, tx);
231eda14cbcSMatt Macy }
232eda14cbcSMatt Macy 
233eda14cbcSMatt Macy uint64_t
234eda14cbcSMatt Macy dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
235eda14cbcSMatt Macy     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
236eda14cbcSMatt Macy     dmu_tx_t *tx)
237eda14cbcSMatt Macy {
238eda14cbcSMatt Macy 	return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
239eda14cbcSMatt Macy 	    bonustype, bonuslen, 0, NULL, NULL, tx);
240eda14cbcSMatt Macy }
241eda14cbcSMatt Macy 
242eda14cbcSMatt Macy uint64_t
243eda14cbcSMatt Macy dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
244eda14cbcSMatt Macy     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
245eda14cbcSMatt Macy {
246eda14cbcSMatt Macy 	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
247eda14cbcSMatt Macy 	    bonuslen, dnodesize, NULL, NULL, tx));
248eda14cbcSMatt Macy }
249eda14cbcSMatt Macy 
250eda14cbcSMatt Macy /*
251eda14cbcSMatt Macy  * Allocate a new object and return a pointer to the newly allocated dnode
252eda14cbcSMatt Macy  * via the allocated_dnode argument.  The returned dnode will be held and
253eda14cbcSMatt Macy  * the caller is responsible for releasing the hold by calling dnode_rele().
254eda14cbcSMatt Macy  */
255eda14cbcSMatt Macy uint64_t
256eda14cbcSMatt Macy dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
257eda14cbcSMatt Macy     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
258a0b956f5SMartin Matuska     int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
259eda14cbcSMatt Macy {
260eda14cbcSMatt Macy 	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
261eda14cbcSMatt Macy 	    bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
262eda14cbcSMatt Macy }
263eda14cbcSMatt Macy 
264eda14cbcSMatt Macy int
265eda14cbcSMatt Macy dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
266eda14cbcSMatt Macy     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
267eda14cbcSMatt Macy {
268eda14cbcSMatt Macy 	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
269eda14cbcSMatt Macy 	    bonuslen, 0, tx));
270eda14cbcSMatt Macy }
271eda14cbcSMatt Macy 
272eda14cbcSMatt Macy int
273eda14cbcSMatt Macy dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
274eda14cbcSMatt Macy     int blocksize, dmu_object_type_t bonustype, int bonuslen,
275eda14cbcSMatt Macy     int dnodesize, dmu_tx_t *tx)
276eda14cbcSMatt Macy {
277eda14cbcSMatt Macy 	dnode_t *dn;
278eda14cbcSMatt Macy 	int dn_slots = dnodesize >> DNODE_SHIFT;
279eda14cbcSMatt Macy 	int err;
280eda14cbcSMatt Macy 
281eda14cbcSMatt Macy 	if (dn_slots == 0)
282eda14cbcSMatt Macy 		dn_slots = DNODE_MIN_SLOTS;
283eda14cbcSMatt Macy 	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
284eda14cbcSMatt Macy 	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
285eda14cbcSMatt Macy 
286eda14cbcSMatt Macy 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
287eda14cbcSMatt Macy 		return (SET_ERROR(EBADF));
288eda14cbcSMatt Macy 
289eda14cbcSMatt Macy 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
290eda14cbcSMatt Macy 	    FTAG, &dn);
291eda14cbcSMatt Macy 	if (err)
292eda14cbcSMatt Macy 		return (err);
293eda14cbcSMatt Macy 
294eda14cbcSMatt Macy 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
295eda14cbcSMatt Macy 	dmu_tx_add_new_object(tx, dn);
296eda14cbcSMatt Macy 
297eda14cbcSMatt Macy 	dnode_rele(dn, FTAG);
298eda14cbcSMatt Macy 
299eda14cbcSMatt Macy 	return (0);
300eda14cbcSMatt Macy }
301eda14cbcSMatt Macy 
302eda14cbcSMatt Macy int
303eda14cbcSMatt Macy dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
304eda14cbcSMatt Macy     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
305eda14cbcSMatt Macy {
306eda14cbcSMatt Macy 	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
307eda14cbcSMatt Macy 	    bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
308eda14cbcSMatt Macy }
309eda14cbcSMatt Macy 
310eda14cbcSMatt Macy int
311eda14cbcSMatt Macy dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
312eda14cbcSMatt Macy     int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
313eda14cbcSMatt Macy     boolean_t keep_spill, dmu_tx_t *tx)
314eda14cbcSMatt Macy {
315eda14cbcSMatt Macy 	dnode_t *dn;
316eda14cbcSMatt Macy 	int dn_slots = dnodesize >> DNODE_SHIFT;
317eda14cbcSMatt Macy 	int err;
318eda14cbcSMatt Macy 
319eda14cbcSMatt Macy 	if (dn_slots == 0)
320eda14cbcSMatt Macy 		dn_slots = DNODE_MIN_SLOTS;
321eda14cbcSMatt Macy 
322eda14cbcSMatt Macy 	if (object == DMU_META_DNODE_OBJECT)
323eda14cbcSMatt Macy 		return (SET_ERROR(EBADF));
324eda14cbcSMatt Macy 
325eda14cbcSMatt Macy 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
326eda14cbcSMatt Macy 	    FTAG, &dn);
327eda14cbcSMatt Macy 	if (err)
328eda14cbcSMatt Macy 		return (err);
329eda14cbcSMatt Macy 
330eda14cbcSMatt Macy 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
331eda14cbcSMatt Macy 	    keep_spill, tx);
332eda14cbcSMatt Macy 
333eda14cbcSMatt Macy 	dnode_rele(dn, FTAG);
334eda14cbcSMatt Macy 	return (err);
335eda14cbcSMatt Macy }
336eda14cbcSMatt Macy 
337eda14cbcSMatt Macy int
338eda14cbcSMatt Macy dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
339eda14cbcSMatt Macy {
340eda14cbcSMatt Macy 	dnode_t *dn;
341eda14cbcSMatt Macy 	int err;
342eda14cbcSMatt Macy 
343eda14cbcSMatt Macy 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
344eda14cbcSMatt Macy 	    FTAG, &dn);
345eda14cbcSMatt Macy 	if (err)
346eda14cbcSMatt Macy 		return (err);
347eda14cbcSMatt Macy 
348eda14cbcSMatt Macy 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
349eda14cbcSMatt Macy 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
350eda14cbcSMatt Macy 		dbuf_rm_spill(dn, tx);
351eda14cbcSMatt Macy 		dnode_rm_spill(dn, tx);
352eda14cbcSMatt Macy 	}
353eda14cbcSMatt Macy 	rw_exit(&dn->dn_struct_rwlock);
354eda14cbcSMatt Macy 
355eda14cbcSMatt Macy 	dnode_rele(dn, FTAG);
356eda14cbcSMatt Macy 	return (err);
357eda14cbcSMatt Macy }
358eda14cbcSMatt Macy 
359eda14cbcSMatt Macy int
360eda14cbcSMatt Macy dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
361eda14cbcSMatt Macy {
362eda14cbcSMatt Macy 	dnode_t *dn;
363eda14cbcSMatt Macy 	int err;
364eda14cbcSMatt Macy 
365eda14cbcSMatt Macy 	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
366eda14cbcSMatt Macy 
367eda14cbcSMatt Macy 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
368eda14cbcSMatt Macy 	    FTAG, &dn);
369eda14cbcSMatt Macy 	if (err)
370eda14cbcSMatt Macy 		return (err);
371eda14cbcSMatt Macy 
372eda14cbcSMatt Macy 	ASSERT(dn->dn_type != DMU_OT_NONE);
373eda14cbcSMatt Macy 	/*
374eda14cbcSMatt Macy 	 * If we don't create this free range, we'll leak indirect blocks when
375eda14cbcSMatt Macy 	 * we get to freeing the dnode in syncing context.
376eda14cbcSMatt Macy 	 */
377eda14cbcSMatt Macy 	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
378eda14cbcSMatt Macy 	dnode_free(dn, tx);
379eda14cbcSMatt Macy 	dnode_rele(dn, FTAG);
380eda14cbcSMatt Macy 
381eda14cbcSMatt Macy 	return (0);
382eda14cbcSMatt Macy }
383eda14cbcSMatt Macy 
384eda14cbcSMatt Macy /*
385eda14cbcSMatt Macy  * Return (in *objectp) the next object which is allocated (or a hole)
386eda14cbcSMatt Macy  * after *object, taking into account only objects that may have been modified
387eda14cbcSMatt Macy  * after the specified txg.
388eda14cbcSMatt Macy  */
389eda14cbcSMatt Macy int
390eda14cbcSMatt Macy dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
391eda14cbcSMatt Macy {
392eda14cbcSMatt Macy 	uint64_t offset;
393eda14cbcSMatt Macy 	uint64_t start_obj;
394eda14cbcSMatt Macy 	struct dsl_dataset *ds = os->os_dsl_dataset;
395eda14cbcSMatt Macy 	int error;
396eda14cbcSMatt Macy 
397eda14cbcSMatt Macy 	if (*objectp == 0) {
398eda14cbcSMatt Macy 		start_obj = 1;
399eda14cbcSMatt Macy 	} else if (ds && dsl_dataset_feature_is_active(ds,
400eda14cbcSMatt Macy 	    SPA_FEATURE_LARGE_DNODE)) {
401eda14cbcSMatt Macy 		uint64_t i = *objectp + 1;
402eda14cbcSMatt Macy 		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
403eda14cbcSMatt Macy 		dmu_object_info_t doi;
404eda14cbcSMatt Macy 
405eda14cbcSMatt Macy 		/*
406eda14cbcSMatt Macy 		 * Scan through the remaining meta dnode block.  The contents
407eda14cbcSMatt Macy 		 * of each slot in the block are known so it can be quickly
408eda14cbcSMatt Macy 		 * checked.  If the block is exhausted without a match then
409eda14cbcSMatt Macy 		 * hand off to dnode_next_offset() for further scanning.
410eda14cbcSMatt Macy 		 */
411eda14cbcSMatt Macy 		while (i <= last_obj) {
412*2a58b312SMartin Matuska 			if (i == 0)
413*2a58b312SMartin Matuska 				return (SET_ERROR(ESRCH));
414eda14cbcSMatt Macy 			error = dmu_object_info(os, i, &doi);
415eda14cbcSMatt Macy 			if (error == ENOENT) {
416eda14cbcSMatt Macy 				if (hole) {
417eda14cbcSMatt Macy 					*objectp = i;
418eda14cbcSMatt Macy 					return (0);
419eda14cbcSMatt Macy 				} else {
420eda14cbcSMatt Macy 					i++;
421eda14cbcSMatt Macy 				}
422eda14cbcSMatt Macy 			} else if (error == EEXIST) {
423eda14cbcSMatt Macy 				i++;
424eda14cbcSMatt Macy 			} else if (error == 0) {
425eda14cbcSMatt Macy 				if (hole) {
426eda14cbcSMatt Macy 					i += doi.doi_dnodesize >> DNODE_SHIFT;
427eda14cbcSMatt Macy 				} else {
428eda14cbcSMatt Macy 					*objectp = i;
429eda14cbcSMatt Macy 					return (0);
430eda14cbcSMatt Macy 				}
431eda14cbcSMatt Macy 			} else {
432eda14cbcSMatt Macy 				return (error);
433eda14cbcSMatt Macy 			}
434eda14cbcSMatt Macy 		}
435eda14cbcSMatt Macy 
436eda14cbcSMatt Macy 		start_obj = i;
437eda14cbcSMatt Macy 	} else {
438eda14cbcSMatt Macy 		start_obj = *objectp + 1;
439eda14cbcSMatt Macy 	}
440eda14cbcSMatt Macy 
441eda14cbcSMatt Macy 	offset = start_obj << DNODE_SHIFT;
442eda14cbcSMatt Macy 
443eda14cbcSMatt Macy 	error = dnode_next_offset(DMU_META_DNODE(os),
444eda14cbcSMatt Macy 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
445eda14cbcSMatt Macy 
446eda14cbcSMatt Macy 	*objectp = offset >> DNODE_SHIFT;
447eda14cbcSMatt Macy 
448eda14cbcSMatt Macy 	return (error);
449eda14cbcSMatt Macy }
450eda14cbcSMatt Macy 
451eda14cbcSMatt Macy /*
452eda14cbcSMatt Macy  * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
453eda14cbcSMatt Macy  * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
454eda14cbcSMatt Macy  *
455eda14cbcSMatt Macy  * Only for use from syncing context, on MOS objects.
456eda14cbcSMatt Macy  */
457eda14cbcSMatt Macy void
458eda14cbcSMatt Macy dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
459eda14cbcSMatt Macy     dmu_tx_t *tx)
460eda14cbcSMatt Macy {
461eda14cbcSMatt Macy 	dnode_t *dn;
462eda14cbcSMatt Macy 
463eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
464eda14cbcSMatt Macy 
465eda14cbcSMatt Macy 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
466eda14cbcSMatt Macy 	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
467eda14cbcSMatt Macy 		dnode_rele(dn, FTAG);
468eda14cbcSMatt Macy 		return;
469eda14cbcSMatt Macy 	}
470eda14cbcSMatt Macy 	ASSERT3U(dn->dn_type, ==, old_type);
471eda14cbcSMatt Macy 	ASSERT0(dn->dn_maxblkid);
472eda14cbcSMatt Macy 
473eda14cbcSMatt Macy 	/*
474eda14cbcSMatt Macy 	 * We must initialize the ZAP data before changing the type,
475eda14cbcSMatt Macy 	 * so that concurrent calls to *_is_zapified() can determine if
476eda14cbcSMatt Macy 	 * the object has been completely zapified by checking the type.
477eda14cbcSMatt Macy 	 */
478eda14cbcSMatt Macy 	mzap_create_impl(dn, 0, 0, tx);
479eda14cbcSMatt Macy 
480eda14cbcSMatt Macy 	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
481eda14cbcSMatt Macy 	    DMU_OTN_ZAP_METADATA;
482eda14cbcSMatt Macy 	dnode_setdirty(dn, tx);
483eda14cbcSMatt Macy 	dnode_rele(dn, FTAG);
484eda14cbcSMatt Macy 
485eda14cbcSMatt Macy 	spa_feature_incr(dmu_objset_spa(mos),
486eda14cbcSMatt Macy 	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
487eda14cbcSMatt Macy }
488eda14cbcSMatt Macy 
489eda14cbcSMatt Macy void
490eda14cbcSMatt Macy dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
491eda14cbcSMatt Macy {
492eda14cbcSMatt Macy 	dnode_t *dn;
493eda14cbcSMatt Macy 	dmu_object_type_t t;
494eda14cbcSMatt Macy 
495eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
496eda14cbcSMatt Macy 
497eda14cbcSMatt Macy 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
498eda14cbcSMatt Macy 	t = dn->dn_type;
499eda14cbcSMatt Macy 	dnode_rele(dn, FTAG);
500eda14cbcSMatt Macy 
501eda14cbcSMatt Macy 	if (t == DMU_OTN_ZAP_METADATA) {
502eda14cbcSMatt Macy 		spa_feature_decr(dmu_objset_spa(mos),
503eda14cbcSMatt Macy 		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
504eda14cbcSMatt Macy 	}
505eda14cbcSMatt Macy 	VERIFY0(dmu_object_free(mos, object, tx));
506eda14cbcSMatt Macy }
507eda14cbcSMatt Macy 
508eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc);
509eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_ibs);
510eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_dnsize);
511eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_hold);
512eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_claim);
513eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_claim_dnsize);
514eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_reclaim);
515eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
516eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_rm_spill);
517eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_free);
518eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_next);
519eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_zapify);
520eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_free_zapified);
521eda14cbcSMatt Macy 
522eda14cbcSMatt Macy /* BEGIN CSTYLED */
523be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
524eda14cbcSMatt Macy 	"CPU-specific allocator grabs 2^N objects at once");
525eda14cbcSMatt Macy /* END CSTYLED */
526