1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
25eda14cbcSMatt Macy * Copyright 2014 HybridCluster. All rights reserved.
26eda14cbcSMatt Macy */
27eda14cbcSMatt Macy
28eda14cbcSMatt Macy #include <sys/dbuf.h>
29eda14cbcSMatt Macy #include <sys/dmu.h>
30eda14cbcSMatt Macy #include <sys/dmu_impl.h>
31eda14cbcSMatt Macy #include <sys/dmu_objset.h>
32eda14cbcSMatt Macy #include <sys/dmu_tx.h>
33eda14cbcSMatt Macy #include <sys/dnode.h>
34eda14cbcSMatt Macy #include <sys/zap.h>
35eda14cbcSMatt Macy #include <sys/zfeature.h>
36eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
37eda14cbcSMatt Macy
38eda14cbcSMatt Macy /*
39eda14cbcSMatt Macy * Each of the concurrent object allocators will grab
40eda14cbcSMatt Macy * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
41eda14cbcSMatt Macy * grab 128 slots, which is 4 blocks worth. This was experimentally
42eda14cbcSMatt Macy * determined to be the lowest value that eliminates the measurable effect
43eda14cbcSMatt Macy * of lock contention from this code path.
44eda14cbcSMatt Macy */
45be181ee2SMartin Matuska uint_t dmu_object_alloc_chunk_shift = 7;
46eda14cbcSMatt Macy
47eda14cbcSMatt Macy static uint64_t
dmu_object_alloc_impl(objset_t * os,dmu_object_type_t ot,int blocksize,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)48eda14cbcSMatt Macy dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
49eda14cbcSMatt Macy int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
50a0b956f5SMartin Matuska int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
51eda14cbcSMatt Macy {
52eda14cbcSMatt Macy uint64_t object;
53eda14cbcSMatt Macy uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
54eda14cbcSMatt Macy (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
55eda14cbcSMatt Macy dnode_t *dn = NULL;
56eda14cbcSMatt Macy int dn_slots = dnodesize >> DNODE_SHIFT;
57eda14cbcSMatt Macy boolean_t restarted = B_FALSE;
58eda14cbcSMatt Macy uint64_t *cpuobj = NULL;
59be181ee2SMartin Matuska uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
60eda14cbcSMatt Macy int error;
61eda14cbcSMatt Macy
627877fdebSMatt Macy cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
63eda14cbcSMatt Macy os->os_obj_next_percpu_len];
64eda14cbcSMatt Macy
65eda14cbcSMatt Macy if (dn_slots == 0) {
66eda14cbcSMatt Macy dn_slots = DNODE_MIN_SLOTS;
67eda14cbcSMatt Macy } else {
68eda14cbcSMatt Macy ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
69eda14cbcSMatt Macy ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
70eda14cbcSMatt Macy }
71eda14cbcSMatt Macy
72eda14cbcSMatt Macy /*
73eda14cbcSMatt Macy * The "chunk" of dnodes that is assigned to a CPU-specific
74eda14cbcSMatt Macy * allocator needs to be at least one block's worth, to avoid
75eda14cbcSMatt Macy * lock contention on the dbuf. It can be at most one L1 block's
76eda14cbcSMatt Macy * worth, so that the "rescan after polishing off a L1's worth"
77eda14cbcSMatt Macy * logic below will be sure to kick in.
78eda14cbcSMatt Macy */
79eda14cbcSMatt Macy if (dnodes_per_chunk < DNODES_PER_BLOCK)
80eda14cbcSMatt Macy dnodes_per_chunk = DNODES_PER_BLOCK;
81eda14cbcSMatt Macy if (dnodes_per_chunk > L1_dnode_count)
82eda14cbcSMatt Macy dnodes_per_chunk = L1_dnode_count;
83eda14cbcSMatt Macy
84eda14cbcSMatt Macy /*
85eda14cbcSMatt Macy * The caller requested the dnode be returned as a performance
86eda14cbcSMatt Macy * optimization in order to avoid releasing the hold only to
87eda14cbcSMatt Macy * immediately reacquire it. Since they caller is responsible
88eda14cbcSMatt Macy * for releasing the hold they must provide the tag.
89eda14cbcSMatt Macy */
90eda14cbcSMatt Macy if (allocated_dnode != NULL) {
91eda14cbcSMatt Macy ASSERT3P(tag, !=, NULL);
92eda14cbcSMatt Macy } else {
93eda14cbcSMatt Macy ASSERT3P(tag, ==, NULL);
94eda14cbcSMatt Macy tag = FTAG;
95eda14cbcSMatt Macy }
96eda14cbcSMatt Macy
97eda14cbcSMatt Macy object = *cpuobj;
98eda14cbcSMatt Macy for (;;) {
99eda14cbcSMatt Macy /*
100eda14cbcSMatt Macy * If we finished a chunk of dnodes, get a new one from
101eda14cbcSMatt Macy * the global allocator.
102eda14cbcSMatt Macy */
103eda14cbcSMatt Macy if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
104eda14cbcSMatt Macy (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
105eda14cbcSMatt Macy dn_slots)) {
106eda14cbcSMatt Macy DNODE_STAT_BUMP(dnode_alloc_next_chunk);
107eda14cbcSMatt Macy mutex_enter(&os->os_obj_lock);
108eda14cbcSMatt Macy ASSERT0(P2PHASE(os->os_obj_next_chunk,
109eda14cbcSMatt Macy dnodes_per_chunk));
110eda14cbcSMatt Macy object = os->os_obj_next_chunk;
111eda14cbcSMatt Macy
112eda14cbcSMatt Macy /*
113eda14cbcSMatt Macy * Each time we polish off a L1 bp worth of dnodes
114eda14cbcSMatt Macy * (2^12 objects), move to another L1 bp that's
115eda14cbcSMatt Macy * still reasonably sparse (at most 1/4 full). Look
116eda14cbcSMatt Macy * from the beginning at most once per txg. If we
117eda14cbcSMatt Macy * still can't allocate from that L1 block, search
118eda14cbcSMatt Macy * for an empty L0 block, which will quickly skip
119eda14cbcSMatt Macy * to the end of the metadnode if no nearby L0
120eda14cbcSMatt Macy * blocks are empty. This fallback avoids a
121eda14cbcSMatt Macy * pathology where full dnode blocks containing
122eda14cbcSMatt Macy * large dnodes appear sparse because they have a
123eda14cbcSMatt Macy * low blk_fill, leading to many failed allocation
124eda14cbcSMatt Macy * attempts. In the long term a better mechanism to
125eda14cbcSMatt Macy * search for sparse metadnode regions, such as
126eda14cbcSMatt Macy * spacemaps, could be implemented.
127eda14cbcSMatt Macy *
128eda14cbcSMatt Macy * os_scan_dnodes is set during txg sync if enough
129eda14cbcSMatt Macy * objects have been freed since the previous
130eda14cbcSMatt Macy * rescan to justify backfilling again.
131eda14cbcSMatt Macy *
132eda14cbcSMatt Macy * Note that dmu_traverse depends on the behavior
133eda14cbcSMatt Macy * that we use multiple blocks of the dnode object
134eda14cbcSMatt Macy * before going back to reuse objects. Any change
135eda14cbcSMatt Macy * to this algorithm should preserve that property
136eda14cbcSMatt Macy * or find another solution to the issues described
137eda14cbcSMatt Macy * in traverse_visitbp.
138eda14cbcSMatt Macy */
139eda14cbcSMatt Macy if (P2PHASE(object, L1_dnode_count) == 0) {
140eda14cbcSMatt Macy uint64_t offset;
141eda14cbcSMatt Macy uint64_t blkfill;
142eda14cbcSMatt Macy int minlvl;
143eda14cbcSMatt Macy if (os->os_rescan_dnodes) {
144eda14cbcSMatt Macy offset = 0;
145eda14cbcSMatt Macy os->os_rescan_dnodes = B_FALSE;
146eda14cbcSMatt Macy } else {
147eda14cbcSMatt Macy offset = object << DNODE_SHIFT;
148eda14cbcSMatt Macy }
149eda14cbcSMatt Macy blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
150eda14cbcSMatt Macy minlvl = restarted ? 1 : 2;
151eda14cbcSMatt Macy restarted = B_TRUE;
152eda14cbcSMatt Macy error = dnode_next_offset(DMU_META_DNODE(os),
153eda14cbcSMatt Macy DNODE_FIND_HOLE, &offset, minlvl,
154eda14cbcSMatt Macy blkfill, 0);
155eda14cbcSMatt Macy if (error == 0) {
156eda14cbcSMatt Macy object = offset >> DNODE_SHIFT;
157eda14cbcSMatt Macy }
158eda14cbcSMatt Macy }
159eda14cbcSMatt Macy /*
160eda14cbcSMatt Macy * Note: if "restarted", we may find a L0 that
161eda14cbcSMatt Macy * is not suitably aligned.
162eda14cbcSMatt Macy */
163eda14cbcSMatt Macy os->os_obj_next_chunk =
164aca928a5SMartin Matuska P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
165eda14cbcSMatt Macy dnodes_per_chunk;
166eda14cbcSMatt Macy (void) atomic_swap_64(cpuobj, object);
167eda14cbcSMatt Macy mutex_exit(&os->os_obj_lock);
168eda14cbcSMatt Macy }
169eda14cbcSMatt Macy
170eda14cbcSMatt Macy /*
171eda14cbcSMatt Macy * The value of (*cpuobj) before adding dn_slots is the object
172eda14cbcSMatt Macy * ID assigned to us. The value afterwards is the object ID
173eda14cbcSMatt Macy * assigned to whoever wants to do an allocation next.
174eda14cbcSMatt Macy */
175eda14cbcSMatt Macy object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
176eda14cbcSMatt Macy
177eda14cbcSMatt Macy /*
178eda14cbcSMatt Macy * XXX We should check for an i/o error here and return
179eda14cbcSMatt Macy * up to our caller. Actually we should pre-read it in
180eda14cbcSMatt Macy * dmu_tx_assign(), but there is currently no mechanism
181eda14cbcSMatt Macy * to do so.
182eda14cbcSMatt Macy */
183eda14cbcSMatt Macy error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
184eda14cbcSMatt Macy dn_slots, tag, &dn);
185eda14cbcSMatt Macy if (error == 0) {
186eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
187eda14cbcSMatt Macy /*
188eda14cbcSMatt Macy * Another thread could have allocated it; check
189eda14cbcSMatt Macy * again now that we have the struct lock.
190eda14cbcSMatt Macy */
191eda14cbcSMatt Macy if (dn->dn_type == DMU_OT_NONE) {
192eda14cbcSMatt Macy dnode_allocate(dn, ot, blocksize,
193eda14cbcSMatt Macy indirect_blockshift, bonustype,
194eda14cbcSMatt Macy bonuslen, dn_slots, tx);
195eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock);
196eda14cbcSMatt Macy dmu_tx_add_new_object(tx, dn);
197eda14cbcSMatt Macy
198eda14cbcSMatt Macy /*
199eda14cbcSMatt Macy * Caller requested the allocated dnode be
200eda14cbcSMatt Macy * returned and is responsible for the hold.
201eda14cbcSMatt Macy */
202eda14cbcSMatt Macy if (allocated_dnode != NULL)
203eda14cbcSMatt Macy *allocated_dnode = dn;
204eda14cbcSMatt Macy else
205eda14cbcSMatt Macy dnode_rele(dn, tag);
206eda14cbcSMatt Macy
207eda14cbcSMatt Macy return (object);
208eda14cbcSMatt Macy }
209eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock);
210eda14cbcSMatt Macy dnode_rele(dn, tag);
211eda14cbcSMatt Macy DNODE_STAT_BUMP(dnode_alloc_race);
212eda14cbcSMatt Macy }
213eda14cbcSMatt Macy
214eda14cbcSMatt Macy /*
215eda14cbcSMatt Macy * Skip to next known valid starting point on error. This
216eda14cbcSMatt Macy * is the start of the next block of dnodes.
217eda14cbcSMatt Macy */
218eda14cbcSMatt Macy if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
219eda14cbcSMatt Macy object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
220eda14cbcSMatt Macy DNODE_STAT_BUMP(dnode_alloc_next_block);
221eda14cbcSMatt Macy }
222eda14cbcSMatt Macy (void) atomic_swap_64(cpuobj, object);
223eda14cbcSMatt Macy }
224eda14cbcSMatt Macy }
225eda14cbcSMatt Macy
226eda14cbcSMatt Macy uint64_t
dmu_object_alloc(objset_t * os,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)227eda14cbcSMatt Macy dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
228eda14cbcSMatt Macy dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
229eda14cbcSMatt Macy {
230eda14cbcSMatt Macy return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
231eda14cbcSMatt Macy bonuslen, 0, NULL, NULL, tx);
232eda14cbcSMatt Macy }
233eda14cbcSMatt Macy
234eda14cbcSMatt Macy uint64_t
dmu_object_alloc_ibs(objset_t * os,dmu_object_type_t ot,int blocksize,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)235eda14cbcSMatt Macy dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
236eda14cbcSMatt Macy int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
237eda14cbcSMatt Macy dmu_tx_t *tx)
238eda14cbcSMatt Macy {
239eda14cbcSMatt Macy return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
240eda14cbcSMatt Macy bonustype, bonuslen, 0, NULL, NULL, tx);
241eda14cbcSMatt Macy }
242eda14cbcSMatt Macy
243eda14cbcSMatt Macy uint64_t
dmu_object_alloc_dnsize(objset_t * os,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)244eda14cbcSMatt Macy dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
245eda14cbcSMatt Macy dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
246eda14cbcSMatt Macy {
247eda14cbcSMatt Macy return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
248eda14cbcSMatt Macy bonuslen, dnodesize, NULL, NULL, tx));
249eda14cbcSMatt Macy }
250eda14cbcSMatt Macy
251eda14cbcSMatt Macy /*
252eda14cbcSMatt Macy * Allocate a new object and return a pointer to the newly allocated dnode
253eda14cbcSMatt Macy * via the allocated_dnode argument. The returned dnode will be held and
254eda14cbcSMatt Macy * the caller is responsible for releasing the hold by calling dnode_rele().
255eda14cbcSMatt Macy */
256eda14cbcSMatt Macy uint64_t
dmu_object_alloc_hold(objset_t * os,dmu_object_type_t ot,int blocksize,int indirect_blockshift,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dnode_t ** allocated_dnode,const void * tag,dmu_tx_t * tx)257eda14cbcSMatt Macy dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
258eda14cbcSMatt Macy int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
259a0b956f5SMartin Matuska int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
260eda14cbcSMatt Macy {
261eda14cbcSMatt Macy return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
262eda14cbcSMatt Macy bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
263eda14cbcSMatt Macy }
264eda14cbcSMatt Macy
265eda14cbcSMatt Macy int
dmu_object_claim(objset_t * os,uint64_t object,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)266eda14cbcSMatt Macy dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
267eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
268eda14cbcSMatt Macy {
269eda14cbcSMatt Macy return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
270eda14cbcSMatt Macy bonuslen, 0, tx));
271eda14cbcSMatt Macy }
272eda14cbcSMatt Macy
273eda14cbcSMatt Macy int
dmu_object_claim_dnsize(objset_t * os,uint64_t object,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,int dnodesize,dmu_tx_t * tx)274eda14cbcSMatt Macy dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
275eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen,
276eda14cbcSMatt Macy int dnodesize, dmu_tx_t *tx)
277eda14cbcSMatt Macy {
278eda14cbcSMatt Macy dnode_t *dn;
279eda14cbcSMatt Macy int dn_slots = dnodesize >> DNODE_SHIFT;
280eda14cbcSMatt Macy int err;
281eda14cbcSMatt Macy
282eda14cbcSMatt Macy if (dn_slots == 0)
283eda14cbcSMatt Macy dn_slots = DNODE_MIN_SLOTS;
284eda14cbcSMatt Macy ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
285eda14cbcSMatt Macy ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
286eda14cbcSMatt Macy
287eda14cbcSMatt Macy if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
288eda14cbcSMatt Macy return (SET_ERROR(EBADF));
289eda14cbcSMatt Macy
290eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
291eda14cbcSMatt Macy FTAG, &dn);
292eda14cbcSMatt Macy if (err)
293eda14cbcSMatt Macy return (err);
294eda14cbcSMatt Macy
295eda14cbcSMatt Macy dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
296eda14cbcSMatt Macy dmu_tx_add_new_object(tx, dn);
297eda14cbcSMatt Macy
298eda14cbcSMatt Macy dnode_rele(dn, FTAG);
299eda14cbcSMatt Macy
300eda14cbcSMatt Macy return (0);
301eda14cbcSMatt Macy }
302eda14cbcSMatt Macy
303eda14cbcSMatt Macy int
dmu_object_reclaim(objset_t * os,uint64_t object,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,dmu_tx_t * tx)304eda14cbcSMatt Macy dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
305eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
306eda14cbcSMatt Macy {
307eda14cbcSMatt Macy return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
308eda14cbcSMatt Macy bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
309eda14cbcSMatt Macy }
310eda14cbcSMatt Macy
311eda14cbcSMatt Macy int
dmu_object_reclaim_dnsize(objset_t * os,uint64_t object,dmu_object_type_t ot,int blocksize,dmu_object_type_t bonustype,int bonuslen,int dnodesize,boolean_t keep_spill,dmu_tx_t * tx)312eda14cbcSMatt Macy dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
313eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
314eda14cbcSMatt Macy boolean_t keep_spill, dmu_tx_t *tx)
315eda14cbcSMatt Macy {
316eda14cbcSMatt Macy dnode_t *dn;
317eda14cbcSMatt Macy int dn_slots = dnodesize >> DNODE_SHIFT;
318eda14cbcSMatt Macy int err;
319eda14cbcSMatt Macy
320eda14cbcSMatt Macy if (dn_slots == 0)
321eda14cbcSMatt Macy dn_slots = DNODE_MIN_SLOTS;
322eda14cbcSMatt Macy
323eda14cbcSMatt Macy if (object == DMU_META_DNODE_OBJECT)
324eda14cbcSMatt Macy return (SET_ERROR(EBADF));
325eda14cbcSMatt Macy
326eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
327eda14cbcSMatt Macy FTAG, &dn);
328eda14cbcSMatt Macy if (err)
329eda14cbcSMatt Macy return (err);
330eda14cbcSMatt Macy
331eda14cbcSMatt Macy dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
332eda14cbcSMatt Macy keep_spill, tx);
333eda14cbcSMatt Macy
334eda14cbcSMatt Macy dnode_rele(dn, FTAG);
335eda14cbcSMatt Macy return (err);
336eda14cbcSMatt Macy }
337eda14cbcSMatt Macy
338eda14cbcSMatt Macy int
dmu_object_rm_spill(objset_t * os,uint64_t object,dmu_tx_t * tx)339eda14cbcSMatt Macy dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
340eda14cbcSMatt Macy {
341eda14cbcSMatt Macy dnode_t *dn;
342eda14cbcSMatt Macy int err;
343eda14cbcSMatt Macy
344eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
345eda14cbcSMatt Macy FTAG, &dn);
346eda14cbcSMatt Macy if (err)
347eda14cbcSMatt Macy return (err);
348eda14cbcSMatt Macy
349eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
350eda14cbcSMatt Macy if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
351eda14cbcSMatt Macy dbuf_rm_spill(dn, tx);
352eda14cbcSMatt Macy dnode_rm_spill(dn, tx);
353eda14cbcSMatt Macy }
354eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock);
355eda14cbcSMatt Macy
356eda14cbcSMatt Macy dnode_rele(dn, FTAG);
357eda14cbcSMatt Macy return (err);
358eda14cbcSMatt Macy }
359eda14cbcSMatt Macy
360eda14cbcSMatt Macy int
dmu_object_free(objset_t * os,uint64_t object,dmu_tx_t * tx)361eda14cbcSMatt Macy dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
362eda14cbcSMatt Macy {
363eda14cbcSMatt Macy dnode_t *dn;
364eda14cbcSMatt Macy int err;
365eda14cbcSMatt Macy
366eda14cbcSMatt Macy ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
367eda14cbcSMatt Macy
368eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
369eda14cbcSMatt Macy FTAG, &dn);
370eda14cbcSMatt Macy if (err)
371eda14cbcSMatt Macy return (err);
372eda14cbcSMatt Macy
373eda14cbcSMatt Macy ASSERT(dn->dn_type != DMU_OT_NONE);
374eda14cbcSMatt Macy /*
375eda14cbcSMatt Macy * If we don't create this free range, we'll leak indirect blocks when
376eda14cbcSMatt Macy * we get to freeing the dnode in syncing context.
377eda14cbcSMatt Macy */
378eda14cbcSMatt Macy dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
379eda14cbcSMatt Macy dnode_free(dn, tx);
380eda14cbcSMatt Macy dnode_rele(dn, FTAG);
381eda14cbcSMatt Macy
382eda14cbcSMatt Macy return (0);
383eda14cbcSMatt Macy }
384eda14cbcSMatt Macy
385eda14cbcSMatt Macy /*
386eda14cbcSMatt Macy * Return (in *objectp) the next object which is allocated (or a hole)
387eda14cbcSMatt Macy * after *object, taking into account only objects that may have been modified
388eda14cbcSMatt Macy * after the specified txg.
389eda14cbcSMatt Macy */
390eda14cbcSMatt Macy int
dmu_object_next(objset_t * os,uint64_t * objectp,boolean_t hole,uint64_t txg)391eda14cbcSMatt Macy dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
392eda14cbcSMatt Macy {
393eda14cbcSMatt Macy uint64_t offset;
394eda14cbcSMatt Macy uint64_t start_obj;
395eda14cbcSMatt Macy struct dsl_dataset *ds = os->os_dsl_dataset;
396eda14cbcSMatt Macy int error;
397eda14cbcSMatt Macy
398eda14cbcSMatt Macy if (*objectp == 0) {
399eda14cbcSMatt Macy start_obj = 1;
400eda14cbcSMatt Macy } else if (ds && dsl_dataset_feature_is_active(ds,
401eda14cbcSMatt Macy SPA_FEATURE_LARGE_DNODE)) {
402eda14cbcSMatt Macy uint64_t i = *objectp + 1;
403eda14cbcSMatt Macy uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
404eda14cbcSMatt Macy dmu_object_info_t doi;
405eda14cbcSMatt Macy
406eda14cbcSMatt Macy /*
407eda14cbcSMatt Macy * Scan through the remaining meta dnode block. The contents
408eda14cbcSMatt Macy * of each slot in the block are known so it can be quickly
409eda14cbcSMatt Macy * checked. If the block is exhausted without a match then
410eda14cbcSMatt Macy * hand off to dnode_next_offset() for further scanning.
411eda14cbcSMatt Macy */
412eda14cbcSMatt Macy while (i <= last_obj) {
4132a58b312SMartin Matuska if (i == 0)
4142a58b312SMartin Matuska return (SET_ERROR(ESRCH));
415eda14cbcSMatt Macy error = dmu_object_info(os, i, &doi);
416eda14cbcSMatt Macy if (error == ENOENT) {
417eda14cbcSMatt Macy if (hole) {
418eda14cbcSMatt Macy *objectp = i;
419eda14cbcSMatt Macy return (0);
420eda14cbcSMatt Macy } else {
421eda14cbcSMatt Macy i++;
422eda14cbcSMatt Macy }
423eda14cbcSMatt Macy } else if (error == EEXIST) {
424eda14cbcSMatt Macy i++;
425eda14cbcSMatt Macy } else if (error == 0) {
426eda14cbcSMatt Macy if (hole) {
427eda14cbcSMatt Macy i += doi.doi_dnodesize >> DNODE_SHIFT;
428eda14cbcSMatt Macy } else {
429eda14cbcSMatt Macy *objectp = i;
430eda14cbcSMatt Macy return (0);
431eda14cbcSMatt Macy }
432eda14cbcSMatt Macy } else {
433eda14cbcSMatt Macy return (error);
434eda14cbcSMatt Macy }
435eda14cbcSMatt Macy }
436eda14cbcSMatt Macy
437eda14cbcSMatt Macy start_obj = i;
438eda14cbcSMatt Macy } else {
439eda14cbcSMatt Macy start_obj = *objectp + 1;
440eda14cbcSMatt Macy }
441eda14cbcSMatt Macy
442eda14cbcSMatt Macy offset = start_obj << DNODE_SHIFT;
443eda14cbcSMatt Macy
444eda14cbcSMatt Macy error = dnode_next_offset(DMU_META_DNODE(os),
445eda14cbcSMatt Macy (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
446eda14cbcSMatt Macy
447eda14cbcSMatt Macy *objectp = offset >> DNODE_SHIFT;
448eda14cbcSMatt Macy
449eda14cbcSMatt Macy return (error);
450eda14cbcSMatt Macy }
451eda14cbcSMatt Macy
452eda14cbcSMatt Macy /*
453eda14cbcSMatt Macy * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
454eda14cbcSMatt Macy * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
455eda14cbcSMatt Macy *
456eda14cbcSMatt Macy * Only for use from syncing context, on MOS objects.
457eda14cbcSMatt Macy */
458eda14cbcSMatt Macy void
dmu_object_zapify(objset_t * mos,uint64_t object,dmu_object_type_t old_type,dmu_tx_t * tx)459eda14cbcSMatt Macy dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
460eda14cbcSMatt Macy dmu_tx_t *tx)
461eda14cbcSMatt Macy {
462eda14cbcSMatt Macy dnode_t *dn;
463eda14cbcSMatt Macy
464eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx));
465eda14cbcSMatt Macy
466eda14cbcSMatt Macy VERIFY0(dnode_hold(mos, object, FTAG, &dn));
467eda14cbcSMatt Macy if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
468eda14cbcSMatt Macy dnode_rele(dn, FTAG);
469eda14cbcSMatt Macy return;
470eda14cbcSMatt Macy }
471eda14cbcSMatt Macy ASSERT3U(dn->dn_type, ==, old_type);
472eda14cbcSMatt Macy ASSERT0(dn->dn_maxblkid);
473eda14cbcSMatt Macy
474eda14cbcSMatt Macy /*
475eda14cbcSMatt Macy * We must initialize the ZAP data before changing the type,
476eda14cbcSMatt Macy * so that concurrent calls to *_is_zapified() can determine if
477eda14cbcSMatt Macy * the object has been completely zapified by checking the type.
478eda14cbcSMatt Macy */
479eda14cbcSMatt Macy mzap_create_impl(dn, 0, 0, tx);
480eda14cbcSMatt Macy
481eda14cbcSMatt Macy dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
482eda14cbcSMatt Macy DMU_OTN_ZAP_METADATA;
483eda14cbcSMatt Macy dnode_setdirty(dn, tx);
484eda14cbcSMatt Macy dnode_rele(dn, FTAG);
485eda14cbcSMatt Macy
486eda14cbcSMatt Macy spa_feature_incr(dmu_objset_spa(mos),
487eda14cbcSMatt Macy SPA_FEATURE_EXTENSIBLE_DATASET, tx);
488eda14cbcSMatt Macy }
489eda14cbcSMatt Macy
490eda14cbcSMatt Macy void
dmu_object_free_zapified(objset_t * mos,uint64_t object,dmu_tx_t * tx)491eda14cbcSMatt Macy dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
492eda14cbcSMatt Macy {
493eda14cbcSMatt Macy dnode_t *dn;
494eda14cbcSMatt Macy dmu_object_type_t t;
495eda14cbcSMatt Macy
496eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx));
497eda14cbcSMatt Macy
498eda14cbcSMatt Macy VERIFY0(dnode_hold(mos, object, FTAG, &dn));
499eda14cbcSMatt Macy t = dn->dn_type;
500eda14cbcSMatt Macy dnode_rele(dn, FTAG);
501eda14cbcSMatt Macy
502eda14cbcSMatt Macy if (t == DMU_OTN_ZAP_METADATA) {
503eda14cbcSMatt Macy spa_feature_decr(dmu_objset_spa(mos),
504eda14cbcSMatt Macy SPA_FEATURE_EXTENSIBLE_DATASET, tx);
505eda14cbcSMatt Macy }
506eda14cbcSMatt Macy VERIFY0(dmu_object_free(mos, object, tx));
507eda14cbcSMatt Macy }
508eda14cbcSMatt Macy
509eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc);
510eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_ibs);
511eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_dnsize);
512eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_hold);
513eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_claim);
514eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_claim_dnsize);
515eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_reclaim);
516eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
517eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_rm_spill);
518eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_free);
519eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_next);
520eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_zapify);
521eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_free_zapified);
522eda14cbcSMatt Macy
523be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
524eda14cbcSMatt Macy "CPU-specific allocator grabs 2^N objects at once");
525