1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Portions Copyright 2011 iXsystems, Inc
25 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright 2019 Joyent, Inc.
29 * Copyright 2023 RackTop Systems, Inc.
30 */
31
32 #include <sys/zfs_context.h>
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/sysmacros.h>
37 #include <sys/dmu.h>
38 #include <sys/dmu_impl.h>
39 #include <sys/dmu_objset.h>
40 #include <sys/dmu_tx.h>
41 #include <sys/dbuf.h>
42 #include <sys/dnode.h>
43 #include <sys/zap.h>
44 #include <sys/sa.h>
45 #include <sys/sunddi.h>
46 #include <sys/sa_impl.h>
47 #include <sys/dnode.h>
48 #include <sys/errno.h>
49 #include <sys/zfs_context.h>
50
51 #ifdef _KERNEL
52 #include <sys/zfs_znode.h>
53 #endif
54
55 /*
56 * ZFS System attributes:
57 *
58 * A generic mechanism to allow for arbitrary attributes
59 * to be stored in a dnode. The data will be stored in the bonus buffer of
60 * the dnode and if necessary a special "spill" block will be used to handle
61 * overflow situations. The spill block will be sized to fit the data
62 * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
63 * spill block is stored at the end of the current bonus buffer. Any
64 * attributes that would be in the way of the blkptr_t will be relocated
65 * into the spill block.
66 *
67 * Attribute registration:
68 *
69 * Stored persistently on a per dataset basis
70 * a mapping between attribute "string" names and their actual attribute
71 * numeric values, length, and byteswap function. The names are only used
72 * during registration. All attributes are known by their unique attribute
73 * id value. If an attribute can have a variable size then the value
74 * 0 will be used to indicate this.
75 *
76 * Attribute Layout:
77 *
78 * Attribute layouts are a way to compactly store multiple attributes, but
79 * without taking the overhead associated with managing each attribute
80 * individually. Since you will typically have the same set of attributes
81 * stored in the same order a single table will be used to represent that
82 * layout. The ZPL for example will usually have only about 10 different
83 * layouts (regular files, device files, symlinks,
84 * regular files + scanstamp, files/dir with extended attributes, and then
85 * you have the possibility of all of those minus ACL, because it would
86 * be kicked out into the spill block)
87 *
88 * Layouts are simply an array of the attributes and their
89 * ordering i.e. [0, 1, 4, 5, 2]
90 *
91 * Each distinct layout is given a unique layout number and that is whats
92 * stored in the header at the beginning of the SA data buffer.
93 *
94 * A layout only covers a single dbuf (bonus or spill). If a set of
95 * attributes is split up between the bonus buffer and a spill buffer then
96 * two different layouts will be used. This allows us to byteswap the
97 * spill without looking at the bonus buffer and keeps the on disk format of
98 * the bonus and spill buffer the same.
99 *
100 * Adding a single attribute will cause the entire set of attributes to
101 * be rewritten and could result in a new layout number being constructed
102 * as part of the rewrite if no such layout exists for the new set of
103 * attribues. The new attribute will be appended to the end of the already
104 * existing attributes.
105 *
106 * Both the attribute registration and attribute layout information are
107 * stored in normal ZAP attributes. Their should be a small number of
108 * known layouts and the set of attributes is assumed to typically be quite
109 * small.
110 *
111 * The registered attributes and layout "table" information is maintained
112 * in core and a special "sa_os_t" is attached to the objset_t.
113 *
114 * A special interface is provided to allow for quickly applying
115 * a large set of attributes at once. sa_replace_all_by_template() is
116 * used to set an array of attributes. This is used by the ZPL when
117 * creating a brand new file. The template that is passed into the function
118 * specifies the attribute, size for variable length attributes, location of
119 * data and special "data locator" function if the data isn't in a contiguous
120 * location.
121 *
122 * Byteswap implications:
123 *
124 * Since the SA attributes are not entirely self describing we can't do
125 * the normal byteswap processing. The special ZAP layout attribute and
126 * attribute registration attributes define the byteswap function and the
127 * size of the attributes, unless it is variable sized.
128 * The normal ZFS byteswapping infrastructure assumes you don't need
129 * to read any objects in order to do the necessary byteswapping. Whereas
130 * SA attributes can only be properly byteswapped if the dataset is opened
131 * and the layout/attribute ZAP attributes are available. Because of this
132 * the SA attributes will be byteswapped when they are first accessed by
133 * the SA code that will read the SA data.
134 */
135
136 typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
137 uint16_t length, int length_idx, boolean_t, void *userp);
138
139 static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
140 static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
141 static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
142 sa_hdr_phys_t *hdr);
143 static void sa_idx_tab_rele(objset_t *os, void *arg);
144 static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
145 int buflen);
146 static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
147 sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
148 uint16_t buflen, dmu_tx_t *tx);
149
150 arc_byteswap_func_t *sa_bswap_table[] = {
151 byteswap_uint64_array,
152 byteswap_uint32_array,
153 byteswap_uint16_array,
154 byteswap_uint8_array,
155 zfs_acl_byteswap,
156 };
157
158 #define SA_COPY_DATA(f, s, t, l) \
159 { \
160 if (f == NULL) { \
161 if (l == 8) { \
162 *(uint64_t *)t = *(uint64_t *)s; \
163 } else if (l == 16) { \
164 *(uint64_t *)t = *(uint64_t *)s; \
165 *(uint64_t *)((uintptr_t)t + 8) = \
166 *(uint64_t *)((uintptr_t)s + 8); \
167 } else { \
168 bcopy(s, t, l); \
169 } \
170 } else \
171 sa_copy_data(f, s, t, l); \
172 }
173
174 /*
175 * This table is fixed and cannot be changed. Its purpose is to
176 * allow the SA code to work with both old/new ZPL file systems.
177 * It contains the list of legacy attributes. These attributes aren't
178 * stored in the "attribute" registry zap objects, since older ZPL file systems
179 * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
180 * use this static table.
181 */
182 sa_attr_reg_t sa_legacy_attrs[] = {
183 {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
184 {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
185 {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
186 {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
187 {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
188 {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
189 {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
190 {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
191 {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
192 {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
193 {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
194 {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
195 {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
196 {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
197 {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
198 {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
199 };
200
201 /*
202 * This is only used for objects of type DMU_OT_ZNODE
203 */
204 sa_attr_type_t sa_legacy_zpl_layout[] = {
205 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
206 };
207
208 /*
209 * Special dummy layout used for buffers with no attributes.
210 */
211 sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
212
213 static int sa_legacy_attr_count = 16;
214 static kmem_cache_t *sa_cache = NULL;
215
216 /*ARGSUSED*/
217 static int
sa_cache_constructor(void * buf,void * unused,int kmflag)218 sa_cache_constructor(void *buf, void *unused, int kmflag)
219 {
220 sa_handle_t *hdl = buf;
221
222 mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
223 return (0);
224 }
225
226 /*ARGSUSED*/
227 static void
sa_cache_destructor(void * buf,void * unused)228 sa_cache_destructor(void *buf, void *unused)
229 {
230 sa_handle_t *hdl = buf;
231 mutex_destroy(&hdl->sa_lock);
232 }
233
234 void
sa_cache_init(void)235 sa_cache_init(void)
236 {
237 sa_cache = kmem_cache_create("sa_cache",
238 sizeof (sa_handle_t), 0, sa_cache_constructor,
239 sa_cache_destructor, NULL, NULL, NULL, 0);
240 }
241
242 void
sa_cache_fini(void)243 sa_cache_fini(void)
244 {
245 if (sa_cache)
246 kmem_cache_destroy(sa_cache);
247 }
248
249 static int
layout_num_compare(const void * arg1,const void * arg2)250 layout_num_compare(const void *arg1, const void *arg2)
251 {
252 const sa_lot_t *node1 = (const sa_lot_t *)arg1;
253 const sa_lot_t *node2 = (const sa_lot_t *)arg2;
254
255 return (TREE_CMP(node1->lot_num, node2->lot_num));
256 }
257
258 static int
layout_hash_compare(const void * arg1,const void * arg2)259 layout_hash_compare(const void *arg1, const void *arg2)
260 {
261 const sa_lot_t *node1 = (const sa_lot_t *)arg1;
262 const sa_lot_t *node2 = (const sa_lot_t *)arg2;
263
264 int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
265 if (likely(cmp))
266 return (cmp);
267
268 return (TREE_CMP(node1->lot_instance, node2->lot_instance));
269 }
270
271 boolean_t
sa_layout_equal(sa_lot_t * tbf,sa_attr_type_t * attrs,int count)272 sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
273 {
274 int i;
275
276 if (count != tbf->lot_attr_count)
277 return (1);
278
279 for (i = 0; i != count; i++) {
280 if (attrs[i] != tbf->lot_attrs[i])
281 return (1);
282 }
283 return (0);
284 }
285
286 #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
287
288 static uint64_t
sa_layout_info_hash(sa_attr_type_t * attrs,int attr_count)289 sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
290 {
291 int i;
292 uint64_t crc = -1ULL;
293
294 for (i = 0; i != attr_count; i++)
295 crc ^= SA_ATTR_HASH(attrs[i]);
296
297 return (crc);
298 }
299
300 static int
sa_get_spill(sa_handle_t * hdl)301 sa_get_spill(sa_handle_t *hdl)
302 {
303 int rc;
304 if (hdl->sa_spill == NULL) {
305 if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
306 &hdl->sa_spill)) == 0)
307 VERIFY(0 == sa_build_index(hdl, SA_SPILL));
308 } else {
309 rc = 0;
310 }
311
312 return (rc);
313 }
314
315 /*
316 * Main attribute lookup/update function
317 * returns 0 for success or non zero for failures
318 *
319 * Operates on bulk array, first failure will abort further processing
320 */
321 int
sa_attr_op(sa_handle_t * hdl,sa_bulk_attr_t * bulk,int count,sa_data_op_t data_op,dmu_tx_t * tx)322 sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
323 sa_data_op_t data_op, dmu_tx_t *tx)
324 {
325 sa_os_t *sa = hdl->sa_os->os_sa;
326 int i;
327 int error = 0;
328 sa_buf_type_t buftypes;
329
330 buftypes = 0;
331
332 ASSERT(count > 0);
333 for (i = 0; i != count; i++) {
334 ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
335
336 bulk[i].sa_addr = NULL;
337 /* First check the bonus buffer */
338
339 if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
340 hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
341 SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
342 SA_GET_HDR(hdl, SA_BONUS),
343 bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
344 if (tx && !(buftypes & SA_BONUS)) {
345 dmu_buf_will_dirty(hdl->sa_bonus, tx);
346 buftypes |= SA_BONUS;
347 }
348 }
349 if (bulk[i].sa_addr == NULL &&
350 ((error = sa_get_spill(hdl)) == 0)) {
351 if (TOC_ATTR_PRESENT(
352 hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
353 SA_ATTR_INFO(sa, hdl->sa_spill_tab,
354 SA_GET_HDR(hdl, SA_SPILL),
355 bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
356 if (tx && !(buftypes & SA_SPILL) &&
357 bulk[i].sa_size == bulk[i].sa_length) {
358 dmu_buf_will_dirty(hdl->sa_spill, tx);
359 buftypes |= SA_SPILL;
360 }
361 }
362 }
363 if (error && error != ENOENT) {
364 return ((error == ECKSUM) ? EIO : error);
365 }
366
367 switch (data_op) {
368 case SA_LOOKUP:
369 if (bulk[i].sa_addr == NULL)
370 return (SET_ERROR(ENOENT));
371 if (bulk[i].sa_data) {
372 SA_COPY_DATA(bulk[i].sa_data_func,
373 bulk[i].sa_addr, bulk[i].sa_data,
374 MIN(bulk[i].sa_size, bulk[i].sa_length));
375 }
376 continue;
377
378 case SA_UPDATE:
379 /* existing rewrite of attr */
380 if (bulk[i].sa_addr &&
381 bulk[i].sa_size == bulk[i].sa_length) {
382 SA_COPY_DATA(bulk[i].sa_data_func,
383 bulk[i].sa_data, bulk[i].sa_addr,
384 bulk[i].sa_length);
385 continue;
386 } else if (bulk[i].sa_addr) { /* attr size change */
387 error = sa_modify_attrs(hdl, bulk[i].sa_attr,
388 SA_REPLACE, bulk[i].sa_data_func,
389 bulk[i].sa_data, bulk[i].sa_length, tx);
390 } else { /* adding new attribute */
391 error = sa_modify_attrs(hdl, bulk[i].sa_attr,
392 SA_ADD, bulk[i].sa_data_func,
393 bulk[i].sa_data, bulk[i].sa_length, tx);
394 }
395 if (error)
396 return (error);
397 break;
398 }
399 }
400 return (error);
401 }
402
403 static sa_lot_t *
sa_add_layout_entry(objset_t * os,sa_attr_type_t * attrs,int attr_count,uint64_t lot_num,uint64_t hash,boolean_t zapadd,dmu_tx_t * tx)404 sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
405 uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
406 {
407 sa_os_t *sa = os->os_sa;
408 sa_lot_t *tb, *findtb;
409 int i, size;
410 avl_index_t loc;
411
412 ASSERT(MUTEX_HELD(&sa->sa_lock));
413 tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
414 tb->lot_attr_count = attr_count;
415
416 if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) {
417 tb->lot_attrs = kmem_alloc(size, KM_SLEEP);
418 bcopy(attrs, tb->lot_attrs, size);
419 }
420
421 tb->lot_num = lot_num;
422 tb->lot_hash = hash;
423 tb->lot_instance = 0;
424
425 if (zapadd) {
426 char attr_name[8];
427
428 if (sa->sa_layout_attr_obj == 0) {
429 sa->sa_layout_attr_obj = zap_create_link(os,
430 DMU_OT_SA_ATTR_LAYOUTS,
431 sa->sa_master_obj, SA_LAYOUTS, tx);
432 }
433
434 (void) snprintf(attr_name, sizeof (attr_name),
435 "%d", (int)lot_num);
436 VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
437 attr_name, 2, attr_count, attrs, tx));
438 }
439
440 list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
441 offsetof(sa_idx_tab_t, sa_next));
442
443 for (i = 0; i != attr_count; i++) {
444 if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
445 tb->lot_var_sizes++;
446 }
447
448 avl_add(&sa->sa_layout_num_tree, tb);
449
450 /* verify we don't have a hash collision */
451 if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
452 for (; findtb && findtb->lot_hash == hash;
453 findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
454 if (findtb->lot_instance != tb->lot_instance)
455 break;
456 tb->lot_instance++;
457 }
458 }
459 avl_add(&sa->sa_layout_hash_tree, tb);
460 return (tb);
461 }
462
463 static void
sa_find_layout(objset_t * os,uint64_t hash,sa_attr_type_t * attrs,int count,dmu_tx_t * tx,sa_lot_t ** lot)464 sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
465 int count, dmu_tx_t *tx, sa_lot_t **lot)
466 {
467 sa_lot_t *tb, tbsearch;
468 avl_index_t loc;
469 sa_os_t *sa = os->os_sa;
470 boolean_t found = B_FALSE;
471
472 mutex_enter(&sa->sa_lock);
473 tbsearch.lot_hash = hash;
474 tbsearch.lot_instance = 0;
475 tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
476 if (tb) {
477 for (; tb && tb->lot_hash == hash;
478 tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
479 if (sa_layout_equal(tb, attrs, count) == 0) {
480 found = B_TRUE;
481 break;
482 }
483 }
484 }
485 if (!found) {
486 tb = sa_add_layout_entry(os, attrs, count,
487 avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
488 }
489 mutex_exit(&sa->sa_lock);
490 *lot = tb;
491 }
492
493 static int
sa_resize_spill(sa_handle_t * hdl,uint32_t size,dmu_tx_t * tx)494 sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
495 {
496 int error;
497 uint32_t blocksize;
498
499 if (size == 0) {
500 blocksize = SPA_MINBLOCKSIZE;
501 } else if (size > SPA_OLD_MAXBLOCKSIZE) {
502 ASSERT(0);
503 return (SET_ERROR(EFBIG));
504 } else {
505 blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
506 }
507
508 error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
509 ASSERT(error == 0);
510 return (error);
511 }
512
513 static void
sa_copy_data(sa_data_locator_t * func,void * datastart,void * target,int buflen)514 sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
515 {
516 if (func == NULL) {
517 bcopy(datastart, target, buflen);
518 } else {
519 boolean_t start;
520 int bytes;
521 void *dataptr;
522 void *saptr = target;
523 uint32_t length;
524
525 start = B_TRUE;
526 bytes = 0;
527 while (bytes < buflen) {
528 func(&dataptr, &length, buflen, start, datastart);
529 bcopy(dataptr, saptr, length);
530 saptr = (void *)((caddr_t)saptr + length);
531 bytes += length;
532 start = B_FALSE;
533 }
534 }
535 }
536
537 /*
538 * Determine several different sizes
539 * first the sa header size
540 * the number of bytes to be stored
541 * if spill would occur the index in the attribute array is returned
542 *
543 * the boolean will_spill will be set when spilling is necessary. It
544 * is only set when the buftype is SA_BONUS
545 */
546 static int
sa_find_sizes(sa_os_t * sa,sa_bulk_attr_t * attr_desc,int attr_count,dmu_buf_t * db,sa_buf_type_t buftype,int full_space,int * index,int * total,boolean_t * will_spill)547 sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
548 dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
549 int *total, boolean_t *will_spill)
550 {
551 int var_size = 0;
552 int i;
553 int hdrsize;
554 int extra_hdrsize;
555
556 if (buftype == SA_BONUS && sa->sa_force_spill) {
557 *total = 0;
558 *index = 0;
559 *will_spill = B_TRUE;
560 return (0);
561 }
562
563 *index = -1;
564 *total = 0;
565 *will_spill = B_FALSE;
566
567 extra_hdrsize = 0;
568 hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
569 sizeof (sa_hdr_phys_t);
570
571 ASSERT(IS_P2ALIGNED(full_space, 8));
572
573 for (i = 0; i != attr_count; i++) {
574 boolean_t is_var_sz;
575
576 *total = P2ROUNDUP(*total, 8);
577 *total += attr_desc[i].sa_length;
578 if (*will_spill)
579 continue;
580
581 is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
582 if (is_var_sz) {
583 var_size++;
584 }
585
586 if (is_var_sz && var_size > 1) {
587 /*
588 * Don't worry that the spill block might overflow.
589 * It will be resized if needed in sa_build_layouts().
590 */
591 if (buftype == SA_SPILL ||
592 P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
593 *total < full_space) {
594 /*
595 * Account for header space used by array of
596 * optional sizes of variable-length attributes.
597 * Record the extra header size in case this
598 * increase needs to be reversed due to
599 * spill-over.
600 */
601 hdrsize += sizeof (uint16_t);
602 if (*index != -1)
603 extra_hdrsize += sizeof (uint16_t);
604 } else {
605 ASSERT(buftype == SA_BONUS);
606 if (*index == -1)
607 *index = i;
608 *will_spill = B_TRUE;
609 continue;
610 }
611 }
612
613 /*
614 * find index of where spill *could* occur.
615 * Then continue to count of remainder attribute
616 * space. The sum is used later for sizing bonus
617 * and spill buffer.
618 */
619 if (buftype == SA_BONUS && *index == -1 &&
620 *total + P2ROUNDUP(hdrsize, 8) >
621 (full_space - sizeof (blkptr_t))) {
622 *index = i;
623 }
624
625 if (*total + P2ROUNDUP(hdrsize, 8) > full_space &&
626 buftype == SA_BONUS)
627 *will_spill = B_TRUE;
628 }
629
630 if (*will_spill)
631 hdrsize -= extra_hdrsize;
632
633 hdrsize = P2ROUNDUP(hdrsize, 8);
634 return (hdrsize);
635 }
636
637 #define BUF_SPACE_NEEDED(total, header) (total + header)
638
639 /*
640 * Find layout that corresponds to ordering of attributes
641 * If not found a new layout number is created and added to
642 * persistent layout tables.
643 */
644 static int
sa_build_layouts(sa_handle_t * hdl,sa_bulk_attr_t * attr_desc,int attr_count,dmu_tx_t * tx)645 sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
646 dmu_tx_t *tx)
647 {
648 sa_os_t *sa = hdl->sa_os->os_sa;
649 uint64_t hash;
650 sa_buf_type_t buftype;
651 sa_hdr_phys_t *sahdr;
652 void *data_start;
653 int buf_space;
654 sa_attr_type_t *attrs, *attrs_start;
655 int i, lot_count;
656 int dnodesize;
657 int hdrsize;
658 int spillhdrsize = 0;
659 int used;
660 dmu_object_type_t bonustype;
661 sa_lot_t *lot;
662 int len_idx;
663 int spill_used;
664 int bonuslen;
665 boolean_t spilling;
666
667 dmu_buf_will_dirty(hdl->sa_bonus, tx);
668 bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
669
670 dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
671 bonuslen = DN_BONUS_SIZE(dnodesize);
672
673 /* first determine bonus header size and sum of all attributes */
674 hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
675 SA_BONUS, bonuslen, &i, &used, &spilling);
676
677 if (used > SPA_OLD_MAXBLOCKSIZE)
678 return (SET_ERROR(EFBIG));
679
680 VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
681 MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
682 used + hdrsize, tx));
683
684 ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
685 bonustype == DMU_OT_SA);
686
687 /* setup and size spill buffer when needed */
688 if (spilling) {
689 boolean_t dummy;
690
691 if (hdl->sa_spill == NULL) {
692 VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
693 &hdl->sa_spill) == 0);
694 }
695 dmu_buf_will_dirty(hdl->sa_spill, tx);
696
697 spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
698 attr_count - i, hdl->sa_spill, SA_SPILL,
699 hdl->sa_spill->db_size, &i, &spill_used, &dummy);
700
701 if (spill_used > SPA_OLD_MAXBLOCKSIZE)
702 return (SET_ERROR(EFBIG));
703
704 buf_space = hdl->sa_spill->db_size - spillhdrsize;
705 if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
706 hdl->sa_spill->db_size)
707 VERIFY(0 == sa_resize_spill(hdl,
708 BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
709 }
710
711 /* setup starting pointers to lay down data */
712 data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
713 sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
714 buftype = SA_BONUS;
715
716 if (spilling)
717 buf_space = (sa->sa_force_spill) ?
718 0 : SA_BLKPTR_SPACE - hdrsize;
719 else
720 buf_space = hdl->sa_bonus->db_size - hdrsize;
721
722 attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
723 KM_SLEEP);
724 lot_count = 0;
725
726 for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
727 uint16_t length;
728
729 ASSERT(IS_P2ALIGNED(data_start, 8));
730 ASSERT(IS_P2ALIGNED(buf_space, 8));
731 attrs[i] = attr_desc[i].sa_attr;
732 length = SA_REGISTERED_LEN(sa, attrs[i]);
733 if (length == 0)
734 length = attr_desc[i].sa_length;
735
736 if (buf_space < length) { /* switch to spill buffer */
737 VERIFY(spilling);
738 VERIFY(bonustype == DMU_OT_SA);
739 if (buftype == SA_BONUS && !sa->sa_force_spill) {
740 sa_find_layout(hdl->sa_os, hash, attrs_start,
741 lot_count, tx, &lot);
742 SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
743 }
744
745 buftype = SA_SPILL;
746 hash = -1ULL;
747 len_idx = 0;
748
749 sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
750 sahdr->sa_magic = SA_MAGIC;
751 data_start = (void *)((uintptr_t)sahdr +
752 spillhdrsize);
753 attrs_start = &attrs[i];
754 buf_space = hdl->sa_spill->db_size - spillhdrsize;
755 lot_count = 0;
756 }
757 hash ^= SA_ATTR_HASH(attrs[i]);
758 attr_desc[i].sa_addr = data_start;
759 attr_desc[i].sa_size = length;
760 SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
761 data_start, length);
762 if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
763 sahdr->sa_lengths[len_idx++] = length;
764 }
765 data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
766 length), 8);
767 buf_space -= P2ROUNDUP(length, 8);
768 lot_count++;
769 }
770
771 sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
772
773 /*
774 * Verify that old znodes always have layout number 0.
775 * Must be DMU_OT_SA for arbitrary layouts
776 */
777 VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
778 (bonustype == DMU_OT_SA && lot->lot_num > 1));
779
780 if (bonustype == DMU_OT_SA) {
781 SA_SET_HDR(sahdr, lot->lot_num,
782 buftype == SA_BONUS ? hdrsize : spillhdrsize);
783 }
784
785 kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
786 if (hdl->sa_bonus_tab) {
787 sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
788 hdl->sa_bonus_tab = NULL;
789 }
790 if (!sa->sa_force_spill)
791 VERIFY(0 == sa_build_index(hdl, SA_BONUS));
792 if (hdl->sa_spill) {
793 sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
794 if (!spilling) {
795 /*
796 * remove spill block that is no longer needed.
797 */
798 dmu_buf_rele(hdl->sa_spill, NULL);
799 hdl->sa_spill = NULL;
800 hdl->sa_spill_tab = NULL;
801 VERIFY(0 == dmu_rm_spill(hdl->sa_os,
802 sa_handle_object(hdl), tx));
803 } else {
804 VERIFY(0 == sa_build_index(hdl, SA_SPILL));
805 }
806 }
807
808 return (0);
809 }
810
811 static void
sa_free_attr_table(sa_os_t * sa)812 sa_free_attr_table(sa_os_t *sa)
813 {
814 int i;
815
816 if (sa->sa_attr_table == NULL)
817 return;
818
819 for (i = 0; i != sa->sa_num_attrs; i++) {
820 if (sa->sa_attr_table[i].sa_name)
821 kmem_free(sa->sa_attr_table[i].sa_name,
822 strlen(sa->sa_attr_table[i].sa_name) + 1);
823 }
824
825 kmem_free(sa->sa_attr_table,
826 sizeof (sa_attr_table_t) * sa->sa_num_attrs);
827
828 sa->sa_attr_table = NULL;
829 }
830
831 static int
sa_attr_table_setup(objset_t * os,sa_attr_reg_t * reg_attrs,int count)832 sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
833 {
834 sa_os_t *sa = os->os_sa;
835 uint64_t sa_attr_count = 0;
836 uint64_t sa_reg_count = 0;
837 int error = 0;
838 uint64_t attr_value;
839 sa_attr_table_t *tb;
840 zap_cursor_t zc;
841 zap_attribute_t za;
842 int registered_count = 0;
843 int i;
844 dmu_objset_type_t ostype = dmu_objset_type(os);
845
846 sa->sa_user_table =
847 kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
848 sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
849
850 if (sa->sa_reg_attr_obj != 0) {
851 error = zap_count(os, sa->sa_reg_attr_obj,
852 &sa_attr_count);
853
854 /*
855 * Make sure we retrieved a count and that it isn't zero
856 */
857 if (error || (error == 0 && sa_attr_count == 0)) {
858 if (error == 0)
859 error = SET_ERROR(EINVAL);
860 goto bail;
861 }
862 sa_reg_count = sa_attr_count;
863 }
864
865 if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
866 sa_attr_count += sa_legacy_attr_count;
867
868 /* Allocate attribute numbers for attributes that aren't registered */
869 for (i = 0; i != count; i++) {
870 boolean_t found = B_FALSE;
871 int j;
872
873 if (ostype == DMU_OST_ZFS) {
874 for (j = 0; j != sa_legacy_attr_count; j++) {
875 if (strcmp(reg_attrs[i].sa_name,
876 sa_legacy_attrs[j].sa_name) == 0) {
877 sa->sa_user_table[i] =
878 sa_legacy_attrs[j].sa_attr;
879 found = B_TRUE;
880 }
881 }
882 }
883 if (found)
884 continue;
885
886 if (sa->sa_reg_attr_obj)
887 error = zap_lookup(os, sa->sa_reg_attr_obj,
888 reg_attrs[i].sa_name, 8, 1, &attr_value);
889 else
890 error = SET_ERROR(ENOENT);
891 switch (error) {
892 case ENOENT:
893 sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
894 sa_attr_count++;
895 break;
896 case 0:
897 sa->sa_user_table[i] = ATTR_NUM(attr_value);
898 break;
899 default:
900 goto bail;
901 }
902 }
903
904 sa->sa_num_attrs = sa_attr_count;
905 tb = sa->sa_attr_table =
906 kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
907
908 /*
909 * Attribute table is constructed from requested attribute list,
910 * previously foreign registered attributes, and also the legacy
911 * ZPL set of attributes.
912 */
913
914 if (sa->sa_reg_attr_obj) {
915 for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
916 (error = zap_cursor_retrieve(&zc, &za)) == 0;
917 zap_cursor_advance(&zc)) {
918 uint64_t value;
919 value = za.za_first_integer;
920
921 registered_count++;
922 tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
923 tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
924 tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
925 tb[ATTR_NUM(value)].sa_registered = B_TRUE;
926
927 if (tb[ATTR_NUM(value)].sa_name) {
928 continue;
929 }
930 tb[ATTR_NUM(value)].sa_name =
931 kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
932 (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
933 strlen(za.za_name) +1);
934 }
935 zap_cursor_fini(&zc);
936 /*
937 * Make sure we processed the correct number of registered
938 * attributes
939 */
940 if (registered_count != sa_reg_count) {
941 ASSERT(error != 0);
942 goto bail;
943 }
944
945 }
946
947 if (ostype == DMU_OST_ZFS) {
948 for (i = 0; i != sa_legacy_attr_count; i++) {
949 if (tb[i].sa_name)
950 continue;
951 tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
952 tb[i].sa_length = sa_legacy_attrs[i].sa_length;
953 tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
954 tb[i].sa_registered = B_FALSE;
955 tb[i].sa_name =
956 kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
957 KM_SLEEP);
958 (void) strlcpy(tb[i].sa_name,
959 sa_legacy_attrs[i].sa_name,
960 strlen(sa_legacy_attrs[i].sa_name) + 1);
961 }
962 }
963
964 for (i = 0; i != count; i++) {
965 sa_attr_type_t attr_id;
966
967 attr_id = sa->sa_user_table[i];
968 if (tb[attr_id].sa_name)
969 continue;
970
971 tb[attr_id].sa_length = reg_attrs[i].sa_length;
972 tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
973 tb[attr_id].sa_attr = attr_id;
974 tb[attr_id].sa_name =
975 kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
976 (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
977 strlen(reg_attrs[i].sa_name) + 1);
978 }
979
980 sa->sa_need_attr_registration =
981 (sa_attr_count != registered_count);
982
983 return (0);
984 bail:
985 kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
986 sa->sa_user_table = NULL;
987 sa_free_attr_table(sa);
988 return ((error != 0) ? error : EINVAL);
989 }
990
991 int
sa_setup(objset_t * os,uint64_t sa_obj,sa_attr_reg_t * reg_attrs,int count,sa_attr_type_t ** user_table)992 sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
993 sa_attr_type_t **user_table)
994 {
995 zap_cursor_t zc;
996 zap_attribute_t za;
997 sa_os_t *sa;
998 dmu_objset_type_t ostype = dmu_objset_type(os);
999 sa_attr_type_t *tb;
1000 int error;
1001
1002 mutex_enter(&os->os_user_ptr_lock);
1003 if (os->os_sa) {
1004 mutex_enter(&os->os_sa->sa_lock);
1005 mutex_exit(&os->os_user_ptr_lock);
1006 tb = os->os_sa->sa_user_table;
1007 mutex_exit(&os->os_sa->sa_lock);
1008 *user_table = tb;
1009 return (0);
1010 }
1011
1012 sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
1013 mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
1014 sa->sa_master_obj = sa_obj;
1015
1016 os->os_sa = sa;
1017 mutex_enter(&sa->sa_lock);
1018 mutex_exit(&os->os_user_ptr_lock);
1019 avl_create(&sa->sa_layout_num_tree, layout_num_compare,
1020 sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
1021 avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
1022 sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
1023
1024 if (sa_obj) {
1025 error = zap_lookup(os, sa_obj, SA_LAYOUTS,
1026 8, 1, &sa->sa_layout_attr_obj);
1027 if (error != 0 && error != ENOENT)
1028 goto fail;
1029 error = zap_lookup(os, sa_obj, SA_REGISTRY,
1030 8, 1, &sa->sa_reg_attr_obj);
1031 if (error != 0 && error != ENOENT)
1032 goto fail;
1033 }
1034
1035 if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
1036 goto fail;
1037
1038 if (sa->sa_layout_attr_obj != 0) {
1039 uint64_t layout_count;
1040
1041 error = zap_count(os, sa->sa_layout_attr_obj,
1042 &layout_count);
1043
1044 /*
1045 * Layout number count should be > 0
1046 */
1047 if (error || (error == 0 && layout_count == 0)) {
1048 if (error == 0)
1049 error = SET_ERROR(EINVAL);
1050 goto fail;
1051 }
1052
1053 for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
1054 (error = zap_cursor_retrieve(&zc, &za)) == 0;
1055 zap_cursor_advance(&zc)) {
1056 sa_attr_type_t *lot_attrs;
1057 uint64_t lot_num;
1058
1059 lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
1060 za.za_num_integers, KM_SLEEP);
1061
1062 if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
1063 za.za_name, 2, za.za_num_integers,
1064 lot_attrs))) != 0) {
1065 kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
1066 za.za_num_integers);
1067 break;
1068 }
1069 VERIFY(ddi_strtoull(za.za_name, NULL, 10,
1070 (unsigned long long *)&lot_num) == 0);
1071
1072 (void) sa_add_layout_entry(os, lot_attrs,
1073 za.za_num_integers, lot_num,
1074 sa_layout_info_hash(lot_attrs,
1075 za.za_num_integers), B_FALSE, NULL);
1076 kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
1077 za.za_num_integers);
1078 }
1079 zap_cursor_fini(&zc);
1080
1081 /*
1082 * Make sure layout count matches number of entries added
1083 * to AVL tree
1084 */
1085 if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
1086 ASSERT(error != 0);
1087 goto fail;
1088 }
1089 }
1090
1091 /* Add special layout number for old ZNODES */
1092 if (ostype == DMU_OST_ZFS) {
1093 (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
1094 sa_legacy_attr_count, 0,
1095 sa_layout_info_hash(sa_legacy_zpl_layout,
1096 sa_legacy_attr_count), B_FALSE, NULL);
1097
1098 (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
1099 0, B_FALSE, NULL);
1100 }
1101 *user_table = os->os_sa->sa_user_table;
1102 mutex_exit(&sa->sa_lock);
1103 return (0);
1104 fail:
1105 os->os_sa = NULL;
1106 sa_free_attr_table(sa);
1107 if (sa->sa_user_table)
1108 kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
1109 mutex_exit(&sa->sa_lock);
1110 avl_destroy(&sa->sa_layout_hash_tree);
1111 avl_destroy(&sa->sa_layout_num_tree);
1112 mutex_destroy(&sa->sa_lock);
1113 kmem_free(sa, sizeof (sa_os_t));
1114 return ((error == ECKSUM) ? EIO : error);
1115 }
1116
1117 void
sa_tear_down(objset_t * os)1118 sa_tear_down(objset_t *os)
1119 {
1120 sa_os_t *sa = os->os_sa;
1121 sa_lot_t *layout;
1122 void *cookie;
1123
1124 kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
1125
1126 /* Free up attr table */
1127
1128 sa_free_attr_table(sa);
1129
1130 cookie = NULL;
1131 while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
1132 sa_idx_tab_t *tab;
1133 while (tab = list_head(&layout->lot_idx_tab)) {
1134 ASSERT(zfs_refcount_count(&tab->sa_refcount));
1135 sa_idx_tab_rele(os, tab);
1136 }
1137 }
1138
1139 cookie = NULL;
1140 while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
1141 kmem_free(layout->lot_attrs,
1142 sizeof (sa_attr_type_t) * layout->lot_attr_count);
1143 kmem_free(layout, sizeof (sa_lot_t));
1144 }
1145
1146 avl_destroy(&sa->sa_layout_hash_tree);
1147 avl_destroy(&sa->sa_layout_num_tree);
1148 mutex_destroy(&sa->sa_lock);
1149
1150 kmem_free(sa, sizeof (sa_os_t));
1151 os->os_sa = NULL;
1152 }
1153
1154 void
sa_build_idx_tab(void * hdr,void * attr_addr,sa_attr_type_t attr,uint16_t length,int length_idx,boolean_t var_length,void * userp)1155 sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
1156 uint16_t length, int length_idx, boolean_t var_length, void *userp)
1157 {
1158 sa_idx_tab_t *idx_tab = userp;
1159
1160 if (var_length) {
1161 ASSERT(idx_tab->sa_variable_lengths);
1162 idx_tab->sa_variable_lengths[length_idx] = length;
1163 }
1164 TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
1165 (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
1166 }
1167
1168 static void
sa_attr_iter(objset_t * os,sa_hdr_phys_t * hdr,dmu_object_type_t type,sa_iterfunc_t func,sa_lot_t * tab,void * userp)1169 sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
1170 sa_iterfunc_t func, sa_lot_t *tab, void *userp)
1171 {
1172 void *data_start;
1173 sa_lot_t *tb = tab;
1174 sa_lot_t search;
1175 avl_index_t loc;
1176 sa_os_t *sa = os->os_sa;
1177 int i;
1178 uint16_t *length_start = NULL;
1179 uint8_t length_idx = 0;
1180
1181 if (tab == NULL) {
1182 search.lot_num = SA_LAYOUT_NUM(hdr, type);
1183 tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
1184 ASSERT(tb);
1185 }
1186
1187 if (IS_SA_BONUSTYPE(type)) {
1188 data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
1189 offsetof(sa_hdr_phys_t, sa_lengths) +
1190 (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
1191 length_start = hdr->sa_lengths;
1192 } else {
1193 data_start = hdr;
1194 }
1195
1196 for (i = 0; i != tb->lot_attr_count; i++) {
1197 int attr_length, reg_length;
1198 uint8_t idx_len;
1199
1200 reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
1201 if (reg_length) {
1202 attr_length = reg_length;
1203 idx_len = 0;
1204 } else {
1205 attr_length = length_start[length_idx];
1206 idx_len = length_idx++;
1207 }
1208
1209 func(hdr, data_start, tb->lot_attrs[i], attr_length,
1210 idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
1211
1212 data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
1213 attr_length), 8);
1214 }
1215 }
1216
1217 /*ARGSUSED*/
1218 void
sa_byteswap_cb(void * hdr,void * attr_addr,sa_attr_type_t attr,uint16_t length,int length_idx,boolean_t variable_length,void * userp)1219 sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
1220 uint16_t length, int length_idx, boolean_t variable_length, void *userp)
1221 {
1222 sa_handle_t *hdl = userp;
1223 sa_os_t *sa = hdl->sa_os->os_sa;
1224
1225 sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
1226 }
1227
1228 void
sa_byteswap(sa_handle_t * hdl,sa_buf_type_t buftype)1229 sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
1230 {
1231 sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
1232 dmu_buf_impl_t *db;
1233 sa_os_t *sa = hdl->sa_os->os_sa;
1234 int num_lengths = 1;
1235 int i;
1236
1237 ASSERT(MUTEX_HELD(&sa->sa_lock));
1238 if (sa_hdr_phys->sa_magic == SA_MAGIC)
1239 return;
1240
1241 db = SA_GET_DB(hdl, buftype);
1242
1243 if (buftype == SA_SPILL) {
1244 arc_release(db->db_buf, NULL);
1245 arc_buf_thaw(db->db_buf);
1246 }
1247
1248 sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
1249 sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
1250
1251 /*
1252 * Determine number of variable lenghts in header
1253 * The standard 8 byte header has one for free and a
1254 * 16 byte header would have 4 + 1;
1255 */
1256 if (SA_HDR_SIZE(sa_hdr_phys) > 8)
1257 num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
1258 for (i = 0; i != num_lengths; i++)
1259 sa_hdr_phys->sa_lengths[i] =
1260 BSWAP_16(sa_hdr_phys->sa_lengths[i]);
1261
1262 sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
1263 sa_byteswap_cb, NULL, hdl);
1264
1265 if (buftype == SA_SPILL)
1266 arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
1267 }
1268
1269 static int
sa_build_index(sa_handle_t * hdl,sa_buf_type_t buftype)1270 sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
1271 {
1272 sa_hdr_phys_t *sa_hdr_phys;
1273 dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
1274 dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
1275 sa_os_t *sa = hdl->sa_os->os_sa;
1276 sa_idx_tab_t *idx_tab;
1277
1278 sa_hdr_phys = SA_GET_HDR(hdl, buftype);
1279
1280 mutex_enter(&sa->sa_lock);
1281
1282 /* Do we need to byteswap? */
1283
1284 /* only check if not old znode */
1285 if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
1286 sa_hdr_phys->sa_magic != 0) {
1287 VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
1288 sa_byteswap(hdl, buftype);
1289 }
1290
1291 idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
1292
1293 if (buftype == SA_BONUS)
1294 hdl->sa_bonus_tab = idx_tab;
1295 else
1296 hdl->sa_spill_tab = idx_tab;
1297
1298 mutex_exit(&sa->sa_lock);
1299 return (0);
1300 }
1301
1302 /*ARGSUSED*/
1303 static void
sa_evict_sync(void * dbu)1304 sa_evict_sync(void *dbu)
1305 {
1306 panic("evicting sa dbuf\n");
1307 }
1308
1309 static void
sa_idx_tab_rele(objset_t * os,void * arg)1310 sa_idx_tab_rele(objset_t *os, void *arg)
1311 {
1312 sa_os_t *sa = os->os_sa;
1313 sa_idx_tab_t *idx_tab = arg;
1314
1315 if (idx_tab == NULL)
1316 return;
1317
1318 mutex_enter(&sa->sa_lock);
1319 if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
1320 list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
1321 if (idx_tab->sa_variable_lengths)
1322 kmem_free(idx_tab->sa_variable_lengths,
1323 sizeof (uint16_t) *
1324 idx_tab->sa_layout->lot_var_sizes);
1325 zfs_refcount_destroy(&idx_tab->sa_refcount);
1326 kmem_free(idx_tab->sa_idx_tab,
1327 sizeof (uint32_t) * sa->sa_num_attrs);
1328 kmem_free(idx_tab, sizeof (sa_idx_tab_t));
1329 }
1330 mutex_exit(&sa->sa_lock);
1331 }
1332
1333 static void
sa_idx_tab_hold(objset_t * os,sa_idx_tab_t * idx_tab)1334 sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
1335 {
1336 sa_os_t *sa = os->os_sa;
1337
1338 ASSERT(MUTEX_HELD(&sa->sa_lock));
1339 (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
1340 }
1341
1342 void
sa_handle_destroy(sa_handle_t * hdl)1343 sa_handle_destroy(sa_handle_t *hdl)
1344 {
1345 dmu_buf_t *db = hdl->sa_bonus;
1346
1347 mutex_enter(&hdl->sa_lock);
1348 (void) dmu_buf_remove_user(db, &hdl->sa_dbu);
1349
1350 if (hdl->sa_bonus_tab)
1351 sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
1352
1353 if (hdl->sa_spill_tab)
1354 sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
1355
1356 dmu_buf_rele(hdl->sa_bonus, NULL);
1357
1358 if (hdl->sa_spill)
1359 dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
1360 mutex_exit(&hdl->sa_lock);
1361
1362 kmem_cache_free(sa_cache, hdl);
1363 }
1364
1365 int
sa_handle_get_from_db(objset_t * os,dmu_buf_t * db,void * userp,sa_handle_type_t hdl_type,sa_handle_t ** handlepp)1366 sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
1367 sa_handle_type_t hdl_type, sa_handle_t **handlepp)
1368 {
1369 int error = 0;
1370 dmu_object_info_t doi;
1371 sa_handle_t *handle = NULL;
1372
1373 #ifdef ZFS_DEBUG
1374 dmu_object_info_from_db(db, &doi);
1375 ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
1376 doi.doi_bonus_type == DMU_OT_ZNODE);
1377 #endif
1378 /* find handle, if it exists */
1379 /* if one doesn't exist then create a new one, and initialize it */
1380
1381 if (hdl_type == SA_HDL_SHARED)
1382 handle = dmu_buf_get_user(db);
1383
1384 if (handle == NULL) {
1385 sa_handle_t *winner = NULL;
1386
1387 handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
1388 handle->sa_dbu.dbu_evict_func_sync = NULL;
1389 handle->sa_dbu.dbu_evict_func_async = NULL;
1390 handle->sa_userp = userp;
1391 handle->sa_bonus = db;
1392 handle->sa_os = os;
1393 handle->sa_spill = NULL;
1394 handle->sa_bonus_tab = NULL;
1395 handle->sa_spill_tab = NULL;
1396
1397 error = sa_build_index(handle, SA_BONUS);
1398
1399 if (hdl_type == SA_HDL_SHARED) {
1400 dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
1401 NULL);
1402 winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
1403 }
1404
1405 if (winner != NULL) {
1406 kmem_cache_free(sa_cache, handle);
1407 handle = winner;
1408 }
1409 }
1410 *handlepp = handle;
1411
1412 return (error);
1413 }
1414
1415 int
sa_handle_get(objset_t * objset,uint64_t objid,void * userp,sa_handle_type_t hdl_type,sa_handle_t ** handlepp)1416 sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
1417 sa_handle_type_t hdl_type, sa_handle_t **handlepp)
1418 {
1419 dmu_buf_t *db;
1420 int error;
1421
1422 if (error = dmu_bonus_hold(objset, objid, NULL, &db))
1423 return (error);
1424
1425 return (sa_handle_get_from_db(objset, db, userp, hdl_type,
1426 handlepp));
1427 }
1428
1429 int
sa_buf_hold(objset_t * objset,uint64_t obj_num,void * tag,dmu_buf_t ** db)1430 sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
1431 {
1432 return (dmu_bonus_hold(objset, obj_num, tag, db));
1433 }
1434
1435 void
sa_buf_rele(dmu_buf_t * db,void * tag)1436 sa_buf_rele(dmu_buf_t *db, void *tag)
1437 {
1438 dmu_buf_rele(db, tag);
1439 }
1440
1441 int
sa_lookup_impl(sa_handle_t * hdl,sa_bulk_attr_t * bulk,int count)1442 sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
1443 {
1444 ASSERT(hdl);
1445 ASSERT(MUTEX_HELD(&hdl->sa_lock));
1446 return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
1447 }
1448
1449 static int
sa_lookup_locked(sa_handle_t * hdl,sa_attr_type_t attr,void * buf,uint32_t buflen)1450 sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf,
1451 uint32_t buflen)
1452 {
1453 int error;
1454 sa_bulk_attr_t bulk;
1455
1456 bulk.sa_attr = attr;
1457 bulk.sa_data = buf;
1458 bulk.sa_length = buflen;
1459 bulk.sa_data_func = NULL;
1460
1461 ASSERT(hdl);
1462 error = sa_lookup_impl(hdl, &bulk, 1);
1463 return (error);
1464 }
1465
1466 int
sa_lookup(sa_handle_t * hdl,sa_attr_type_t attr,void * buf,uint32_t buflen)1467 sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
1468 {
1469 int error;
1470
1471 mutex_enter(&hdl->sa_lock);
1472 error = sa_lookup_locked(hdl, attr, buf, buflen);
1473 mutex_exit(&hdl->sa_lock);
1474
1475 return (error);
1476 }
1477
1478 #ifdef _KERNEL
1479 int
sa_lookup_uio(sa_handle_t * hdl,sa_attr_type_t attr,uio_t * uio)1480 sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
1481 {
1482 int error;
1483 sa_bulk_attr_t bulk;
1484
1485 bulk.sa_data = NULL;
1486 bulk.sa_attr = attr;
1487 bulk.sa_data_func = NULL;
1488
1489 ASSERT(hdl);
1490
1491 mutex_enter(&hdl->sa_lock);
1492 if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
1493 error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
1494 uio->uio_resid), UIO_READ, uio);
1495 }
1496 mutex_exit(&hdl->sa_lock);
1497 return (error);
1498
1499 }
1500
1501 /*
1502 * For the existing object that is upgraded from old system, its ondisk layout
1503 * has no slot for the project ID attribute. But quota accounting logic needs
1504 * to access related slots by offset directly. So we need to adjust these old
1505 * objects' layout to make the project ID to some unified and fixed offset.
1506 */
1507 int
sa_add_projid(sa_handle_t * hdl,dmu_tx_t * tx,uint64_t projid)1508 sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
1509 {
1510 znode_t *zp = sa_get_userdata(hdl);
1511 dmu_buf_t *db = sa_get_db(hdl);
1512 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1513 int count = 0, err = 0;
1514 sa_bulk_attr_t *bulk, *attrs;
1515 zfs_acl_locator_cb_t locate = { 0 };
1516 uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links;
1517 uint64_t crtime[2], mtime[2], ctime[2], atime[2];
1518 zfs_acl_phys_t znode_acl = { 0 };
1519 char scanstamp[AV_SCANSTAMP_SZ];
1520
1521 if (zp->z_acl_cached == NULL) {
1522 zfs_acl_t *aclp;
1523
1524 rw_enter(&zp->z_acl_lock, RW_WRITER);
1525 err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
1526 rw_exit(&zp->z_acl_lock);
1527 if (err != 0 && err != ENOENT)
1528 return (err);
1529 }
1530
1531 bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
1532 attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
1533 mutex_enter(&hdl->sa_lock);
1534 mutex_enter(&zp->z_lock);
1535
1536 err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
1537 sizeof (uint64_t));
1538 if (unlikely(err == 0))
1539 /* Someone has added project ID attr by race. */
1540 err = EEXIST;
1541 if (err != ENOENT)
1542 goto out;
1543
1544 /* First do a bulk query of the attributes that aren't cached */
1545 if (zp->z_is_sa) {
1546 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1547 &mode, 8);
1548 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1549 &gen, 8);
1550 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1551 &uid, 8);
1552 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1553 &gid, 8);
1554 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
1555 &parent, 8);
1556 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1557 &atime, 16);
1558 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1559 &mtime, 16);
1560 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1561 &ctime, 16);
1562 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
1563 &crtime, 16);
1564 if (S_ISBLK(zp->z_mode) || S_ISCHR(zp->z_mode))
1565 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1566 &rdev, 8);
1567 } else {
1568 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1569 &atime, 16);
1570 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1571 &mtime, 16);
1572 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1573 &ctime, 16);
1574 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
1575 &crtime, 16);
1576 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1577 &gen, 8);
1578 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1579 &mode, 8);
1580 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
1581 &parent, 8);
1582 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL,
1583 &xattr, 8);
1584 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1585 &rdev, 8);
1586 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1587 &uid, 8);
1588 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1589 &gid, 8);
1590 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
1591 &znode_acl, 88);
1592 }
1593 err = sa_bulk_lookup_locked(hdl, bulk, count);
1594 if (err != 0)
1595 goto out;
1596
1597 err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8);
1598 if (err != 0 && err != ENOENT)
1599 goto out;
1600
1601 zp->z_projid = projid;
1602 zp->z_pflags |= ZFS_PROJID;
1603 links = zp->z_links;
1604 count = 0;
1605 err = 0;
1606
1607 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
1608 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
1609 &zp->z_size, 8);
1610 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8);
1611 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
1612 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
1613 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
1614 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1615 &zp->z_pflags, 8);
1616 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
1617 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1618 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1619 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
1620 &crtime, 16);
1621 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
1622 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8);
1623
1624 if (S_ISBLK(zp->z_mode) || S_ISCHR(zp->z_mode))
1625 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
1626 &rdev, 8);
1627
1628 if (zp->z_acl_cached != NULL) {
1629 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
1630 &zp->z_acl_cached->z_acl_count, 8);
1631 if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
1632 zfs_acl_xform(zp, zp->z_acl_cached, CRED());
1633 locate.cb_aclp = zp->z_acl_cached;
1634 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
1635 zfs_acl_data_locator, &locate,
1636 zp->z_acl_cached->z_acl_bytes);
1637 }
1638
1639 if (xattr)
1640 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL,
1641 &xattr, 8);
1642
1643 if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
1644 bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
1645 scanstamp, AV_SCANSTAMP_SZ);
1646 SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
1647 scanstamp, AV_SCANSTAMP_SZ);
1648 zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
1649 }
1650
1651 VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
1652 VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0);
1653 if (znode_acl.z_acl_extern_obj) {
1654 VERIFY(0 == dmu_object_free(zfsvfs->z_os,
1655 znode_acl.z_acl_extern_obj, tx));
1656 }
1657
1658 zp->z_is_sa = B_TRUE;
1659
1660 out:
1661 mutex_exit(&zp->z_lock);
1662 mutex_exit(&hdl->sa_lock);
1663 kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
1664 kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
1665 return (err);
1666 }
1667 #endif
1668
1669 static sa_idx_tab_t *
sa_find_idx_tab(objset_t * os,dmu_object_type_t bonustype,sa_hdr_phys_t * hdr)1670 sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
1671 {
1672 sa_idx_tab_t *idx_tab;
1673 sa_os_t *sa = os->os_sa;
1674 sa_lot_t *tb, search;
1675 avl_index_t loc;
1676
1677 /*
1678 * Deterimine layout number. If SA node and header == 0 then
1679 * force the index table to the dummy "1" empty layout.
1680 *
1681 * The layout number would only be zero for a newly created file
1682 * that has not added any attributes yet, or with crypto enabled which
1683 * doesn't write any attributes to the bonus buffer.
1684 */
1685
1686 search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
1687
1688 tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
1689
1690 /* Verify header size is consistent with layout information */
1691 ASSERT(tb);
1692 ASSERT(IS_SA_BONUSTYPE(bonustype) &&
1693 SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
1694 (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
1695
1696 /*
1697 * See if any of the already existing TOC entries can be reused?
1698 */
1699
1700 for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
1701 idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
1702 boolean_t valid_idx = B_TRUE;
1703 int i;
1704
1705 if (tb->lot_var_sizes != 0 &&
1706 idx_tab->sa_variable_lengths != NULL) {
1707 for (i = 0; i != tb->lot_var_sizes; i++) {
1708 if (hdr->sa_lengths[i] !=
1709 idx_tab->sa_variable_lengths[i]) {
1710 valid_idx = B_FALSE;
1711 break;
1712 }
1713 }
1714 }
1715 if (valid_idx) {
1716 sa_idx_tab_hold(os, idx_tab);
1717 return (idx_tab);
1718 }
1719 }
1720
1721 /* No such luck, create a new entry */
1722 idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
1723 idx_tab->sa_idx_tab =
1724 kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
1725 idx_tab->sa_layout = tb;
1726 zfs_refcount_create(&idx_tab->sa_refcount);
1727 if (tb->lot_var_sizes)
1728 idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
1729 tb->lot_var_sizes, KM_SLEEP);
1730
1731 sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
1732 tb, idx_tab);
1733 sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
1734 sa_idx_tab_hold(os, idx_tab); /* one for layout */
1735 list_insert_tail(&tb->lot_idx_tab, idx_tab);
1736 return (idx_tab);
1737 }
1738
1739 void
sa_default_locator(void ** dataptr,uint32_t * len,uint32_t total_len,boolean_t start,void * userdata)1740 sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
1741 boolean_t start, void *userdata)
1742 {
1743 ASSERT(start);
1744
1745 *dataptr = userdata;
1746 *len = total_len;
1747 }
1748
1749 static void
sa_attr_register_sync(sa_handle_t * hdl,dmu_tx_t * tx)1750 sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
1751 {
1752 uint64_t attr_value = 0;
1753 sa_os_t *sa = hdl->sa_os->os_sa;
1754 sa_attr_table_t *tb = sa->sa_attr_table;
1755 int i;
1756
1757 mutex_enter(&sa->sa_lock);
1758
1759 if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
1760 mutex_exit(&sa->sa_lock);
1761 return;
1762 }
1763
1764 if (sa->sa_reg_attr_obj == 0) {
1765 sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
1766 DMU_OT_SA_ATTR_REGISTRATION,
1767 sa->sa_master_obj, SA_REGISTRY, tx);
1768 }
1769 for (i = 0; i != sa->sa_num_attrs; i++) {
1770 if (sa->sa_attr_table[i].sa_registered)
1771 continue;
1772 ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
1773 tb[i].sa_byteswap);
1774 VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
1775 tb[i].sa_name, 8, 1, &attr_value, tx));
1776 tb[i].sa_registered = B_TRUE;
1777 }
1778 sa->sa_need_attr_registration = B_FALSE;
1779 mutex_exit(&sa->sa_lock);
1780 }
1781
1782 /*
1783 * Replace all attributes with attributes specified in template.
1784 * If dnode had a spill buffer then those attributes will be
1785 * also be replaced, possibly with just an empty spill block
1786 *
1787 * This interface is intended to only be used for bulk adding of
1788 * attributes for a new file. It will also be used by the ZPL
1789 * when converting and old formatted znode to native SA support.
1790 */
1791 int
sa_replace_all_by_template_locked(sa_handle_t * hdl,sa_bulk_attr_t * attr_desc,int attr_count,dmu_tx_t * tx)1792 sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
1793 int attr_count, dmu_tx_t *tx)
1794 {
1795 sa_os_t *sa = hdl->sa_os->os_sa;
1796
1797 if (sa->sa_need_attr_registration)
1798 sa_attr_register_sync(hdl, tx);
1799 return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
1800 }
1801
1802 int
sa_replace_all_by_template(sa_handle_t * hdl,sa_bulk_attr_t * attr_desc,int attr_count,dmu_tx_t * tx)1803 sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
1804 int attr_count, dmu_tx_t *tx)
1805 {
1806 int error;
1807
1808 mutex_enter(&hdl->sa_lock);
1809 error = sa_replace_all_by_template_locked(hdl, attr_desc,
1810 attr_count, tx);
1811 mutex_exit(&hdl->sa_lock);
1812 return (error);
1813 }
1814
1815 /*
1816 * Add/remove a single attribute or replace a variable-sized attribute value
1817 * with a value of a different size, and then rewrite the entire set
1818 * of attributes.
1819 * Same-length attribute value replacement (including fixed-length attributes)
1820 * is handled more efficiently by the upper layers.
1821 */
1822 static int
sa_modify_attrs(sa_handle_t * hdl,sa_attr_type_t newattr,sa_data_op_t action,sa_data_locator_t * locator,void * datastart,uint16_t buflen,dmu_tx_t * tx)1823 sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
1824 sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
1825 uint16_t buflen, dmu_tx_t *tx)
1826 {
1827 sa_os_t *sa = hdl->sa_os->os_sa;
1828 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1829 dnode_t *dn;
1830 sa_bulk_attr_t *attr_desc;
1831 void *old_data[2];
1832 int bonus_attr_count = 0;
1833 int bonus_data_size = 0;
1834 int spill_data_size = 0;
1835 int spill_attr_count = 0;
1836 int error;
1837 uint16_t length, reg_length;
1838 int i, j, k, length_idx;
1839 sa_hdr_phys_t *hdr;
1840 sa_idx_tab_t *idx_tab;
1841 int attr_count;
1842 int count;
1843
1844 ASSERT(MUTEX_HELD(&hdl->sa_lock));
1845
1846 /* First make of copy of the old data */
1847
1848 DB_DNODE_ENTER(db);
1849 dn = DB_DNODE(db);
1850 if (dn->dn_bonuslen != 0) {
1851 bonus_data_size = hdl->sa_bonus->db_size;
1852 old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
1853 bcopy(hdl->sa_bonus->db_data, old_data[0],
1854 hdl->sa_bonus->db_size);
1855 bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
1856 } else {
1857 old_data[0] = NULL;
1858 }
1859 DB_DNODE_EXIT(db);
1860
1861 /* Bring spill buffer online if it isn't currently */
1862
1863 if ((error = sa_get_spill(hdl)) == 0) {
1864 spill_data_size = hdl->sa_spill->db_size;
1865 old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
1866 bcopy(hdl->sa_spill->db_data, old_data[1],
1867 hdl->sa_spill->db_size);
1868 spill_attr_count =
1869 hdl->sa_spill_tab->sa_layout->lot_attr_count;
1870 } else if (error && error != ENOENT) {
1871 if (old_data[0])
1872 kmem_free(old_data[0], bonus_data_size);
1873 return (error);
1874 } else {
1875 old_data[1] = NULL;
1876 }
1877
1878 /* build descriptor of all attributes */
1879
1880 attr_count = bonus_attr_count + spill_attr_count;
1881 if (action == SA_ADD)
1882 attr_count++;
1883 else if (action == SA_REMOVE)
1884 attr_count--;
1885
1886 attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
1887
1888 /*
1889 * loop through bonus and spill buffer if it exists, and
1890 * build up new attr_descriptor to reset the attributes
1891 */
1892 k = j = 0;
1893 count = bonus_attr_count;
1894 hdr = SA_GET_HDR(hdl, SA_BONUS);
1895 idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
1896 for (; k != 2; k++) {
1897 /*
1898 * Iterate over each attribute in layout. Fetch the
1899 * size of variable-length attributes needing rewrite
1900 * from sa_lengths[].
1901 */
1902 for (i = 0, length_idx = 0; i != count; i++) {
1903 sa_attr_type_t attr;
1904
1905 attr = idx_tab->sa_layout->lot_attrs[i];
1906 reg_length = SA_REGISTERED_LEN(sa, attr);
1907 if (reg_length == 0) {
1908 length = hdr->sa_lengths[length_idx];
1909 length_idx++;
1910 } else {
1911 length = reg_length;
1912 }
1913 if (attr == newattr) {
1914 /*
1915 * There is nothing to do for SA_REMOVE,
1916 * so it is just skipped.
1917 */
1918 if (action == SA_REMOVE)
1919 continue;
1920
1921 /*
1922 * Duplicate attributes are not allowed, so the
1923 * action can not be SA_ADD here.
1924 */
1925 ASSERT3S(action, ==, SA_REPLACE);
1926
1927 /*
1928 * Only a variable-sized attribute can be
1929 * replaced here, and its size must be changing.
1930 */
1931 ASSERT3U(reg_length, ==, 0);
1932 ASSERT3U(length, !=, buflen);
1933 SA_ADD_BULK_ATTR(attr_desc, j, attr,
1934 locator, datastart, buflen);
1935 } else {
1936 SA_ADD_BULK_ATTR(attr_desc, j, attr,
1937 NULL, (void *)
1938 (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
1939 (uintptr_t)old_data[k]), length);
1940 }
1941 }
1942 if (k == 0 && hdl->sa_spill) {
1943 hdr = SA_GET_HDR(hdl, SA_SPILL);
1944 idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
1945 count = spill_attr_count;
1946 } else {
1947 break;
1948 }
1949 }
1950 if (action == SA_ADD) {
1951 reg_length = SA_REGISTERED_LEN(sa, newattr);
1952 IMPLY(reg_length != 0, reg_length == buflen);
1953 SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
1954 datastart, buflen);
1955 }
1956 ASSERT3U(j, ==, attr_count);
1957
1958 error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
1959
1960 if (old_data[0])
1961 kmem_free(old_data[0], bonus_data_size);
1962 if (old_data[1])
1963 kmem_free(old_data[1], spill_data_size);
1964 kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
1965
1966 return (error);
1967 }
1968
1969 static int
sa_bulk_update_impl(sa_handle_t * hdl,sa_bulk_attr_t * bulk,int count,dmu_tx_t * tx)1970 sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
1971 dmu_tx_t *tx)
1972 {
1973 int error;
1974 sa_os_t *sa = hdl->sa_os->os_sa;
1975 dmu_object_type_t bonustype;
1976
1977 bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
1978
1979 ASSERT(hdl);
1980 ASSERT(MUTEX_HELD(&hdl->sa_lock));
1981
1982 /* sync out registration table if necessary */
1983 if (sa->sa_need_attr_registration)
1984 sa_attr_register_sync(hdl, tx);
1985
1986 error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
1987 if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
1988 sa->sa_update_cb(hdl, tx);
1989
1990 return (error);
1991 }
1992
1993 /*
1994 * update or add new attribute
1995 */
1996 int
sa_update(sa_handle_t * hdl,sa_attr_type_t type,void * buf,uint32_t buflen,dmu_tx_t * tx)1997 sa_update(sa_handle_t *hdl, sa_attr_type_t type,
1998 void *buf, uint32_t buflen, dmu_tx_t *tx)
1999 {
2000 int error;
2001 sa_bulk_attr_t bulk;
2002
2003 bulk.sa_attr = type;
2004 bulk.sa_data_func = NULL;
2005 bulk.sa_length = buflen;
2006 bulk.sa_data = buf;
2007
2008 mutex_enter(&hdl->sa_lock);
2009 error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
2010 mutex_exit(&hdl->sa_lock);
2011 return (error);
2012 }
2013
2014 int
sa_update_from_cb(sa_handle_t * hdl,sa_attr_type_t attr,uint32_t buflen,sa_data_locator_t * locator,void * userdata,dmu_tx_t * tx)2015 sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
2016 uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
2017 {
2018 int error;
2019 sa_bulk_attr_t bulk;
2020
2021 bulk.sa_attr = attr;
2022 bulk.sa_data = userdata;
2023 bulk.sa_data_func = locator;
2024 bulk.sa_length = buflen;
2025
2026 mutex_enter(&hdl->sa_lock);
2027 error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
2028 mutex_exit(&hdl->sa_lock);
2029 return (error);
2030 }
2031
2032 /*
2033 * Return size of an attribute
2034 */
2035
2036 int
sa_size(sa_handle_t * hdl,sa_attr_type_t attr,int * size)2037 sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
2038 {
2039 sa_bulk_attr_t bulk;
2040 int error;
2041
2042 bulk.sa_data = NULL;
2043 bulk.sa_attr = attr;
2044 bulk.sa_data_func = NULL;
2045
2046 ASSERT(hdl);
2047 mutex_enter(&hdl->sa_lock);
2048 if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
2049 mutex_exit(&hdl->sa_lock);
2050 return (error);
2051 }
2052 *size = bulk.sa_size;
2053
2054 mutex_exit(&hdl->sa_lock);
2055 return (0);
2056 }
2057
2058 int
sa_bulk_lookup_locked(sa_handle_t * hdl,sa_bulk_attr_t * attrs,int count)2059 sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
2060 {
2061 ASSERT(hdl);
2062 ASSERT(MUTEX_HELD(&hdl->sa_lock));
2063 return (sa_lookup_impl(hdl, attrs, count));
2064 }
2065
2066 int
sa_bulk_lookup(sa_handle_t * hdl,sa_bulk_attr_t * attrs,int count)2067 sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
2068 {
2069 int error;
2070
2071 ASSERT(hdl);
2072 mutex_enter(&hdl->sa_lock);
2073 error = sa_bulk_lookup_locked(hdl, attrs, count);
2074 mutex_exit(&hdl->sa_lock);
2075 return (error);
2076 }
2077
2078 int
sa_bulk_update(sa_handle_t * hdl,sa_bulk_attr_t * attrs,int count,dmu_tx_t * tx)2079 sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
2080 {
2081 int error;
2082
2083 ASSERT(hdl);
2084 mutex_enter(&hdl->sa_lock);
2085 error = sa_bulk_update_impl(hdl, attrs, count, tx);
2086 mutex_exit(&hdl->sa_lock);
2087 return (error);
2088 }
2089
2090 int
sa_remove(sa_handle_t * hdl,sa_attr_type_t attr,dmu_tx_t * tx)2091 sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
2092 {
2093 int error;
2094
2095 mutex_enter(&hdl->sa_lock);
2096 error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
2097 NULL, 0, tx);
2098 mutex_exit(&hdl->sa_lock);
2099 return (error);
2100 }
2101
2102 void
sa_object_info(sa_handle_t * hdl,dmu_object_info_t * doi)2103 sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
2104 {
2105 dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
2106 }
2107
2108 void
sa_object_size(sa_handle_t * hdl,uint32_t * blksize,u_longlong_t * nblocks)2109 sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
2110 {
2111 dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
2112 blksize, nblocks);
2113 }
2114
2115 void
sa_set_userp(sa_handle_t * hdl,void * ptr)2116 sa_set_userp(sa_handle_t *hdl, void *ptr)
2117 {
2118 hdl->sa_userp = ptr;
2119 }
2120
2121 dmu_buf_t *
sa_get_db(sa_handle_t * hdl)2122 sa_get_db(sa_handle_t *hdl)
2123 {
2124 return ((dmu_buf_t *)hdl->sa_bonus);
2125 }
2126
2127 void *
sa_get_userdata(sa_handle_t * hdl)2128 sa_get_userdata(sa_handle_t *hdl)
2129 {
2130 return (hdl->sa_userp);
2131 }
2132
2133 void
sa_register_update_callback_locked(objset_t * os,sa_update_cb_t * func)2134 sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
2135 {
2136 ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
2137 os->os_sa->sa_update_cb = func;
2138 }
2139
2140 void
sa_register_update_callback(objset_t * os,sa_update_cb_t * func)2141 sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
2142 {
2143
2144 mutex_enter(&os->os_sa->sa_lock);
2145 sa_register_update_callback_locked(os, func);
2146 mutex_exit(&os->os_sa->sa_lock);
2147 }
2148
2149 uint64_t
sa_handle_object(sa_handle_t * hdl)2150 sa_handle_object(sa_handle_t *hdl)
2151 {
2152 return (hdl->sa_bonus->db_object);
2153 }
2154
2155 boolean_t
sa_enabled(objset_t * os)2156 sa_enabled(objset_t *os)
2157 {
2158 return (os->os_sa == NULL);
2159 }
2160
2161 int
sa_set_sa_object(objset_t * os,uint64_t sa_object)2162 sa_set_sa_object(objset_t *os, uint64_t sa_object)
2163 {
2164 sa_os_t *sa = os->os_sa;
2165
2166 if (sa->sa_master_obj)
2167 return (1);
2168
2169 sa->sa_master_obj = sa_object;
2170
2171 return (0);
2172 }
2173
2174 int
sa_hdrsize(void * arg)2175 sa_hdrsize(void *arg)
2176 {
2177 sa_hdr_phys_t *hdr = arg;
2178
2179 return (SA_HDR_SIZE(hdr));
2180 }
2181
2182 void
sa_handle_lock(sa_handle_t * hdl)2183 sa_handle_lock(sa_handle_t *hdl)
2184 {
2185 ASSERT(hdl);
2186 mutex_enter(&hdl->sa_lock);
2187 }
2188
2189 void
sa_handle_unlock(sa_handle_t * hdl)2190 sa_handle_unlock(sa_handle_t *hdl)
2191 {
2192 ASSERT(hdl);
2193 mutex_exit(&hdl->sa_lock);
2194 }
2195