xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25  */
26 
27 /* Portions Copyright 2007 Jeremy Teo */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/sysmacros.h>
33 #include <sys/mntent.h>
34 #include <sys/u8_textprep.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/kmem.h>
40 #include <sys/errno.h>
41 #include <sys/atomic.h>
42 #include <sys/zfs_dir.h>
43 #include <sys/zfs_acl.h>
44 #include <sys/zfs_ioctl.h>
45 #include <sys/zfs_rlock.h>
46 #include <sys/zfs_fuid.h>
47 #include <sys/zfs_vnops.h>
48 #include <sys/zfs_ctldir.h>
49 #include <sys/dnode.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/zpl.h>
52 #include <sys/dmu.h>
53 #include <sys/dmu_objset.h>
54 #include <sys/dmu_tx.h>
55 #include <sys/zfs_refcount.h>
56 #include <sys/stat.h>
57 #include <sys/zap.h>
58 #include <sys/zfs_znode.h>
59 #include <sys/sa.h>
60 #include <sys/zfs_sa.h>
61 #include <sys/zfs_stat.h>
62 #include <linux/mm_compat.h>
63 
64 #include "zfs_prop.h"
65 #include "zfs_comutil.h"
66 
67 static kmem_cache_t *znode_cache = NULL;
68 static kmem_cache_t *znode_hold_cache = NULL;
69 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
70 
71 /*
72  * This is used by the test suite so that it can delay znodes from being
73  * freed in order to inspect the unlinked set.
74  */
75 static int zfs_unlink_suspend_progress = 0;
76 
77 /*
78  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
79  * z_rangelock. It will modify the offset and length of the lock to reflect
80  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
81  * called with the rangelock_t's rl_lock held, which avoids races.
82  */
83 static void
zfs_rangelock_cb(zfs_locked_range_t * new,void * arg)84 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
85 {
86 	znode_t *zp = arg;
87 
88 	/*
89 	 * If in append mode, convert to writer and lock starting at the
90 	 * current end of file.
91 	 */
92 	if (new->lr_type == RL_APPEND) {
93 		new->lr_offset = zp->z_size;
94 		new->lr_type = RL_WRITER;
95 	}
96 
97 	/*
98 	 * If we need to grow the block size then lock the whole file range.
99 	 */
100 	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
101 	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
102 	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
103 		new->lr_offset = 0;
104 		new->lr_length = UINT64_MAX;
105 	}
106 }
107 
108 static int
zfs_znode_cache_constructor(void * buf,void * arg,int kmflags)109 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
110 {
111 	(void) arg, (void) kmflags;
112 	znode_t *zp = buf;
113 
114 	inode_init_once(ZTOI(zp));
115 	list_link_init(&zp->z_link_node);
116 
117 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
118 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
119 	rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
120 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
121 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
122 
123 	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
124 
125 	zp->z_dirlocks = NULL;
126 	zp->z_acl_cached = NULL;
127 	zp->z_xattr_cached = NULL;
128 	zp->z_xattr_parent = 0;
129 	zp->z_sync_writes_cnt = 0;
130 	zp->z_async_writes_cnt = 0;
131 
132 	return (0);
133 }
134 
135 static void
zfs_znode_cache_destructor(void * buf,void * arg)136 zfs_znode_cache_destructor(void *buf, void *arg)
137 {
138 	(void) arg;
139 	znode_t *zp = buf;
140 
141 	ASSERT(!list_link_active(&zp->z_link_node));
142 	mutex_destroy(&zp->z_lock);
143 	rw_destroy(&zp->z_parent_lock);
144 	rw_destroy(&zp->z_name_lock);
145 	mutex_destroy(&zp->z_acl_lock);
146 	rw_destroy(&zp->z_xattr_lock);
147 	zfs_rangelock_fini(&zp->z_rangelock);
148 
149 	ASSERT3P(zp->z_dirlocks, ==, NULL);
150 	ASSERT3P(zp->z_acl_cached, ==, NULL);
151 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
152 
153 	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
154 	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
155 }
156 
157 static int
zfs_znode_hold_cache_constructor(void * buf,void * arg,int kmflags)158 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
159 {
160 	(void) arg, (void) kmflags;
161 	znode_hold_t *zh = buf;
162 
163 	mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
164 	zh->zh_refcount = 0;
165 
166 	return (0);
167 }
168 
169 static void
zfs_znode_hold_cache_destructor(void * buf,void * arg)170 zfs_znode_hold_cache_destructor(void *buf, void *arg)
171 {
172 	(void) arg;
173 	znode_hold_t *zh = buf;
174 
175 	mutex_destroy(&zh->zh_lock);
176 }
177 
178 void
zfs_znode_init(void)179 zfs_znode_init(void)
180 {
181 	/*
182 	 * Initialize zcache.  The KMC_SLAB hint is used in order that it be
183 	 * backed by kmalloc() when on the Linux slab in order that any
184 	 * wait_on_bit() operations on the related inode operate properly.
185 	 */
186 	ASSERT(znode_cache == NULL);
187 	znode_cache = kmem_cache_create("zfs_znode_cache",
188 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
189 	    zfs_znode_cache_destructor, NULL, NULL, NULL,
190 	    KMC_SLAB | KMC_RECLAIMABLE);
191 
192 	ASSERT(znode_hold_cache == NULL);
193 	znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
194 	    sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
195 	    zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
196 }
197 
198 void
zfs_znode_fini(void)199 zfs_znode_fini(void)
200 {
201 	/*
202 	 * Cleanup zcache
203 	 */
204 	if (znode_cache)
205 		kmem_cache_destroy(znode_cache);
206 	znode_cache = NULL;
207 
208 	if (znode_hold_cache)
209 		kmem_cache_destroy(znode_hold_cache);
210 	znode_hold_cache = NULL;
211 }
212 
213 /*
214  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
215  * serialize access to a znode and its SA buffer while the object is being
216  * created or destroyed.  This kind of locking would normally reside in the
217  * znode itself but in this case that's impossible because the znode and SA
218  * buffer may not yet exist.  Therefore the locking is handled externally
219  * with an array of mutexes and AVLs trees which contain per-object locks.
220  *
221  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
222  * in to the correct AVL tree and finally the per-object lock is held.  In
223  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
224  * released, removed from the AVL tree and destroyed if there are no waiters.
225  *
226  * This scheme has two important properties:
227  *
228  * 1) No memory allocations are performed while holding one of the z_hold_locks.
229  *    This ensures evict(), which can be called from direct memory reclaim, will
230  *    never block waiting on a z_hold_locks which just happens to have hashed
231  *    to the same index.
232  *
233  * 2) All locks used to serialize access to an object are per-object and never
234  *    shared.  This minimizes lock contention without creating a large number
235  *    of dedicated locks.
236  *
237  * On the downside it does require znode_lock_t structures to be frequently
238  * allocated and freed.  However, because these are backed by a kmem cache
239  * and very short lived this cost is minimal.
240  */
241 int
zfs_znode_hold_compare(const void * a,const void * b)242 zfs_znode_hold_compare(const void *a, const void *b)
243 {
244 	const znode_hold_t *zh_a = (const znode_hold_t *)a;
245 	const znode_hold_t *zh_b = (const znode_hold_t *)b;
246 
247 	return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
248 }
249 
250 static boolean_t __maybe_unused
zfs_znode_held(zfsvfs_t * zfsvfs,uint64_t obj)251 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
252 {
253 	znode_hold_t *zh, search;
254 	int i = ZFS_OBJ_HASH(zfsvfs, obj);
255 	boolean_t held;
256 
257 	search.zh_obj = obj;
258 
259 	mutex_enter(&zfsvfs->z_hold_locks[i]);
260 	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
261 	held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
262 	mutex_exit(&zfsvfs->z_hold_locks[i]);
263 
264 	return (held);
265 }
266 
267 znode_hold_t *
zfs_znode_hold_enter(zfsvfs_t * zfsvfs,uint64_t obj)268 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
269 {
270 	znode_hold_t *zh, *zh_new, search;
271 	int i = ZFS_OBJ_HASH(zfsvfs, obj);
272 	boolean_t found = B_FALSE;
273 
274 	zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
275 	search.zh_obj = obj;
276 
277 	mutex_enter(&zfsvfs->z_hold_locks[i]);
278 	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
279 	if (likely(zh == NULL)) {
280 		zh = zh_new;
281 		zh->zh_obj = obj;
282 		avl_add(&zfsvfs->z_hold_trees[i], zh);
283 	} else {
284 		ASSERT3U(zh->zh_obj, ==, obj);
285 		found = B_TRUE;
286 	}
287 	zh->zh_refcount++;
288 	ASSERT3S(zh->zh_refcount, >, 0);
289 	mutex_exit(&zfsvfs->z_hold_locks[i]);
290 
291 	if (found == B_TRUE)
292 		kmem_cache_free(znode_hold_cache, zh_new);
293 
294 	ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
295 	mutex_enter(&zh->zh_lock);
296 
297 	return (zh);
298 }
299 
300 void
zfs_znode_hold_exit(zfsvfs_t * zfsvfs,znode_hold_t * zh)301 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
302 {
303 	int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
304 	boolean_t remove = B_FALSE;
305 
306 	ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
307 	mutex_exit(&zh->zh_lock);
308 
309 	mutex_enter(&zfsvfs->z_hold_locks[i]);
310 	ASSERT3S(zh->zh_refcount, >, 0);
311 	if (--zh->zh_refcount == 0) {
312 		avl_remove(&zfsvfs->z_hold_trees[i], zh);
313 		remove = B_TRUE;
314 	}
315 	mutex_exit(&zfsvfs->z_hold_locks[i]);
316 
317 	if (remove == B_TRUE)
318 		kmem_cache_free(znode_hold_cache, zh);
319 }
320 
321 dev_t
zfs_cmpldev(uint64_t dev)322 zfs_cmpldev(uint64_t dev)
323 {
324 	return (dev);
325 }
326 
327 static void
zfs_znode_sa_init(zfsvfs_t * zfsvfs,znode_t * zp,dmu_buf_t * db,dmu_object_type_t obj_type,sa_handle_t * sa_hdl)328 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
329     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
330 {
331 	ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
332 
333 	mutex_enter(&zp->z_lock);
334 
335 	ASSERT(zp->z_sa_hdl == NULL);
336 	ASSERT(zp->z_acl_cached == NULL);
337 	if (sa_hdl == NULL) {
338 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
339 		    SA_HDL_SHARED, &zp->z_sa_hdl));
340 	} else {
341 		zp->z_sa_hdl = sa_hdl;
342 		sa_set_userp(sa_hdl, zp);
343 	}
344 
345 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
346 
347 	mutex_exit(&zp->z_lock);
348 }
349 
350 void
zfs_znode_dmu_fini(znode_t * zp)351 zfs_znode_dmu_fini(znode_t *zp)
352 {
353 	ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
354 	    RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
355 
356 	sa_handle_destroy(zp->z_sa_hdl);
357 	zp->z_sa_hdl = NULL;
358 }
359 
360 /*
361  * Called by new_inode() to allocate a new inode.
362  */
363 int
zfs_inode_alloc(struct super_block * sb,struct inode ** ip)364 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
365 {
366 	znode_t *zp;
367 
368 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
369 	*ip = ZTOI(zp);
370 
371 	return (0);
372 }
373 
374 /*
375  * Called in multiple places when an inode should be destroyed.
376  */
377 void
zfs_inode_destroy(struct inode * ip)378 zfs_inode_destroy(struct inode *ip)
379 {
380 	znode_t *zp = ITOZ(ip);
381 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
382 
383 	mutex_enter(&zfsvfs->z_znodes_lock);
384 	if (list_link_active(&zp->z_link_node)) {
385 		list_remove(&zfsvfs->z_all_znodes, zp);
386 	}
387 	mutex_exit(&zfsvfs->z_znodes_lock);
388 
389 	if (zp->z_acl_cached) {
390 		zfs_acl_free(zp->z_acl_cached);
391 		zp->z_acl_cached = NULL;
392 	}
393 
394 	if (zp->z_xattr_cached) {
395 		nvlist_free(zp->z_xattr_cached);
396 		zp->z_xattr_cached = NULL;
397 	}
398 
399 	kmem_cache_free(znode_cache, zp);
400 }
401 
402 static void
zfs_inode_set_ops(zfsvfs_t * zfsvfs,struct inode * ip)403 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
404 {
405 	uint64_t rdev = 0;
406 
407 	switch (ip->i_mode & S_IFMT) {
408 	case S_IFREG:
409 		ip->i_op = &zpl_inode_operations;
410 		ip->i_fop = &zpl_file_operations;
411 		ip->i_mapping->a_ops = &zpl_address_space_operations;
412 		break;
413 
414 	case S_IFDIR:
415 		ip->i_op = &zpl_dir_inode_operations;
416 		ip->i_fop = &zpl_dir_file_operations;
417 		ITOZ(ip)->z_zn_prefetch = B_TRUE;
418 		break;
419 
420 	case S_IFLNK:
421 		ip->i_op = &zpl_symlink_inode_operations;
422 		break;
423 
424 	/*
425 	 * rdev is only stored in a SA only for device files.
426 	 */
427 	case S_IFCHR:
428 	case S_IFBLK:
429 		(void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
430 		    sizeof (rdev));
431 		zfs_fallthrough;
432 	case S_IFIFO:
433 	case S_IFSOCK:
434 		init_special_inode(ip, ip->i_mode, rdev);
435 		ip->i_op = &zpl_special_inode_operations;
436 		break;
437 
438 	default:
439 		zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
440 		    (u_longlong_t)ip->i_ino, ip->i_mode);
441 
442 		/* Assume the inode is a file and attempt to continue */
443 		ip->i_mode = S_IFREG | 0644;
444 		ip->i_op = &zpl_inode_operations;
445 		ip->i_fop = &zpl_file_operations;
446 		ip->i_mapping->a_ops = &zpl_address_space_operations;
447 		break;
448 	}
449 }
450 
451 static void
zfs_set_inode_flags(znode_t * zp,struct inode * ip)452 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
453 {
454 	/*
455 	 * Linux and Solaris have different sets of file attributes, so we
456 	 * restrict this conversion to the intersection of the two.
457 	 */
458 	unsigned int flags = 0;
459 	if (zp->z_pflags & ZFS_IMMUTABLE)
460 		flags |= S_IMMUTABLE;
461 	if (zp->z_pflags & ZFS_APPENDONLY)
462 		flags |= S_APPEND;
463 
464 	inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
465 }
466 
467 /*
468  * Update the embedded inode given the znode.
469  */
470 void
zfs_znode_update_vfs(znode_t * zp)471 zfs_znode_update_vfs(znode_t *zp)
472 {
473 	struct inode	*ip;
474 	uint32_t	blksize;
475 	u_longlong_t	i_blocks;
476 
477 	ASSERT(zp != NULL);
478 	ip = ZTOI(zp);
479 
480 	/* Skip .zfs control nodes which do not exist on disk. */
481 	if (zfsctl_is_node(ip))
482 		return;
483 
484 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
485 
486 	spin_lock(&ip->i_lock);
487 	ip->i_mode = zp->z_mode;
488 	ip->i_blocks = i_blocks;
489 	i_size_write(ip, zp->z_size);
490 	spin_unlock(&ip->i_lock);
491 }
492 
493 
494 /*
495  * Construct a znode+inode and initialize.
496  *
497  * This does not do a call to dmu_set_user() that is
498  * up to the caller to do, in case you don't want to
499  * return the znode
500  */
501 static znode_t *
zfs_znode_alloc(zfsvfs_t * zfsvfs,dmu_buf_t * db,int blksz,dmu_object_type_t obj_type,sa_handle_t * hdl)502 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
503     dmu_object_type_t obj_type, sa_handle_t *hdl)
504 {
505 	znode_t	*zp;
506 	struct inode *ip;
507 	uint64_t mode;
508 	uint64_t parent;
509 	uint64_t tmp_gen;
510 	uint64_t links;
511 	uint64_t z_uid, z_gid;
512 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
513 	inode_timespec_t tmp_ts;
514 	uint64_t projid = ZFS_DEFAULT_PROJID;
515 	sa_bulk_attr_t bulk[12];
516 	int count = 0;
517 
518 	ASSERT(zfsvfs != NULL);
519 
520 	ip = new_inode(zfsvfs->z_sb);
521 	if (ip == NULL)
522 		return (NULL);
523 
524 	zp = ITOZ(ip);
525 	ASSERT(zp->z_dirlocks == NULL);
526 	ASSERT3P(zp->z_acl_cached, ==, NULL);
527 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
528 	zp->z_unlinked = B_FALSE;
529 	zp->z_atime_dirty = B_FALSE;
530 	zp->z_is_ctldir = B_FALSE;
531 	zp->z_suspended = B_FALSE;
532 	zp->z_sa_hdl = NULL;
533 	zp->z_mapcnt = 0;
534 	zp->z_id = db->db_object;
535 	zp->z_blksz = blksz;
536 	zp->z_seq = 0x7A4653;
537 	zp->z_sync_cnt = 0;
538 	zp->z_sync_writes_cnt = 0;
539 	zp->z_async_writes_cnt = 0;
540 
541 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
542 
543 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
544 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
545 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
546 	    &zp->z_size, 8);
547 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
548 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
549 	    &zp->z_pflags, 8);
550 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
551 	    &parent, 8);
552 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
553 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
554 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
555 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
556 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
557 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
558 
559 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
560 	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
561 	    (zp->z_pflags & ZFS_PROJID) &&
562 	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
563 		if (hdl == NULL)
564 			sa_handle_destroy(zp->z_sa_hdl);
565 		zp->z_sa_hdl = NULL;
566 		goto error;
567 	}
568 
569 	zp->z_projid = projid;
570 	zp->z_mode = ip->i_mode = mode;
571 	ip->i_generation = (uint32_t)tmp_gen;
572 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
573 	set_nlink(ip, (uint32_t)links);
574 	zfs_uid_write(ip, z_uid);
575 	zfs_gid_write(ip, z_gid);
576 	zfs_set_inode_flags(zp, ip);
577 
578 	/* Cache the xattr parent id */
579 	if (zp->z_pflags & ZFS_XATTR)
580 		zp->z_xattr_parent = parent;
581 
582 	ZFS_TIME_DECODE(&tmp_ts, atime);
583 	zpl_inode_set_atime_to_ts(ip, tmp_ts);
584 	ZFS_TIME_DECODE(&tmp_ts, mtime);
585 	zpl_inode_set_mtime_to_ts(ip, tmp_ts);
586 	ZFS_TIME_DECODE(&tmp_ts, ctime);
587 	zpl_inode_set_ctime_to_ts(ip, tmp_ts);
588 	ZFS_TIME_DECODE(&zp->z_btime, btime);
589 
590 	ip->i_ino = zp->z_id;
591 	zfs_znode_update_vfs(zp);
592 	zfs_inode_set_ops(zfsvfs, ip);
593 
594 	/*
595 	 * The only way insert_inode_locked() can fail is if the ip->i_ino
596 	 * number is already hashed for this super block.  This can never
597 	 * happen because the inode numbers map 1:1 with the object numbers.
598 	 *
599 	 * Exceptions include rolling back a mounted file system, either
600 	 * from the zfs rollback or zfs recv command.
601 	 *
602 	 * Active inodes are unhashed during the rollback, but since zrele
603 	 * can happen asynchronously, we can't guarantee they've been
604 	 * unhashed.  This can cause hash collisions in unlinked drain
605 	 * processing so do not hash unlinked znodes.
606 	 */
607 	if (links > 0)
608 		VERIFY3S(insert_inode_locked(ip), ==, 0);
609 
610 	mutex_enter(&zfsvfs->z_znodes_lock);
611 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
612 	mutex_exit(&zfsvfs->z_znodes_lock);
613 
614 	if (links > 0)
615 		unlock_new_inode(ip);
616 	return (zp);
617 
618 error:
619 	iput(ip);
620 	return (NULL);
621 }
622 
623 /*
624  * Safely mark an inode dirty.  Inodes which are part of a read-only
625  * file system or snapshot may not be dirtied.
626  */
627 void
zfs_mark_inode_dirty(struct inode * ip)628 zfs_mark_inode_dirty(struct inode *ip)
629 {
630 	zfsvfs_t *zfsvfs = ITOZSB(ip);
631 
632 	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
633 		return;
634 
635 	mark_inode_dirty(ip);
636 }
637 
638 static uint64_t empty_xattr;
639 static uint64_t pad[4];
640 static zfs_acl_phys_t acl_phys;
641 /*
642  * Create a new DMU object to hold a zfs znode.
643  *
644  *	IN:	dzp	- parent directory for new znode
645  *		vap	- file attributes for new znode
646  *		tx	- dmu transaction id for zap operations
647  *		cr	- credentials of caller
648  *		flag	- flags:
649  *			  IS_ROOT_NODE	- new object will be root
650  *			  IS_TMPFILE	- new object is of O_TMPFILE
651  *			  IS_XATTR	- new object is an attribute
652  *		acl_ids	- ACL related attributes
653  *
654  *	OUT:	zpp	- allocated znode (set to dzp if IS_ROOT_NODE)
655  *
656  */
657 void
zfs_mknode(znode_t * dzp,vattr_t * vap,dmu_tx_t * tx,cred_t * cr,uint_t flag,znode_t ** zpp,zfs_acl_ids_t * acl_ids)658 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
659     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
660 {
661 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
662 	uint64_t	mode, size, links, parent, pflags;
663 	uint64_t	projid = ZFS_DEFAULT_PROJID;
664 	uint64_t	rdev = 0;
665 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
666 	dmu_buf_t	*db;
667 	inode_timespec_t now;
668 	uint64_t	gen, obj;
669 	int		bonuslen;
670 	int		dnodesize;
671 	sa_handle_t	*sa_hdl;
672 	dmu_object_type_t obj_type;
673 	sa_bulk_attr_t	*sa_attrs;
674 	int		cnt = 0;
675 	zfs_acl_locator_cb_t locate = { 0 };
676 	znode_hold_t	*zh;
677 
678 	if (zfsvfs->z_replay) {
679 		obj = vap->va_nodeid;
680 		now = vap->va_ctime;		/* see zfs_replay_create() */
681 		gen = vap->va_nblocks;		/* ditto */
682 		dnodesize = vap->va_fsid;	/* ditto */
683 	} else {
684 		obj = 0;
685 		gethrestime(&now);
686 		gen = dmu_tx_get_txg(tx);
687 		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
688 	}
689 
690 	if (dnodesize == 0)
691 		dnodesize = DNODE_MIN_SIZE;
692 
693 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
694 
695 	bonuslen = (obj_type == DMU_OT_SA) ?
696 	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
697 
698 	/*
699 	 * Create a new DMU object.
700 	 */
701 	/*
702 	 * There's currently no mechanism for pre-reading the blocks that will
703 	 * be needed to allocate a new object, so we accept the small chance
704 	 * that there will be an i/o error and we will fail one of the
705 	 * assertions below.
706 	 */
707 	if (S_ISDIR(vap->va_mode)) {
708 		if (zfsvfs->z_replay) {
709 			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
710 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
711 			    obj_type, bonuslen, dnodesize, tx));
712 		} else {
713 			obj = zap_create_norm_dnsize(zfsvfs->z_os,
714 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
715 			    obj_type, bonuslen, dnodesize, tx);
716 		}
717 	} else {
718 		if (zfsvfs->z_replay) {
719 			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
720 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
721 			    obj_type, bonuslen, dnodesize, tx));
722 		} else {
723 			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
724 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
725 			    obj_type, bonuslen, dnodesize, tx);
726 		}
727 	}
728 
729 	zh = zfs_znode_hold_enter(zfsvfs, obj);
730 	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
731 
732 	/*
733 	 * If this is the root, fix up the half-initialized parent pointer
734 	 * to reference the just-allocated physical data area.
735 	 */
736 	if (flag & IS_ROOT_NODE) {
737 		dzp->z_id = obj;
738 	}
739 
740 	/*
741 	 * If parent is an xattr, so am I.
742 	 */
743 	if (dzp->z_pflags & ZFS_XATTR) {
744 		flag |= IS_XATTR;
745 	}
746 
747 	if (zfsvfs->z_use_fuids)
748 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
749 	else
750 		pflags = 0;
751 
752 	if (S_ISDIR(vap->va_mode)) {
753 		size = 2;		/* contents ("." and "..") */
754 		links = 2;
755 	} else {
756 		size = 0;
757 		links = (flag & IS_TMPFILE) ? 0 : 1;
758 	}
759 
760 	if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
761 		rdev = vap->va_rdev;
762 
763 	parent = dzp->z_id;
764 	mode = acl_ids->z_mode;
765 	if (flag & IS_XATTR)
766 		pflags |= ZFS_XATTR;
767 
768 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
769 		/*
770 		 * With ZFS_PROJID flag, we can easily know whether there is
771 		 * project ID stored on disk or not. See zfs_space_delta_cb().
772 		 */
773 		if (obj_type != DMU_OT_ZNODE &&
774 		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
775 			pflags |= ZFS_PROJID;
776 
777 		/*
778 		 * Inherit project ID from parent if required.
779 		 */
780 		projid = zfs_inherit_projid(dzp);
781 		if (dzp->z_pflags & ZFS_PROJINHERIT)
782 			pflags |= ZFS_PROJINHERIT;
783 	}
784 
785 	/*
786 	 * No execs denied will be determined when zfs_mode_compute() is called.
787 	 */
788 	pflags |= acl_ids->z_aclp->z_hints &
789 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
790 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
791 
792 	ZFS_TIME_ENCODE(&now, crtime);
793 	ZFS_TIME_ENCODE(&now, ctime);
794 
795 	if (vap->va_mask & ATTR_ATIME) {
796 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
797 	} else {
798 		ZFS_TIME_ENCODE(&now, atime);
799 	}
800 
801 	if (vap->va_mask & ATTR_MTIME) {
802 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
803 	} else {
804 		ZFS_TIME_ENCODE(&now, mtime);
805 	}
806 
807 	/* Now add in all of the "SA" attributes */
808 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
809 	    &sa_hdl));
810 
811 	/*
812 	 * Setup the array of attributes to be replaced/set on the new file
813 	 *
814 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
815 	 * in the old znode_phys_t format.  Don't change this ordering
816 	 */
817 	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
818 
819 	if (obj_type == DMU_OT_ZNODE) {
820 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
821 		    NULL, &atime, 16);
822 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
823 		    NULL, &mtime, 16);
824 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
825 		    NULL, &ctime, 16);
826 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
827 		    NULL, &crtime, 16);
828 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
829 		    NULL, &gen, 8);
830 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
831 		    NULL, &mode, 8);
832 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
833 		    NULL, &size, 8);
834 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
835 		    NULL, &parent, 8);
836 	} else {
837 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
838 		    NULL, &mode, 8);
839 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
840 		    NULL, &size, 8);
841 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
842 		    NULL, &gen, 8);
843 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
844 		    NULL, &acl_ids->z_fuid, 8);
845 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
846 		    NULL, &acl_ids->z_fgid, 8);
847 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
848 		    NULL, &parent, 8);
849 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
850 		    NULL, &pflags, 8);
851 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
852 		    NULL, &atime, 16);
853 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
854 		    NULL, &mtime, 16);
855 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
856 		    NULL, &ctime, 16);
857 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
858 		    NULL, &crtime, 16);
859 	}
860 
861 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
862 
863 	if (obj_type == DMU_OT_ZNODE) {
864 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
865 		    &empty_xattr, 8);
866 	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
867 	    pflags & ZFS_PROJID) {
868 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
869 		    NULL, &projid, 8);
870 	}
871 	if (obj_type == DMU_OT_ZNODE ||
872 	    (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
873 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
874 		    NULL, &rdev, 8);
875 	}
876 	if (obj_type == DMU_OT_ZNODE) {
877 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
878 		    NULL, &pflags, 8);
879 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
880 		    &acl_ids->z_fuid, 8);
881 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
882 		    &acl_ids->z_fgid, 8);
883 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
884 		    sizeof (uint64_t) * 4);
885 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
886 		    &acl_phys, sizeof (zfs_acl_phys_t));
887 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
888 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
889 		    &acl_ids->z_aclp->z_acl_count, 8);
890 		locate.cb_aclp = acl_ids->z_aclp;
891 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
892 		    zfs_acl_data_locator, &locate,
893 		    acl_ids->z_aclp->z_acl_bytes);
894 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
895 		    acl_ids->z_fuid, acl_ids->z_fgid);
896 	}
897 
898 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
899 
900 	if (!(flag & IS_ROOT_NODE)) {
901 		/*
902 		 * The call to zfs_znode_alloc() may fail if memory is low
903 		 * via the call path: alloc_inode() -> inode_init_always() ->
904 		 * security_inode_alloc() -> inode_alloc_security().  Since
905 		 * the existing code is written such that zfs_mknode() can
906 		 * not fail retry until sufficient memory has been reclaimed.
907 		 */
908 		do {
909 			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
910 		} while (*zpp == NULL);
911 
912 		VERIFY(*zpp != NULL);
913 		VERIFY(dzp != NULL);
914 	} else {
915 		/*
916 		 * If we are creating the root node, the "parent" we
917 		 * passed in is the znode for the root.
918 		 */
919 		*zpp = dzp;
920 
921 		(*zpp)->z_sa_hdl = sa_hdl;
922 	}
923 
924 	(*zpp)->z_pflags = pflags;
925 	(*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
926 	(*zpp)->z_dnodesize = dnodesize;
927 	(*zpp)->z_projid = projid;
928 
929 	if (obj_type == DMU_OT_ZNODE ||
930 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
931 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
932 	}
933 	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
934 	zfs_znode_hold_exit(zfsvfs, zh);
935 }
936 
937 /*
938  * Update in-core attributes.  It is assumed the caller will be doing an
939  * sa_bulk_update to push the changes out.
940  */
941 void
zfs_xvattr_set(znode_t * zp,xvattr_t * xvap,dmu_tx_t * tx)942 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
943 {
944 	xoptattr_t *xoap;
945 	boolean_t update_inode = B_FALSE;
946 
947 	xoap = xva_getxoptattr(xvap);
948 	ASSERT(xoap);
949 
950 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
951 		uint64_t times[2];
952 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
953 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
954 		    &times, sizeof (times), tx);
955 		XVA_SET_RTN(xvap, XAT_CREATETIME);
956 	}
957 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
958 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
959 		    zp->z_pflags, tx);
960 		XVA_SET_RTN(xvap, XAT_READONLY);
961 	}
962 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
963 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
964 		    zp->z_pflags, tx);
965 		XVA_SET_RTN(xvap, XAT_HIDDEN);
966 	}
967 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
968 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
969 		    zp->z_pflags, tx);
970 		XVA_SET_RTN(xvap, XAT_SYSTEM);
971 	}
972 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
973 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
974 		    zp->z_pflags, tx);
975 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
976 	}
977 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
978 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
979 		    zp->z_pflags, tx);
980 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
981 
982 		update_inode = B_TRUE;
983 	}
984 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
985 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
986 		    zp->z_pflags, tx);
987 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
988 	}
989 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
990 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
991 		    zp->z_pflags, tx);
992 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
993 
994 		update_inode = B_TRUE;
995 	}
996 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
997 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
998 		    zp->z_pflags, tx);
999 		XVA_SET_RTN(xvap, XAT_NODUMP);
1000 	}
1001 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1002 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1003 		    zp->z_pflags, tx);
1004 		XVA_SET_RTN(xvap, XAT_OPAQUE);
1005 	}
1006 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1007 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1008 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1009 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1010 	}
1011 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1012 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1013 		    zp->z_pflags, tx);
1014 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1015 	}
1016 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1017 		zfs_sa_set_scanstamp(zp, xvap, tx);
1018 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1019 	}
1020 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1021 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1022 		    zp->z_pflags, tx);
1023 		XVA_SET_RTN(xvap, XAT_REPARSE);
1024 	}
1025 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1026 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1027 		    zp->z_pflags, tx);
1028 		XVA_SET_RTN(xvap, XAT_OFFLINE);
1029 	}
1030 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1031 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1032 		    zp->z_pflags, tx);
1033 		XVA_SET_RTN(xvap, XAT_SPARSE);
1034 	}
1035 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1036 		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1037 		    zp->z_pflags, tx);
1038 		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1039 	}
1040 
1041 	if (update_inode)
1042 		zfs_set_inode_flags(zp, ZTOI(zp));
1043 }
1044 
1045 int
zfs_zget(zfsvfs_t * zfsvfs,uint64_t obj_num,znode_t ** zpp)1046 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1047 {
1048 	dmu_object_info_t doi;
1049 	dmu_buf_t	*db;
1050 	znode_t		*zp;
1051 	znode_hold_t	*zh;
1052 	int err;
1053 	sa_handle_t	*hdl;
1054 
1055 	*zpp = NULL;
1056 
1057 again:
1058 	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1059 
1060 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1061 	if (err) {
1062 		zfs_znode_hold_exit(zfsvfs, zh);
1063 		return (err);
1064 	}
1065 
1066 	dmu_object_info_from_db(db, &doi);
1067 	if (doi.doi_bonus_type != DMU_OT_SA &&
1068 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1069 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1070 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1071 		sa_buf_rele(db, NULL);
1072 		zfs_znode_hold_exit(zfsvfs, zh);
1073 		return (SET_ERROR(EINVAL));
1074 	}
1075 
1076 	hdl = dmu_buf_get_user(db);
1077 	if (hdl != NULL) {
1078 		zp = sa_get_userdata(hdl);
1079 
1080 
1081 		/*
1082 		 * Since "SA" does immediate eviction we
1083 		 * should never find a sa handle that doesn't
1084 		 * know about the znode.
1085 		 */
1086 
1087 		ASSERT3P(zp, !=, NULL);
1088 
1089 		mutex_enter(&zp->z_lock);
1090 		ASSERT3U(zp->z_id, ==, obj_num);
1091 		/*
1092 		 * If zp->z_unlinked is set, the znode is already marked
1093 		 * for deletion and should not be discovered. Check this
1094 		 * after checking igrab() due to fsetxattr() & O_TMPFILE.
1095 		 *
1096 		 * If igrab() returns NULL the VFS has independently
1097 		 * determined the inode should be evicted and has
1098 		 * called iput_final() to start the eviction process.
1099 		 * The SA handle is still valid but because the VFS
1100 		 * requires that the eviction succeed we must drop
1101 		 * our locks and references to allow the eviction to
1102 		 * complete.  The zfs_zget() may then be retried.
1103 		 *
1104 		 * This unlikely case could be optimized by registering
1105 		 * a sops->drop_inode() callback.  The callback would
1106 		 * need to detect the active SA hold thereby informing
1107 		 * the VFS that this inode should not be evicted.
1108 		 */
1109 		if (igrab(ZTOI(zp)) == NULL) {
1110 			if (zp->z_unlinked)
1111 				err = SET_ERROR(ENOENT);
1112 			else
1113 				err = SET_ERROR(EAGAIN);
1114 		} else {
1115 			*zpp = zp;
1116 			err = 0;
1117 		}
1118 
1119 		mutex_exit(&zp->z_lock);
1120 		sa_buf_rele(db, NULL);
1121 		zfs_znode_hold_exit(zfsvfs, zh);
1122 
1123 		if (err == EAGAIN) {
1124 			/* inode might need this to finish evict */
1125 			cond_resched();
1126 			goto again;
1127 		}
1128 		return (err);
1129 	}
1130 
1131 	/*
1132 	 * Not found create new znode/vnode but only if file exists.
1133 	 *
1134 	 * There is a small window where zfs_vget() could
1135 	 * find this object while a file create is still in
1136 	 * progress.  This is checked for in zfs_znode_alloc()
1137 	 *
1138 	 * if zfs_znode_alloc() fails it will drop the hold on the
1139 	 * bonus buffer.
1140 	 */
1141 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1142 	    doi.doi_bonus_type, NULL);
1143 	if (zp == NULL) {
1144 		err = SET_ERROR(ENOENT);
1145 	} else {
1146 		*zpp = zp;
1147 	}
1148 	zfs_znode_hold_exit(zfsvfs, zh);
1149 	return (err);
1150 }
1151 
1152 int
zfs_rezget(znode_t * zp)1153 zfs_rezget(znode_t *zp)
1154 {
1155 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1156 	dmu_object_info_t doi;
1157 	dmu_buf_t *db;
1158 	uint64_t obj_num = zp->z_id;
1159 	uint64_t mode;
1160 	uint64_t links;
1161 	sa_bulk_attr_t bulk[11];
1162 	int err;
1163 	int count = 0;
1164 	uint64_t gen;
1165 	uint64_t z_uid, z_gid;
1166 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
1167 	inode_timespec_t tmp_ts;
1168 	uint64_t projid = ZFS_DEFAULT_PROJID;
1169 	znode_hold_t *zh;
1170 
1171 	/*
1172 	 * skip ctldir, otherwise they will always get invalidated. This will
1173 	 * cause funny behaviour for the mounted snapdirs. Especially for
1174 	 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1175 	 * anyone automount it again as long as someone is still using the
1176 	 * detached mount.
1177 	 */
1178 	if (zp->z_is_ctldir)
1179 		return (0);
1180 
1181 	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1182 
1183 	mutex_enter(&zp->z_acl_lock);
1184 	if (zp->z_acl_cached) {
1185 		zfs_acl_free(zp->z_acl_cached);
1186 		zp->z_acl_cached = NULL;
1187 	}
1188 	mutex_exit(&zp->z_acl_lock);
1189 
1190 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
1191 	if (zp->z_xattr_cached) {
1192 		nvlist_free(zp->z_xattr_cached);
1193 		zp->z_xattr_cached = NULL;
1194 	}
1195 	rw_exit(&zp->z_xattr_lock);
1196 
1197 	ASSERT(zp->z_sa_hdl == NULL);
1198 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1199 	if (err) {
1200 		zfs_znode_hold_exit(zfsvfs, zh);
1201 		return (err);
1202 	}
1203 
1204 	dmu_object_info_from_db(db, &doi);
1205 	if (doi.doi_bonus_type != DMU_OT_SA &&
1206 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1207 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1208 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1209 		sa_buf_rele(db, NULL);
1210 		zfs_znode_hold_exit(zfsvfs, zh);
1211 		return (SET_ERROR(EINVAL));
1212 	}
1213 
1214 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1215 
1216 	/* reload cached values */
1217 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1218 	    &gen, sizeof (gen));
1219 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1220 	    &zp->z_size, sizeof (zp->z_size));
1221 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1222 	    &links, sizeof (links));
1223 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1224 	    &zp->z_pflags, sizeof (zp->z_pflags));
1225 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1226 	    &z_uid, sizeof (z_uid));
1227 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1228 	    &z_gid, sizeof (z_gid));
1229 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1230 	    &mode, sizeof (mode));
1231 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1232 	    &atime, 16);
1233 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1234 	    &mtime, 16);
1235 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1236 	    &ctime, 16);
1237 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
1238 
1239 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1240 		zfs_znode_dmu_fini(zp);
1241 		zfs_znode_hold_exit(zfsvfs, zh);
1242 		return (SET_ERROR(EIO));
1243 	}
1244 
1245 	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1246 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1247 		    &projid, 8);
1248 		if (err != 0 && err != ENOENT) {
1249 			zfs_znode_dmu_fini(zp);
1250 			zfs_znode_hold_exit(zfsvfs, zh);
1251 			return (SET_ERROR(err));
1252 		}
1253 	}
1254 
1255 	zp->z_projid = projid;
1256 	zp->z_mode = ZTOI(zp)->i_mode = mode;
1257 	zfs_uid_write(ZTOI(zp), z_uid);
1258 	zfs_gid_write(ZTOI(zp), z_gid);
1259 
1260 	ZFS_TIME_DECODE(&tmp_ts, atime);
1261 	zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
1262 	ZFS_TIME_DECODE(&tmp_ts, mtime);
1263 	zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1264 	ZFS_TIME_DECODE(&tmp_ts, ctime);
1265 	zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
1266 	ZFS_TIME_DECODE(&zp->z_btime, btime);
1267 
1268 	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
1269 		zfs_znode_dmu_fini(zp);
1270 		zfs_znode_hold_exit(zfsvfs, zh);
1271 		return (SET_ERROR(EIO));
1272 	}
1273 
1274 	set_nlink(ZTOI(zp), (uint32_t)links);
1275 	zfs_set_inode_flags(zp, ZTOI(zp));
1276 
1277 	zp->z_blksz = doi.doi_data_block_size;
1278 	zp->z_atime_dirty = B_FALSE;
1279 	zfs_znode_update_vfs(zp);
1280 
1281 	/*
1282 	 * If the file has zero links, then it has been unlinked on the send
1283 	 * side and it must be in the received unlinked set.
1284 	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1285 	 * stale data and to prevent automatic removal of the file in
1286 	 * zfs_zinactive().  The file will be removed either when it is removed
1287 	 * on the send side and the next incremental stream is received or
1288 	 * when the unlinked set gets processed.
1289 	 */
1290 	zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1291 	if (zp->z_unlinked)
1292 		zfs_znode_dmu_fini(zp);
1293 
1294 	zfs_znode_hold_exit(zfsvfs, zh);
1295 
1296 	return (0);
1297 }
1298 
1299 void
zfs_znode_delete(znode_t * zp,dmu_tx_t * tx)1300 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1301 {
1302 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1303 	objset_t *os = zfsvfs->z_os;
1304 	uint64_t obj = zp->z_id;
1305 	uint64_t acl_obj = zfs_external_acl(zp);
1306 	znode_hold_t *zh;
1307 
1308 	zh = zfs_znode_hold_enter(zfsvfs, obj);
1309 	if (acl_obj) {
1310 		VERIFY(!zp->z_is_sa);
1311 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1312 	}
1313 	VERIFY(0 == dmu_object_free(os, obj, tx));
1314 	zfs_znode_dmu_fini(zp);
1315 	zfs_znode_hold_exit(zfsvfs, zh);
1316 }
1317 
1318 void
zfs_zinactive(znode_t * zp)1319 zfs_zinactive(znode_t *zp)
1320 {
1321 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1322 	uint64_t z_id = zp->z_id;
1323 	znode_hold_t *zh;
1324 
1325 	ASSERT(zp->z_sa_hdl);
1326 
1327 	/*
1328 	 * Don't allow a zfs_zget() while were trying to release this znode.
1329 	 */
1330 	zh = zfs_znode_hold_enter(zfsvfs, z_id);
1331 
1332 	mutex_enter(&zp->z_lock);
1333 
1334 	/*
1335 	 * If this was the last reference to a file with no links, remove
1336 	 * the file from the file system unless the file system is mounted
1337 	 * read-only.  That can happen, for example, if the file system was
1338 	 * originally read-write, the file was opened, then unlinked and
1339 	 * the file system was made read-only before the file was finally
1340 	 * closed.  The file will remain in the unlinked set.
1341 	 */
1342 	if (zp->z_unlinked) {
1343 		ASSERT(!zfsvfs->z_issnap);
1344 		if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
1345 			mutex_exit(&zp->z_lock);
1346 			zfs_znode_hold_exit(zfsvfs, zh);
1347 			zfs_rmnode(zp);
1348 			return;
1349 		}
1350 	}
1351 
1352 	mutex_exit(&zp->z_lock);
1353 	zfs_znode_dmu_fini(zp);
1354 
1355 	zfs_znode_hold_exit(zfsvfs, zh);
1356 }
1357 
1358 /*
1359  * Determine whether the znode's atime must be updated.  The logic mostly
1360  * duplicates the Linux kernel's relatime_need_update() functionality.
1361  * This function is only called if the underlying filesystem actually has
1362  * atime updates enabled.
1363  */
1364 boolean_t
zfs_relatime_need_update(const struct inode * ip)1365 zfs_relatime_need_update(const struct inode *ip)
1366 {
1367 	inode_timespec_t now, tmp_atime, tmp_ts;
1368 
1369 	gethrestime(&now);
1370 	tmp_atime = zpl_inode_get_atime(ip);
1371 	/*
1372 	 * In relatime mode, only update the atime if the previous atime
1373 	 * is earlier than either the ctime or mtime or if at least a day
1374 	 * has passed since the last update of atime.
1375 	 */
1376 	tmp_ts = zpl_inode_get_mtime(ip);
1377 	if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0)
1378 		return (B_TRUE);
1379 
1380 	tmp_ts = zpl_inode_get_ctime(ip);
1381 	if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0)
1382 		return (B_TRUE);
1383 
1384 	if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
1385 		return (B_TRUE);
1386 
1387 	return (B_FALSE);
1388 }
1389 
1390 /*
1391  * Prepare to update znode time stamps.
1392  *
1393  *	IN:	zp	- znode requiring timestamp update
1394  *		flag	- ATTR_MTIME, ATTR_CTIME flags
1395  *
1396  *	OUT:	zp	- z_seq
1397  *		mtime	- new mtime
1398  *		ctime	- new ctime
1399  *
1400  *	Note: We don't update atime here, because we rely on Linux VFS to do
1401  *	atime updating.
1402  */
1403 void
zfs_tstamp_update_setup(znode_t * zp,uint_t flag,uint64_t mtime[2],uint64_t ctime[2])1404 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1405     uint64_t ctime[2])
1406 {
1407 	inode_timespec_t now, tmp_ts;
1408 
1409 	gethrestime(&now);
1410 
1411 	zp->z_seq++;
1412 
1413 	if (flag & ATTR_MTIME) {
1414 		ZFS_TIME_ENCODE(&now, mtime);
1415 		ZFS_TIME_DECODE(&tmp_ts, mtime);
1416 		zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1417 		if (ZTOZSB(zp)->z_use_fuids) {
1418 			zp->z_pflags |= (ZFS_ARCHIVE |
1419 			    ZFS_AV_MODIFIED);
1420 		}
1421 	}
1422 
1423 	if (flag & ATTR_CTIME) {
1424 		ZFS_TIME_ENCODE(&now, ctime);
1425 		ZFS_TIME_DECODE(&tmp_ts, ctime);
1426 		zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
1427 		if (ZTOZSB(zp)->z_use_fuids)
1428 			zp->z_pflags |= ZFS_ARCHIVE;
1429 	}
1430 }
1431 
1432 /*
1433  * Grow the block size for a file.
1434  *
1435  *	IN:	zp	- znode of file to free data in.
1436  *		size	- requested block size
1437  *		tx	- open transaction.
1438  *
1439  * NOTE: this function assumes that the znode is write locked.
1440  */
1441 void
zfs_grow_blocksize(znode_t * zp,uint64_t size,dmu_tx_t * tx)1442 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1443 {
1444 	int		error;
1445 	u_longlong_t	dummy;
1446 
1447 	if (size <= zp->z_blksz)
1448 		return;
1449 	/*
1450 	 * If the file size is already greater than the current blocksize,
1451 	 * we will not grow.  If there is more than one block in a file,
1452 	 * the blocksize cannot change.
1453 	 */
1454 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1455 		return;
1456 
1457 	error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1458 	    size, 0, tx);
1459 
1460 	if (error == ENOTSUP)
1461 		return;
1462 	ASSERT0(error);
1463 
1464 	/* What blocksize did we actually get? */
1465 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1466 }
1467 
1468 /*
1469  * Increase the file length
1470  *
1471  *	IN:	zp	- znode of file to free data in.
1472  *		end	- new end-of-file
1473  *
1474  *	RETURN:	0 on success, error code on failure
1475  */
1476 static int
zfs_extend(znode_t * zp,uint64_t end)1477 zfs_extend(znode_t *zp, uint64_t end)
1478 {
1479 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1480 	dmu_tx_t *tx;
1481 	zfs_locked_range_t *lr;
1482 	uint64_t newblksz;
1483 	int error;
1484 
1485 	/*
1486 	 * We will change zp_size, lock the whole file.
1487 	 */
1488 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1489 
1490 	/*
1491 	 * Nothing to do if file already at desired length.
1492 	 */
1493 	if (end <= zp->z_size) {
1494 		zfs_rangelock_exit(lr);
1495 		return (0);
1496 	}
1497 	tx = dmu_tx_create(zfsvfs->z_os);
1498 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1499 	zfs_sa_upgrade_txholds(tx, zp);
1500 	if (end > zp->z_blksz &&
1501 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1502 		/*
1503 		 * We are growing the file past the current block size.
1504 		 */
1505 		if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1506 			/*
1507 			 * File's blocksize is already larger than the
1508 			 * "recordsize" property.  Only let it grow to
1509 			 * the next power of 2.
1510 			 */
1511 			ASSERT(!ISP2(zp->z_blksz));
1512 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1513 		} else {
1514 			newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1515 		}
1516 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1517 	} else {
1518 		newblksz = 0;
1519 	}
1520 
1521 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
1522 	if (error) {
1523 		dmu_tx_abort(tx);
1524 		zfs_rangelock_exit(lr);
1525 		return (error);
1526 	}
1527 
1528 	if (newblksz)
1529 		zfs_grow_blocksize(zp, newblksz, tx);
1530 
1531 	zp->z_size = end;
1532 
1533 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1534 	    &zp->z_size, sizeof (zp->z_size), tx));
1535 
1536 	zfs_rangelock_exit(lr);
1537 
1538 	dmu_tx_commit(tx);
1539 
1540 	return (0);
1541 }
1542 
1543 /*
1544  * zfs_zero_partial_page - Modeled after update_pages() but
1545  * with different arguments and semantics for use by zfs_freesp().
1546  *
1547  * Zeroes a piece of a single page cache entry for zp at offset
1548  * start and length len.
1549  *
1550  * Caller must acquire a range lock on the file for the region
1551  * being zeroed in order that the ARC and page cache stay in sync.
1552  */
1553 static void
zfs_zero_partial_page(znode_t * zp,uint64_t start,uint64_t len)1554 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1555 {
1556 	struct address_space *mp = ZTOI(zp)->i_mapping;
1557 	struct page *pp;
1558 	int64_t	off;
1559 	void *pb;
1560 
1561 	ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1562 
1563 	off = start & (PAGE_SIZE - 1);
1564 	start &= PAGE_MASK;
1565 
1566 	pp = find_lock_page(mp, start >> PAGE_SHIFT);
1567 	if (pp) {
1568 		if (mapping_writably_mapped(mp))
1569 			flush_dcache_page(pp);
1570 
1571 		pb = kmap(pp);
1572 		memset(pb + off, 0, len);
1573 		kunmap(pp);
1574 
1575 		if (mapping_writably_mapped(mp))
1576 			flush_dcache_page(pp);
1577 
1578 		mark_page_accessed(pp);
1579 		SetPageUptodate(pp);
1580 		ClearPageError(pp);
1581 		unlock_page(pp);
1582 		put_page(pp);
1583 	}
1584 }
1585 
1586 /*
1587  * Free space in a file.
1588  *
1589  *	IN:	zp	- znode of file to free data in.
1590  *		off	- start of section to free.
1591  *		len	- length of section to free.
1592  *
1593  *	RETURN:	0 on success, error code on failure
1594  */
1595 static int
zfs_free_range(znode_t * zp,uint64_t off,uint64_t len)1596 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1597 {
1598 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1599 	zfs_locked_range_t *lr;
1600 	int error;
1601 
1602 	/*
1603 	 * Lock the range being freed.
1604 	 */
1605 	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1606 
1607 	/*
1608 	 * Nothing to do if file already at desired length.
1609 	 */
1610 	if (off >= zp->z_size) {
1611 		zfs_rangelock_exit(lr);
1612 		return (0);
1613 	}
1614 
1615 	if (off + len > zp->z_size)
1616 		len = zp->z_size - off;
1617 
1618 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1619 
1620 	/*
1621 	 * Zero partial page cache entries.  This must be done under a
1622 	 * range lock in order to keep the ARC and page cache in sync.
1623 	 */
1624 	if (zn_has_cached_data(zp, off, off + len - 1)) {
1625 		loff_t first_page, last_page, page_len;
1626 		loff_t first_page_offset, last_page_offset;
1627 
1628 		/* first possible full page in hole */
1629 		first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1630 		/* last page of hole */
1631 		last_page = (off + len) >> PAGE_SHIFT;
1632 
1633 		/* offset of first_page */
1634 		first_page_offset = first_page << PAGE_SHIFT;
1635 		/* offset of last_page */
1636 		last_page_offset = last_page << PAGE_SHIFT;
1637 
1638 		/* truncate whole pages */
1639 		if (last_page_offset > first_page_offset) {
1640 			truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1641 			    first_page_offset, last_page_offset - 1);
1642 		}
1643 
1644 		/* truncate sub-page ranges */
1645 		if (first_page > last_page) {
1646 			/* entire punched area within a single page */
1647 			zfs_zero_partial_page(zp, off, len);
1648 		} else {
1649 			/* beginning of punched area at the end of a page */
1650 			page_len  = first_page_offset - off;
1651 			if (page_len > 0)
1652 				zfs_zero_partial_page(zp, off, page_len);
1653 
1654 			/* end of punched area at the beginning of a page */
1655 			page_len = off + len - last_page_offset;
1656 			if (page_len > 0)
1657 				zfs_zero_partial_page(zp, last_page_offset,
1658 				    page_len);
1659 		}
1660 	}
1661 	zfs_rangelock_exit(lr);
1662 
1663 	return (error);
1664 }
1665 
1666 /*
1667  * Truncate a file
1668  *
1669  *	IN:	zp	- znode of file to free data in.
1670  *		end	- new end-of-file.
1671  *
1672  *	RETURN:	0 on success, error code on failure
1673  */
1674 static int
zfs_trunc(znode_t * zp,uint64_t end)1675 zfs_trunc(znode_t *zp, uint64_t end)
1676 {
1677 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1678 	dmu_tx_t *tx;
1679 	zfs_locked_range_t *lr;
1680 	int error;
1681 	sa_bulk_attr_t bulk[2];
1682 	int count = 0;
1683 
1684 	/*
1685 	 * We will change zp_size, lock the whole file.
1686 	 */
1687 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1688 
1689 	/*
1690 	 * Nothing to do if file already at desired length.
1691 	 */
1692 	if (end >= zp->z_size) {
1693 		zfs_rangelock_exit(lr);
1694 		return (0);
1695 	}
1696 
1697 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1698 	    DMU_OBJECT_END);
1699 	if (error) {
1700 		zfs_rangelock_exit(lr);
1701 		return (error);
1702 	}
1703 	tx = dmu_tx_create(zfsvfs->z_os);
1704 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1705 	zfs_sa_upgrade_txholds(tx, zp);
1706 	dmu_tx_mark_netfree(tx);
1707 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
1708 	if (error) {
1709 		dmu_tx_abort(tx);
1710 		zfs_rangelock_exit(lr);
1711 		return (error);
1712 	}
1713 
1714 	zp->z_size = end;
1715 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1716 	    NULL, &zp->z_size, sizeof (zp->z_size));
1717 
1718 	if (end == 0) {
1719 		zp->z_pflags &= ~ZFS_SPARSE;
1720 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1721 		    NULL, &zp->z_pflags, 8);
1722 	}
1723 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1724 
1725 	dmu_tx_commit(tx);
1726 	zfs_rangelock_exit(lr);
1727 
1728 	return (0);
1729 }
1730 
1731 /*
1732  * Free space in a file
1733  *
1734  *	IN:	zp	- znode of file to free data in.
1735  *		off	- start of range
1736  *		len	- end of range (0 => EOF)
1737  *		flag	- current file open mode flags.
1738  *		log	- TRUE if this action should be logged
1739  *
1740  *	RETURN:	0 on success, error code on failure
1741  */
1742 int
zfs_freesp(znode_t * zp,uint64_t off,uint64_t len,int flag,boolean_t log)1743 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1744 {
1745 	dmu_tx_t *tx;
1746 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1747 	zilog_t *zilog = zfsvfs->z_log;
1748 	uint64_t mode;
1749 	uint64_t mtime[2], ctime[2];
1750 	sa_bulk_attr_t bulk[3];
1751 	int count = 0;
1752 	int error;
1753 
1754 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1755 	    sizeof (mode))) != 0)
1756 		return (error);
1757 
1758 	if (off > zp->z_size) {
1759 		error =  zfs_extend(zp, off+len);
1760 		if (error == 0 && log)
1761 			goto log;
1762 		goto out;
1763 	}
1764 
1765 	if (len == 0) {
1766 		error = zfs_trunc(zp, off);
1767 	} else {
1768 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1769 		    off + len > zp->z_size)
1770 			error = zfs_extend(zp, off+len);
1771 	}
1772 	if (error || !log)
1773 		goto out;
1774 log:
1775 	tx = dmu_tx_create(zfsvfs->z_os);
1776 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1777 	zfs_sa_upgrade_txholds(tx, zp);
1778 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
1779 	if (error) {
1780 		dmu_tx_abort(tx);
1781 		goto out;
1782 	}
1783 
1784 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1785 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1786 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1787 	    NULL, &zp->z_pflags, 8);
1788 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1789 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1790 	ASSERT(error == 0);
1791 
1792 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1793 
1794 	dmu_tx_commit(tx);
1795 
1796 	zfs_znode_update_vfs(zp);
1797 	error = 0;
1798 
1799 out:
1800 	/*
1801 	 * Truncate the page cache - for file truncate operations, use
1802 	 * the purpose-built API for truncations.  For punching operations,
1803 	 * the truncation is handled under a range lock in zfs_free_range.
1804 	 */
1805 	if (len == 0)
1806 		truncate_setsize(ZTOI(zp), off);
1807 	return (error);
1808 }
1809 
1810 void
zfs_create_fs(objset_t * os,cred_t * cr,nvlist_t * zplprops,dmu_tx_t * tx)1811 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1812 {
1813 	struct super_block *sb;
1814 	zfsvfs_t	*zfsvfs;
1815 	uint64_t	moid, obj, sa_obj, version;
1816 	uint64_t	sense = ZFS_CASE_SENSITIVE;
1817 	uint64_t	norm = 0;
1818 	nvpair_t	*elem;
1819 	int		size;
1820 	int		error;
1821 	int		i;
1822 	znode_t		*rootzp = NULL;
1823 	vattr_t		vattr;
1824 	znode_t		*zp;
1825 	zfs_acl_ids_t	acl_ids;
1826 
1827 	/*
1828 	 * First attempt to create master node.
1829 	 */
1830 	/*
1831 	 * In an empty objset, there are no blocks to read and thus
1832 	 * there can be no i/o errors (which we assert below).
1833 	 */
1834 	moid = MASTER_NODE_OBJ;
1835 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1836 	    DMU_OT_NONE, 0, tx);
1837 	ASSERT(error == 0);
1838 
1839 	/*
1840 	 * Set starting attributes.
1841 	 */
1842 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1843 	elem = NULL;
1844 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1845 		/* For the moment we expect all zpl props to be uint64_ts */
1846 		uint64_t val;
1847 		const char *name;
1848 
1849 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1850 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1851 		name = nvpair_name(elem);
1852 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1853 			if (val < version)
1854 				version = val;
1855 		} else {
1856 			error = zap_update(os, moid, name, 8, 1, &val, tx);
1857 		}
1858 		ASSERT(error == 0);
1859 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1860 			norm = val;
1861 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1862 			sense = val;
1863 	}
1864 	ASSERT(version != 0);
1865 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1866 	ASSERT(error == 0);
1867 
1868 	/*
1869 	 * Create zap object used for SA attribute registration
1870 	 */
1871 
1872 	if (version >= ZPL_VERSION_SA) {
1873 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1874 		    DMU_OT_NONE, 0, tx);
1875 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1876 		ASSERT(error == 0);
1877 	} else {
1878 		sa_obj = 0;
1879 	}
1880 	/*
1881 	 * Create a delete queue.
1882 	 */
1883 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1884 
1885 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1886 	ASSERT(error == 0);
1887 
1888 	/*
1889 	 * Create root znode.  Create minimal znode/inode/zfsvfs/sb
1890 	 * to allow zfs_mknode to work.
1891 	 */
1892 	vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1893 	vattr.va_mode = S_IFDIR|0755;
1894 	vattr.va_uid = crgetuid(cr);
1895 	vattr.va_gid = crgetgid(cr);
1896 
1897 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1898 	rootzp->z_unlinked = B_FALSE;
1899 	rootzp->z_atime_dirty = B_FALSE;
1900 	rootzp->z_is_sa = USE_SA(version, os);
1901 	rootzp->z_pflags = 0;
1902 
1903 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1904 	zfsvfs->z_os = os;
1905 	zfsvfs->z_parent = zfsvfs;
1906 	zfsvfs->z_version = version;
1907 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1908 	zfsvfs->z_use_sa = USE_SA(version, os);
1909 	zfsvfs->z_norm = norm;
1910 
1911 	sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1912 	sb->s_fs_info = zfsvfs;
1913 
1914 	ZTOI(rootzp)->i_sb = sb;
1915 
1916 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1917 	    &zfsvfs->z_attr_table);
1918 
1919 	ASSERT(error == 0);
1920 
1921 	/*
1922 	 * Fold case on file systems that are always or sometimes case
1923 	 * insensitive.
1924 	 */
1925 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1926 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1927 
1928 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1929 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1930 	    offsetof(znode_t, z_link_node));
1931 
1932 	size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1933 	zfsvfs->z_hold_size = size;
1934 	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1935 	    KM_SLEEP);
1936 	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1937 	for (i = 0; i != size; i++) {
1938 		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1939 		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1940 		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1941 	}
1942 
1943 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1944 	    cr, NULL, &acl_ids, zfs_init_idmap));
1945 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1946 	ASSERT3P(zp, ==, rootzp);
1947 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1948 	ASSERT(error == 0);
1949 	zfs_acl_ids_free(&acl_ids);
1950 
1951 	atomic_set(&ZTOI(rootzp)->i_count, 0);
1952 	sa_handle_destroy(rootzp->z_sa_hdl);
1953 	kmem_cache_free(znode_cache, rootzp);
1954 
1955 	for (i = 0; i != size; i++) {
1956 		avl_destroy(&zfsvfs->z_hold_trees[i]);
1957 		mutex_destroy(&zfsvfs->z_hold_locks[i]);
1958 	}
1959 
1960 	mutex_destroy(&zfsvfs->z_znodes_lock);
1961 
1962 	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1963 	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1964 	kmem_free(sb, sizeof (struct super_block));
1965 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1966 }
1967 
1968 EXPORT_SYMBOL(zfs_create_fs);
1969 EXPORT_SYMBOL(zfs_obj_to_path);
1970 
1971 module_param(zfs_object_mutex_size, uint, 0644);
1972 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
1973 module_param(zfs_unlink_suspend_progress, int, 0644);
1974 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
1975 "(debug - leaks space into the unlinked set)");
1976