1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2025, Klara, Inc.
29 */
30
31 /* Portions Copyright 2007 Jeremy Teo */
32 /* Portions Copyright 2010 Robert Milkowski */
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/time.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/file.h>
40 #include <sys/stat.h>
41 #include <sys/kmem.h>
42 #include <sys/taskq.h>
43 #include <sys/uio.h>
44 #include <sys/vmsystm.h>
45 #include <sys/atomic.h>
46 #include <sys/pathname.h>
47 #include <sys/cmn_err.h>
48 #include <sys/errno.h>
49 #include <sys/zfs_dir.h>
50 #include <sys/zfs_acl.h>
51 #include <sys/zfs_ioctl.h>
52 #include <sys/fs/zfs.h>
53 #include <sys/dmu.h>
54 #include <sys/dmu_objset.h>
55 #include <sys/spa.h>
56 #include <sys/txg.h>
57 #include <sys/dbuf.h>
58 #include <sys/zap.h>
59 #include <sys/sa.h>
60 #include <sys/policy.h>
61 #include <sys/sunddi.h>
62 #include <sys/sid.h>
63 #include <sys/zfs_ctldir.h>
64 #include <sys/zfs_fuid.h>
65 #include <sys/zfs_quota.h>
66 #include <sys/zfs_sa.h>
67 #include <sys/zfs_vnops.h>
68 #include <sys/zfs_rlock.h>
69 #include <sys/cred.h>
70 #include <sys/zpl.h>
71 #include <sys/zil.h>
72 #include <sys/sa_impl.h>
73 #include <linux/mm_compat.h>
74
75 /*
76 * Programming rules.
77 *
78 * Each vnode op performs some logical unit of work. To do this, the ZPL must
79 * properly lock its in-core state, create a DMU transaction, do the work,
80 * record this work in the intent log (ZIL), commit the DMU transaction,
81 * and wait for the intent log to commit if it is a synchronous operation.
82 * Moreover, the vnode ops must work in both normal and log replay context.
83 * The ordering of events is important to avoid deadlocks and references
84 * to freed memory. The example below illustrates the following Big Rules:
85 *
86 * (1) A check must be made in each zfs thread for a mounted file system.
87 * This is done avoiding races using zfs_enter(zfsvfs).
88 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
89 * must be checked with zfs_verify_zp(zp). Both of these macros
90 * can return EIO from the calling function.
91 *
92 * (2) zrele() should always be the last thing except for zil_commit() (if
93 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
94 * last reference, the vnode/znode can be freed, so the zp may point to
95 * freed memory. Second, the last reference will call zfs_zinactive(),
96 * which may induce a lot of work -- pushing cached pages (which acquires
97 * range locks) and syncing out cached atime changes. Third,
98 * zfs_zinactive() may require a new tx, which could deadlock the system
99 * if you were already holding one. This deadlock occurs because the tx
100 * currently being operated on prevents a txg from syncing, which
101 * prevents the new tx from progressing, resulting in a deadlock. If you
102 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
103 * is a synonym for zrele().
104 *
105 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
106 * as they can span dmu_tx_assign() calls.
107 *
108 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
109 * dmu_tx_assign(). This is critical because we don't want to block
110 * while holding locks.
111 *
112 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
113 * This reduces lock contention and CPU usage when we must wait (note
114 * that if throughput is constrained by the storage, nearly every
115 * transaction must wait).
116 *
117 * Note, in particular, that if a lock is sometimes acquired before
118 * the tx assigns, and sometimes after (e.g. z_lock), then failing
119 * to use a non-blocking assign can deadlock the system. The scenario:
120 *
121 * Thread A has grabbed a lock before calling dmu_tx_assign().
122 * Thread B is in an already-assigned tx, and blocks for this lock.
123 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
124 * txg_wait_open() forever, because the previous txg can't quiesce
125 * until B's tx commits.
126 *
127 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
128 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
129 * again. On subsequent calls to dmu_tx_assign(), pass
130 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
131 * this operation has already called dmu_tx_wait(). This will ensure
132 * that we don't retry forever, waiting a short bit each time.
133 *
134 * (5) If the operation succeeded, generate the intent log entry for it
135 * before dropping locks. This ensures that the ordering of events
136 * in the intent log matches the order in which they actually occurred.
137 * During ZIL replay the zfs_log_* functions will update the sequence
138 * number to indicate the zil transaction has replayed.
139 *
140 * (6) At the end of each vnode op, the DMU tx must always commit,
141 * regardless of whether there were any errors.
142 *
143 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
144 * to ensure that synchronous semantics are provided when necessary.
145 *
146 * In general, this is how things should be ordered in each vnode op:
147 *
148 * zfs_enter(zfsvfs); // exit if unmounted
149 * top:
150 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
151 * rw_enter(...); // grab any other locks you need
152 * tx = dmu_tx_create(...); // get DMU tx
153 * dmu_tx_hold_*(); // hold each object you might modify
154 * error = dmu_tx_assign(tx,
155 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
156 * if (error) {
157 * rw_exit(...); // drop locks
158 * zfs_dirent_unlock(dl); // unlock directory entry
159 * zrele(...); // release held znodes
160 * if (error == ERESTART) {
161 * waited = B_TRUE;
162 * dmu_tx_wait(tx);
163 * dmu_tx_abort(tx);
164 * goto top;
165 * }
166 * dmu_tx_abort(tx); // abort DMU tx
167 * zfs_exit(zfsvfs); // finished in zfs
168 * return (error); // really out of space
169 * }
170 * error = do_real_work(); // do whatever this VOP does
171 * if (error == 0)
172 * zfs_log_*(...); // on success, make ZIL entry
173 * dmu_tx_commit(tx); // commit DMU tx -- error or not
174 * rw_exit(...); // drop locks
175 * zfs_dirent_unlock(dl); // unlock directory entry
176 * zrele(...); // release held znodes
177 * zil_commit(zilog, foid); // synchronous when necessary
178 * zfs_exit(zfsvfs); // finished in zfs
179 * return (error); // done, report error
180 */
181 int
zfs_open(struct inode * ip,int mode,int flag,cred_t * cr)182 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
183 {
184 (void) cr;
185 znode_t *zp = ITOZ(ip);
186 zfsvfs_t *zfsvfs = ITOZSB(ip);
187 int error;
188
189 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
190 return (error);
191
192 /* Honor ZFS_APPENDONLY file attribute */
193 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
194 ((flag & O_APPEND) == 0)) {
195 zfs_exit(zfsvfs, FTAG);
196 return (SET_ERROR(EPERM));
197 }
198
199 /*
200 * Keep a count of the synchronous opens in the znode. On first
201 * synchronous open we must convert all previous async transactions
202 * into sync to keep correct ordering.
203 * Skip it for snapshot, as it won't have any transactions.
204 */
205 if (!zfsvfs->z_issnap && (flag & O_SYNC)) {
206 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
207 zil_async_to_sync(zfsvfs->z_log, zp->z_id);
208 }
209
210 zfs_exit(zfsvfs, FTAG);
211 return (0);
212 }
213
214 int
zfs_close(struct inode * ip,int flag,cred_t * cr)215 zfs_close(struct inode *ip, int flag, cred_t *cr)
216 {
217 (void) cr;
218 znode_t *zp = ITOZ(ip);
219 zfsvfs_t *zfsvfs = ITOZSB(ip);
220 int error;
221
222 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
223 return (error);
224
225 /* Decrement the synchronous opens in the znode */
226 if (!zfsvfs->z_issnap && (flag & O_SYNC))
227 atomic_dec_32(&zp->z_sync_cnt);
228
229 zfs_exit(zfsvfs, FTAG);
230 return (0);
231 }
232
233 #if defined(_KERNEL)
234
235 static int zfs_fillpage(struct inode *ip, struct page *pp);
236
237 /*
238 * When a file is memory mapped, we must keep the IO data synchronized
239 * between the DMU cache and the memory mapped pages. Update all mapped
240 * pages with the contents of the coresponding dmu buffer.
241 */
242 void
update_pages(znode_t * zp,int64_t start,int len,objset_t * os)243 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
244 {
245 struct address_space *mp = ZTOI(zp)->i_mapping;
246 int64_t off = start & (PAGE_SIZE - 1);
247
248 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
249 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
250
251 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
252 if (pp) {
253 if (mapping_writably_mapped(mp))
254 flush_dcache_page(pp);
255
256 void *pb = kmap(pp);
257 int error = dmu_read(os, zp->z_id, start + off,
258 nbytes, pb + off, DMU_READ_PREFETCH);
259 kunmap(pp);
260
261 if (error) {
262 SetPageError(pp);
263 ClearPageUptodate(pp);
264 } else {
265 ClearPageError(pp);
266 SetPageUptodate(pp);
267
268 if (mapping_writably_mapped(mp))
269 flush_dcache_page(pp);
270
271 mark_page_accessed(pp);
272 }
273
274 unlock_page(pp);
275 put_page(pp);
276 }
277
278 len -= nbytes;
279 off = 0;
280 }
281 }
282
283 /*
284 * When a file is memory mapped, we must keep the I/O data synchronized
285 * between the DMU cache and the memory mapped pages. Preferentially read
286 * from memory mapped pages, otherwise fallback to reading through the dmu.
287 */
288 int
mappedread(znode_t * zp,int nbytes,zfs_uio_t * uio)289 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
290 {
291 struct inode *ip = ZTOI(zp);
292 struct address_space *mp = ip->i_mapping;
293 int64_t start = uio->uio_loffset;
294 int64_t off = start & (PAGE_SIZE - 1);
295 int len = nbytes;
296 int error = 0;
297
298 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
299 uint64_t bytes = MIN(PAGE_SIZE - off, len);
300
301 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
302 if (pp) {
303
304 /*
305 * If filemap_fault() retries there exists a window
306 * where the page will be unlocked and not up to date.
307 * In this case we must try and fill the page.
308 */
309 if (unlikely(!PageUptodate(pp))) {
310 error = zfs_fillpage(ip, pp);
311 if (error) {
312 unlock_page(pp);
313 put_page(pp);
314 return (error);
315 }
316 }
317
318 ASSERT(PageUptodate(pp) || PageDirty(pp));
319
320 unlock_page(pp);
321
322 void *pb = kmap(pp);
323 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
324 kunmap(pp);
325
326 if (mapping_writably_mapped(mp))
327 flush_dcache_page(pp);
328
329 mark_page_accessed(pp);
330 put_page(pp);
331 } else {
332 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
333 uio, bytes, DMU_READ_PREFETCH);
334 }
335
336 len -= bytes;
337 off = 0;
338
339 if (error)
340 break;
341 }
342
343 return (error);
344 }
345 #endif /* _KERNEL */
346
347 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
348
349 /*
350 * Write the bytes to a file.
351 *
352 * IN: zp - znode of file to be written to
353 * data - bytes to write
354 * len - number of bytes to write
355 * pos - offset to start writing at
356 *
357 * OUT: resid - remaining bytes to write
358 *
359 * RETURN: 0 if success
360 * positive error code if failure. EIO is returned
361 * for a short write when residp isn't provided.
362 *
363 * Timestamps:
364 * zp - ctime|mtime updated if byte count > 0
365 */
366 int
zfs_write_simple(znode_t * zp,const void * data,size_t len,loff_t pos,size_t * residp)367 zfs_write_simple(znode_t *zp, const void *data, size_t len,
368 loff_t pos, size_t *residp)
369 {
370 fstrans_cookie_t cookie;
371 int error;
372
373 struct iovec iov;
374 iov.iov_base = (void *)data;
375 iov.iov_len = len;
376
377 zfs_uio_t uio;
378 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
379
380 cookie = spl_fstrans_mark();
381 error = zfs_write(zp, &uio, 0, kcred);
382 spl_fstrans_unmark(cookie);
383
384 if (error == 0) {
385 if (residp != NULL)
386 *residp = zfs_uio_resid(&uio);
387 else if (zfs_uio_resid(&uio) != 0)
388 error = SET_ERROR(EIO);
389 }
390
391 return (error);
392 }
393
394 static void
zfs_rele_async_task(void * arg)395 zfs_rele_async_task(void *arg)
396 {
397 iput(arg);
398 }
399
400 void
zfs_zrele_async(znode_t * zp)401 zfs_zrele_async(znode_t *zp)
402 {
403 struct inode *ip = ZTOI(zp);
404 objset_t *os = ITOZSB(ip)->z_os;
405
406 ASSERT(atomic_read(&ip->i_count) > 0);
407 ASSERT(os != NULL);
408
409 /*
410 * If decrementing the count would put us at 0, we can't do it inline
411 * here, because that would be synchronous. Instead, dispatch an iput
412 * to run later.
413 *
414 * For more information on the dangers of a synchronous iput, see the
415 * header comment of this file.
416 */
417 if (!atomic_add_unless(&ip->i_count, -1, 1)) {
418 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
419 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
420 }
421 }
422
423
424 /*
425 * Lookup an entry in a directory, or an extended attribute directory.
426 * If it exists, return a held inode reference for it.
427 *
428 * IN: zdp - znode of directory to search.
429 * nm - name of entry to lookup.
430 * flags - LOOKUP_XATTR set if looking for an attribute.
431 * cr - credentials of caller.
432 * direntflags - directory lookup flags
433 * realpnp - returned pathname.
434 *
435 * OUT: zpp - znode of located entry, NULL if not found.
436 *
437 * RETURN: 0 on success, error code on failure.
438 *
439 * Timestamps:
440 * NA
441 */
442 int
zfs_lookup(znode_t * zdp,char * nm,znode_t ** zpp,int flags,cred_t * cr,int * direntflags,pathname_t * realpnp)443 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
444 int *direntflags, pathname_t *realpnp)
445 {
446 zfsvfs_t *zfsvfs = ZTOZSB(zdp);
447 int error = 0;
448
449 /*
450 * Fast path lookup, however we must skip DNLC lookup
451 * for case folding or normalizing lookups because the
452 * DNLC code only stores the passed in name. This means
453 * creating 'a' and removing 'A' on a case insensitive
454 * file system would work, but DNLC still thinks 'a'
455 * exists and won't let you create it again on the next
456 * pass through fast path.
457 */
458 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
459
460 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
461 return (SET_ERROR(ENOTDIR));
462 } else if (zdp->z_sa_hdl == NULL) {
463 return (SET_ERROR(EIO));
464 }
465
466 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
467 error = zfs_fastaccesschk_execute(zdp, cr);
468 if (!error) {
469 *zpp = zdp;
470 zhold(*zpp);
471 return (0);
472 }
473 return (error);
474 }
475 }
476
477 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
478 return (error);
479
480 *zpp = NULL;
481
482 if (flags & LOOKUP_XATTR) {
483 /*
484 * We don't allow recursive attributes..
485 * Maybe someday we will.
486 */
487 if (zdp->z_pflags & ZFS_XATTR) {
488 zfs_exit(zfsvfs, FTAG);
489 return (SET_ERROR(EINVAL));
490 }
491
492 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
493 zfs_exit(zfsvfs, FTAG);
494 return (error);
495 }
496
497 /*
498 * Do we have permission to get into attribute directory?
499 */
500
501 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
502 B_TRUE, cr, zfs_init_idmap))) {
503 zrele(*zpp);
504 *zpp = NULL;
505 }
506
507 zfs_exit(zfsvfs, FTAG);
508 return (error);
509 }
510
511 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
512 zfs_exit(zfsvfs, FTAG);
513 return (SET_ERROR(ENOTDIR));
514 }
515
516 /*
517 * Check accessibility of directory.
518 */
519
520 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
521 zfs_init_idmap))) {
522 zfs_exit(zfsvfs, FTAG);
523 return (error);
524 }
525
526 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
527 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
528 zfs_exit(zfsvfs, FTAG);
529 return (SET_ERROR(EILSEQ));
530 }
531
532 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
533 if ((error == 0) && (*zpp))
534 zfs_znode_update_vfs(*zpp);
535
536 zfs_exit(zfsvfs, FTAG);
537 return (error);
538 }
539
540 /*
541 * Perform a linear search in directory for the name of specific inode.
542 * Note we don't pass in the buffer size of name because it's hardcoded to
543 * NAME_MAX+1(256) in Linux.
544 *
545 * IN: dzp - znode of directory to search.
546 * zp - znode of the target
547 *
548 * OUT: name - dentry name of the target
549 *
550 * RETURN: 0 on success, error code on failure.
551 */
552 int
zfs_get_name(znode_t * dzp,char * name,znode_t * zp)553 zfs_get_name(znode_t *dzp, char *name, znode_t *zp)
554 {
555 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
556 int error = 0;
557
558 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
559 return (error);
560
561 if ((error = zfs_verify_zp(zp)) != 0) {
562 zfs_exit(zfsvfs, FTAG);
563 return (error);
564 }
565
566 /* ctldir should have got their name in zfs_vget */
567 if (dzp->z_is_ctldir || zp->z_is_ctldir) {
568 zfs_exit(zfsvfs, FTAG);
569 return (ENOENT);
570 }
571
572 /* buffer len is hardcoded to 256 in Linux kernel */
573 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id,
574 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN);
575
576 zfs_exit(zfsvfs, FTAG);
577 return (error);
578 }
579
580 /*
581 * Attempt to create a new entry in a directory. If the entry
582 * already exists, truncate the file if permissible, else return
583 * an error. Return the ip of the created or trunc'd file.
584 *
585 * IN: dzp - znode of directory to put new file entry in.
586 * name - name of new file entry.
587 * vap - attributes of new file.
588 * excl - flag indicating exclusive or non-exclusive mode.
589 * mode - mode to open file with.
590 * cr - credentials of caller.
591 * flag - file flag.
592 * vsecp - ACL to be set
593 * mnt_ns - user namespace of the mount
594 *
595 * OUT: zpp - znode of created or trunc'd entry.
596 *
597 * RETURN: 0 on success, error code on failure.
598 *
599 * Timestamps:
600 * dzp - ctime|mtime updated if new entry created
601 * zp - ctime|mtime always, atime if new
602 */
603 int
zfs_create(znode_t * dzp,char * name,vattr_t * vap,int excl,int mode,znode_t ** zpp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)604 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
605 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
606 zidmap_t *mnt_ns)
607 {
608 znode_t *zp;
609 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
610 zilog_t *zilog;
611 objset_t *os;
612 zfs_dirlock_t *dl;
613 dmu_tx_t *tx;
614 int error;
615 uid_t uid;
616 gid_t gid;
617 zfs_acl_ids_t acl_ids;
618 boolean_t fuid_dirtied;
619 boolean_t have_acl = B_FALSE;
620 boolean_t waited = B_FALSE;
621 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
622
623 /*
624 * If we have an ephemeral id, ACL, or XVATTR then
625 * make sure file system is at proper version
626 */
627
628 gid = crgetgid(cr);
629 uid = crgetuid(cr);
630
631 if (zfsvfs->z_use_fuids == B_FALSE &&
632 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
633 return (SET_ERROR(EINVAL));
634
635 if (name == NULL)
636 return (SET_ERROR(EINVAL));
637
638 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
639 return (error);
640 os = zfsvfs->z_os;
641 zilog = zfsvfs->z_log;
642
643 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
644 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
645 zfs_exit(zfsvfs, FTAG);
646 return (SET_ERROR(EILSEQ));
647 }
648
649 if (vap->va_mask & ATTR_XVATTR) {
650 if ((error = secpolicy_xvattr((xvattr_t *)vap,
651 crgetuid(cr), cr, vap->va_mode)) != 0) {
652 zfs_exit(zfsvfs, FTAG);
653 return (error);
654 }
655 }
656
657 top:
658 *zpp = NULL;
659 if (*name == '\0') {
660 /*
661 * Null component name refers to the directory itself.
662 */
663 zhold(dzp);
664 zp = dzp;
665 dl = NULL;
666 error = 0;
667 } else {
668 /* possible igrab(zp) */
669 int zflg = 0;
670
671 if (flag & FIGNORECASE)
672 zflg |= ZCILOOK;
673
674 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
675 NULL, NULL);
676 if (error) {
677 if (have_acl)
678 zfs_acl_ids_free(&acl_ids);
679 if (strcmp(name, "..") == 0)
680 error = SET_ERROR(EISDIR);
681 zfs_exit(zfsvfs, FTAG);
682 return (error);
683 }
684 }
685
686 if (zp == NULL) {
687 uint64_t txtype;
688 uint64_t projid = ZFS_DEFAULT_PROJID;
689
690 /*
691 * Create a new file object and update the directory
692 * to reference it.
693 */
694 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
695 mnt_ns))) {
696 if (have_acl)
697 zfs_acl_ids_free(&acl_ids);
698 goto out;
699 }
700
701 /*
702 * We only support the creation of regular files in
703 * extended attribute directories.
704 */
705
706 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
707 if (have_acl)
708 zfs_acl_ids_free(&acl_ids);
709 error = SET_ERROR(EINVAL);
710 goto out;
711 }
712
713 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
714 cr, vsecp, &acl_ids, mnt_ns)) != 0)
715 goto out;
716 have_acl = B_TRUE;
717
718 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
719 projid = zfs_inherit_projid(dzp);
720 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
721 zfs_acl_ids_free(&acl_ids);
722 error = SET_ERROR(EDQUOT);
723 goto out;
724 }
725
726 tx = dmu_tx_create(os);
727
728 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
729 ZFS_SA_BASE_ATTR_SIZE);
730
731 fuid_dirtied = zfsvfs->z_fuid_dirty;
732 if (fuid_dirtied)
733 zfs_fuid_txhold(zfsvfs, tx);
734 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
735 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
736 if (!zfsvfs->z_use_sa &&
737 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
738 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
739 0, acl_ids.z_aclp->z_acl_bytes);
740 }
741
742 error = dmu_tx_assign(tx,
743 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
744 if (error) {
745 zfs_dirent_unlock(dl);
746 if (error == ERESTART) {
747 waited = B_TRUE;
748 dmu_tx_wait(tx);
749 dmu_tx_abort(tx);
750 goto top;
751 }
752 zfs_acl_ids_free(&acl_ids);
753 dmu_tx_abort(tx);
754 zfs_exit(zfsvfs, FTAG);
755 return (error);
756 }
757 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
758
759 error = zfs_link_create(dl, zp, tx, ZNEW);
760 if (error != 0) {
761 /*
762 * Since, we failed to add the directory entry for it,
763 * delete the newly created dnode.
764 */
765 zfs_znode_delete(zp, tx);
766 remove_inode_hash(ZTOI(zp));
767 zfs_acl_ids_free(&acl_ids);
768 dmu_tx_commit(tx);
769 goto out;
770 }
771
772 if (fuid_dirtied)
773 zfs_fuid_sync(zfsvfs, tx);
774
775 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
776 if (flag & FIGNORECASE)
777 txtype |= TX_CI;
778 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
779 vsecp, acl_ids.z_fuidp, vap);
780 zfs_acl_ids_free(&acl_ids);
781 dmu_tx_commit(tx);
782 } else {
783 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
784
785 if (have_acl)
786 zfs_acl_ids_free(&acl_ids);
787
788 /*
789 * A directory entry already exists for this name.
790 */
791 /*
792 * Can't truncate an existing file if in exclusive mode.
793 */
794 if (excl) {
795 error = SET_ERROR(EEXIST);
796 goto out;
797 }
798 /*
799 * Can't open a directory for writing.
800 */
801 if (S_ISDIR(ZTOI(zp)->i_mode)) {
802 error = SET_ERROR(EISDIR);
803 goto out;
804 }
805 /*
806 * Verify requested access to file.
807 */
808 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
809 mnt_ns))) {
810 goto out;
811 }
812
813 mutex_enter(&dzp->z_lock);
814 dzp->z_seq++;
815 mutex_exit(&dzp->z_lock);
816
817 /*
818 * Truncate regular files if requested.
819 */
820 if (S_ISREG(ZTOI(zp)->i_mode) &&
821 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
822 /* we can't hold any locks when calling zfs_freesp() */
823 if (dl) {
824 zfs_dirent_unlock(dl);
825 dl = NULL;
826 }
827 error = zfs_freesp(zp, 0, 0, mode, TRUE);
828 }
829 }
830 out:
831
832 if (dl)
833 zfs_dirent_unlock(dl);
834
835 if (error) {
836 if (zp)
837 zrele(zp);
838 } else {
839 zfs_znode_update_vfs(dzp);
840 zfs_znode_update_vfs(zp);
841 *zpp = zp;
842 }
843
844 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
845 error = zil_commit(zilog, 0);
846
847 zfs_exit(zfsvfs, FTAG);
848 return (error);
849 }
850
851 int
zfs_tmpfile(struct inode * dip,vattr_t * vap,int excl,int mode,struct inode ** ipp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)852 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
853 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
854 zidmap_t *mnt_ns)
855 {
856 (void) excl, (void) mode, (void) flag;
857 znode_t *zp = NULL, *dzp = ITOZ(dip);
858 zfsvfs_t *zfsvfs = ITOZSB(dip);
859 objset_t *os;
860 dmu_tx_t *tx;
861 int error;
862 uid_t uid;
863 gid_t gid;
864 zfs_acl_ids_t acl_ids;
865 uint64_t projid = ZFS_DEFAULT_PROJID;
866 boolean_t fuid_dirtied;
867 boolean_t have_acl = B_FALSE;
868 boolean_t waited = B_FALSE;
869
870 /*
871 * If we have an ephemeral id, ACL, or XVATTR then
872 * make sure file system is at proper version
873 */
874
875 gid = crgetgid(cr);
876 uid = crgetuid(cr);
877
878 if (zfsvfs->z_use_fuids == B_FALSE &&
879 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
880 return (SET_ERROR(EINVAL));
881
882 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
883 return (error);
884 os = zfsvfs->z_os;
885
886 if (vap->va_mask & ATTR_XVATTR) {
887 if ((error = secpolicy_xvattr((xvattr_t *)vap,
888 crgetuid(cr), cr, vap->va_mode)) != 0) {
889 zfs_exit(zfsvfs, FTAG);
890 return (error);
891 }
892 }
893
894 top:
895 *ipp = NULL;
896
897 /*
898 * Create a new file object and update the directory
899 * to reference it.
900 */
901 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
902 if (have_acl)
903 zfs_acl_ids_free(&acl_ids);
904 goto out;
905 }
906
907 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
908 cr, vsecp, &acl_ids, mnt_ns)) != 0)
909 goto out;
910 have_acl = B_TRUE;
911
912 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
913 projid = zfs_inherit_projid(dzp);
914 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
915 zfs_acl_ids_free(&acl_ids);
916 error = SET_ERROR(EDQUOT);
917 goto out;
918 }
919
920 tx = dmu_tx_create(os);
921
922 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
923 ZFS_SA_BASE_ATTR_SIZE);
924 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
925
926 fuid_dirtied = zfsvfs->z_fuid_dirty;
927 if (fuid_dirtied)
928 zfs_fuid_txhold(zfsvfs, tx);
929 if (!zfsvfs->z_use_sa &&
930 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
931 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
932 0, acl_ids.z_aclp->z_acl_bytes);
933 }
934 error = dmu_tx_assign(tx,
935 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
936 if (error) {
937 if (error == ERESTART) {
938 waited = B_TRUE;
939 dmu_tx_wait(tx);
940 dmu_tx_abort(tx);
941 goto top;
942 }
943 zfs_acl_ids_free(&acl_ids);
944 dmu_tx_abort(tx);
945 zfs_exit(zfsvfs, FTAG);
946 return (error);
947 }
948 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
949
950 if (fuid_dirtied)
951 zfs_fuid_sync(zfsvfs, tx);
952
953 /* Add to unlinked set */
954 zp->z_unlinked = B_TRUE;
955 zfs_unlinked_add(zp, tx);
956 zfs_acl_ids_free(&acl_ids);
957 dmu_tx_commit(tx);
958 out:
959
960 if (error) {
961 if (zp)
962 zrele(zp);
963 } else {
964 zfs_znode_update_vfs(dzp);
965 zfs_znode_update_vfs(zp);
966 *ipp = ZTOI(zp);
967 }
968
969 zfs_exit(zfsvfs, FTAG);
970 return (error);
971 }
972
973 /*
974 * Remove an entry from a directory.
975 *
976 * IN: dzp - znode of directory to remove entry from.
977 * name - name of entry to remove.
978 * cr - credentials of caller.
979 * flags - case flags.
980 *
981 * RETURN: 0 if success
982 * error code if failure
983 *
984 * Timestamps:
985 * dzp - ctime|mtime
986 * ip - ctime (if nlink > 0)
987 */
988
989 static uint64_t null_xattr = 0;
990
991 int
zfs_remove(znode_t * dzp,char * name,cred_t * cr,int flags)992 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
993 {
994 znode_t *zp;
995 znode_t *xzp;
996 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
997 zilog_t *zilog;
998 uint64_t acl_obj, xattr_obj;
999 uint64_t xattr_obj_unlinked = 0;
1000 uint64_t obj = 0;
1001 uint64_t links;
1002 zfs_dirlock_t *dl;
1003 dmu_tx_t *tx;
1004 boolean_t may_delete_now, delete_now = FALSE;
1005 boolean_t unlinked, toobig = FALSE;
1006 uint64_t txtype;
1007 pathname_t *realnmp = NULL;
1008 pathname_t realnm;
1009 int error;
1010 int zflg = ZEXISTS;
1011 boolean_t waited = B_FALSE;
1012
1013 if (name == NULL)
1014 return (SET_ERROR(EINVAL));
1015
1016 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1017 return (error);
1018 zilog = zfsvfs->z_log;
1019
1020 if (flags & FIGNORECASE) {
1021 zflg |= ZCILOOK;
1022 pn_alloc(&realnm);
1023 realnmp = &realnm;
1024 }
1025
1026 top:
1027 xattr_obj = 0;
1028 xzp = NULL;
1029 /*
1030 * Attempt to lock directory; fail if entry doesn't exist.
1031 */
1032 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1033 NULL, realnmp))) {
1034 if (realnmp)
1035 pn_free(realnmp);
1036 zfs_exit(zfsvfs, FTAG);
1037 return (error);
1038 }
1039
1040 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1041 goto out;
1042 }
1043
1044 /*
1045 * Need to use rmdir for removing directories.
1046 */
1047 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1048 error = SET_ERROR(EPERM);
1049 goto out;
1050 }
1051
1052 mutex_enter(&zp->z_lock);
1053 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1054 !zn_has_cached_data(zp, 0, LLONG_MAX);
1055 mutex_exit(&zp->z_lock);
1056
1057 /*
1058 * We may delete the znode now, or we may put it in the unlinked set;
1059 * it depends on whether we're the last link, and on whether there are
1060 * other holds on the inode. So we dmu_tx_hold() the right things to
1061 * allow for either case.
1062 */
1063 obj = zp->z_id;
1064 tx = dmu_tx_create(zfsvfs->z_os);
1065 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1066 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1067 zfs_sa_upgrade_txholds(tx, zp);
1068 zfs_sa_upgrade_txholds(tx, dzp);
1069 if (may_delete_now) {
1070 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1071 /* if the file is too big, only hold_free a token amount */
1072 dmu_tx_hold_free(tx, zp->z_id, 0,
1073 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1074 }
1075
1076 /* are there any extended attributes? */
1077 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1078 &xattr_obj, sizeof (xattr_obj));
1079 if (error == 0 && xattr_obj) {
1080 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1081 ASSERT0(error);
1082 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1083 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1084 }
1085
1086 mutex_enter(&zp->z_lock);
1087 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1088 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1089 mutex_exit(&zp->z_lock);
1090
1091 /* charge as an update -- would be nice not to charge at all */
1092 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1093
1094 /*
1095 * Mark this transaction as typically resulting in a net free of space
1096 */
1097 dmu_tx_mark_netfree(tx);
1098
1099 error = dmu_tx_assign(tx,
1100 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1101 if (error) {
1102 zfs_dirent_unlock(dl);
1103 if (error == ERESTART) {
1104 waited = B_TRUE;
1105 dmu_tx_wait(tx);
1106 dmu_tx_abort(tx);
1107 zrele(zp);
1108 if (xzp)
1109 zrele(xzp);
1110 goto top;
1111 }
1112 if (realnmp)
1113 pn_free(realnmp);
1114 dmu_tx_abort(tx);
1115 zrele(zp);
1116 if (xzp)
1117 zrele(xzp);
1118 zfs_exit(zfsvfs, FTAG);
1119 return (error);
1120 }
1121
1122 /*
1123 * Remove the directory entry.
1124 */
1125 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1126
1127 if (error) {
1128 dmu_tx_commit(tx);
1129 goto out;
1130 }
1131
1132 if (unlinked) {
1133 /*
1134 * Hold z_lock so that we can make sure that the ACL obj
1135 * hasn't changed. Could have been deleted due to
1136 * zfs_sa_upgrade().
1137 */
1138 mutex_enter(&zp->z_lock);
1139 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1140 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1141 delete_now = may_delete_now && !toobig &&
1142 atomic_read(&ZTOI(zp)->i_count) == 1 &&
1143 !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1144 xattr_obj == xattr_obj_unlinked &&
1145 zfs_external_acl(zp) == acl_obj;
1146 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1147 }
1148
1149 if (delete_now) {
1150 if (xattr_obj_unlinked) {
1151 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1152 mutex_enter(&xzp->z_lock);
1153 xzp->z_unlinked = B_TRUE;
1154 clear_nlink(ZTOI(xzp));
1155 links = 0;
1156 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1157 &links, sizeof (links), tx);
1158 ASSERT3U(error, ==, 0);
1159 mutex_exit(&xzp->z_lock);
1160 zfs_unlinked_add(xzp, tx);
1161
1162 if (zp->z_is_sa)
1163 error = sa_remove(zp->z_sa_hdl,
1164 SA_ZPL_XATTR(zfsvfs), tx);
1165 else
1166 error = sa_update(zp->z_sa_hdl,
1167 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1168 sizeof (uint64_t), tx);
1169 ASSERT0(error);
1170 }
1171 /*
1172 * Add to the unlinked set because a new reference could be
1173 * taken concurrently resulting in a deferred destruction.
1174 */
1175 zfs_unlinked_add(zp, tx);
1176 mutex_exit(&zp->z_lock);
1177 } else if (unlinked) {
1178 mutex_exit(&zp->z_lock);
1179 zfs_unlinked_add(zp, tx);
1180 }
1181
1182 txtype = TX_REMOVE;
1183 if (flags & FIGNORECASE)
1184 txtype |= TX_CI;
1185 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1186
1187 dmu_tx_commit(tx);
1188 out:
1189 if (realnmp)
1190 pn_free(realnmp);
1191
1192 zfs_dirent_unlock(dl);
1193 zfs_znode_update_vfs(dzp);
1194 zfs_znode_update_vfs(zp);
1195
1196 if (delete_now)
1197 zrele(zp);
1198 else
1199 zfs_zrele_async(zp);
1200
1201 if (xzp) {
1202 zfs_znode_update_vfs(xzp);
1203 zfs_zrele_async(xzp);
1204 }
1205
1206 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1207 error = zil_commit(zilog, 0);
1208
1209 zfs_exit(zfsvfs, FTAG);
1210 return (error);
1211 }
1212
1213 /*
1214 * Create a new directory and insert it into dzp using the name
1215 * provided. Return a pointer to the inserted directory.
1216 *
1217 * IN: dzp - znode of directory to add subdir to.
1218 * dirname - name of new directory.
1219 * vap - attributes of new directory.
1220 * cr - credentials of caller.
1221 * flags - case flags.
1222 * vsecp - ACL to be set
1223 * mnt_ns - user namespace of the mount
1224 *
1225 * OUT: zpp - znode of created directory.
1226 *
1227 * RETURN: 0 if success
1228 * error code if failure
1229 *
1230 * Timestamps:
1231 * dzp - ctime|mtime updated
1232 * zpp - ctime|mtime|atime updated
1233 */
1234 int
zfs_mkdir(znode_t * dzp,char * dirname,vattr_t * vap,znode_t ** zpp,cred_t * cr,int flags,vsecattr_t * vsecp,zidmap_t * mnt_ns)1235 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1236 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1237 {
1238 znode_t *zp;
1239 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1240 zilog_t *zilog;
1241 zfs_dirlock_t *dl;
1242 uint64_t txtype;
1243 dmu_tx_t *tx;
1244 int error;
1245 int zf = ZNEW;
1246 uid_t uid;
1247 gid_t gid = crgetgid(cr);
1248 zfs_acl_ids_t acl_ids;
1249 boolean_t fuid_dirtied;
1250 boolean_t waited = B_FALSE;
1251
1252 ASSERT(S_ISDIR(vap->va_mode));
1253
1254 /*
1255 * If we have an ephemeral id, ACL, or XVATTR then
1256 * make sure file system is at proper version
1257 */
1258
1259 uid = crgetuid(cr);
1260 if (zfsvfs->z_use_fuids == B_FALSE &&
1261 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1262 return (SET_ERROR(EINVAL));
1263
1264 if (dirname == NULL)
1265 return (SET_ERROR(EINVAL));
1266
1267 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1268 return (error);
1269 zilog = zfsvfs->z_log;
1270
1271 if (dzp->z_pflags & ZFS_XATTR) {
1272 zfs_exit(zfsvfs, FTAG);
1273 return (SET_ERROR(EINVAL));
1274 }
1275
1276 if (zfsvfs->z_utf8 && u8_validate(dirname,
1277 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1278 zfs_exit(zfsvfs, FTAG);
1279 return (SET_ERROR(EILSEQ));
1280 }
1281 if (flags & FIGNORECASE)
1282 zf |= ZCILOOK;
1283
1284 if (vap->va_mask & ATTR_XVATTR) {
1285 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1286 crgetuid(cr), cr, vap->va_mode)) != 0) {
1287 zfs_exit(zfsvfs, FTAG);
1288 return (error);
1289 }
1290 }
1291
1292 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1293 vsecp, &acl_ids, mnt_ns)) != 0) {
1294 zfs_exit(zfsvfs, FTAG);
1295 return (error);
1296 }
1297 /*
1298 * First make sure the new directory doesn't exist.
1299 *
1300 * Existence is checked first to make sure we don't return
1301 * EACCES instead of EEXIST which can cause some applications
1302 * to fail.
1303 */
1304 top:
1305 *zpp = NULL;
1306
1307 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1308 NULL, NULL))) {
1309 zfs_acl_ids_free(&acl_ids);
1310 zfs_exit(zfsvfs, FTAG);
1311 return (error);
1312 }
1313
1314 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1315 mnt_ns))) {
1316 zfs_acl_ids_free(&acl_ids);
1317 zfs_dirent_unlock(dl);
1318 zfs_exit(zfsvfs, FTAG);
1319 return (error);
1320 }
1321
1322 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1323 zfs_acl_ids_free(&acl_ids);
1324 zfs_dirent_unlock(dl);
1325 zfs_exit(zfsvfs, FTAG);
1326 return (SET_ERROR(EDQUOT));
1327 }
1328
1329 /*
1330 * Add a new entry to the directory.
1331 */
1332 tx = dmu_tx_create(zfsvfs->z_os);
1333 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1334 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1335 fuid_dirtied = zfsvfs->z_fuid_dirty;
1336 if (fuid_dirtied)
1337 zfs_fuid_txhold(zfsvfs, tx);
1338 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1339 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1340 acl_ids.z_aclp->z_acl_bytes);
1341 }
1342
1343 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1344 ZFS_SA_BASE_ATTR_SIZE);
1345
1346 error = dmu_tx_assign(tx,
1347 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1348 if (error) {
1349 zfs_dirent_unlock(dl);
1350 if (error == ERESTART) {
1351 waited = B_TRUE;
1352 dmu_tx_wait(tx);
1353 dmu_tx_abort(tx);
1354 goto top;
1355 }
1356 zfs_acl_ids_free(&acl_ids);
1357 dmu_tx_abort(tx);
1358 zfs_exit(zfsvfs, FTAG);
1359 return (error);
1360 }
1361
1362 /*
1363 * Create new node.
1364 */
1365 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1366
1367 /*
1368 * Now put new name in parent dir.
1369 */
1370 error = zfs_link_create(dl, zp, tx, ZNEW);
1371 if (error != 0) {
1372 zfs_znode_delete(zp, tx);
1373 remove_inode_hash(ZTOI(zp));
1374 goto out;
1375 }
1376
1377 if (fuid_dirtied)
1378 zfs_fuid_sync(zfsvfs, tx);
1379
1380 *zpp = zp;
1381
1382 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1383 if (flags & FIGNORECASE)
1384 txtype |= TX_CI;
1385 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1386 acl_ids.z_fuidp, vap);
1387
1388 out:
1389 zfs_acl_ids_free(&acl_ids);
1390
1391 dmu_tx_commit(tx);
1392
1393 zfs_dirent_unlock(dl);
1394
1395 if (error != 0) {
1396 zrele(zp);
1397 } else {
1398 zfs_znode_update_vfs(dzp);
1399 zfs_znode_update_vfs(zp);
1400
1401 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1402 error = zil_commit(zilog, 0);
1403
1404 }
1405 zfs_exit(zfsvfs, FTAG);
1406 return (error);
1407 }
1408
1409 /*
1410 * Remove a directory subdir entry. If the current working
1411 * directory is the same as the subdir to be removed, the
1412 * remove will fail.
1413 *
1414 * IN: dzp - znode of directory to remove from.
1415 * name - name of directory to be removed.
1416 * cwd - inode of current working directory.
1417 * cr - credentials of caller.
1418 * flags - case flags
1419 *
1420 * RETURN: 0 on success, error code on failure.
1421 *
1422 * Timestamps:
1423 * dzp - ctime|mtime updated
1424 */
1425 int
zfs_rmdir(znode_t * dzp,char * name,znode_t * cwd,cred_t * cr,int flags)1426 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1427 int flags)
1428 {
1429 znode_t *zp;
1430 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1431 zilog_t *zilog;
1432 zfs_dirlock_t *dl;
1433 dmu_tx_t *tx;
1434 int error;
1435 int zflg = ZEXISTS;
1436 boolean_t waited = B_FALSE;
1437
1438 if (name == NULL)
1439 return (SET_ERROR(EINVAL));
1440
1441 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1442 return (error);
1443 zilog = zfsvfs->z_log;
1444
1445 if (flags & FIGNORECASE)
1446 zflg |= ZCILOOK;
1447 top:
1448 zp = NULL;
1449
1450 /*
1451 * Attempt to lock directory; fail if entry doesn't exist.
1452 */
1453 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1454 NULL, NULL))) {
1455 zfs_exit(zfsvfs, FTAG);
1456 return (error);
1457 }
1458
1459 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1460 goto out;
1461 }
1462
1463 if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1464 error = SET_ERROR(ENOTDIR);
1465 goto out;
1466 }
1467
1468 if (zp == cwd) {
1469 error = SET_ERROR(EINVAL);
1470 goto out;
1471 }
1472
1473 /*
1474 * Grab a lock on the directory to make sure that no one is
1475 * trying to add (or lookup) entries while we are removing it.
1476 */
1477 rw_enter(&zp->z_name_lock, RW_WRITER);
1478
1479 /*
1480 * Grab a lock on the parent pointer to make sure we play well
1481 * with the treewalk and directory rename code.
1482 */
1483 rw_enter(&zp->z_parent_lock, RW_WRITER);
1484
1485 tx = dmu_tx_create(zfsvfs->z_os);
1486 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1487 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1488 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1489 zfs_sa_upgrade_txholds(tx, zp);
1490 zfs_sa_upgrade_txholds(tx, dzp);
1491 dmu_tx_mark_netfree(tx);
1492 error = dmu_tx_assign(tx,
1493 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1494 if (error) {
1495 rw_exit(&zp->z_parent_lock);
1496 rw_exit(&zp->z_name_lock);
1497 zfs_dirent_unlock(dl);
1498 if (error == ERESTART) {
1499 waited = B_TRUE;
1500 dmu_tx_wait(tx);
1501 dmu_tx_abort(tx);
1502 zrele(zp);
1503 goto top;
1504 }
1505 dmu_tx_abort(tx);
1506 zrele(zp);
1507 zfs_exit(zfsvfs, FTAG);
1508 return (error);
1509 }
1510
1511 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1512
1513 if (error == 0) {
1514 uint64_t txtype = TX_RMDIR;
1515 if (flags & FIGNORECASE)
1516 txtype |= TX_CI;
1517 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1518 B_FALSE);
1519 }
1520
1521 dmu_tx_commit(tx);
1522
1523 rw_exit(&zp->z_parent_lock);
1524 rw_exit(&zp->z_name_lock);
1525 out:
1526 zfs_dirent_unlock(dl);
1527
1528 zfs_znode_update_vfs(dzp);
1529 zfs_znode_update_vfs(zp);
1530 zrele(zp);
1531
1532 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1533 error = zil_commit(zilog, 0);
1534
1535 zfs_exit(zfsvfs, FTAG);
1536 return (error);
1537 }
1538
1539 /*
1540 * Read directory entries from the given directory cursor position and emit
1541 * name and position for each entry.
1542 *
1543 * IN: ip - inode of directory to read.
1544 * ctx - directory entry context.
1545 * cr - credentials of caller.
1546 *
1547 * RETURN: 0 if success
1548 * error code if failure
1549 *
1550 * Timestamps:
1551 * ip - atime updated
1552 *
1553 * Note that the low 4 bits of the cookie returned by zap is always zero.
1554 * This allows us to use the low range for "special" directory entries:
1555 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1556 * we use the offset 2 for the '.zfs' directory.
1557 */
1558 int
zfs_readdir(struct inode * ip,struct dir_context * ctx,cred_t * cr)1559 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
1560 {
1561 (void) cr;
1562 znode_t *zp = ITOZ(ip);
1563 zfsvfs_t *zfsvfs = ITOZSB(ip);
1564 objset_t *os;
1565 zap_cursor_t zc;
1566 zap_attribute_t *zap;
1567 int error;
1568 uint8_t prefetch;
1569 uint8_t type;
1570 int done = 0;
1571 uint64_t parent;
1572 uint64_t offset; /* must be unsigned; checks for < 1 */
1573
1574 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1575 return (error);
1576
1577 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1578 &parent, sizeof (parent))) != 0)
1579 goto out;
1580
1581 /*
1582 * Quit if directory has been removed (posix)
1583 */
1584 if (zp->z_unlinked)
1585 goto out;
1586
1587 error = 0;
1588 os = zfsvfs->z_os;
1589 offset = ctx->pos;
1590 prefetch = zp->z_zn_prefetch;
1591 zap = zap_attribute_long_alloc();
1592
1593 /*
1594 * Initialize the iterator cursor.
1595 */
1596 if (offset <= 3) {
1597 /*
1598 * Start iteration from the beginning of the directory.
1599 */
1600 zap_cursor_init(&zc, os, zp->z_id);
1601 } else {
1602 /*
1603 * The offset is a serialized cursor.
1604 */
1605 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1606 }
1607
1608 /*
1609 * Transform to file-system independent format
1610 */
1611 while (!done) {
1612 uint64_t objnum;
1613 /*
1614 * Special case `.', `..', and `.zfs'.
1615 */
1616 if (offset == 0) {
1617 (void) strcpy(zap->za_name, ".");
1618 zap->za_normalization_conflict = 0;
1619 objnum = zp->z_id;
1620 type = DT_DIR;
1621 } else if (offset == 1) {
1622 (void) strcpy(zap->za_name, "..");
1623 zap->za_normalization_conflict = 0;
1624 objnum = parent;
1625 type = DT_DIR;
1626 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1627 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1628 zap->za_normalization_conflict = 0;
1629 objnum = ZFSCTL_INO_ROOT;
1630 type = DT_DIR;
1631 } else {
1632 /*
1633 * Grab next entry.
1634 */
1635 if ((error = zap_cursor_retrieve(&zc, zap))) {
1636 if (error == ENOENT)
1637 break;
1638 else
1639 goto update;
1640 }
1641
1642 /*
1643 * Allow multiple entries provided the first entry is
1644 * the object id. Non-zpl consumers may safely make
1645 * use of the additional space.
1646 *
1647 * XXX: This should be a feature flag for compatibility
1648 */
1649 if (zap->za_integer_length != 8 ||
1650 zap->za_num_integers == 0) {
1651 cmn_err(CE_WARN, "zap_readdir: bad directory "
1652 "entry, obj = %lld, offset = %lld, "
1653 "length = %d, num = %lld\n",
1654 (u_longlong_t)zp->z_id,
1655 (u_longlong_t)offset,
1656 zap->za_integer_length,
1657 (u_longlong_t)zap->za_num_integers);
1658 error = SET_ERROR(ENXIO);
1659 goto update;
1660 }
1661
1662 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1663 type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1664 }
1665
1666 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name),
1667 objnum, type);
1668 if (done)
1669 break;
1670
1671 if (prefetch)
1672 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1673
1674 /*
1675 * Move to the next entry, fill in the previous offset.
1676 */
1677 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1678 zap_cursor_advance(&zc);
1679 offset = zap_cursor_serialize(&zc);
1680 } else {
1681 offset += 1;
1682 }
1683 ctx->pos = offset;
1684 }
1685 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1686
1687 update:
1688 zap_cursor_fini(&zc);
1689 zap_attribute_free(zap);
1690 if (error == ENOENT)
1691 error = 0;
1692 out:
1693 zfs_exit(zfsvfs, FTAG);
1694
1695 return (error);
1696 }
1697
1698 /*
1699 * Get the basic file attributes and place them in the provided kstat
1700 * structure. The inode is assumed to be the authoritative source
1701 * for most of the attributes. However, the znode currently has the
1702 * authoritative atime, blksize, and block count.
1703 *
1704 * IN: ip - inode of file.
1705 *
1706 * OUT: sp - kstat values.
1707 *
1708 * RETURN: 0 (always succeeds)
1709 */
1710 int
1711 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
zfs_getattr_fast(zidmap_t * user_ns,u32 request_mask,struct inode * ip,struct kstat * sp)1712 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
1713 struct kstat *sp)
1714 #else
1715 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1716 #endif
1717 {
1718 znode_t *zp = ITOZ(ip);
1719 zfsvfs_t *zfsvfs = ITOZSB(ip);
1720 uint32_t blksize;
1721 u_longlong_t nblocks;
1722 int error;
1723
1724 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1725 return (error);
1726
1727 mutex_enter(&zp->z_lock);
1728
1729 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1730 zpl_generic_fillattr(user_ns, request_mask, ip, sp);
1731 #else
1732 zpl_generic_fillattr(user_ns, ip, sp);
1733 #endif
1734 /*
1735 * +1 link count for root inode with visible '.zfs' directory.
1736 */
1737 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1738 if (sp->nlink < ZFS_LINK_MAX)
1739 sp->nlink++;
1740
1741 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1742 sp->blksize = blksize;
1743 sp->blocks = nblocks;
1744
1745 if (unlikely(zp->z_blksz == 0)) {
1746 /*
1747 * Block size hasn't been set; suggest maximal I/O transfers.
1748 */
1749 sp->blksize = zfsvfs->z_max_blksz;
1750 }
1751
1752 mutex_exit(&zp->z_lock);
1753
1754 /*
1755 * Required to prevent NFS client from detecting different inode
1756 * numbers of snapshot root dentry before and after snapshot mount.
1757 */
1758 if (zfsvfs->z_issnap) {
1759 if (ip->i_sb->s_root->d_inode == ip)
1760 sp->ino = ZFSCTL_INO_SNAPDIRS -
1761 dmu_objset_id(zfsvfs->z_os);
1762 }
1763
1764 zfs_exit(zfsvfs, FTAG);
1765
1766 return (0);
1767 }
1768
1769 /*
1770 * For the operation of changing file's user/group/project, we need to
1771 * handle not only the main object that is assigned to the file directly,
1772 * but also the ones that are used by the file via hidden xattr directory.
1773 *
1774 * Because the xattr directory may contains many EA entries, as to it may
1775 * be impossible to change all of them via the transaction of changing the
1776 * main object's user/group/project attributes. Then we have to change them
1777 * via other multiple independent transactions one by one. It may be not good
1778 * solution, but we have no better idea yet.
1779 */
1780 static int
zfs_setattr_dir(znode_t * dzp)1781 zfs_setattr_dir(znode_t *dzp)
1782 {
1783 struct inode *dxip = ZTOI(dzp);
1784 struct inode *xip = NULL;
1785 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1786 objset_t *os = zfsvfs->z_os;
1787 zap_cursor_t zc;
1788 zap_attribute_t *zap;
1789 zfs_dirlock_t *dl;
1790 znode_t *zp = NULL;
1791 dmu_tx_t *tx = NULL;
1792 uint64_t uid, gid;
1793 sa_bulk_attr_t bulk[4];
1794 int count;
1795 int err;
1796
1797 zap = zap_attribute_alloc();
1798 zap_cursor_init(&zc, os, dzp->z_id);
1799 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
1800 count = 0;
1801 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
1802 err = ENXIO;
1803 break;
1804 }
1805
1806 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp,
1807 ZEXISTS, NULL, NULL);
1808 if (err == ENOENT)
1809 goto next;
1810 if (err)
1811 break;
1812
1813 xip = ZTOI(zp);
1814 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1815 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1816 zp->z_projid == dzp->z_projid)
1817 goto next;
1818
1819 tx = dmu_tx_create(os);
1820 if (!(zp->z_pflags & ZFS_PROJID))
1821 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1822 else
1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1824
1825 err = dmu_tx_assign(tx, DMU_TX_WAIT);
1826 if (err)
1827 break;
1828
1829 mutex_enter(&dzp->z_lock);
1830
1831 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1832 xip->i_uid = dxip->i_uid;
1833 uid = zfs_uid_read(dxip);
1834 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1835 &uid, sizeof (uid));
1836 }
1837
1838 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1839 xip->i_gid = dxip->i_gid;
1840 gid = zfs_gid_read(dxip);
1841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1842 &gid, sizeof (gid));
1843 }
1844
1845
1846 uint64_t projid = dzp->z_projid;
1847 if (zp->z_projid != projid) {
1848 if (!(zp->z_pflags & ZFS_PROJID)) {
1849 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
1850 if (unlikely(err == EEXIST)) {
1851 err = 0;
1852 } else if (err != 0) {
1853 goto sa_add_projid_err;
1854 } else {
1855 projid = ZFS_INVALID_PROJID;
1856 }
1857 }
1858
1859 if (projid != ZFS_INVALID_PROJID) {
1860 zp->z_projid = projid;
1861 SA_ADD_BULK_ATTR(bulk, count,
1862 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
1863 sizeof (zp->z_projid));
1864 }
1865 }
1866
1867 sa_add_projid_err:
1868 mutex_exit(&dzp->z_lock);
1869
1870 if (likely(count > 0)) {
1871 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1872 dmu_tx_commit(tx);
1873 } else if (projid == ZFS_INVALID_PROJID) {
1874 dmu_tx_commit(tx);
1875 } else {
1876 dmu_tx_abort(tx);
1877 }
1878 tx = NULL;
1879 if (err != 0 && err != ENOENT)
1880 break;
1881
1882 next:
1883 if (zp) {
1884 zrele(zp);
1885 zp = NULL;
1886 zfs_dirent_unlock(dl);
1887 }
1888 zap_cursor_advance(&zc);
1889 }
1890
1891 if (tx)
1892 dmu_tx_abort(tx);
1893 if (zp) {
1894 zrele(zp);
1895 zfs_dirent_unlock(dl);
1896 }
1897 zap_cursor_fini(&zc);
1898 zap_attribute_free(zap);
1899
1900 return (err == ENOENT ? 0 : err);
1901 }
1902
1903 /*
1904 * Set the file attributes to the values contained in the
1905 * vattr structure.
1906 *
1907 * IN: zp - znode of file to be modified.
1908 * vap - new attribute values.
1909 * If ATTR_XVATTR set, then optional attrs are being set
1910 * flags - ATTR_UTIME set if non-default time values provided.
1911 * - ATTR_NOACLCHECK (CIFS context only).
1912 * cr - credentials of caller.
1913 * mnt_ns - user namespace of the mount
1914 *
1915 * RETURN: 0 if success
1916 * error code if failure
1917 *
1918 * Timestamps:
1919 * ip - ctime updated, mtime updated if size changed.
1920 */
1921 int
zfs_setattr(znode_t * zp,vattr_t * vap,int flags,cred_t * cr,zidmap_t * mnt_ns)1922 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1923 {
1924 struct inode *ip;
1925 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1926 objset_t *os;
1927 zilog_t *zilog;
1928 dmu_tx_t *tx;
1929 vattr_t oldva;
1930 xvattr_t *tmpxvattr;
1931 uint_t mask = vap->va_mask;
1932 uint_t saved_mask = 0;
1933 int trim_mask = 0;
1934 uint64_t new_mode;
1935 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1936 uint64_t xattr_obj;
1937 uint64_t mtime[2], ctime[2], atime[2];
1938 uint64_t projid = ZFS_INVALID_PROJID;
1939 znode_t *attrzp;
1940 int need_policy = FALSE;
1941 int err, err2 = 0;
1942 zfs_fuid_info_t *fuidp = NULL;
1943 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
1944 xoptattr_t *xoap;
1945 zfs_acl_t *aclp;
1946 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1947 boolean_t fuid_dirtied = B_FALSE;
1948 boolean_t handle_eadir = B_FALSE;
1949 sa_bulk_attr_t *bulk, *xattr_bulk;
1950 int count = 0, xattr_count = 0, bulks = 8;
1951
1952 if (mask == 0)
1953 return (0);
1954
1955 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1956 return (err);
1957 ip = ZTOI(zp);
1958 os = zfsvfs->z_os;
1959
1960 /*
1961 * If this is a xvattr_t, then get a pointer to the structure of
1962 * optional attributes. If this is NULL, then we have a vattr_t.
1963 */
1964 xoap = xva_getxoptattr(xvap);
1965 if (xoap != NULL && (mask & ATTR_XVATTR)) {
1966 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1967 if (!dmu_objset_projectquota_enabled(os) ||
1968 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1969 zfs_exit(zfsvfs, FTAG);
1970 return (SET_ERROR(ENOTSUP));
1971 }
1972
1973 projid = xoap->xoa_projid;
1974 if (unlikely(projid == ZFS_INVALID_PROJID)) {
1975 zfs_exit(zfsvfs, FTAG);
1976 return (SET_ERROR(EINVAL));
1977 }
1978
1979 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1980 projid = ZFS_INVALID_PROJID;
1981 else
1982 need_policy = TRUE;
1983 }
1984
1985 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1986 (xoap->xoa_projinherit !=
1987 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1988 (!dmu_objset_projectquota_enabled(os) ||
1989 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1990 zfs_exit(zfsvfs, FTAG);
1991 return (SET_ERROR(ENOTSUP));
1992 }
1993 }
1994
1995 zilog = zfsvfs->z_log;
1996
1997 /*
1998 * Make sure that if we have ephemeral uid/gid or xvattr specified
1999 * that file system is at proper version level
2000 */
2001
2002 if (zfsvfs->z_use_fuids == B_FALSE &&
2003 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2004 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2005 (mask & ATTR_XVATTR))) {
2006 zfs_exit(zfsvfs, FTAG);
2007 return (SET_ERROR(EINVAL));
2008 }
2009
2010 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2011 zfs_exit(zfsvfs, FTAG);
2012 return (SET_ERROR(EISDIR));
2013 }
2014
2015 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2016 zfs_exit(zfsvfs, FTAG);
2017 return (SET_ERROR(EINVAL));
2018 }
2019
2020 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2021 xva_init(tmpxvattr);
2022
2023 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2024 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2025
2026 /*
2027 * Immutable files can only alter immutable bit and atime
2028 */
2029 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2030 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2031 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2032 err = SET_ERROR(EPERM);
2033 goto out3;
2034 }
2035
2036 /* ZFS_READONLY will be handled in zfs_zaccess() */
2037
2038 /*
2039 * Verify timestamps doesn't overflow 32 bits.
2040 * ZFS can handle large timestamps, but 32bit syscalls can't
2041 * handle times greater than 2039. This check should be removed
2042 * once large timestamps are fully supported.
2043 */
2044 if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2045 if (((mask & ATTR_ATIME) &&
2046 TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2047 ((mask & ATTR_MTIME) &&
2048 TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2049 err = SET_ERROR(EOVERFLOW);
2050 goto out3;
2051 }
2052 }
2053
2054 top:
2055 attrzp = NULL;
2056 aclp = NULL;
2057
2058 /* Can this be moved to before the top label? */
2059 if (zfs_is_readonly(zfsvfs)) {
2060 err = SET_ERROR(EROFS);
2061 goto out3;
2062 }
2063
2064 /*
2065 * First validate permissions
2066 */
2067
2068 if (mask & ATTR_SIZE) {
2069 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
2070 mnt_ns);
2071 if (err)
2072 goto out3;
2073
2074 /*
2075 * XXX - Note, we are not providing any open
2076 * mode flags here (like FNDELAY), so we may
2077 * block if there are locks present... this
2078 * should be addressed in openat().
2079 */
2080 /* XXX - would it be OK to generate a log record here? */
2081 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2082 if (err)
2083 goto out3;
2084 }
2085
2086 if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2087 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2088 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2089 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2090 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2091 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2092 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2093 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2094 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2095 skipaclchk, cr, mnt_ns);
2096 }
2097
2098 if (mask & (ATTR_UID|ATTR_GID)) {
2099 int idmask = (mask & (ATTR_UID|ATTR_GID));
2100 int take_owner;
2101 int take_group;
2102 uid_t uid;
2103 gid_t gid;
2104
2105 /*
2106 * NOTE: even if a new mode is being set,
2107 * we may clear S_ISUID/S_ISGID bits.
2108 */
2109
2110 if (!(mask & ATTR_MODE))
2111 vap->va_mode = zp->z_mode;
2112
2113 /*
2114 * Take ownership or chgrp to group we are a member of
2115 */
2116
2117 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2118 vap->va_uid);
2119 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2120 vap->va_gid);
2121 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2122 take_group = (mask & ATTR_GID) &&
2123 zfs_groupmember(zfsvfs, gid, cr);
2124
2125 /*
2126 * If both ATTR_UID and ATTR_GID are set then take_owner and
2127 * take_group must both be set in order to allow taking
2128 * ownership.
2129 *
2130 * Otherwise, send the check through secpolicy_vnode_setattr()
2131 *
2132 */
2133
2134 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2135 take_owner && take_group) ||
2136 ((idmask == ATTR_UID) && take_owner) ||
2137 ((idmask == ATTR_GID) && take_group)) {
2138 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2139 skipaclchk, cr, mnt_ns) == 0) {
2140 /*
2141 * Remove setuid/setgid for non-privileged users
2142 */
2143 (void) secpolicy_setid_clear(vap, cr);
2144 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2145 } else {
2146 need_policy = TRUE;
2147 }
2148 } else {
2149 need_policy = TRUE;
2150 }
2151 }
2152
2153 mutex_enter(&zp->z_lock);
2154 oldva.va_mode = zp->z_mode;
2155 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2156 if (mask & ATTR_XVATTR) {
2157 /*
2158 * Update xvattr mask to include only those attributes
2159 * that are actually changing.
2160 *
2161 * the bits will be restored prior to actually setting
2162 * the attributes so the caller thinks they were set.
2163 */
2164 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2165 if (xoap->xoa_appendonly !=
2166 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2167 need_policy = TRUE;
2168 } else {
2169 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2170 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2171 }
2172 }
2173
2174 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2175 if (xoap->xoa_projinherit !=
2176 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2177 need_policy = TRUE;
2178 } else {
2179 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2180 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2181 }
2182 }
2183
2184 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2185 if (xoap->xoa_nounlink !=
2186 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2187 need_policy = TRUE;
2188 } else {
2189 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2190 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2191 }
2192 }
2193
2194 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2195 if (xoap->xoa_immutable !=
2196 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2197 need_policy = TRUE;
2198 } else {
2199 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2200 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2201 }
2202 }
2203
2204 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2205 if (xoap->xoa_nodump !=
2206 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2207 need_policy = TRUE;
2208 } else {
2209 XVA_CLR_REQ(xvap, XAT_NODUMP);
2210 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2211 }
2212 }
2213
2214 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2215 if (xoap->xoa_av_modified !=
2216 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2217 need_policy = TRUE;
2218 } else {
2219 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2220 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2221 }
2222 }
2223
2224 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2225 if ((!S_ISREG(ip->i_mode) &&
2226 xoap->xoa_av_quarantined) ||
2227 xoap->xoa_av_quarantined !=
2228 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2229 need_policy = TRUE;
2230 } else {
2231 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2232 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2233 }
2234 }
2235
2236 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2237 mutex_exit(&zp->z_lock);
2238 err = SET_ERROR(EPERM);
2239 goto out3;
2240 }
2241
2242 if (need_policy == FALSE &&
2243 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2244 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2245 need_policy = TRUE;
2246 }
2247 }
2248
2249 mutex_exit(&zp->z_lock);
2250
2251 if (mask & ATTR_MODE) {
2252 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2253 mnt_ns) == 0) {
2254 err = secpolicy_setid_setsticky_clear(ip, vap,
2255 &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2256 if (err)
2257 goto out3;
2258 trim_mask |= ATTR_MODE;
2259 } else {
2260 need_policy = TRUE;
2261 }
2262 }
2263
2264 if (need_policy) {
2265 /*
2266 * If trim_mask is set then take ownership
2267 * has been granted or write_acl is present and user
2268 * has the ability to modify mode. In that case remove
2269 * UID|GID and or MODE from mask so that
2270 * secpolicy_vnode_setattr() doesn't revoke it.
2271 */
2272
2273 if (trim_mask) {
2274 saved_mask = vap->va_mask;
2275 vap->va_mask &= ~trim_mask;
2276 }
2277 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2278 zfs_zaccess_unix, zp);
2279 if (err)
2280 goto out3;
2281
2282 if (trim_mask)
2283 vap->va_mask |= saved_mask;
2284 }
2285
2286 /*
2287 * secpolicy_vnode_setattr, or take ownership may have
2288 * changed va_mask
2289 */
2290 mask = vap->va_mask;
2291
2292 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2293 handle_eadir = B_TRUE;
2294 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2295 &xattr_obj, sizeof (xattr_obj));
2296
2297 if (err == 0 && xattr_obj) {
2298 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2299 if (err)
2300 goto out2;
2301 }
2302 if (mask & ATTR_UID) {
2303 new_kuid = zfs_fuid_create(zfsvfs,
2304 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2305 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2306 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2307 new_kuid)) {
2308 if (attrzp)
2309 zrele(attrzp);
2310 err = SET_ERROR(EDQUOT);
2311 goto out2;
2312 }
2313 }
2314
2315 if (mask & ATTR_GID) {
2316 new_kgid = zfs_fuid_create(zfsvfs,
2317 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2318 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2319 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2320 new_kgid)) {
2321 if (attrzp)
2322 zrele(attrzp);
2323 err = SET_ERROR(EDQUOT);
2324 goto out2;
2325 }
2326 }
2327
2328 if (projid != ZFS_INVALID_PROJID &&
2329 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2330 if (attrzp)
2331 zrele(attrzp);
2332 err = EDQUOT;
2333 goto out2;
2334 }
2335 }
2336 tx = dmu_tx_create(os);
2337
2338 if (mask & ATTR_MODE) {
2339 uint64_t pmode = zp->z_mode;
2340 uint64_t acl_obj;
2341 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2342
2343 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2344 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2345 err = EPERM;
2346 goto out;
2347 }
2348
2349 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2350 goto out;
2351
2352 mutex_enter(&zp->z_lock);
2353 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2354 /*
2355 * Are we upgrading ACL from old V0 format
2356 * to V1 format?
2357 */
2358 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2359 zfs_znode_acl_version(zp) ==
2360 ZFS_ACL_VERSION_INITIAL) {
2361 dmu_tx_hold_free(tx, acl_obj, 0,
2362 DMU_OBJECT_END);
2363 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2364 0, aclp->z_acl_bytes);
2365 } else {
2366 dmu_tx_hold_write(tx, acl_obj, 0,
2367 aclp->z_acl_bytes);
2368 }
2369 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2371 0, aclp->z_acl_bytes);
2372 }
2373 mutex_exit(&zp->z_lock);
2374 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2375 } else {
2376 if (((mask & ATTR_XVATTR) &&
2377 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2378 (projid != ZFS_INVALID_PROJID &&
2379 !(zp->z_pflags & ZFS_PROJID)))
2380 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2381 else
2382 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2383 }
2384
2385 if (attrzp) {
2386 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2387 }
2388
2389 fuid_dirtied = zfsvfs->z_fuid_dirty;
2390 if (fuid_dirtied)
2391 zfs_fuid_txhold(zfsvfs, tx);
2392
2393 zfs_sa_upgrade_txholds(tx, zp);
2394
2395 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2396 if (err)
2397 goto out;
2398
2399 count = 0;
2400 /*
2401 * Set each attribute requested.
2402 * We group settings according to the locks they need to acquire.
2403 *
2404 * Note: you cannot set ctime directly, although it will be
2405 * updated as a side-effect of calling this function.
2406 */
2407
2408 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2409 /*
2410 * For the existed object that is upgraded from old system,
2411 * its on-disk layout has no slot for the project ID attribute.
2412 * But quota accounting logic needs to access related slots by
2413 * offset directly. So we need to adjust old objects' layout
2414 * to make the project ID to some unified and fixed offset.
2415 */
2416 if (attrzp)
2417 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2418 if (err == 0)
2419 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2420
2421 if (unlikely(err == EEXIST))
2422 err = 0;
2423 else if (err != 0)
2424 goto out;
2425 else
2426 projid = ZFS_INVALID_PROJID;
2427 }
2428
2429 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2430 mutex_enter(&zp->z_acl_lock);
2431 mutex_enter(&zp->z_lock);
2432
2433 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2434 &zp->z_pflags, sizeof (zp->z_pflags));
2435
2436 if (attrzp) {
2437 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2438 mutex_enter(&attrzp->z_acl_lock);
2439 mutex_enter(&attrzp->z_lock);
2440 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2441 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2442 sizeof (attrzp->z_pflags));
2443 if (projid != ZFS_INVALID_PROJID) {
2444 attrzp->z_projid = projid;
2445 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2446 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2447 sizeof (attrzp->z_projid));
2448 }
2449 }
2450
2451 if (mask & (ATTR_UID|ATTR_GID)) {
2452
2453 if (mask & ATTR_UID) {
2454 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2455 new_uid = zfs_uid_read(ZTOI(zp));
2456 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2457 &new_uid, sizeof (new_uid));
2458 if (attrzp) {
2459 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2460 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2461 sizeof (new_uid));
2462 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2463 }
2464 }
2465
2466 if (mask & ATTR_GID) {
2467 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2468 new_gid = zfs_gid_read(ZTOI(zp));
2469 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2470 NULL, &new_gid, sizeof (new_gid));
2471 if (attrzp) {
2472 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2473 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2474 sizeof (new_gid));
2475 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2476 }
2477 }
2478 if (!(mask & ATTR_MODE)) {
2479 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2480 NULL, &new_mode, sizeof (new_mode));
2481 new_mode = zp->z_mode;
2482 }
2483 err = zfs_acl_chown_setattr(zp);
2484 ASSERT0(err);
2485 if (attrzp) {
2486 err = zfs_acl_chown_setattr(attrzp);
2487 ASSERT0(err);
2488 }
2489 }
2490
2491 if (mask & ATTR_MODE) {
2492 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2493 &new_mode, sizeof (new_mode));
2494 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2495 ASSERT3P(aclp, !=, NULL);
2496 err = zfs_aclset_common(zp, aclp, cr, tx);
2497 ASSERT0(err);
2498 if (zp->z_acl_cached)
2499 zfs_acl_free(zp->z_acl_cached);
2500 zp->z_acl_cached = aclp;
2501 aclp = NULL;
2502 }
2503
2504 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2505 zp->z_atime_dirty = B_FALSE;
2506 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
2507 ZFS_TIME_ENCODE(&tmp_atime, atime);
2508 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2509 &atime, sizeof (atime));
2510 }
2511
2512 if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2513 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2514 zpl_inode_set_mtime_to_ts(ZTOI(zp),
2515 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
2516
2517 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2518 mtime, sizeof (mtime));
2519 }
2520
2521 if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2522 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2523 zpl_inode_set_ctime_to_ts(ZTOI(zp),
2524 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
2525 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2526 ctime, sizeof (ctime));
2527 }
2528
2529 if (projid != ZFS_INVALID_PROJID) {
2530 zp->z_projid = projid;
2531 SA_ADD_BULK_ATTR(bulk, count,
2532 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2533 sizeof (zp->z_projid));
2534 }
2535
2536 if (attrzp && mask) {
2537 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2538 SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2539 sizeof (ctime));
2540 }
2541
2542 /*
2543 * Do this after setting timestamps to prevent timestamp
2544 * update from toggling bit
2545 */
2546
2547 if (xoap && (mask & ATTR_XVATTR)) {
2548
2549 /*
2550 * restore trimmed off masks
2551 * so that return masks can be set for caller.
2552 */
2553
2554 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2555 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2556 }
2557 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2558 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2559 }
2560 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2561 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2562 }
2563 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2564 XVA_SET_REQ(xvap, XAT_NODUMP);
2565 }
2566 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2567 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2568 }
2569 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2570 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2571 }
2572 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2573 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2574 }
2575
2576 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2577 ASSERT(S_ISREG(ip->i_mode));
2578
2579 zfs_xvattr_set(zp, xvap, tx);
2580 }
2581
2582 if (fuid_dirtied)
2583 zfs_fuid_sync(zfsvfs, tx);
2584
2585 if (mask != 0) {
2586 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2587 /*
2588 * Ensure that the z_seq is always incremented on setattr
2589 * operation. This is required for change accounting for
2590 * NFS clients.
2591 *
2592 * ATTR_MODE already increments via zfs_acl_chmod_setattr.
2593 * ATTR_SIZE already increments via zfs_freesp.
2594 */
2595 if (!(mask & (ATTR_MODE | ATTR_SIZE)))
2596 zp->z_seq++;
2597 }
2598
2599 mutex_exit(&zp->z_lock);
2600 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2601 mutex_exit(&zp->z_acl_lock);
2602
2603 if (attrzp) {
2604 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2605 mutex_exit(&attrzp->z_acl_lock);
2606 mutex_exit(&attrzp->z_lock);
2607 }
2608 out:
2609 if (err == 0 && xattr_count > 0) {
2610 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2611 xattr_count, tx);
2612 ASSERT0(err2);
2613 }
2614
2615 if (aclp)
2616 zfs_acl_free(aclp);
2617
2618 if (fuidp) {
2619 zfs_fuid_info_free(fuidp);
2620 fuidp = NULL;
2621 }
2622
2623 if (err) {
2624 dmu_tx_abort(tx);
2625 if (attrzp)
2626 zrele(attrzp);
2627 if (err == ERESTART)
2628 goto top;
2629 } else {
2630 if (count > 0)
2631 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2632 dmu_tx_commit(tx);
2633 if (attrzp) {
2634 if (err2 == 0 && handle_eadir)
2635 err = zfs_setattr_dir(attrzp);
2636 zrele(attrzp);
2637 }
2638 zfs_znode_update_vfs(zp);
2639 }
2640
2641 out2:
2642 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
2643 err = zil_commit(zilog, 0);
2644
2645 out3:
2646 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2647 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2648 kmem_free(tmpxvattr, sizeof (xvattr_t));
2649 zfs_exit(zfsvfs, FTAG);
2650 return (err);
2651 }
2652
2653 typedef struct zfs_zlock {
2654 krwlock_t *zl_rwlock; /* lock we acquired */
2655 znode_t *zl_znode; /* znode we held */
2656 struct zfs_zlock *zl_next; /* next in list */
2657 } zfs_zlock_t;
2658
2659 /*
2660 * Drop locks and release vnodes that were held by zfs_rename_lock().
2661 */
2662 static void
zfs_rename_unlock(zfs_zlock_t ** zlpp)2663 zfs_rename_unlock(zfs_zlock_t **zlpp)
2664 {
2665 zfs_zlock_t *zl;
2666
2667 while ((zl = *zlpp) != NULL) {
2668 if (zl->zl_znode != NULL)
2669 zfs_zrele_async(zl->zl_znode);
2670 rw_exit(zl->zl_rwlock);
2671 *zlpp = zl->zl_next;
2672 kmem_free(zl, sizeof (*zl));
2673 }
2674 }
2675
2676 /*
2677 * Search back through the directory tree, using the ".." entries.
2678 * Lock each directory in the chain to prevent concurrent renames.
2679 * Fail any attempt to move a directory into one of its own descendants.
2680 * XXX - z_parent_lock can overlap with map or grow locks
2681 */
2682 static int
zfs_rename_lock(znode_t * szp,znode_t * tdzp,znode_t * sdzp,zfs_zlock_t ** zlpp)2683 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2684 {
2685 zfs_zlock_t *zl;
2686 znode_t *zp = tdzp;
2687 uint64_t rootid = ZTOZSB(zp)->z_root;
2688 uint64_t oidp = zp->z_id;
2689 krwlock_t *rwlp = &szp->z_parent_lock;
2690 krw_t rw = RW_WRITER;
2691
2692 /*
2693 * First pass write-locks szp and compares to zp->z_id.
2694 * Later passes read-lock zp and compare to zp->z_parent.
2695 */
2696 do {
2697 if (!rw_tryenter(rwlp, rw)) {
2698 /*
2699 * Another thread is renaming in this path.
2700 * Note that if we are a WRITER, we don't have any
2701 * parent_locks held yet.
2702 */
2703 if (rw == RW_READER && zp->z_id > szp->z_id) {
2704 /*
2705 * Drop our locks and restart
2706 */
2707 zfs_rename_unlock(&zl);
2708 *zlpp = NULL;
2709 zp = tdzp;
2710 oidp = zp->z_id;
2711 rwlp = &szp->z_parent_lock;
2712 rw = RW_WRITER;
2713 continue;
2714 } else {
2715 /*
2716 * Wait for other thread to drop its locks
2717 */
2718 rw_enter(rwlp, rw);
2719 }
2720 }
2721
2722 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2723 zl->zl_rwlock = rwlp;
2724 zl->zl_znode = NULL;
2725 zl->zl_next = *zlpp;
2726 *zlpp = zl;
2727
2728 if (oidp == szp->z_id) /* We're a descendant of szp */
2729 return (SET_ERROR(EINVAL));
2730
2731 if (oidp == rootid) /* We've hit the top */
2732 return (0);
2733
2734 if (rw == RW_READER) { /* i.e. not the first pass */
2735 int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2736 if (error)
2737 return (error);
2738 zl->zl_znode = zp;
2739 }
2740 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2741 &oidp, sizeof (oidp));
2742 rwlp = &zp->z_parent_lock;
2743 rw = RW_READER;
2744
2745 } while (zp->z_id != sdzp->z_id);
2746
2747 return (0);
2748 }
2749
2750 /*
2751 * Move an entry from the provided source directory to the target
2752 * directory. Change the entry name as indicated.
2753 *
2754 * IN: sdzp - Source directory containing the "old entry".
2755 * snm - Old entry name.
2756 * tdzp - Target directory to contain the "new entry".
2757 * tnm - New entry name.
2758 * cr - credentials of caller.
2759 * flags - case flags
2760 * rflags - RENAME_* flags
2761 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0).
2762 * mnt_ns - user namespace of the mount
2763 *
2764 * RETURN: 0 on success, error code on failure.
2765 *
2766 * Timestamps:
2767 * sdzp,tdzp - ctime|mtime updated
2768 */
2769 int
zfs_rename(znode_t * sdzp,char * snm,znode_t * tdzp,char * tnm,cred_t * cr,int flags,uint64_t rflags,vattr_t * wo_vap,zidmap_t * mnt_ns)2770 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2771 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2772 {
2773 znode_t *szp, *tzp;
2774 zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
2775 zilog_t *zilog;
2776 zfs_dirlock_t *sdl, *tdl;
2777 dmu_tx_t *tx;
2778 zfs_zlock_t *zl;
2779 int cmp, serr, terr;
2780 int error = 0;
2781 int zflg = 0;
2782 boolean_t waited = B_FALSE;
2783 /* Needed for whiteout inode creation. */
2784 boolean_t fuid_dirtied;
2785 zfs_acl_ids_t acl_ids;
2786 boolean_t have_acl = B_FALSE;
2787 znode_t *wzp = NULL;
2788
2789
2790 if (snm == NULL || tnm == NULL)
2791 return (SET_ERROR(EINVAL));
2792
2793 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2794 return (SET_ERROR(EINVAL));
2795
2796 /* Already checked by Linux VFS, but just to make sure. */
2797 if (rflags & RENAME_EXCHANGE &&
2798 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2799 return (SET_ERROR(EINVAL));
2800
2801 /*
2802 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2803 * right kind of vattr_t for the whiteout file. These are set
2804 * internally by ZFS so should never be incorrect.
2805 */
2806 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2807 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2808 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2809
2810 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2811 return (error);
2812 zilog = zfsvfs->z_log;
2813
2814 if ((error = zfs_verify_zp(tdzp)) != 0) {
2815 zfs_exit(zfsvfs, FTAG);
2816 return (error);
2817 }
2818
2819 /*
2820 * We check i_sb because snapshots and the ctldir must have different
2821 * super blocks.
2822 */
2823 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2824 zfsctl_is_node(ZTOI(tdzp))) {
2825 zfs_exit(zfsvfs, FTAG);
2826 return (SET_ERROR(EXDEV));
2827 }
2828
2829 if (zfsvfs->z_utf8 && u8_validate(tnm,
2830 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2831 zfs_exit(zfsvfs, FTAG);
2832 return (SET_ERROR(EILSEQ));
2833 }
2834
2835 if (flags & FIGNORECASE)
2836 zflg |= ZCILOOK;
2837
2838 top:
2839 szp = NULL;
2840 tzp = NULL;
2841 zl = NULL;
2842
2843 /*
2844 * This is to prevent the creation of links into attribute space
2845 * by renaming a linked file into/outof an attribute directory.
2846 * See the comment in zfs_link() for why this is considered bad.
2847 */
2848 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2849 zfs_exit(zfsvfs, FTAG);
2850 return (SET_ERROR(EINVAL));
2851 }
2852
2853 /*
2854 * Lock source and target directory entries. To prevent deadlock,
2855 * a lock ordering must be defined. We lock the directory with
2856 * the smallest object id first, or if it's a tie, the one with
2857 * the lexically first name.
2858 */
2859 if (sdzp->z_id < tdzp->z_id) {
2860 cmp = -1;
2861 } else if (sdzp->z_id > tdzp->z_id) {
2862 cmp = 1;
2863 } else {
2864 /*
2865 * First compare the two name arguments without
2866 * considering any case folding.
2867 */
2868 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2869
2870 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2871 ASSERT(error == 0 || !zfsvfs->z_utf8);
2872 if (cmp == 0) {
2873 /*
2874 * POSIX: "If the old argument and the new argument
2875 * both refer to links to the same existing file,
2876 * the rename() function shall return successfully
2877 * and perform no other action."
2878 */
2879 zfs_exit(zfsvfs, FTAG);
2880 return (0);
2881 }
2882 /*
2883 * If the file system is case-folding, then we may
2884 * have some more checking to do. A case-folding file
2885 * system is either supporting mixed case sensitivity
2886 * access or is completely case-insensitive. Note
2887 * that the file system is always case preserving.
2888 *
2889 * In mixed sensitivity mode case sensitive behavior
2890 * is the default. FIGNORECASE must be used to
2891 * explicitly request case insensitive behavior.
2892 *
2893 * If the source and target names provided differ only
2894 * by case (e.g., a request to rename 'tim' to 'Tim'),
2895 * we will treat this as a special case in the
2896 * case-insensitive mode: as long as the source name
2897 * is an exact match, we will allow this to proceed as
2898 * a name-change request.
2899 */
2900 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2901 (zfsvfs->z_case == ZFS_CASE_MIXED &&
2902 flags & FIGNORECASE)) &&
2903 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2904 &error) == 0) {
2905 /*
2906 * case preserving rename request, require exact
2907 * name matches
2908 */
2909 zflg |= ZCIEXACT;
2910 zflg &= ~ZCILOOK;
2911 }
2912 }
2913
2914 /*
2915 * If the source and destination directories are the same, we should
2916 * grab the z_name_lock of that directory only once.
2917 */
2918 if (sdzp == tdzp) {
2919 zflg |= ZHAVELOCK;
2920 rw_enter(&sdzp->z_name_lock, RW_READER);
2921 }
2922
2923 if (cmp < 0) {
2924 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2925 ZEXISTS | zflg, NULL, NULL);
2926 terr = zfs_dirent_lock(&tdl,
2927 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2928 } else {
2929 terr = zfs_dirent_lock(&tdl,
2930 tdzp, tnm, &tzp, zflg, NULL, NULL);
2931 serr = zfs_dirent_lock(&sdl,
2932 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2933 NULL, NULL);
2934 }
2935
2936 if (serr) {
2937 /*
2938 * Source entry invalid or not there.
2939 */
2940 if (!terr) {
2941 zfs_dirent_unlock(tdl);
2942 if (tzp)
2943 zrele(tzp);
2944 }
2945
2946 if (sdzp == tdzp)
2947 rw_exit(&sdzp->z_name_lock);
2948
2949 if (strcmp(snm, "..") == 0)
2950 serr = EINVAL;
2951 zfs_exit(zfsvfs, FTAG);
2952 return (serr);
2953 }
2954 if (terr) {
2955 zfs_dirent_unlock(sdl);
2956 zrele(szp);
2957
2958 if (sdzp == tdzp)
2959 rw_exit(&sdzp->z_name_lock);
2960
2961 if (strcmp(tnm, "..") == 0)
2962 terr = EINVAL;
2963 zfs_exit(zfsvfs, FTAG);
2964 return (terr);
2965 }
2966
2967 /*
2968 * If we are using project inheritance, means if the directory has
2969 * ZFS_PROJINHERIT set, then its descendant directories will inherit
2970 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2971 * such case, we only allow renames into our tree when the project
2972 * IDs are the same.
2973 */
2974 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2975 tdzp->z_projid != szp->z_projid) {
2976 error = SET_ERROR(EXDEV);
2977 goto out;
2978 }
2979
2980 /*
2981 * Must have write access at the source to remove the old entry
2982 * and write access at the target to create the new entry.
2983 * Note that if target and source are the same, this can be
2984 * done in a single check.
2985 */
2986 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2987 goto out;
2988
2989 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2990 /*
2991 * Check to make sure rename is valid.
2992 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2993 */
2994 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2995 goto out;
2996 }
2997
2998 /*
2999 * Does target exist?
3000 */
3001 if (tzp) {
3002 if (rflags & RENAME_NOREPLACE) {
3003 error = SET_ERROR(EEXIST);
3004 goto out;
3005 }
3006 /*
3007 * Source and target must be the same type (unless exchanging).
3008 */
3009 if (!(rflags & RENAME_EXCHANGE)) {
3010 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
3011 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
3012
3013 if (s_is_dir != t_is_dir) {
3014 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
3015 goto out;
3016 }
3017 }
3018 /*
3019 * POSIX dictates that when the source and target
3020 * entries refer to the same file object, rename
3021 * must do nothing and exit without error.
3022 */
3023 if (szp->z_id == tzp->z_id) {
3024 error = 0;
3025 goto out;
3026 }
3027 } else if (rflags & RENAME_EXCHANGE) {
3028 /* Target must exist for RENAME_EXCHANGE. */
3029 error = SET_ERROR(ENOENT);
3030 goto out;
3031 }
3032
3033 /* Set up inode creation for RENAME_WHITEOUT. */
3034 if (rflags & RENAME_WHITEOUT) {
3035 /*
3036 * Whiteout files are not regular files or directories, so to
3037 * match zfs_create() we do not inherit the project id.
3038 */
3039 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
3040
3041 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
3042 if (error)
3043 goto out;
3044
3045 if (!have_acl) {
3046 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
3047 &acl_ids, mnt_ns);
3048 if (error)
3049 goto out;
3050 have_acl = B_TRUE;
3051 }
3052
3053 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
3054 error = SET_ERROR(EDQUOT);
3055 goto out;
3056 }
3057 }
3058
3059 tx = dmu_tx_create(zfsvfs->z_os);
3060 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3061 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3062 dmu_tx_hold_zap(tx, sdzp->z_id,
3063 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
3064 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3065 if (sdzp != tdzp) {
3066 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3067 zfs_sa_upgrade_txholds(tx, tdzp);
3068 }
3069 if (tzp) {
3070 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3071 zfs_sa_upgrade_txholds(tx, tzp);
3072 }
3073 if (rflags & RENAME_WHITEOUT) {
3074 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3075 ZFS_SA_BASE_ATTR_SIZE);
3076
3077 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
3078 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3079 if (!zfsvfs->z_use_sa &&
3080 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3081 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3082 0, acl_ids.z_aclp->z_acl_bytes);
3083 }
3084 }
3085 fuid_dirtied = zfsvfs->z_fuid_dirty;
3086 if (fuid_dirtied)
3087 zfs_fuid_txhold(zfsvfs, tx);
3088 zfs_sa_upgrade_txholds(tx, szp);
3089 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3090 error = dmu_tx_assign(tx,
3091 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3092 if (error) {
3093 if (zl != NULL)
3094 zfs_rename_unlock(&zl);
3095 zfs_dirent_unlock(sdl);
3096 zfs_dirent_unlock(tdl);
3097
3098 if (sdzp == tdzp)
3099 rw_exit(&sdzp->z_name_lock);
3100
3101 if (error == ERESTART) {
3102 waited = B_TRUE;
3103 dmu_tx_wait(tx);
3104 dmu_tx_abort(tx);
3105 zrele(szp);
3106 if (tzp)
3107 zrele(tzp);
3108 goto top;
3109 }
3110 dmu_tx_abort(tx);
3111 zrele(szp);
3112 if (tzp)
3113 zrele(tzp);
3114 zfs_exit(zfsvfs, FTAG);
3115 return (error);
3116 }
3117
3118 /*
3119 * Unlink the source.
3120 */
3121 szp->z_pflags |= ZFS_AV_MODIFIED;
3122 if (tdzp->z_pflags & ZFS_PROJINHERIT)
3123 szp->z_pflags |= ZFS_PROJINHERIT;
3124
3125 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3126 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3127 VERIFY0(error);
3128
3129 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3130 if (error)
3131 goto commit;
3132
3133 /*
3134 * Unlink the target.
3135 */
3136 if (tzp) {
3137 int tzflg = zflg;
3138
3139 if (rflags & RENAME_EXCHANGE) {
3140 /* This inode will be re-linked soon. */
3141 tzflg |= ZRENAMING;
3142
3143 tzp->z_pflags |= ZFS_AV_MODIFIED;
3144 if (sdzp->z_pflags & ZFS_PROJINHERIT)
3145 tzp->z_pflags |= ZFS_PROJINHERIT;
3146
3147 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3148 (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3149 ASSERT0(error);
3150 }
3151 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3152 if (error)
3153 goto commit_link_szp;
3154 }
3155
3156 /*
3157 * Create the new target links:
3158 * * We always link the target.
3159 * * RENAME_EXCHANGE: Link the old target to the source.
3160 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3161 */
3162 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3163 if (error) {
3164 /*
3165 * If we have removed the existing target, a subsequent call to
3166 * zfs_link_create() to add back the same entry, but with a new
3167 * dnode (szp), should not fail.
3168 */
3169 ASSERT0P(tzp);
3170 goto commit_link_tzp;
3171 }
3172
3173 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3174 case RENAME_EXCHANGE:
3175 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3176 /*
3177 * The same argument as zfs_link_create() failing for
3178 * szp applies here, since the source directory must
3179 * have had an entry we are replacing.
3180 */
3181 ASSERT0(error);
3182 if (error)
3183 goto commit_unlink_td_szp;
3184 break;
3185 case RENAME_WHITEOUT:
3186 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3187 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3188 if (error) {
3189 zfs_znode_delete(wzp, tx);
3190 remove_inode_hash(ZTOI(wzp));
3191 goto commit_unlink_td_szp;
3192 }
3193 break;
3194 }
3195
3196 if (fuid_dirtied)
3197 zfs_fuid_sync(zfsvfs, tx);
3198
3199 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3200 case RENAME_EXCHANGE:
3201 zfs_log_rename_exchange(zilog, tx,
3202 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3203 tdzp, tdl->dl_name, szp);
3204 break;
3205 case RENAME_WHITEOUT:
3206 zfs_log_rename_whiteout(zilog, tx,
3207 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3208 tdzp, tdl->dl_name, szp, wzp);
3209 break;
3210 default:
3211 ASSERT0(rflags & ~RENAME_NOREPLACE);
3212 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3213 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3214 break;
3215 }
3216
3217 commit:
3218 dmu_tx_commit(tx);
3219 out:
3220 if (have_acl)
3221 zfs_acl_ids_free(&acl_ids);
3222
3223 zfs_znode_update_vfs(sdzp);
3224 if (sdzp == tdzp)
3225 rw_exit(&sdzp->z_name_lock);
3226
3227 if (sdzp != tdzp)
3228 zfs_znode_update_vfs(tdzp);
3229
3230 zfs_znode_update_vfs(szp);
3231 zrele(szp);
3232 if (wzp) {
3233 zfs_znode_update_vfs(wzp);
3234 zrele(wzp);
3235 }
3236 if (tzp) {
3237 zfs_znode_update_vfs(tzp);
3238 zrele(tzp);
3239 }
3240
3241 if (zl != NULL)
3242 zfs_rename_unlock(&zl);
3243
3244 zfs_dirent_unlock(sdl);
3245 zfs_dirent_unlock(tdl);
3246
3247 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3248 error = zil_commit(zilog, 0);
3249
3250 zfs_exit(zfsvfs, FTAG);
3251 return (error);
3252
3253 /*
3254 * Clean-up path for broken link state.
3255 *
3256 * At this point we are in a (very) bad state, so we need to do our
3257 * best to correct the state. In particular, all of the nlinks are
3258 * wrong because we were destroying and creating links with ZRENAMING.
3259 *
3260 * In some form, all of these operations have to resolve the state:
3261 *
3262 * * link_destroy() *must* succeed. Fortunately, this is very likely
3263 * since we only just created it.
3264 *
3265 * * link_create()s are allowed to fail (though they shouldn't because
3266 * we only just unlinked them and are putting the entries back
3267 * during clean-up). But if they fail, we can just forcefully drop
3268 * the nlink value to (at the very least) avoid broken nlink values
3269 * -- though in the case of non-empty directories we will have to
3270 * panic (otherwise we'd have a leaked directory with a broken ..).
3271 */
3272 commit_unlink_td_szp:
3273 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3274 commit_link_tzp:
3275 if (tzp) {
3276 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3277 VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3278 }
3279 commit_link_szp:
3280 if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3281 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3282 goto commit;
3283 }
3284
3285 /*
3286 * Insert the indicated symbolic reference entry into the directory.
3287 *
3288 * IN: dzp - Directory to contain new symbolic link.
3289 * name - Name of directory entry in dip.
3290 * vap - Attributes of new entry.
3291 * link - Name for new symlink entry.
3292 * cr - credentials of caller.
3293 * flags - case flags
3294 * mnt_ns - user namespace of the mount
3295 *
3296 * OUT: zpp - Znode for new symbolic link.
3297 *
3298 * RETURN: 0 on success, error code on failure.
3299 *
3300 * Timestamps:
3301 * dip - ctime|mtime updated
3302 */
3303 int
zfs_symlink(znode_t * dzp,char * name,vattr_t * vap,char * link,znode_t ** zpp,cred_t * cr,int flags,zidmap_t * mnt_ns)3304 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3305 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3306 {
3307 znode_t *zp;
3308 zfs_dirlock_t *dl;
3309 dmu_tx_t *tx;
3310 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
3311 zilog_t *zilog;
3312 uint64_t len = strlen(link);
3313 int error;
3314 int zflg = ZNEW;
3315 zfs_acl_ids_t acl_ids;
3316 boolean_t fuid_dirtied;
3317 uint64_t txtype = TX_SYMLINK;
3318 boolean_t waited = B_FALSE;
3319
3320 ASSERT(S_ISLNK(vap->va_mode));
3321
3322 if (name == NULL)
3323 return (SET_ERROR(EINVAL));
3324
3325 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3326 return (error);
3327 zilog = zfsvfs->z_log;
3328
3329 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3330 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3331 zfs_exit(zfsvfs, FTAG);
3332 return (SET_ERROR(EILSEQ));
3333 }
3334 if (flags & FIGNORECASE)
3335 zflg |= ZCILOOK;
3336
3337 if (len > MAXPATHLEN) {
3338 zfs_exit(zfsvfs, FTAG);
3339 return (SET_ERROR(ENAMETOOLONG));
3340 }
3341
3342 if ((error = zfs_acl_ids_create(dzp, 0,
3343 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3344 zfs_exit(zfsvfs, FTAG);
3345 return (error);
3346 }
3347 top:
3348 *zpp = NULL;
3349
3350 /*
3351 * Attempt to lock directory; fail if entry already exists.
3352 */
3353 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3354 if (error) {
3355 zfs_acl_ids_free(&acl_ids);
3356 zfs_exit(zfsvfs, FTAG);
3357 return (error);
3358 }
3359
3360 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3361 zfs_acl_ids_free(&acl_ids);
3362 zfs_dirent_unlock(dl);
3363 zfs_exit(zfsvfs, FTAG);
3364 return (error);
3365 }
3366
3367 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3368 zfs_acl_ids_free(&acl_ids);
3369 zfs_dirent_unlock(dl);
3370 zfs_exit(zfsvfs, FTAG);
3371 return (SET_ERROR(EDQUOT));
3372 }
3373 tx = dmu_tx_create(zfsvfs->z_os);
3374 fuid_dirtied = zfsvfs->z_fuid_dirty;
3375 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3376 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3377 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3378 ZFS_SA_BASE_ATTR_SIZE + len);
3379 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3380 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3381 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3382 acl_ids.z_aclp->z_acl_bytes);
3383 }
3384 if (fuid_dirtied)
3385 zfs_fuid_txhold(zfsvfs, tx);
3386 error = dmu_tx_assign(tx,
3387 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3388 if (error) {
3389 zfs_dirent_unlock(dl);
3390 if (error == ERESTART) {
3391 waited = B_TRUE;
3392 dmu_tx_wait(tx);
3393 dmu_tx_abort(tx);
3394 goto top;
3395 }
3396 zfs_acl_ids_free(&acl_ids);
3397 dmu_tx_abort(tx);
3398 zfs_exit(zfsvfs, FTAG);
3399 return (error);
3400 }
3401
3402 /*
3403 * Create a new object for the symlink.
3404 * for version 4 ZPL datasets the symlink will be an SA attribute
3405 */
3406 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3407
3408 if (fuid_dirtied)
3409 zfs_fuid_sync(zfsvfs, tx);
3410
3411 mutex_enter(&zp->z_lock);
3412 if (zp->z_is_sa)
3413 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3414 link, len, tx);
3415 else
3416 zfs_sa_symlink(zp, link, len, tx);
3417 mutex_exit(&zp->z_lock);
3418
3419 zp->z_size = len;
3420 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3421 &zp->z_size, sizeof (zp->z_size), tx);
3422 /*
3423 * Insert the new object into the directory.
3424 */
3425 error = zfs_link_create(dl, zp, tx, ZNEW);
3426 if (error != 0) {
3427 zfs_znode_delete(zp, tx);
3428 remove_inode_hash(ZTOI(zp));
3429 } else {
3430 if (flags & FIGNORECASE)
3431 txtype |= TX_CI;
3432 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3433
3434 zfs_znode_update_vfs(dzp);
3435 zfs_znode_update_vfs(zp);
3436 }
3437
3438 zfs_acl_ids_free(&acl_ids);
3439
3440 dmu_tx_commit(tx);
3441
3442 zfs_dirent_unlock(dl);
3443
3444 if (error == 0) {
3445 *zpp = zp;
3446
3447 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3448 error = zil_commit(zilog, 0);
3449 } else {
3450 zrele(zp);
3451 }
3452
3453 zfs_exit(zfsvfs, FTAG);
3454 return (error);
3455 }
3456
3457 /*
3458 * Return, in the buffer contained in the provided uio structure,
3459 * the symbolic path referred to by ip.
3460 *
3461 * IN: ip - inode of symbolic link
3462 * uio - structure to contain the link path.
3463 * cr - credentials of caller.
3464 *
3465 * RETURN: 0 if success
3466 * error code if failure
3467 *
3468 * Timestamps:
3469 * ip - atime updated
3470 */
3471 int
zfs_readlink(struct inode * ip,zfs_uio_t * uio,cred_t * cr)3472 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3473 {
3474 (void) cr;
3475 znode_t *zp = ITOZ(ip);
3476 zfsvfs_t *zfsvfs = ITOZSB(ip);
3477 int error;
3478
3479 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3480 return (error);
3481
3482 mutex_enter(&zp->z_lock);
3483 if (zp->z_is_sa)
3484 error = sa_lookup_uio(zp->z_sa_hdl,
3485 SA_ZPL_SYMLINK(zfsvfs), uio);
3486 else
3487 error = zfs_sa_readlink(zp, uio);
3488 mutex_exit(&zp->z_lock);
3489
3490 zfs_exit(zfsvfs, FTAG);
3491 return (error);
3492 }
3493
3494 /*
3495 * Insert a new entry into directory tdzp referencing szp.
3496 *
3497 * IN: tdzp - Directory to contain new entry.
3498 * szp - znode of new entry.
3499 * name - name of new entry.
3500 * cr - credentials of caller.
3501 * flags - case flags.
3502 *
3503 * RETURN: 0 if success
3504 * error code if failure
3505 *
3506 * Timestamps:
3507 * tdzp - ctime|mtime updated
3508 * szp - ctime updated
3509 */
3510 int
zfs_link(znode_t * tdzp,znode_t * szp,char * name,cred_t * cr,int flags)3511 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3512 int flags)
3513 {
3514 struct inode *sip = ZTOI(szp);
3515 znode_t *tzp;
3516 zfsvfs_t *zfsvfs = ZTOZSB(tdzp);
3517 zilog_t *zilog;
3518 zfs_dirlock_t *dl;
3519 dmu_tx_t *tx;
3520 int error;
3521 int zf = ZNEW;
3522 uint64_t parent;
3523 uid_t owner;
3524 boolean_t waited = B_FALSE;
3525 boolean_t is_tmpfile = 0;
3526 uint64_t txg;
3527
3528 is_tmpfile = (sip->i_nlink == 0 &&
3529 (inode_state_read_once(sip) & I_LINKABLE));
3530
3531 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3532
3533 if (name == NULL)
3534 return (SET_ERROR(EINVAL));
3535
3536 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3537 return (error);
3538 zilog = zfsvfs->z_log;
3539
3540 /*
3541 * POSIX dictates that we return EPERM here.
3542 * Better choices include ENOTSUP or EISDIR.
3543 */
3544 if (S_ISDIR(sip->i_mode)) {
3545 zfs_exit(zfsvfs, FTAG);
3546 return (SET_ERROR(EPERM));
3547 }
3548
3549 if ((error = zfs_verify_zp(szp)) != 0) {
3550 zfs_exit(zfsvfs, FTAG);
3551 return (error);
3552 }
3553
3554 /*
3555 * If we are using project inheritance, means if the directory has
3556 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3557 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3558 * such case, we only allow hard link creation in our tree when the
3559 * project IDs are the same.
3560 */
3561 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3562 tdzp->z_projid != szp->z_projid) {
3563 zfs_exit(zfsvfs, FTAG);
3564 return (SET_ERROR(EXDEV));
3565 }
3566
3567 /*
3568 * We check i_sb because snapshots and the ctldir must have different
3569 * super blocks.
3570 */
3571 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3572 zfs_exit(zfsvfs, FTAG);
3573 return (SET_ERROR(EXDEV));
3574 }
3575
3576 /* Prevent links to .zfs/shares files */
3577
3578 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3579 &parent, sizeof (uint64_t))) != 0) {
3580 zfs_exit(zfsvfs, FTAG);
3581 return (error);
3582 }
3583 if (parent == zfsvfs->z_shares_dir) {
3584 zfs_exit(zfsvfs, FTAG);
3585 return (SET_ERROR(EPERM));
3586 }
3587
3588 if (zfsvfs->z_utf8 && u8_validate(name,
3589 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3590 zfs_exit(zfsvfs, FTAG);
3591 return (SET_ERROR(EILSEQ));
3592 }
3593 if (flags & FIGNORECASE)
3594 zf |= ZCILOOK;
3595
3596 /*
3597 * We do not support links between attributes and non-attributes
3598 * because of the potential security risk of creating links
3599 * into "normal" file space in order to circumvent restrictions
3600 * imposed in attribute space.
3601 */
3602 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3603 zfs_exit(zfsvfs, FTAG);
3604 return (SET_ERROR(EINVAL));
3605 }
3606
3607 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3608 cr, ZFS_OWNER);
3609 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3610 zfs_exit(zfsvfs, FTAG);
3611 return (SET_ERROR(EPERM));
3612 }
3613
3614 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3615 zfs_init_idmap))) {
3616 zfs_exit(zfsvfs, FTAG);
3617 return (error);
3618 }
3619
3620 top:
3621 /*
3622 * Attempt to lock directory; fail if entry already exists.
3623 */
3624 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3625 if (error) {
3626 zfs_exit(zfsvfs, FTAG);
3627 return (error);
3628 }
3629
3630 tx = dmu_tx_create(zfsvfs->z_os);
3631 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3632 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3633 if (is_tmpfile)
3634 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3635
3636 zfs_sa_upgrade_txholds(tx, szp);
3637 zfs_sa_upgrade_txholds(tx, tdzp);
3638 error = dmu_tx_assign(tx,
3639 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3640 if (error) {
3641 zfs_dirent_unlock(dl);
3642 if (error == ERESTART) {
3643 waited = B_TRUE;
3644 dmu_tx_wait(tx);
3645 dmu_tx_abort(tx);
3646 goto top;
3647 }
3648 dmu_tx_abort(tx);
3649 zfs_exit(zfsvfs, FTAG);
3650 return (error);
3651 }
3652 /* unmark z_unlinked so zfs_link_create will not reject */
3653 if (is_tmpfile)
3654 szp->z_unlinked = B_FALSE;
3655 error = zfs_link_create(dl, szp, tx, 0);
3656
3657 if (error == 0) {
3658 uint64_t txtype = TX_LINK;
3659 /*
3660 * tmpfile is created to be in z_unlinkedobj, so remove it.
3661 * Also, we don't log in ZIL, because all previous file
3662 * operation on the tmpfile are ignored by ZIL. Instead we
3663 * always wait for txg to sync to make sure all previous
3664 * operation are sync safe.
3665 */
3666 if (is_tmpfile) {
3667 VERIFY0(zap_remove_int(zfsvfs->z_os,
3668 zfsvfs->z_unlinkedobj, szp->z_id, tx));
3669 } else {
3670 if (flags & FIGNORECASE)
3671 txtype |= TX_CI;
3672 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3673 }
3674 } else if (is_tmpfile) {
3675 /* restore z_unlinked since when linking failed */
3676 szp->z_unlinked = B_TRUE;
3677 }
3678 txg = dmu_tx_get_txg(tx);
3679 dmu_tx_commit(tx);
3680
3681 zfs_dirent_unlock(dl);
3682
3683 if (error == 0) {
3684 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3685 error = zil_commit(zilog, 0);
3686
3687 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3688 txg_wait_flag_t wait_flags =
3689 spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
3690 ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
3691 error = txg_wait_synced_flags(
3692 dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
3693 if (error != 0) {
3694 ASSERT3U(error, ==, ESHUTDOWN);
3695 error = SET_ERROR(EIO);
3696 }
3697 }
3698 }
3699
3700 zfs_znode_update_vfs(tdzp);
3701 zfs_znode_update_vfs(szp);
3702 zfs_exit(zfsvfs, FTAG);
3703 return (error);
3704 }
3705
3706 /* Finish page writeback. */
3707 static inline void
zfs_page_writeback_done(struct page * pp,int err)3708 zfs_page_writeback_done(struct page *pp, int err)
3709 {
3710 if (err != 0) {
3711 /*
3712 * Writeback failed. Re-dirty the page. It was undirtied before
3713 * the IO was issued (in zfs_putpage() or write_cache_pages()).
3714 * The kernel only considers writeback for dirty pages; if we
3715 * don't do this, it is eligible for eviction without being
3716 * written out, which we definitely don't want.
3717 */
3718 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3719 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3720 #else
3721 __set_page_dirty_nobuffers(pp);
3722 #endif
3723 }
3724
3725 ClearPageError(pp);
3726 end_page_writeback(pp);
3727 }
3728
3729 /*
3730 * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
3731 * for syncing writes. Called when the ZIL itx has been written to the log or
3732 * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
3733 * is passed as `err`.
3734 */
3735 static void
zfs_putpage_commit_cb(void * arg,int err)3736 zfs_putpage_commit_cb(void *arg, int err)
3737 {
3738 zfs_page_writeback_done(arg, err);
3739 }
3740
3741 /*
3742 * Push a page out to disk, once the page is on stable storage the
3743 * registered commit callback will be run as notification of completion.
3744 *
3745 * IN: ip - page mapped for inode.
3746 * pp - page to push (page is locked)
3747 * wbc - writeback control data
3748 * for_sync - does the caller intend to wait synchronously for the
3749 * page writeback to complete?
3750 *
3751 * RETURN: 0 if success
3752 * error code if failure
3753 *
3754 * Timestamps:
3755 * ip - ctime|mtime updated
3756 */
3757 int
zfs_putpage(struct inode * ip,struct page * pp,struct writeback_control * wbc,boolean_t for_sync)3758 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3759 boolean_t for_sync)
3760 {
3761 znode_t *zp = ITOZ(ip);
3762 zfsvfs_t *zfsvfs = ITOZSB(ip);
3763 loff_t offset;
3764 loff_t pgoff;
3765 unsigned int pglen;
3766 dmu_tx_t *tx;
3767 caddr_t va;
3768 int err = 0;
3769 uint64_t mtime[2], ctime[2];
3770 inode_timespec_t tmp_ts;
3771 sa_bulk_attr_t bulk[3];
3772 int cnt = 0;
3773 struct address_space *mapping;
3774
3775 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3776 return (err);
3777
3778 ASSERT(PageLocked(pp));
3779
3780 pgoff = page_offset(pp); /* Page byte-offset in file */
3781 offset = i_size_read(ip); /* File length in bytes */
3782 pglen = MIN(PAGE_SIZE, /* Page length in bytes */
3783 P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3784
3785 /* Page is beyond end of file */
3786 if (pgoff >= offset) {
3787 unlock_page(pp);
3788 zfs_exit(zfsvfs, FTAG);
3789 return (0);
3790 }
3791
3792 /* Truncate page length to end of file */
3793 if (pgoff + pglen > offset)
3794 pglen = offset - pgoff;
3795
3796 #if 0
3797 /*
3798 * FIXME: Allow mmap writes past its quota. The correct fix
3799 * is to register a page_mkwrite() handler to count the page
3800 * against its quota when it is about to be dirtied.
3801 */
3802 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3803 KUID_TO_SUID(ip->i_uid)) ||
3804 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3805 KGID_TO_SGID(ip->i_gid)) ||
3806 (zp->z_projid != ZFS_DEFAULT_PROJID &&
3807 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3808 zp->z_projid))) {
3809 err = EDQUOT;
3810 }
3811 #endif
3812
3813 /*
3814 * The ordering here is critical and must adhere to the following
3815 * rules in order to avoid deadlocking in either zfs_read() or
3816 * zfs_free_range() due to a lock inversion.
3817 *
3818 * 1) The page must be unlocked prior to acquiring the range lock.
3819 * This is critical because zfs_read() calls find_lock_page()
3820 * which may block on the page lock while holding the range lock.
3821 *
3822 * 2) Before setting or clearing write back on a page the range lock
3823 * must be held in order to prevent a lock inversion with the
3824 * zfs_free_range() function.
3825 *
3826 * This presents a problem because upon entering this function the
3827 * page lock is already held. To safely acquire the range lock the
3828 * page lock must be dropped. This creates a window where another
3829 * process could truncate, invalidate, dirty, or write out the page.
3830 *
3831 * Therefore, after successfully reacquiring the range and page locks
3832 * the current page state is checked. In the common case everything
3833 * will be as is expected and it can be written out. However, if
3834 * the page state has changed it must be handled accordingly.
3835 */
3836 mapping = pp->mapping;
3837 redirty_page_for_writepage(wbc, pp);
3838 unlock_page(pp);
3839
3840 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3841 pgoff, pglen, RL_WRITER);
3842 lock_page(pp);
3843
3844 /* Page mapping changed or it was no longer dirty, we're done */
3845 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3846 unlock_page(pp);
3847 zfs_rangelock_exit(lr);
3848 zfs_exit(zfsvfs, FTAG);
3849 return (0);
3850 }
3851
3852 /* Another process started write block if required */
3853 if (PageWriteback(pp)) {
3854 unlock_page(pp);
3855 zfs_rangelock_exit(lr);
3856
3857 if (wbc->sync_mode != WB_SYNC_NONE) {
3858 if (PageWriteback(pp))
3859 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3860 folio_wait_bit(page_folio(pp), PG_writeback);
3861 #else
3862 wait_on_page_bit(pp, PG_writeback);
3863 #endif
3864 }
3865
3866 zfs_exit(zfsvfs, FTAG);
3867 return (0);
3868 }
3869
3870 /* Clear the dirty flag the required locks are held */
3871 if (!clear_page_dirty_for_io(pp)) {
3872 unlock_page(pp);
3873 zfs_rangelock_exit(lr);
3874 zfs_exit(zfsvfs, FTAG);
3875 return (0);
3876 }
3877
3878 /*
3879 * Counterpart for redirty_page_for_writepage() above. This page
3880 * was in fact not skipped and should not be counted as if it were.
3881 */
3882 wbc->pages_skipped--;
3883 set_page_writeback(pp);
3884 unlock_page(pp);
3885
3886 tx = dmu_tx_create(zfsvfs->z_os);
3887 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3888 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3889 zfs_sa_upgrade_txholds(tx, zp);
3890
3891 err = dmu_tx_assign(tx, DMU_TX_WAIT);
3892 if (err != 0) {
3893 dmu_tx_abort(tx);
3894 zfs_page_writeback_done(pp, err);
3895 zfs_rangelock_exit(lr);
3896 zfs_exit(zfsvfs, FTAG);
3897
3898 /*
3899 * Don't return error for an async writeback; we've re-dirtied
3900 * the page so it will be tried again some other time.
3901 */
3902 return (for_sync ? err : 0);
3903 }
3904
3905 va = kmap(pp);
3906 ASSERT3U(pglen, <=, PAGE_SIZE);
3907 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx,
3908 DMU_READ_PREFETCH);
3909 kunmap(pp);
3910
3911 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3912 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3913 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3914 &zp->z_pflags, 8);
3915
3916 /* Preserve the mtime and ctime provided by the inode */
3917 tmp_ts = zpl_inode_get_mtime(ip);
3918 ZFS_TIME_ENCODE(&tmp_ts, mtime);
3919 tmp_ts = zpl_inode_get_ctime(ip);
3920 ZFS_TIME_ENCODE(&tmp_ts, ctime);
3921 zp->z_atime_dirty = B_FALSE;
3922 zp->z_seq++;
3923
3924 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3925
3926 /*
3927 * A note about for_sync vs wbc->sync_mode.
3928 *
3929 * for_sync indicates that this is a syncing writeback, that is, kernel
3930 * caller expects the data to be durably stored before being notified.
3931 * Often, but not always, the call was triggered by a userspace syncing
3932 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
3933 * means that that page should remain "locked" (in the writeback state)
3934 * until it is definitely on disk (ie zil_commit() or spa_sync()).
3935 * Otherwise, we can unlock and return as soon as it is on the
3936 * in-memory ZIL.
3937 *
3938 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
3939 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
3940 * indicates this a regular async writeback (eg a cache eviction) and
3941 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
3942 * a syncing op that must be waited on (by convention, we test for
3943 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
3944 * performance should there ever be a new mode that we have not yet
3945 * added support for).
3946 *
3947 * So, why a separate for_sync field? This is because zpl_writepages()
3948 * calls zfs_putpage() multiple times for a single "logical" operation.
3949 * It wants all the individual pages to be for_sync==TRUE ie only
3950 * unlocked once durably stored, but it only wants one call to
3951 * zil_commit() at the very end, once all the pages are synced. So,
3952 * it repurposes sync_mode slightly to indicate who issue and wait for
3953 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
3954 * ALL, zfs_putpage should do it.
3955 *
3956 * Summary:
3957 * for_sync: 0=unlock immediately; 1=unlock once on disk
3958 * sync_mode: NONE=caller will commit; ALL=we will commit
3959 */
3960 boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
3961
3962 /*
3963 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
3964 * because it is a policy flag that indicates "someone will call
3965 * zil_commit() soon". for_sync=TRUE means exactly that; the only
3966 * question is whether it will be us, or zpl_writepages().
3967 */
3968 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
3969 B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
3970
3971 if (!for_sync) {
3972 /*
3973 * Async writeback is logged and written to the DMU, so page
3974 * can now be unlocked.
3975 */
3976 zfs_page_writeback_done(pp, 0);
3977 }
3978
3979 dmu_tx_commit(tx);
3980
3981 zfs_rangelock_exit(lr);
3982
3983 if (need_commit) {
3984 err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
3985 if (err != 0) {
3986 zfs_exit(zfsvfs, FTAG);
3987 return (err);
3988 }
3989 }
3990
3991 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3992
3993 zfs_exit(zfsvfs, FTAG);
3994 return (err);
3995 }
3996
3997 /*
3998 * Update the system attributes when the inode has been dirtied. For the
3999 * moment we only update the mode, atime, mtime, and ctime.
4000 */
4001 int
zfs_dirty_inode(struct inode * ip,int flags)4002 zfs_dirty_inode(struct inode *ip, int flags)
4003 {
4004 znode_t *zp = ITOZ(ip);
4005 zfsvfs_t *zfsvfs = ITOZSB(ip);
4006 dmu_tx_t *tx;
4007 uint64_t mode, atime[2], mtime[2], ctime[2];
4008 inode_timespec_t tmp_ts;
4009 sa_bulk_attr_t bulk[4];
4010 int error = 0;
4011 int cnt = 0;
4012
4013 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4014 return (0);
4015
4016 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4017 return (error);
4018
4019 #ifdef I_DIRTY_TIME
4020 /*
4021 * This is the lazytime semantic introduced in Linux 4.0
4022 * This flag will only be called from update_time when lazytime is set.
4023 * (Note, I_DIRTY_SYNC will also set if not lazytime)
4024 * Fortunately mtime and ctime are managed within ZFS itself, so we
4025 * only need to dirty atime.
4026 */
4027 if (flags == I_DIRTY_TIME) {
4028 zp->z_atime_dirty = B_TRUE;
4029 goto out;
4030 }
4031 #endif
4032
4033 tx = dmu_tx_create(zfsvfs->z_os);
4034
4035 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4036 zfs_sa_upgrade_txholds(tx, zp);
4037
4038 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4039 if (error) {
4040 dmu_tx_abort(tx);
4041 goto out;
4042 }
4043
4044 mutex_enter(&zp->z_lock);
4045 zp->z_atime_dirty = B_FALSE;
4046
4047 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4048 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4049 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4050 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4051
4052 /* Preserve the mode, mtime and ctime provided by the inode */
4053 tmp_ts = zpl_inode_get_atime(ip);
4054 ZFS_TIME_ENCODE(&tmp_ts, atime);
4055 tmp_ts = zpl_inode_get_mtime(ip);
4056 ZFS_TIME_ENCODE(&tmp_ts, mtime);
4057 tmp_ts = zpl_inode_get_ctime(ip);
4058 ZFS_TIME_ENCODE(&tmp_ts, ctime);
4059 mode = ip->i_mode;
4060
4061 zp->z_mode = mode;
4062
4063 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4064 mutex_exit(&zp->z_lock);
4065
4066 dmu_tx_commit(tx);
4067 out:
4068 zfs_exit(zfsvfs, FTAG);
4069 return (error);
4070 }
4071
4072 void
zfs_inactive(struct inode * ip)4073 zfs_inactive(struct inode *ip)
4074 {
4075 znode_t *zp = ITOZ(ip);
4076 zfsvfs_t *zfsvfs = ITOZSB(ip);
4077 uint64_t atime[2];
4078 int error;
4079 int need_unlock = 0;
4080
4081 /* Only read lock if we haven't already write locked, e.g. rollback */
4082 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
4083 need_unlock = 1;
4084 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4085 }
4086 if (zp->z_sa_hdl == NULL) {
4087 if (need_unlock)
4088 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4089 return;
4090 }
4091
4092 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
4093 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4094
4095 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4096 zfs_sa_upgrade_txholds(tx, zp);
4097 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4098 if (error) {
4099 dmu_tx_abort(tx);
4100 } else {
4101 inode_timespec_t tmp_atime;
4102 tmp_atime = zpl_inode_get_atime(ip);
4103 ZFS_TIME_ENCODE(&tmp_atime, atime);
4104 mutex_enter(&zp->z_lock);
4105 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4106 (void *)&atime, sizeof (atime), tx);
4107 zp->z_atime_dirty = B_FALSE;
4108 mutex_exit(&zp->z_lock);
4109 dmu_tx_commit(tx);
4110 }
4111 }
4112
4113 zfs_zinactive(zp);
4114 if (need_unlock)
4115 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4116 }
4117
4118 /*
4119 * Fill pages with data from the disk.
4120 */
4121 static int
zfs_fillpage(struct inode * ip,struct page * pp)4122 zfs_fillpage(struct inode *ip, struct page *pp)
4123 {
4124 znode_t *zp = ITOZ(ip);
4125 zfsvfs_t *zfsvfs = ITOZSB(ip);
4126 loff_t i_size = i_size_read(ip);
4127 u_offset_t io_off = page_offset(pp);
4128 size_t io_len = PAGE_SIZE;
4129
4130 ASSERT3U(io_off, <, i_size);
4131
4132 if (io_off + io_len > i_size)
4133 io_len = i_size - io_off;
4134
4135 void *va = kmap(pp);
4136 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
4137 io_len, va, DMU_READ_PREFETCH);
4138 if (io_len != PAGE_SIZE)
4139 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
4140 kunmap(pp);
4141
4142 if (error) {
4143 /* convert checksum errors into IO errors */
4144 if (error == ECKSUM)
4145 error = SET_ERROR(EIO);
4146
4147 SetPageError(pp);
4148 ClearPageUptodate(pp);
4149 } else {
4150 ClearPageError(pp);
4151 SetPageUptodate(pp);
4152 }
4153
4154 return (error);
4155 }
4156
4157 /*
4158 * Uses zfs_fillpage to read data from the file and fill the page.
4159 *
4160 * IN: ip - inode of file to get data from.
4161 * pp - page to read
4162 *
4163 * RETURN: 0 on success, error code on failure.
4164 *
4165 * Timestamps:
4166 * vp - atime updated
4167 */
4168 int
zfs_getpage(struct inode * ip,struct page * pp)4169 zfs_getpage(struct inode *ip, struct page *pp)
4170 {
4171 zfsvfs_t *zfsvfs = ITOZSB(ip);
4172 znode_t *zp = ITOZ(ip);
4173 int error;
4174 loff_t i_size = i_size_read(ip);
4175 u_offset_t io_off = page_offset(pp);
4176 size_t io_len = PAGE_SIZE;
4177
4178 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4179 return (error);
4180
4181 ASSERT3U(io_off, <, i_size);
4182
4183 if (io_off + io_len > i_size)
4184 io_len = i_size - io_off;
4185
4186 /*
4187 * It is important to hold the rangelock here because it is possible
4188 * a Direct I/O write or block clone might be taking place at the same
4189 * time that a page is being faulted in through filemap_fault(). With
4190 * Direct I/O writes and block cloning db->db_data will be set to NULL
4191 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
4192 * rangelock is not held, then there is a race between faulting in a
4193 * page and writing out a Direct I/O write or block cloning. Without
4194 * the rangelock a NULL pointer dereference can occur in
4195 * dmu_read_impl() for db->db_data during the mempcy operation when
4196 * zfs_fillpage() calls dmu_read().
4197 */
4198 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
4199 io_off, io_len, RL_READER);
4200 if (lr == NULL) {
4201 /*
4202 * It is important to drop the page lock before grabbing the
4203 * rangelock to avoid another deadlock between here and
4204 * zfs_write() -> update_pages(). update_pages() holds both the
4205 * rangelock and the page lock.
4206 */
4207 get_page(pp);
4208 unlock_page(pp);
4209 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
4210 io_len, RL_READER);
4211 lock_page(pp);
4212 put_page(pp);
4213 }
4214 error = zfs_fillpage(ip, pp);
4215 zfs_rangelock_exit(lr);
4216
4217 if (error == 0)
4218 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4219
4220 zfs_exit(zfsvfs, FTAG);
4221
4222 return (error);
4223 }
4224
4225 /*
4226 * Check ZFS specific permissions to memory map a section of a file.
4227 *
4228 * IN: ip - inode of the file to mmap
4229 * off - file offset
4230 * addrp - start address in memory region
4231 * len - length of memory region
4232 * vm_flags- address flags
4233 *
4234 * RETURN: 0 if success
4235 * error code if failure
4236 */
4237 int
zfs_map(struct inode * ip,offset_t off,caddr_t * addrp,size_t len,unsigned long vm_flags)4238 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4239 unsigned long vm_flags)
4240 {
4241 (void) addrp;
4242 znode_t *zp = ITOZ(ip);
4243 zfsvfs_t *zfsvfs = ITOZSB(ip);
4244 int error;
4245
4246 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4247 return (error);
4248
4249 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
4250 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4251 zfs_exit(zfsvfs, FTAG);
4252 return (SET_ERROR(EPERM));
4253 }
4254
4255 if ((vm_flags & (VM_READ | VM_EXEC)) &&
4256 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4257 zfs_exit(zfsvfs, FTAG);
4258 return (SET_ERROR(EACCES));
4259 }
4260
4261 if (off < 0 || len > MAXOFFSET_T - off) {
4262 zfs_exit(zfsvfs, FTAG);
4263 return (SET_ERROR(ENXIO));
4264 }
4265
4266 zfs_exit(zfsvfs, FTAG);
4267 return (0);
4268 }
4269
4270 /*
4271 * Free or allocate space in a file. Currently, this function only
4272 * supports the `F_FREESP' command. However, this command is somewhat
4273 * misnamed, as its functionality includes the ability to allocate as
4274 * well as free space.
4275 *
4276 * IN: zp - znode of file to free data in.
4277 * cmd - action to take (only F_FREESP supported).
4278 * bfp - section of file to free/alloc.
4279 * flag - current file open mode flags.
4280 * offset - current file offset.
4281 * cr - credentials of caller.
4282 *
4283 * RETURN: 0 on success, error code on failure.
4284 *
4285 * Timestamps:
4286 * zp - ctime|mtime updated
4287 */
4288 int
zfs_space(znode_t * zp,int cmd,flock64_t * bfp,int flag,offset_t offset,cred_t * cr)4289 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4290 offset_t offset, cred_t *cr)
4291 {
4292 (void) offset;
4293 zfsvfs_t *zfsvfs = ZTOZSB(zp);
4294 uint64_t off, len;
4295 int error;
4296
4297 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4298 return (error);
4299
4300 if (cmd != F_FREESP) {
4301 zfs_exit(zfsvfs, FTAG);
4302 return (SET_ERROR(EINVAL));
4303 }
4304
4305 /*
4306 * Callers might not be able to detect properly that we are read-only,
4307 * so check it explicitly here.
4308 */
4309 if (zfs_is_readonly(zfsvfs)) {
4310 zfs_exit(zfsvfs, FTAG);
4311 return (SET_ERROR(EROFS));
4312 }
4313
4314 if (bfp->l_len < 0) {
4315 zfs_exit(zfsvfs, FTAG);
4316 return (SET_ERROR(EINVAL));
4317 }
4318
4319 /*
4320 * Permissions aren't checked on Solaris because on this OS
4321 * zfs_space() can only be called with an opened file handle.
4322 * On Linux we can get here through truncate_range() which
4323 * operates directly on inodes, so we need to check access rights.
4324 */
4325 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4326 zfs_init_idmap))) {
4327 zfs_exit(zfsvfs, FTAG);
4328 return (error);
4329 }
4330
4331 off = bfp->l_start;
4332 len = bfp->l_len; /* 0 means from off to end of file */
4333
4334 error = zfs_freesp(zp, off, len, flag, TRUE);
4335
4336 zfs_exit(zfsvfs, FTAG);
4337 return (error);
4338 }
4339
4340 int
zfs_fid(struct inode * ip,fid_t * fidp)4341 zfs_fid(struct inode *ip, fid_t *fidp)
4342 {
4343 znode_t *zp = ITOZ(ip);
4344 zfsvfs_t *zfsvfs = ITOZSB(ip);
4345 uint32_t gen;
4346 uint64_t gen64;
4347 uint64_t object = zp->z_id;
4348 zfid_short_t *zfid;
4349 int size, i, error;
4350
4351 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4352 return (error);
4353
4354 if (fidp->fid_len < SHORT_FID_LEN) {
4355 fidp->fid_len = SHORT_FID_LEN;
4356 zfs_exit(zfsvfs, FTAG);
4357 return (SET_ERROR(ENOSPC));
4358 }
4359
4360 if ((error = zfs_verify_zp(zp)) != 0) {
4361 zfs_exit(zfsvfs, FTAG);
4362 return (error);
4363 }
4364
4365 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4366 &gen64, sizeof (uint64_t))) != 0) {
4367 zfs_exit(zfsvfs, FTAG);
4368 return (error);
4369 }
4370
4371 gen = (uint32_t)gen64;
4372
4373 size = SHORT_FID_LEN;
4374
4375 zfid = (zfid_short_t *)fidp;
4376
4377 zfid->zf_len = size;
4378
4379 for (i = 0; i < sizeof (zfid->zf_object); i++)
4380 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4381
4382 /* Must have a non-zero generation number to distinguish from .zfs */
4383 if (gen == 0)
4384 gen = 1;
4385 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4386 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4387
4388 zfs_exit(zfsvfs, FTAG);
4389 return (0);
4390 }
4391
4392 #if defined(_KERNEL)
4393 EXPORT_SYMBOL(zfs_open);
4394 EXPORT_SYMBOL(zfs_close);
4395 EXPORT_SYMBOL(zfs_lookup);
4396 EXPORT_SYMBOL(zfs_create);
4397 EXPORT_SYMBOL(zfs_tmpfile);
4398 EXPORT_SYMBOL(zfs_remove);
4399 EXPORT_SYMBOL(zfs_mkdir);
4400 EXPORT_SYMBOL(zfs_rmdir);
4401 EXPORT_SYMBOL(zfs_readdir);
4402 EXPORT_SYMBOL(zfs_getattr_fast);
4403 EXPORT_SYMBOL(zfs_setattr);
4404 EXPORT_SYMBOL(zfs_rename);
4405 EXPORT_SYMBOL(zfs_symlink);
4406 EXPORT_SYMBOL(zfs_readlink);
4407 EXPORT_SYMBOL(zfs_link);
4408 EXPORT_SYMBOL(zfs_inactive);
4409 EXPORT_SYMBOL(zfs_space);
4410 EXPORT_SYMBOL(zfs_fid);
4411 EXPORT_SYMBOL(zfs_getpage);
4412 EXPORT_SYMBOL(zfs_putpage);
4413 EXPORT_SYMBOL(zfs_dirty_inode);
4414 EXPORT_SYMBOL(zfs_map);
4415
4416 module_param(zfs_delete_blocks, ulong, 0644);
4417 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4418 #endif
4419