1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2025, Klara, Inc.
29 */
30
31 /* Portions Copyright 2007 Jeremy Teo */
32 /* Portions Copyright 2010 Robert Milkowski */
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/time.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/file.h>
40 #include <sys/stat.h>
41 #include <sys/kmem.h>
42 #include <sys/taskq.h>
43 #include <sys/uio.h>
44 #include <sys/vmsystm.h>
45 #include <sys/atomic.h>
46 #include <sys/pathname.h>
47 #include <sys/cmn_err.h>
48 #include <sys/errno.h>
49 #include <sys/zfs_dir.h>
50 #include <sys/zfs_acl.h>
51 #include <sys/zfs_ioctl.h>
52 #include <sys/fs/zfs.h>
53 #include <sys/dmu.h>
54 #include <sys/dmu_objset.h>
55 #include <sys/spa.h>
56 #include <sys/txg.h>
57 #include <sys/dbuf.h>
58 #include <sys/zap.h>
59 #include <sys/sa.h>
60 #include <sys/policy.h>
61 #include <sys/sunddi.h>
62 #include <sys/sid.h>
63 #include <sys/zfs_ctldir.h>
64 #include <sys/zfs_fuid.h>
65 #include <sys/zfs_quota.h>
66 #include <sys/zfs_sa.h>
67 #include <sys/zfs_vnops.h>
68 #include <sys/zfs_rlock.h>
69 #include <sys/cred.h>
70 #include <sys/zpl.h>
71 #include <sys/zil.h>
72 #include <sys/sa_impl.h>
73 #include <linux/mm_compat.h>
74
75 /*
76 * Programming rules.
77 *
78 * Each vnode op performs some logical unit of work. To do this, the ZPL must
79 * properly lock its in-core state, create a DMU transaction, do the work,
80 * record this work in the intent log (ZIL), commit the DMU transaction,
81 * and wait for the intent log to commit if it is a synchronous operation.
82 * Moreover, the vnode ops must work in both normal and log replay context.
83 * The ordering of events is important to avoid deadlocks and references
84 * to freed memory. The example below illustrates the following Big Rules:
85 *
86 * (1) A check must be made in each zfs thread for a mounted file system.
87 * This is done avoiding races using zfs_enter(zfsvfs).
88 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
89 * must be checked with zfs_verify_zp(zp). Both of these macros
90 * can return EIO from the calling function.
91 *
92 * (2) zrele() should always be the last thing except for zil_commit() (if
93 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
94 * last reference, the vnode/znode can be freed, so the zp may point to
95 * freed memory. Second, the last reference will call zfs_zinactive(),
96 * which may induce a lot of work -- pushing cached pages (which acquires
97 * range locks) and syncing out cached atime changes. Third,
98 * zfs_zinactive() may require a new tx, which could deadlock the system
99 * if you were already holding one. This deadlock occurs because the tx
100 * currently being operated on prevents a txg from syncing, which
101 * prevents the new tx from progressing, resulting in a deadlock. If you
102 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
103 * is a synonym for zrele().
104 *
105 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
106 * as they can span dmu_tx_assign() calls.
107 *
108 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
109 * dmu_tx_assign(). This is critical because we don't want to block
110 * while holding locks.
111 *
112 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
113 * This reduces lock contention and CPU usage when we must wait (note
114 * that if throughput is constrained by the storage, nearly every
115 * transaction must wait).
116 *
117 * Note, in particular, that if a lock is sometimes acquired before
118 * the tx assigns, and sometimes after (e.g. z_lock), then failing
119 * to use a non-blocking assign can deadlock the system. The scenario:
120 *
121 * Thread A has grabbed a lock before calling dmu_tx_assign().
122 * Thread B is in an already-assigned tx, and blocks for this lock.
123 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
124 * txg_wait_open() forever, because the previous txg can't quiesce
125 * until B's tx commits.
126 *
127 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
128 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
129 * again. On subsequent calls to dmu_tx_assign(), pass
130 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
131 * this operation has already called dmu_tx_wait(). This will ensure
132 * that we don't retry forever, waiting a short bit each time.
133 *
134 * (5) If the operation succeeded, generate the intent log entry for it
135 * before dropping locks. This ensures that the ordering of events
136 * in the intent log matches the order in which they actually occurred.
137 * During ZIL replay the zfs_log_* functions will update the sequence
138 * number to indicate the zil transaction has replayed.
139 *
140 * (6) At the end of each vnode op, the DMU tx must always commit,
141 * regardless of whether there were any errors.
142 *
143 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
144 * to ensure that synchronous semantics are provided when necessary.
145 *
146 * In general, this is how things should be ordered in each vnode op:
147 *
148 * zfs_enter(zfsvfs); // exit if unmounted
149 * top:
150 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
151 * rw_enter(...); // grab any other locks you need
152 * tx = dmu_tx_create(...); // get DMU tx
153 * dmu_tx_hold_*(); // hold each object you might modify
154 * error = dmu_tx_assign(tx,
155 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
156 * if (error) {
157 * rw_exit(...); // drop locks
158 * zfs_dirent_unlock(dl); // unlock directory entry
159 * zrele(...); // release held znodes
160 * if (error == ERESTART) {
161 * waited = B_TRUE;
162 * dmu_tx_wait(tx);
163 * dmu_tx_abort(tx);
164 * goto top;
165 * }
166 * dmu_tx_abort(tx); // abort DMU tx
167 * zfs_exit(zfsvfs); // finished in zfs
168 * return (error); // really out of space
169 * }
170 * error = do_real_work(); // do whatever this VOP does
171 * if (error == 0)
172 * zfs_log_*(...); // on success, make ZIL entry
173 * dmu_tx_commit(tx); // commit DMU tx -- error or not
174 * rw_exit(...); // drop locks
175 * zfs_dirent_unlock(dl); // unlock directory entry
176 * zrele(...); // release held znodes
177 * zil_commit(zilog, foid); // synchronous when necessary
178 * zfs_exit(zfsvfs); // finished in zfs
179 * return (error); // done, report error
180 */
181 int
zfs_open(struct inode * ip,int mode,int flag,cred_t * cr)182 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
183 {
184 (void) cr;
185 znode_t *zp = ITOZ(ip);
186 zfsvfs_t *zfsvfs = ITOZSB(ip);
187 int error;
188
189 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
190 return (error);
191
192 /* Honor ZFS_APPENDONLY file attribute */
193 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
194 ((flag & O_APPEND) == 0)) {
195 zfs_exit(zfsvfs, FTAG);
196 return (SET_ERROR(EPERM));
197 }
198
199 /*
200 * Keep a count of the synchronous opens in the znode. On first
201 * synchronous open we must convert all previous async transactions
202 * into sync to keep correct ordering.
203 * Skip it for snapshot, as it won't have any transactions.
204 */
205 if (!zfsvfs->z_issnap && (flag & O_SYNC)) {
206 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
207 zil_async_to_sync(zfsvfs->z_log, zp->z_id);
208 }
209
210 zfs_exit(zfsvfs, FTAG);
211 return (0);
212 }
213
214 int
zfs_close(struct inode * ip,int flag,cred_t * cr)215 zfs_close(struct inode *ip, int flag, cred_t *cr)
216 {
217 (void) cr;
218 znode_t *zp = ITOZ(ip);
219 zfsvfs_t *zfsvfs = ITOZSB(ip);
220 int error;
221
222 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
223 return (error);
224
225 /* Decrement the synchronous opens in the znode */
226 if (!zfsvfs->z_issnap && (flag & O_SYNC))
227 atomic_dec_32(&zp->z_sync_cnt);
228
229 zfs_exit(zfsvfs, FTAG);
230 return (0);
231 }
232
233 #if defined(_KERNEL)
234
235 static int zfs_fillpage(struct inode *ip, struct page *pp);
236
237 /*
238 * When a file is memory mapped, we must keep the IO data synchronized
239 * between the DMU cache and the memory mapped pages. Update all mapped
240 * pages with the contents of the coresponding dmu buffer.
241 */
242 void
update_pages(znode_t * zp,int64_t start,int len,objset_t * os)243 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
244 {
245 struct address_space *mp = ZTOI(zp)->i_mapping;
246 int64_t off = start & (PAGE_SIZE - 1);
247
248 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
249 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
250
251 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
252 if (pp) {
253 if (mapping_writably_mapped(mp))
254 flush_dcache_page(pp);
255
256 void *pb = kmap(pp);
257 int error = dmu_read(os, zp->z_id, start + off,
258 nbytes, pb + off, DMU_READ_PREFETCH);
259 kunmap(pp);
260
261 if (error) {
262 SetPageError(pp);
263 ClearPageUptodate(pp);
264 } else {
265 ClearPageError(pp);
266 SetPageUptodate(pp);
267
268 if (mapping_writably_mapped(mp))
269 flush_dcache_page(pp);
270
271 mark_page_accessed(pp);
272 }
273
274 unlock_page(pp);
275 put_page(pp);
276 }
277
278 len -= nbytes;
279 off = 0;
280 }
281 }
282
283 /*
284 * When a file is memory mapped, we must keep the I/O data synchronized
285 * between the DMU cache and the memory mapped pages. Preferentially read
286 * from memory mapped pages, otherwise fallback to reading through the dmu.
287 */
288 int
mappedread(znode_t * zp,int nbytes,zfs_uio_t * uio)289 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
290 {
291 struct inode *ip = ZTOI(zp);
292 struct address_space *mp = ip->i_mapping;
293 int64_t start = uio->uio_loffset;
294 int64_t off = start & (PAGE_SIZE - 1);
295 int len = nbytes;
296 int error = 0;
297
298 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
299 uint64_t bytes = MIN(PAGE_SIZE - off, len);
300
301 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
302 if (pp) {
303
304 /*
305 * If filemap_fault() retries there exists a window
306 * where the page will be unlocked and not up to date.
307 * In this case we must try and fill the page.
308 */
309 if (unlikely(!PageUptodate(pp))) {
310 error = zfs_fillpage(ip, pp);
311 if (error) {
312 unlock_page(pp);
313 put_page(pp);
314 return (error);
315 }
316 }
317
318 ASSERT(PageUptodate(pp) || PageDirty(pp));
319
320 unlock_page(pp);
321
322 void *pb = kmap(pp);
323 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
324 kunmap(pp);
325
326 if (mapping_writably_mapped(mp))
327 flush_dcache_page(pp);
328
329 mark_page_accessed(pp);
330 put_page(pp);
331 } else {
332 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
333 uio, bytes, DMU_READ_PREFETCH);
334 }
335
336 len -= bytes;
337 off = 0;
338
339 if (error)
340 break;
341 }
342
343 return (error);
344 }
345 #endif /* _KERNEL */
346
347 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
348
349 /*
350 * Write the bytes to a file.
351 *
352 * IN: zp - znode of file to be written to
353 * data - bytes to write
354 * len - number of bytes to write
355 * pos - offset to start writing at
356 *
357 * OUT: resid - remaining bytes to write
358 *
359 * RETURN: 0 if success
360 * positive error code if failure. EIO is returned
361 * for a short write when residp isn't provided.
362 *
363 * Timestamps:
364 * zp - ctime|mtime updated if byte count > 0
365 */
366 int
zfs_write_simple(znode_t * zp,const void * data,size_t len,loff_t pos,size_t * residp)367 zfs_write_simple(znode_t *zp, const void *data, size_t len,
368 loff_t pos, size_t *residp)
369 {
370 fstrans_cookie_t cookie;
371 int error;
372
373 struct iovec iov;
374 iov.iov_base = (void *)data;
375 iov.iov_len = len;
376
377 zfs_uio_t uio;
378 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
379
380 cookie = spl_fstrans_mark();
381 error = zfs_write(zp, &uio, 0, kcred);
382 spl_fstrans_unmark(cookie);
383
384 if (error == 0) {
385 if (residp != NULL)
386 *residp = zfs_uio_resid(&uio);
387 else if (zfs_uio_resid(&uio) != 0)
388 error = SET_ERROR(EIO);
389 }
390
391 return (error);
392 }
393
394 static void
zfs_rele_async_task(void * arg)395 zfs_rele_async_task(void *arg)
396 {
397 iput(arg);
398 }
399
400 void
zfs_zrele_async(znode_t * zp)401 zfs_zrele_async(znode_t *zp)
402 {
403 struct inode *ip = ZTOI(zp);
404 objset_t *os = ITOZSB(ip)->z_os;
405
406 ASSERT(atomic_read(&ip->i_count) > 0);
407 ASSERT(os != NULL);
408
409 /*
410 * If decrementing the count would put us at 0, we can't do it inline
411 * here, because that would be synchronous. Instead, dispatch an iput
412 * to run later.
413 *
414 * For more information on the dangers of a synchronous iput, see the
415 * header comment of this file.
416 */
417 if (!atomic_add_unless(&ip->i_count, -1, 1)) {
418 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
419 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
420 }
421 }
422
423
424 /*
425 * Lookup an entry in a directory, or an extended attribute directory.
426 * If it exists, return a held inode reference for it.
427 *
428 * IN: zdp - znode of directory to search.
429 * nm - name of entry to lookup.
430 * flags - LOOKUP_XATTR set if looking for an attribute.
431 * cr - credentials of caller.
432 * direntflags - directory lookup flags
433 * realpnp - returned pathname.
434 *
435 * OUT: zpp - znode of located entry, NULL if not found.
436 *
437 * RETURN: 0 on success, error code on failure.
438 *
439 * Timestamps:
440 * NA
441 */
442 int
zfs_lookup(znode_t * zdp,char * nm,znode_t ** zpp,int flags,cred_t * cr,int * direntflags,pathname_t * realpnp)443 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
444 int *direntflags, pathname_t *realpnp)
445 {
446 zfsvfs_t *zfsvfs = ZTOZSB(zdp);
447 int error = 0;
448
449 /*
450 * Fast path lookup, however we must skip DNLC lookup
451 * for case folding or normalizing lookups because the
452 * DNLC code only stores the passed in name. This means
453 * creating 'a' and removing 'A' on a case insensitive
454 * file system would work, but DNLC still thinks 'a'
455 * exists and won't let you create it again on the next
456 * pass through fast path.
457 */
458 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
459
460 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
461 return (SET_ERROR(ENOTDIR));
462 } else if (zdp->z_sa_hdl == NULL) {
463 return (SET_ERROR(EIO));
464 }
465
466 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
467 error = zfs_fastaccesschk_execute(zdp, cr);
468 if (!error) {
469 *zpp = zdp;
470 zhold(*zpp);
471 return (0);
472 }
473 return (error);
474 }
475 }
476
477 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
478 return (error);
479
480 *zpp = NULL;
481
482 if (flags & LOOKUP_XATTR) {
483 /*
484 * We don't allow recursive attributes..
485 * Maybe someday we will.
486 */
487 if (zdp->z_pflags & ZFS_XATTR) {
488 zfs_exit(zfsvfs, FTAG);
489 return (SET_ERROR(EINVAL));
490 }
491
492 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
493 zfs_exit(zfsvfs, FTAG);
494 return (error);
495 }
496
497 /*
498 * Do we have permission to get into attribute directory?
499 */
500
501 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
502 B_TRUE, cr, zfs_init_idmap))) {
503 zrele(*zpp);
504 *zpp = NULL;
505 }
506
507 zfs_exit(zfsvfs, FTAG);
508 return (error);
509 }
510
511 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
512 zfs_exit(zfsvfs, FTAG);
513 return (SET_ERROR(ENOTDIR));
514 }
515
516 /*
517 * Check accessibility of directory.
518 */
519
520 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
521 zfs_init_idmap))) {
522 zfs_exit(zfsvfs, FTAG);
523 return (error);
524 }
525
526 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
527 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
528 zfs_exit(zfsvfs, FTAG);
529 return (SET_ERROR(EILSEQ));
530 }
531
532 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
533 if ((error == 0) && (*zpp))
534 zfs_znode_update_vfs(*zpp);
535
536 zfs_exit(zfsvfs, FTAG);
537 return (error);
538 }
539
540 /*
541 * Perform a linear search in directory for the name of specific inode.
542 * Note we don't pass in the buffer size of name because it's hardcoded to
543 * NAME_MAX+1(256) in Linux.
544 *
545 * IN: dzp - znode of directory to search.
546 * zp - znode of the target
547 *
548 * OUT: name - dentry name of the target
549 *
550 * RETURN: 0 on success, error code on failure.
551 */
552 int
zfs_get_name(znode_t * dzp,char * name,znode_t * zp)553 zfs_get_name(znode_t *dzp, char *name, znode_t *zp)
554 {
555 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
556 int error = 0;
557
558 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
559 return (error);
560
561 if ((error = zfs_verify_zp(zp)) != 0) {
562 zfs_exit(zfsvfs, FTAG);
563 return (error);
564 }
565
566 /* ctldir should have got their name in zfs_vget */
567 if (dzp->z_is_ctldir || zp->z_is_ctldir) {
568 zfs_exit(zfsvfs, FTAG);
569 return (ENOENT);
570 }
571
572 /* buffer len is hardcoded to 256 in Linux kernel */
573 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id,
574 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN);
575
576 zfs_exit(zfsvfs, FTAG);
577 return (error);
578 }
579
580 /*
581 * Attempt to create a new entry in a directory. If the entry
582 * already exists, truncate the file if permissible, else return
583 * an error. Return the ip of the created or trunc'd file.
584 *
585 * IN: dzp - znode of directory to put new file entry in.
586 * name - name of new file entry.
587 * vap - attributes of new file.
588 * excl - flag indicating exclusive or non-exclusive mode.
589 * mode - mode to open file with.
590 * cr - credentials of caller.
591 * flag - file flag.
592 * vsecp - ACL to be set
593 * mnt_ns - user namespace of the mount
594 *
595 * OUT: zpp - znode of created or trunc'd entry.
596 *
597 * RETURN: 0 on success, error code on failure.
598 *
599 * Timestamps:
600 * dzp - ctime|mtime updated if new entry created
601 * zp - ctime|mtime always, atime if new
602 */
603 int
zfs_create(znode_t * dzp,char * name,vattr_t * vap,int excl,int mode,znode_t ** zpp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)604 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
605 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
606 zidmap_t *mnt_ns)
607 {
608 znode_t *zp;
609 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
610 zilog_t *zilog;
611 objset_t *os;
612 zfs_dirlock_t *dl;
613 dmu_tx_t *tx;
614 int error;
615 uid_t uid;
616 gid_t gid;
617 zfs_acl_ids_t acl_ids;
618 boolean_t fuid_dirtied;
619 boolean_t have_acl = B_FALSE;
620 boolean_t waited = B_FALSE;
621 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
622
623 /*
624 * If we have an ephemeral id, ACL, or XVATTR then
625 * make sure file system is at proper version
626 */
627
628 gid = crgetgid(cr);
629 uid = crgetuid(cr);
630
631 if (zfsvfs->z_use_fuids == B_FALSE &&
632 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
633 return (SET_ERROR(EINVAL));
634
635 if (name == NULL)
636 return (SET_ERROR(EINVAL));
637
638 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
639 return (error);
640 os = zfsvfs->z_os;
641 zilog = zfsvfs->z_log;
642
643 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
644 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
645 zfs_exit(zfsvfs, FTAG);
646 return (SET_ERROR(EILSEQ));
647 }
648
649 if (vap->va_mask & ATTR_XVATTR) {
650 if ((error = secpolicy_xvattr((xvattr_t *)vap,
651 crgetuid(cr), cr, vap->va_mode)) != 0) {
652 zfs_exit(zfsvfs, FTAG);
653 return (error);
654 }
655 }
656
657 top:
658 *zpp = NULL;
659 if (*name == '\0') {
660 /*
661 * Null component name refers to the directory itself.
662 */
663 zhold(dzp);
664 zp = dzp;
665 dl = NULL;
666 error = 0;
667 } else {
668 /* possible igrab(zp) */
669 int zflg = 0;
670
671 if (flag & FIGNORECASE)
672 zflg |= ZCILOOK;
673
674 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
675 NULL, NULL);
676 if (error) {
677 if (have_acl)
678 zfs_acl_ids_free(&acl_ids);
679 if (strcmp(name, "..") == 0)
680 error = SET_ERROR(EISDIR);
681 zfs_exit(zfsvfs, FTAG);
682 return (error);
683 }
684 }
685
686 if (zp == NULL) {
687 uint64_t txtype;
688 uint64_t projid = ZFS_DEFAULT_PROJID;
689
690 /*
691 * Create a new file object and update the directory
692 * to reference it.
693 */
694 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
695 mnt_ns))) {
696 if (have_acl)
697 zfs_acl_ids_free(&acl_ids);
698 goto out;
699 }
700
701 /*
702 * We only support the creation of regular files in
703 * extended attribute directories.
704 */
705
706 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
707 if (have_acl)
708 zfs_acl_ids_free(&acl_ids);
709 error = SET_ERROR(EINVAL);
710 goto out;
711 }
712
713 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
714 cr, vsecp, &acl_ids, mnt_ns)) != 0)
715 goto out;
716 have_acl = B_TRUE;
717
718 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
719 projid = zfs_inherit_projid(dzp);
720 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
721 zfs_acl_ids_free(&acl_ids);
722 error = SET_ERROR(EDQUOT);
723 goto out;
724 }
725
726 tx = dmu_tx_create(os);
727
728 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
729 ZFS_SA_BASE_ATTR_SIZE);
730
731 fuid_dirtied = zfsvfs->z_fuid_dirty;
732 if (fuid_dirtied)
733 zfs_fuid_txhold(zfsvfs, tx);
734 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
735 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
736 if (!zfsvfs->z_use_sa &&
737 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
738 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
739 0, acl_ids.z_aclp->z_acl_bytes);
740 }
741
742 error = dmu_tx_assign(tx,
743 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
744 if (error) {
745 zfs_dirent_unlock(dl);
746 if (error == ERESTART) {
747 waited = B_TRUE;
748 dmu_tx_wait(tx);
749 dmu_tx_abort(tx);
750 goto top;
751 }
752 zfs_acl_ids_free(&acl_ids);
753 dmu_tx_abort(tx);
754 zfs_exit(zfsvfs, FTAG);
755 return (error);
756 }
757 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
758
759 error = zfs_link_create(dl, zp, tx, ZNEW);
760 if (error != 0) {
761 /*
762 * Since, we failed to add the directory entry for it,
763 * delete the newly created dnode.
764 */
765 zfs_znode_delete(zp, tx);
766 remove_inode_hash(ZTOI(zp));
767 zfs_acl_ids_free(&acl_ids);
768 dmu_tx_commit(tx);
769 goto out;
770 }
771
772 if (fuid_dirtied)
773 zfs_fuid_sync(zfsvfs, tx);
774
775 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
776 if (flag & FIGNORECASE)
777 txtype |= TX_CI;
778 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
779 vsecp, acl_ids.z_fuidp, vap);
780 zfs_acl_ids_free(&acl_ids);
781 dmu_tx_commit(tx);
782 } else {
783 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
784
785 if (have_acl)
786 zfs_acl_ids_free(&acl_ids);
787
788 /*
789 * A directory entry already exists for this name.
790 */
791 /*
792 * Can't truncate an existing file if in exclusive mode.
793 */
794 if (excl) {
795 error = SET_ERROR(EEXIST);
796 goto out;
797 }
798 /*
799 * Can't open a directory for writing.
800 */
801 if (S_ISDIR(ZTOI(zp)->i_mode)) {
802 error = SET_ERROR(EISDIR);
803 goto out;
804 }
805 /*
806 * Verify requested access to file.
807 */
808 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
809 mnt_ns))) {
810 goto out;
811 }
812
813 mutex_enter(&dzp->z_lock);
814 dzp->z_seq++;
815 mutex_exit(&dzp->z_lock);
816
817 /*
818 * Truncate regular files if requested.
819 */
820 if (S_ISREG(ZTOI(zp)->i_mode) &&
821 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
822 /* we can't hold any locks when calling zfs_freesp() */
823 if (dl) {
824 zfs_dirent_unlock(dl);
825 dl = NULL;
826 }
827 error = zfs_freesp(zp, 0, 0, mode, TRUE);
828 }
829 }
830 out:
831
832 if (dl)
833 zfs_dirent_unlock(dl);
834
835 if (error) {
836 if (zp)
837 zrele(zp);
838 } else {
839 zfs_znode_update_vfs(dzp);
840 zfs_znode_update_vfs(zp);
841 *zpp = zp;
842 }
843
844 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
845 error = zil_commit(zilog, 0);
846
847 zfs_exit(zfsvfs, FTAG);
848 return (error);
849 }
850
851 int
zfs_tmpfile(struct inode * dip,vattr_t * vap,int excl,int mode,struct inode ** ipp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)852 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
853 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
854 zidmap_t *mnt_ns)
855 {
856 (void) excl, (void) mode, (void) flag;
857 znode_t *zp = NULL, *dzp = ITOZ(dip);
858 zfsvfs_t *zfsvfs = ITOZSB(dip);
859 objset_t *os;
860 dmu_tx_t *tx;
861 int error;
862 uid_t uid;
863 gid_t gid;
864 zfs_acl_ids_t acl_ids;
865 uint64_t projid = ZFS_DEFAULT_PROJID;
866 boolean_t fuid_dirtied;
867 boolean_t have_acl = B_FALSE;
868 boolean_t waited = B_FALSE;
869
870 /*
871 * If we have an ephemeral id, ACL, or XVATTR then
872 * make sure file system is at proper version
873 */
874
875 gid = crgetgid(cr);
876 uid = crgetuid(cr);
877
878 if (zfsvfs->z_use_fuids == B_FALSE &&
879 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
880 return (SET_ERROR(EINVAL));
881
882 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
883 return (error);
884 os = zfsvfs->z_os;
885
886 if (vap->va_mask & ATTR_XVATTR) {
887 if ((error = secpolicy_xvattr((xvattr_t *)vap,
888 crgetuid(cr), cr, vap->va_mode)) != 0) {
889 zfs_exit(zfsvfs, FTAG);
890 return (error);
891 }
892 }
893
894 top:
895 *ipp = NULL;
896
897 /*
898 * Create a new file object and update the directory
899 * to reference it.
900 */
901 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
902 if (have_acl)
903 zfs_acl_ids_free(&acl_ids);
904 goto out;
905 }
906
907 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
908 cr, vsecp, &acl_ids, mnt_ns)) != 0)
909 goto out;
910 have_acl = B_TRUE;
911
912 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
913 projid = zfs_inherit_projid(dzp);
914 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
915 zfs_acl_ids_free(&acl_ids);
916 error = SET_ERROR(EDQUOT);
917 goto out;
918 }
919
920 tx = dmu_tx_create(os);
921
922 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
923 ZFS_SA_BASE_ATTR_SIZE);
924 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
925
926 fuid_dirtied = zfsvfs->z_fuid_dirty;
927 if (fuid_dirtied)
928 zfs_fuid_txhold(zfsvfs, tx);
929 if (!zfsvfs->z_use_sa &&
930 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
931 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
932 0, acl_ids.z_aclp->z_acl_bytes);
933 }
934 error = dmu_tx_assign(tx,
935 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
936 if (error) {
937 if (error == ERESTART) {
938 waited = B_TRUE;
939 dmu_tx_wait(tx);
940 dmu_tx_abort(tx);
941 goto top;
942 }
943 zfs_acl_ids_free(&acl_ids);
944 dmu_tx_abort(tx);
945 zfs_exit(zfsvfs, FTAG);
946 return (error);
947 }
948 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
949
950 if (fuid_dirtied)
951 zfs_fuid_sync(zfsvfs, tx);
952
953 /* Add to unlinked set */
954 zp->z_unlinked = B_TRUE;
955 zfs_unlinked_add(zp, tx);
956 zfs_acl_ids_free(&acl_ids);
957 dmu_tx_commit(tx);
958 out:
959
960 if (error) {
961 if (zp)
962 zrele(zp);
963 } else {
964 zfs_znode_update_vfs(dzp);
965 zfs_znode_update_vfs(zp);
966 *ipp = ZTOI(zp);
967 }
968
969 zfs_exit(zfsvfs, FTAG);
970 return (error);
971 }
972
973 /*
974 * Remove an entry from a directory.
975 *
976 * IN: dzp - znode of directory to remove entry from.
977 * name - name of entry to remove.
978 * cr - credentials of caller.
979 * flags - case flags.
980 *
981 * RETURN: 0 if success
982 * error code if failure
983 *
984 * Timestamps:
985 * dzp - ctime|mtime
986 * ip - ctime (if nlink > 0)
987 */
988
989 static uint64_t null_xattr = 0;
990
991 int
zfs_remove(znode_t * dzp,char * name,cred_t * cr,int flags)992 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
993 {
994 znode_t *zp;
995 znode_t *xzp;
996 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
997 zilog_t *zilog;
998 uint64_t acl_obj, xattr_obj;
999 uint64_t xattr_obj_unlinked = 0;
1000 uint64_t obj = 0;
1001 uint64_t links;
1002 zfs_dirlock_t *dl;
1003 dmu_tx_t *tx;
1004 boolean_t may_delete_now, delete_now = FALSE;
1005 boolean_t unlinked, toobig = FALSE;
1006 uint64_t txtype;
1007 pathname_t *realnmp = NULL;
1008 pathname_t realnm;
1009 int error;
1010 int zflg = ZEXISTS;
1011 boolean_t waited = B_FALSE;
1012
1013 if (name == NULL)
1014 return (SET_ERROR(EINVAL));
1015
1016 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1017 return (error);
1018 zilog = zfsvfs->z_log;
1019
1020 if (flags & FIGNORECASE) {
1021 zflg |= ZCILOOK;
1022 pn_alloc(&realnm);
1023 realnmp = &realnm;
1024 }
1025
1026 top:
1027 xattr_obj = 0;
1028 xzp = NULL;
1029 /*
1030 * Attempt to lock directory; fail if entry doesn't exist.
1031 */
1032 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1033 NULL, realnmp))) {
1034 if (realnmp)
1035 pn_free(realnmp);
1036 zfs_exit(zfsvfs, FTAG);
1037 return (error);
1038 }
1039
1040 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1041 goto out;
1042 }
1043
1044 /*
1045 * Need to use rmdir for removing directories.
1046 */
1047 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1048 error = SET_ERROR(EPERM);
1049 goto out;
1050 }
1051
1052 mutex_enter(&zp->z_lock);
1053 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1054 !zn_has_cached_data(zp, 0, LLONG_MAX);
1055 mutex_exit(&zp->z_lock);
1056
1057 /*
1058 * We may delete the znode now, or we may put it in the unlinked set;
1059 * it depends on whether we're the last link, and on whether there are
1060 * other holds on the inode. So we dmu_tx_hold() the right things to
1061 * allow for either case.
1062 */
1063 obj = zp->z_id;
1064 tx = dmu_tx_create(zfsvfs->z_os);
1065 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1066 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1067 zfs_sa_upgrade_txholds(tx, zp);
1068 zfs_sa_upgrade_txholds(tx, dzp);
1069 if (may_delete_now) {
1070 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1071 /* if the file is too big, only hold_free a token amount */
1072 dmu_tx_hold_free(tx, zp->z_id, 0,
1073 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1074 }
1075
1076 /* are there any extended attributes? */
1077 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1078 &xattr_obj, sizeof (xattr_obj));
1079 if (error == 0 && xattr_obj) {
1080 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1081 ASSERT0(error);
1082 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1083 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1084 }
1085
1086 mutex_enter(&zp->z_lock);
1087 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1088 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1089 mutex_exit(&zp->z_lock);
1090
1091 /* charge as an update -- would be nice not to charge at all */
1092 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1093
1094 /*
1095 * Mark this transaction as typically resulting in a net free of space
1096 */
1097 dmu_tx_mark_netfree(tx);
1098
1099 error = dmu_tx_assign(tx,
1100 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1101 if (error) {
1102 zfs_dirent_unlock(dl);
1103 if (error == ERESTART) {
1104 waited = B_TRUE;
1105 dmu_tx_wait(tx);
1106 dmu_tx_abort(tx);
1107 zrele(zp);
1108 if (xzp)
1109 zrele(xzp);
1110 goto top;
1111 }
1112 if (realnmp)
1113 pn_free(realnmp);
1114 dmu_tx_abort(tx);
1115 zrele(zp);
1116 if (xzp)
1117 zrele(xzp);
1118 zfs_exit(zfsvfs, FTAG);
1119 return (error);
1120 }
1121
1122 /*
1123 * Remove the directory entry.
1124 */
1125 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1126
1127 if (error) {
1128 dmu_tx_commit(tx);
1129 goto out;
1130 }
1131
1132 if (unlinked) {
1133 /*
1134 * Hold z_lock so that we can make sure that the ACL obj
1135 * hasn't changed. Could have been deleted due to
1136 * zfs_sa_upgrade().
1137 */
1138 mutex_enter(&zp->z_lock);
1139 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1140 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1141 delete_now = may_delete_now && !toobig &&
1142 atomic_read(&ZTOI(zp)->i_count) == 1 &&
1143 !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1144 xattr_obj == xattr_obj_unlinked &&
1145 zfs_external_acl(zp) == acl_obj;
1146 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1147 }
1148
1149 if (delete_now) {
1150 if (xattr_obj_unlinked) {
1151 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1152 mutex_enter(&xzp->z_lock);
1153 xzp->z_unlinked = B_TRUE;
1154 clear_nlink(ZTOI(xzp));
1155 links = 0;
1156 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1157 &links, sizeof (links), tx);
1158 ASSERT3U(error, ==, 0);
1159 mutex_exit(&xzp->z_lock);
1160 zfs_unlinked_add(xzp, tx);
1161
1162 if (zp->z_is_sa)
1163 error = sa_remove(zp->z_sa_hdl,
1164 SA_ZPL_XATTR(zfsvfs), tx);
1165 else
1166 error = sa_update(zp->z_sa_hdl,
1167 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1168 sizeof (uint64_t), tx);
1169 ASSERT0(error);
1170 }
1171 /*
1172 * Add to the unlinked set because a new reference could be
1173 * taken concurrently resulting in a deferred destruction.
1174 */
1175 zfs_unlinked_add(zp, tx);
1176 mutex_exit(&zp->z_lock);
1177 } else if (unlinked) {
1178 mutex_exit(&zp->z_lock);
1179 zfs_unlinked_add(zp, tx);
1180 }
1181
1182 txtype = TX_REMOVE;
1183 if (flags & FIGNORECASE)
1184 txtype |= TX_CI;
1185 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1186
1187 dmu_tx_commit(tx);
1188 out:
1189 if (realnmp)
1190 pn_free(realnmp);
1191
1192 zfs_dirent_unlock(dl);
1193 zfs_znode_update_vfs(dzp);
1194 zfs_znode_update_vfs(zp);
1195
1196 if (delete_now)
1197 zrele(zp);
1198 else
1199 zfs_zrele_async(zp);
1200
1201 if (xzp) {
1202 zfs_znode_update_vfs(xzp);
1203 zfs_zrele_async(xzp);
1204 }
1205
1206 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1207 error = zil_commit(zilog, 0);
1208
1209 zfs_exit(zfsvfs, FTAG);
1210 return (error);
1211 }
1212
1213 /*
1214 * Create a new directory and insert it into dzp using the name
1215 * provided. Return a pointer to the inserted directory.
1216 *
1217 * IN: dzp - znode of directory to add subdir to.
1218 * dirname - name of new directory.
1219 * vap - attributes of new directory.
1220 * cr - credentials of caller.
1221 * flags - case flags.
1222 * vsecp - ACL to be set
1223 * mnt_ns - user namespace of the mount
1224 *
1225 * OUT: zpp - znode of created directory.
1226 *
1227 * RETURN: 0 if success
1228 * error code if failure
1229 *
1230 * Timestamps:
1231 * dzp - ctime|mtime updated
1232 * zpp - ctime|mtime|atime updated
1233 */
1234 int
zfs_mkdir(znode_t * dzp,char * dirname,vattr_t * vap,znode_t ** zpp,cred_t * cr,int flags,vsecattr_t * vsecp,zidmap_t * mnt_ns)1235 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1236 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1237 {
1238 znode_t *zp;
1239 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1240 zilog_t *zilog;
1241 zfs_dirlock_t *dl;
1242 uint64_t txtype;
1243 dmu_tx_t *tx;
1244 int error;
1245 int zf = ZNEW;
1246 uid_t uid;
1247 gid_t gid = crgetgid(cr);
1248 zfs_acl_ids_t acl_ids;
1249 boolean_t fuid_dirtied;
1250 boolean_t waited = B_FALSE;
1251
1252 ASSERT(S_ISDIR(vap->va_mode));
1253
1254 /*
1255 * If we have an ephemeral id, ACL, or XVATTR then
1256 * make sure file system is at proper version
1257 */
1258
1259 uid = crgetuid(cr);
1260 if (zfsvfs->z_use_fuids == B_FALSE &&
1261 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1262 return (SET_ERROR(EINVAL));
1263
1264 if (dirname == NULL)
1265 return (SET_ERROR(EINVAL));
1266
1267 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1268 return (error);
1269 zilog = zfsvfs->z_log;
1270
1271 if (dzp->z_pflags & ZFS_XATTR) {
1272 zfs_exit(zfsvfs, FTAG);
1273 return (SET_ERROR(EINVAL));
1274 }
1275
1276 if (zfsvfs->z_utf8 && u8_validate(dirname,
1277 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1278 zfs_exit(zfsvfs, FTAG);
1279 return (SET_ERROR(EILSEQ));
1280 }
1281 if (flags & FIGNORECASE)
1282 zf |= ZCILOOK;
1283
1284 if (vap->va_mask & ATTR_XVATTR) {
1285 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1286 crgetuid(cr), cr, vap->va_mode)) != 0) {
1287 zfs_exit(zfsvfs, FTAG);
1288 return (error);
1289 }
1290 }
1291
1292 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1293 vsecp, &acl_ids, mnt_ns)) != 0) {
1294 zfs_exit(zfsvfs, FTAG);
1295 return (error);
1296 }
1297 /*
1298 * First make sure the new directory doesn't exist.
1299 *
1300 * Existence is checked first to make sure we don't return
1301 * EACCES instead of EEXIST which can cause some applications
1302 * to fail.
1303 */
1304 top:
1305 *zpp = NULL;
1306
1307 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1308 NULL, NULL))) {
1309 zfs_acl_ids_free(&acl_ids);
1310 zfs_exit(zfsvfs, FTAG);
1311 return (error);
1312 }
1313
1314 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1315 mnt_ns))) {
1316 zfs_acl_ids_free(&acl_ids);
1317 zfs_dirent_unlock(dl);
1318 zfs_exit(zfsvfs, FTAG);
1319 return (error);
1320 }
1321
1322 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1323 zfs_acl_ids_free(&acl_ids);
1324 zfs_dirent_unlock(dl);
1325 zfs_exit(zfsvfs, FTAG);
1326 return (SET_ERROR(EDQUOT));
1327 }
1328
1329 /*
1330 * Add a new entry to the directory.
1331 */
1332 tx = dmu_tx_create(zfsvfs->z_os);
1333 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1334 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1335 fuid_dirtied = zfsvfs->z_fuid_dirty;
1336 if (fuid_dirtied)
1337 zfs_fuid_txhold(zfsvfs, tx);
1338 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1339 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1340 acl_ids.z_aclp->z_acl_bytes);
1341 }
1342
1343 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1344 ZFS_SA_BASE_ATTR_SIZE);
1345
1346 error = dmu_tx_assign(tx,
1347 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1348 if (error) {
1349 zfs_dirent_unlock(dl);
1350 if (error == ERESTART) {
1351 waited = B_TRUE;
1352 dmu_tx_wait(tx);
1353 dmu_tx_abort(tx);
1354 goto top;
1355 }
1356 zfs_acl_ids_free(&acl_ids);
1357 dmu_tx_abort(tx);
1358 zfs_exit(zfsvfs, FTAG);
1359 return (error);
1360 }
1361
1362 /*
1363 * Create new node.
1364 */
1365 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1366
1367 /*
1368 * Now put new name in parent dir.
1369 */
1370 error = zfs_link_create(dl, zp, tx, ZNEW);
1371 if (error != 0) {
1372 zfs_znode_delete(zp, tx);
1373 remove_inode_hash(ZTOI(zp));
1374 goto out;
1375 }
1376
1377 if (fuid_dirtied)
1378 zfs_fuid_sync(zfsvfs, tx);
1379
1380 *zpp = zp;
1381
1382 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1383 if (flags & FIGNORECASE)
1384 txtype |= TX_CI;
1385 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1386 acl_ids.z_fuidp, vap);
1387
1388 out:
1389 zfs_acl_ids_free(&acl_ids);
1390
1391 dmu_tx_commit(tx);
1392
1393 zfs_dirent_unlock(dl);
1394
1395 if (error != 0) {
1396 zrele(zp);
1397 } else {
1398 zfs_znode_update_vfs(dzp);
1399 zfs_znode_update_vfs(zp);
1400
1401 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1402 error = zil_commit(zilog, 0);
1403
1404 }
1405 zfs_exit(zfsvfs, FTAG);
1406 return (error);
1407 }
1408
1409 /*
1410 * Remove a directory subdir entry. If the current working
1411 * directory is the same as the subdir to be removed, the
1412 * remove will fail.
1413 *
1414 * IN: dzp - znode of directory to remove from.
1415 * name - name of directory to be removed.
1416 * cwd - inode of current working directory.
1417 * cr - credentials of caller.
1418 * flags - case flags
1419 *
1420 * RETURN: 0 on success, error code on failure.
1421 *
1422 * Timestamps:
1423 * dzp - ctime|mtime updated
1424 */
1425 int
zfs_rmdir(znode_t * dzp,char * name,znode_t * cwd,cred_t * cr,int flags)1426 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1427 int flags)
1428 {
1429 znode_t *zp;
1430 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1431 zilog_t *zilog;
1432 zfs_dirlock_t *dl;
1433 dmu_tx_t *tx;
1434 int error;
1435 int zflg = ZEXISTS;
1436 boolean_t waited = B_FALSE;
1437
1438 if (name == NULL)
1439 return (SET_ERROR(EINVAL));
1440
1441 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1442 return (error);
1443 zilog = zfsvfs->z_log;
1444
1445 if (flags & FIGNORECASE)
1446 zflg |= ZCILOOK;
1447 top:
1448 zp = NULL;
1449
1450 /*
1451 * Attempt to lock directory; fail if entry doesn't exist.
1452 */
1453 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1454 NULL, NULL))) {
1455 zfs_exit(zfsvfs, FTAG);
1456 return (error);
1457 }
1458
1459 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1460 goto out;
1461 }
1462
1463 if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1464 error = SET_ERROR(ENOTDIR);
1465 goto out;
1466 }
1467
1468 if (zp == cwd) {
1469 error = SET_ERROR(EINVAL);
1470 goto out;
1471 }
1472
1473 /*
1474 * Grab a lock on the directory to make sure that no one is
1475 * trying to add (or lookup) entries while we are removing it.
1476 */
1477 rw_enter(&zp->z_name_lock, RW_WRITER);
1478
1479 /*
1480 * Grab a lock on the parent pointer to make sure we play well
1481 * with the treewalk and directory rename code.
1482 */
1483 rw_enter(&zp->z_parent_lock, RW_WRITER);
1484
1485 tx = dmu_tx_create(zfsvfs->z_os);
1486 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1487 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1488 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1489 zfs_sa_upgrade_txholds(tx, zp);
1490 zfs_sa_upgrade_txholds(tx, dzp);
1491 dmu_tx_mark_netfree(tx);
1492 error = dmu_tx_assign(tx,
1493 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1494 if (error) {
1495 rw_exit(&zp->z_parent_lock);
1496 rw_exit(&zp->z_name_lock);
1497 zfs_dirent_unlock(dl);
1498 if (error == ERESTART) {
1499 waited = B_TRUE;
1500 dmu_tx_wait(tx);
1501 dmu_tx_abort(tx);
1502 zrele(zp);
1503 goto top;
1504 }
1505 dmu_tx_abort(tx);
1506 zrele(zp);
1507 zfs_exit(zfsvfs, FTAG);
1508 return (error);
1509 }
1510
1511 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1512
1513 if (error == 0) {
1514 uint64_t txtype = TX_RMDIR;
1515 if (flags & FIGNORECASE)
1516 txtype |= TX_CI;
1517 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1518 B_FALSE);
1519 }
1520
1521 dmu_tx_commit(tx);
1522
1523 rw_exit(&zp->z_parent_lock);
1524 rw_exit(&zp->z_name_lock);
1525 out:
1526 zfs_dirent_unlock(dl);
1527
1528 zfs_znode_update_vfs(dzp);
1529 zfs_znode_update_vfs(zp);
1530 zrele(zp);
1531
1532 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1533 error = zil_commit(zilog, 0);
1534
1535 zfs_exit(zfsvfs, FTAG);
1536 return (error);
1537 }
1538
1539 /*
1540 * Read directory entries from the given directory cursor position and emit
1541 * name and position for each entry.
1542 *
1543 * IN: ip - inode of directory to read.
1544 * ctx - directory entry context.
1545 * cr - credentials of caller.
1546 *
1547 * RETURN: 0 if success
1548 * error code if failure
1549 *
1550 * Timestamps:
1551 * ip - atime updated
1552 *
1553 * Note that the low 4 bits of the cookie returned by zap is always zero.
1554 * This allows us to use the low range for "special" directory entries:
1555 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1556 * we use the offset 2 for the '.zfs' directory.
1557 */
1558 int
zfs_readdir(struct inode * ip,struct dir_context * ctx,cred_t * cr)1559 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
1560 {
1561 (void) cr;
1562 znode_t *zp = ITOZ(ip);
1563 zfsvfs_t *zfsvfs = ITOZSB(ip);
1564 objset_t *os;
1565 zap_cursor_t zc;
1566 zap_attribute_t *zap;
1567 int error;
1568 uint8_t prefetch;
1569 uint8_t type;
1570 int done = 0;
1571 uint64_t parent;
1572 uint64_t offset; /* must be unsigned; checks for < 1 */
1573
1574 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1575 return (error);
1576
1577 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1578 &parent, sizeof (parent))) != 0)
1579 goto out;
1580
1581 /*
1582 * Quit if directory has been removed (posix)
1583 */
1584 if (zp->z_unlinked)
1585 goto out;
1586
1587 error = 0;
1588 os = zfsvfs->z_os;
1589 offset = ctx->pos;
1590 prefetch = zp->z_zn_prefetch;
1591 zap = zap_attribute_long_alloc();
1592
1593 /*
1594 * Initialize the iterator cursor.
1595 */
1596 if (offset <= 3) {
1597 /*
1598 * Start iteration from the beginning of the directory.
1599 */
1600 zap_cursor_init(&zc, os, zp->z_id);
1601 } else {
1602 /*
1603 * The offset is a serialized cursor.
1604 */
1605 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1606 }
1607
1608 /*
1609 * Transform to file-system independent format
1610 */
1611 while (!done) {
1612 uint64_t objnum;
1613 /*
1614 * Special case `.', `..', and `.zfs'.
1615 */
1616 if (offset == 0) {
1617 (void) strcpy(zap->za_name, ".");
1618 zap->za_normalization_conflict = 0;
1619 objnum = zp->z_id;
1620 type = DT_DIR;
1621 } else if (offset == 1) {
1622 (void) strcpy(zap->za_name, "..");
1623 zap->za_normalization_conflict = 0;
1624 objnum = parent;
1625 type = DT_DIR;
1626 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1627 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1628 zap->za_normalization_conflict = 0;
1629 objnum = ZFSCTL_INO_ROOT;
1630 type = DT_DIR;
1631 } else {
1632 /*
1633 * Grab next entry.
1634 */
1635 if ((error = zap_cursor_retrieve(&zc, zap))) {
1636 if (error == ENOENT)
1637 break;
1638 else
1639 goto update;
1640 }
1641
1642 /*
1643 * Allow multiple entries provided the first entry is
1644 * the object id. Non-zpl consumers may safely make
1645 * use of the additional space.
1646 *
1647 * XXX: This should be a feature flag for compatibility
1648 */
1649 if (zap->za_integer_length != 8 ||
1650 zap->za_num_integers == 0) {
1651 cmn_err(CE_WARN, "zap_readdir: bad directory "
1652 "entry, obj = %lld, offset = %lld, "
1653 "length = %d, num = %lld\n",
1654 (u_longlong_t)zp->z_id,
1655 (u_longlong_t)offset,
1656 zap->za_integer_length,
1657 (u_longlong_t)zap->za_num_integers);
1658 error = SET_ERROR(ENXIO);
1659 goto update;
1660 }
1661
1662 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1663 type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1664 }
1665
1666 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name),
1667 objnum, type);
1668 if (done)
1669 break;
1670
1671 if (prefetch)
1672 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1673
1674 /*
1675 * Move to the next entry, fill in the previous offset.
1676 */
1677 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1678 zap_cursor_advance(&zc);
1679 offset = zap_cursor_serialize(&zc);
1680 } else {
1681 offset += 1;
1682 }
1683 ctx->pos = offset;
1684 }
1685 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1686
1687 update:
1688 zap_cursor_fini(&zc);
1689 zap_attribute_free(zap);
1690 if (error == ENOENT)
1691 error = 0;
1692 out:
1693 zfs_exit(zfsvfs, FTAG);
1694
1695 return (error);
1696 }
1697
1698 /*
1699 * Get the basic file attributes and place them in the provided kstat
1700 * structure. The inode is assumed to be the authoritative source
1701 * for most of the attributes. However, the znode currently has the
1702 * authoritative atime, blksize, and block count.
1703 *
1704 * IN: ip - inode of file.
1705 *
1706 * OUT: sp - kstat values.
1707 *
1708 * RETURN: 0 (always succeeds)
1709 */
1710 int
1711 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
zfs_getattr_fast(zidmap_t * user_ns,u32 request_mask,struct inode * ip,struct kstat * sp)1712 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
1713 struct kstat *sp)
1714 #else
1715 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1716 #endif
1717 {
1718 znode_t *zp = ITOZ(ip);
1719 zfsvfs_t *zfsvfs = ITOZSB(ip);
1720 uint32_t blksize;
1721 u_longlong_t nblocks;
1722 int error;
1723
1724 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1725 return (error);
1726
1727 mutex_enter(&zp->z_lock);
1728
1729 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1730 zpl_generic_fillattr(user_ns, request_mask, ip, sp);
1731 #else
1732 zpl_generic_fillattr(user_ns, ip, sp);
1733 #endif
1734 /*
1735 * +1 link count for root inode with visible '.zfs' directory.
1736 */
1737 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1738 if (sp->nlink < ZFS_LINK_MAX)
1739 sp->nlink++;
1740
1741 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1742 sp->blksize = blksize;
1743 sp->blocks = nblocks;
1744
1745 if (unlikely(zp->z_blksz == 0)) {
1746 /*
1747 * Block size hasn't been set; suggest maximal I/O transfers.
1748 */
1749 sp->blksize = zfsvfs->z_max_blksz;
1750 }
1751
1752 mutex_exit(&zp->z_lock);
1753
1754 /*
1755 * Required to prevent NFS client from detecting different inode
1756 * numbers of snapshot root dentry before and after snapshot mount.
1757 */
1758 if (zfsvfs->z_issnap) {
1759 if (ip->i_sb->s_root->d_inode == ip)
1760 sp->ino = ZFSCTL_INO_SNAPDIRS -
1761 dmu_objset_id(zfsvfs->z_os);
1762 }
1763
1764 zfs_exit(zfsvfs, FTAG);
1765
1766 return (0);
1767 }
1768
1769 /*
1770 * For the operation of changing file's user/group/project, we need to
1771 * handle not only the main object that is assigned to the file directly,
1772 * but also the ones that are used by the file via hidden xattr directory.
1773 *
1774 * Because the xattr directory may contains many EA entries, as to it may
1775 * be impossible to change all of them via the transaction of changing the
1776 * main object's user/group/project attributes. Then we have to change them
1777 * via other multiple independent transactions one by one. It may be not good
1778 * solution, but we have no better idea yet.
1779 */
1780 static int
zfs_setattr_dir(znode_t * dzp)1781 zfs_setattr_dir(znode_t *dzp)
1782 {
1783 struct inode *dxip = ZTOI(dzp);
1784 struct inode *xip = NULL;
1785 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1786 objset_t *os = zfsvfs->z_os;
1787 zap_cursor_t zc;
1788 zap_attribute_t *zap;
1789 zfs_dirlock_t *dl;
1790 znode_t *zp = NULL;
1791 dmu_tx_t *tx = NULL;
1792 uint64_t uid, gid;
1793 sa_bulk_attr_t bulk[4];
1794 int count;
1795 int err;
1796
1797 zap = zap_attribute_alloc();
1798 zap_cursor_init(&zc, os, dzp->z_id);
1799 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
1800 count = 0;
1801 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
1802 err = ENXIO;
1803 break;
1804 }
1805
1806 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp,
1807 ZEXISTS, NULL, NULL);
1808 if (err == ENOENT)
1809 goto next;
1810 if (err)
1811 break;
1812
1813 xip = ZTOI(zp);
1814 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1815 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1816 zp->z_projid == dzp->z_projid)
1817 goto next;
1818
1819 tx = dmu_tx_create(os);
1820 if (!(zp->z_pflags & ZFS_PROJID))
1821 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1822 else
1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1824
1825 err = dmu_tx_assign(tx, DMU_TX_WAIT);
1826 if (err)
1827 break;
1828
1829 mutex_enter(&dzp->z_lock);
1830
1831 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1832 xip->i_uid = dxip->i_uid;
1833 uid = zfs_uid_read(dxip);
1834 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1835 &uid, sizeof (uid));
1836 }
1837
1838 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1839 xip->i_gid = dxip->i_gid;
1840 gid = zfs_gid_read(dxip);
1841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1842 &gid, sizeof (gid));
1843 }
1844
1845
1846 uint64_t projid = dzp->z_projid;
1847 if (zp->z_projid != projid) {
1848 if (!(zp->z_pflags & ZFS_PROJID)) {
1849 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
1850 if (unlikely(err == EEXIST)) {
1851 err = 0;
1852 } else if (err != 0) {
1853 goto sa_add_projid_err;
1854 } else {
1855 projid = ZFS_INVALID_PROJID;
1856 }
1857 }
1858
1859 if (projid != ZFS_INVALID_PROJID) {
1860 zp->z_projid = projid;
1861 SA_ADD_BULK_ATTR(bulk, count,
1862 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
1863 sizeof (zp->z_projid));
1864 }
1865 }
1866
1867 sa_add_projid_err:
1868 mutex_exit(&dzp->z_lock);
1869
1870 if (likely(count > 0)) {
1871 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1872 dmu_tx_commit(tx);
1873 } else if (projid == ZFS_INVALID_PROJID) {
1874 dmu_tx_commit(tx);
1875 } else {
1876 dmu_tx_abort(tx);
1877 }
1878 tx = NULL;
1879 if (err != 0 && err != ENOENT)
1880 break;
1881
1882 next:
1883 if (zp) {
1884 zrele(zp);
1885 zp = NULL;
1886 zfs_dirent_unlock(dl);
1887 }
1888 zap_cursor_advance(&zc);
1889 }
1890
1891 if (tx)
1892 dmu_tx_abort(tx);
1893 if (zp) {
1894 zrele(zp);
1895 zfs_dirent_unlock(dl);
1896 }
1897 zap_cursor_fini(&zc);
1898 zap_attribute_free(zap);
1899
1900 return (err == ENOENT ? 0 : err);
1901 }
1902
1903 /*
1904 * Set the file attributes to the values contained in the
1905 * vattr structure.
1906 *
1907 * IN: zp - znode of file to be modified.
1908 * vap - new attribute values.
1909 * If ATTR_XVATTR set, then optional attrs are being set
1910 * flags - ATTR_UTIME set if non-default time values provided.
1911 * - ATTR_NOACLCHECK (CIFS context only).
1912 * cr - credentials of caller.
1913 * mnt_ns - user namespace of the mount
1914 *
1915 * RETURN: 0 if success
1916 * error code if failure
1917 *
1918 * Timestamps:
1919 * ip - ctime updated, mtime updated if size changed.
1920 */
1921 int
zfs_setattr(znode_t * zp,vattr_t * vap,int flags,cred_t * cr,zidmap_t * mnt_ns)1922 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1923 {
1924 struct inode *ip;
1925 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1926 objset_t *os;
1927 zilog_t *zilog;
1928 dmu_tx_t *tx;
1929 vattr_t oldva;
1930 xvattr_t *tmpxvattr;
1931 uint_t mask = vap->va_mask;
1932 uint_t saved_mask = 0;
1933 int trim_mask = 0;
1934 uint64_t new_mode;
1935 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1936 uint64_t xattr_obj;
1937 uint64_t mtime[2], ctime[2], atime[2];
1938 uint64_t projid = ZFS_INVALID_PROJID;
1939 znode_t *attrzp;
1940 int need_policy = FALSE;
1941 int err, err2 = 0;
1942 zfs_fuid_info_t *fuidp = NULL;
1943 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
1944 xoptattr_t *xoap;
1945 zfs_acl_t *aclp;
1946 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1947 boolean_t fuid_dirtied = B_FALSE;
1948 boolean_t handle_eadir = B_FALSE;
1949 sa_bulk_attr_t *bulk, *xattr_bulk;
1950 int count = 0, xattr_count = 0, bulks = 8;
1951
1952 if (mask == 0)
1953 return (0);
1954
1955 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1956 return (err);
1957 ip = ZTOI(zp);
1958 os = zfsvfs->z_os;
1959
1960 /*
1961 * If this is a xvattr_t, then get a pointer to the structure of
1962 * optional attributes. If this is NULL, then we have a vattr_t.
1963 */
1964 xoap = xva_getxoptattr(xvap);
1965 if (xoap != NULL && (mask & ATTR_XVATTR)) {
1966 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1967 if (!dmu_objset_projectquota_enabled(os) ||
1968 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1969 zfs_exit(zfsvfs, FTAG);
1970 return (SET_ERROR(ENOTSUP));
1971 }
1972
1973 projid = xoap->xoa_projid;
1974 if (unlikely(projid == ZFS_INVALID_PROJID)) {
1975 zfs_exit(zfsvfs, FTAG);
1976 return (SET_ERROR(EINVAL));
1977 }
1978
1979 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1980 projid = ZFS_INVALID_PROJID;
1981 else
1982 need_policy = TRUE;
1983 }
1984
1985 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1986 (xoap->xoa_projinherit !=
1987 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1988 (!dmu_objset_projectquota_enabled(os) ||
1989 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1990 zfs_exit(zfsvfs, FTAG);
1991 return (SET_ERROR(ENOTSUP));
1992 }
1993 }
1994
1995 zilog = zfsvfs->z_log;
1996
1997 /*
1998 * Make sure that if we have ephemeral uid/gid or xvattr specified
1999 * that file system is at proper version level
2000 */
2001
2002 if (zfsvfs->z_use_fuids == B_FALSE &&
2003 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2004 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2005 (mask & ATTR_XVATTR))) {
2006 zfs_exit(zfsvfs, FTAG);
2007 return (SET_ERROR(EINVAL));
2008 }
2009
2010 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2011 zfs_exit(zfsvfs, FTAG);
2012 return (SET_ERROR(EISDIR));
2013 }
2014
2015 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2016 zfs_exit(zfsvfs, FTAG);
2017 return (SET_ERROR(EINVAL));
2018 }
2019
2020 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2021 xva_init(tmpxvattr);
2022
2023 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2024 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2025
2026 /*
2027 * Immutable files can only alter immutable bit and atime
2028 */
2029 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2030 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2031 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2032 err = SET_ERROR(EPERM);
2033 goto out3;
2034 }
2035
2036 /* ZFS_READONLY will be handled in zfs_zaccess() */
2037
2038 /*
2039 * Verify timestamps doesn't overflow 32 bits.
2040 * ZFS can handle large timestamps, but 32bit syscalls can't
2041 * handle times greater than 2039. This check should be removed
2042 * once large timestamps are fully supported.
2043 */
2044 if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2045 if (((mask & ATTR_ATIME) &&
2046 TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2047 ((mask & ATTR_MTIME) &&
2048 TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2049 err = SET_ERROR(EOVERFLOW);
2050 goto out3;
2051 }
2052 }
2053
2054 top:
2055 attrzp = NULL;
2056 aclp = NULL;
2057
2058 /* Can this be moved to before the top label? */
2059 if (zfs_is_readonly(zfsvfs)) {
2060 err = SET_ERROR(EROFS);
2061 goto out3;
2062 }
2063
2064 /*
2065 * First validate permissions
2066 */
2067
2068 if (mask & ATTR_SIZE) {
2069 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
2070 mnt_ns);
2071 if (err)
2072 goto out3;
2073
2074 /*
2075 * XXX - Note, we are not providing any open
2076 * mode flags here (like FNDELAY), so we may
2077 * block if there are locks present... this
2078 * should be addressed in openat().
2079 */
2080 /* XXX - would it be OK to generate a log record here? */
2081 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2082 if (err)
2083 goto out3;
2084 }
2085
2086 if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2087 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2088 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2089 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2090 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2091 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2092 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2093 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2094 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2095 skipaclchk, cr, mnt_ns);
2096 }
2097
2098 if (mask & (ATTR_UID|ATTR_GID)) {
2099 int idmask = (mask & (ATTR_UID|ATTR_GID));
2100 int take_owner;
2101 int take_group;
2102 uid_t uid;
2103 gid_t gid;
2104
2105 /*
2106 * NOTE: even if a new mode is being set,
2107 * we may clear S_ISUID/S_ISGID bits.
2108 */
2109
2110 if (!(mask & ATTR_MODE))
2111 vap->va_mode = zp->z_mode;
2112
2113 /*
2114 * Take ownership or chgrp to group we are a member of
2115 */
2116
2117 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2118 vap->va_uid);
2119 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2120 vap->va_gid);
2121 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2122 take_group = (mask & ATTR_GID) &&
2123 zfs_groupmember(zfsvfs, gid, cr);
2124
2125 /*
2126 * If both ATTR_UID and ATTR_GID are set then take_owner and
2127 * take_group must both be set in order to allow taking
2128 * ownership.
2129 *
2130 * Otherwise, send the check through secpolicy_vnode_setattr()
2131 *
2132 */
2133
2134 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2135 take_owner && take_group) ||
2136 ((idmask == ATTR_UID) && take_owner) ||
2137 ((idmask == ATTR_GID) && take_group)) {
2138 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2139 skipaclchk, cr, mnt_ns) == 0) {
2140 /*
2141 * Remove setuid/setgid for non-privileged users
2142 */
2143 (void) secpolicy_setid_clear(vap, cr);
2144 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2145 } else {
2146 need_policy = TRUE;
2147 }
2148 } else {
2149 need_policy = TRUE;
2150 }
2151 }
2152
2153 mutex_enter(&zp->z_lock);
2154 oldva.va_mode = zp->z_mode;
2155 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2156 if (mask & ATTR_XVATTR) {
2157 /*
2158 * Update xvattr mask to include only those attributes
2159 * that are actually changing.
2160 *
2161 * the bits will be restored prior to actually setting
2162 * the attributes so the caller thinks they were set.
2163 */
2164 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2165 if (xoap->xoa_appendonly !=
2166 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2167 need_policy = TRUE;
2168 } else {
2169 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2170 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2171 }
2172 }
2173
2174 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2175 if (xoap->xoa_projinherit !=
2176 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2177 need_policy = TRUE;
2178 } else {
2179 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2180 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2181 }
2182 }
2183
2184 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2185 if (xoap->xoa_nounlink !=
2186 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2187 need_policy = TRUE;
2188 } else {
2189 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2190 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2191 }
2192 }
2193
2194 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2195 if (xoap->xoa_immutable !=
2196 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2197 need_policy = TRUE;
2198 } else {
2199 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2200 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2201 }
2202 }
2203
2204 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2205 if (xoap->xoa_nodump !=
2206 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2207 need_policy = TRUE;
2208 } else {
2209 XVA_CLR_REQ(xvap, XAT_NODUMP);
2210 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2211 }
2212 }
2213
2214 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2215 if (xoap->xoa_av_modified !=
2216 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2217 need_policy = TRUE;
2218 } else {
2219 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2220 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2221 }
2222 }
2223
2224 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2225 if ((!S_ISREG(ip->i_mode) &&
2226 xoap->xoa_av_quarantined) ||
2227 xoap->xoa_av_quarantined !=
2228 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2229 need_policy = TRUE;
2230 } else {
2231 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2232 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2233 }
2234 }
2235
2236 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2237 mutex_exit(&zp->z_lock);
2238 err = SET_ERROR(EPERM);
2239 goto out3;
2240 }
2241
2242 if (need_policy == FALSE &&
2243 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2244 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2245 need_policy = TRUE;
2246 }
2247 }
2248
2249 mutex_exit(&zp->z_lock);
2250
2251 if (mask & ATTR_MODE) {
2252 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2253 mnt_ns) == 0) {
2254 err = secpolicy_setid_setsticky_clear(ip, vap,
2255 &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2256 if (err)
2257 goto out3;
2258 trim_mask |= ATTR_MODE;
2259 } else {
2260 need_policy = TRUE;
2261 }
2262 }
2263
2264 if (need_policy) {
2265 /*
2266 * If trim_mask is set then take ownership
2267 * has been granted or write_acl is present and user
2268 * has the ability to modify mode. In that case remove
2269 * UID|GID and or MODE from mask so that
2270 * secpolicy_vnode_setattr() doesn't revoke it.
2271 */
2272
2273 if (trim_mask) {
2274 saved_mask = vap->va_mask;
2275 vap->va_mask &= ~trim_mask;
2276 }
2277 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2278 zfs_zaccess_unix, zp);
2279 if (err)
2280 goto out3;
2281
2282 if (trim_mask)
2283 vap->va_mask |= saved_mask;
2284 }
2285
2286 /*
2287 * secpolicy_vnode_setattr, or take ownership may have
2288 * changed va_mask
2289 */
2290 mask = vap->va_mask;
2291
2292 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2293 handle_eadir = B_TRUE;
2294 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2295 &xattr_obj, sizeof (xattr_obj));
2296
2297 if (err == 0 && xattr_obj) {
2298 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2299 if (err)
2300 goto out2;
2301 }
2302 if (mask & ATTR_UID) {
2303 new_kuid = zfs_fuid_create(zfsvfs,
2304 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2305 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2306 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2307 new_kuid)) {
2308 if (attrzp)
2309 zrele(attrzp);
2310 err = SET_ERROR(EDQUOT);
2311 goto out2;
2312 }
2313 }
2314
2315 if (mask & ATTR_GID) {
2316 new_kgid = zfs_fuid_create(zfsvfs,
2317 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2318 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2319 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2320 new_kgid)) {
2321 if (attrzp)
2322 zrele(attrzp);
2323 err = SET_ERROR(EDQUOT);
2324 goto out2;
2325 }
2326 }
2327
2328 if (projid != ZFS_INVALID_PROJID &&
2329 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2330 if (attrzp)
2331 zrele(attrzp);
2332 err = EDQUOT;
2333 goto out2;
2334 }
2335 }
2336 tx = dmu_tx_create(os);
2337
2338 if (mask & ATTR_MODE) {
2339 uint64_t pmode = zp->z_mode;
2340 uint64_t acl_obj;
2341 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2342
2343 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2344 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2345 err = EPERM;
2346 goto out;
2347 }
2348
2349 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2350 goto out;
2351
2352 mutex_enter(&zp->z_lock);
2353 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2354 /*
2355 * Are we upgrading ACL from old V0 format
2356 * to V1 format?
2357 */
2358 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2359 zfs_znode_acl_version(zp) ==
2360 ZFS_ACL_VERSION_INITIAL) {
2361 dmu_tx_hold_free(tx, acl_obj, 0,
2362 DMU_OBJECT_END);
2363 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2364 0, aclp->z_acl_bytes);
2365 } else {
2366 dmu_tx_hold_write(tx, acl_obj, 0,
2367 aclp->z_acl_bytes);
2368 }
2369 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2371 0, aclp->z_acl_bytes);
2372 }
2373 mutex_exit(&zp->z_lock);
2374 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2375 } else {
2376 if (((mask & ATTR_XVATTR) &&
2377 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2378 (projid != ZFS_INVALID_PROJID &&
2379 !(zp->z_pflags & ZFS_PROJID)))
2380 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2381 else
2382 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2383 }
2384
2385 if (attrzp) {
2386 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2387 }
2388
2389 fuid_dirtied = zfsvfs->z_fuid_dirty;
2390 if (fuid_dirtied)
2391 zfs_fuid_txhold(zfsvfs, tx);
2392
2393 zfs_sa_upgrade_txholds(tx, zp);
2394
2395 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2396 if (err)
2397 goto out;
2398
2399 count = 0;
2400 /*
2401 * Set each attribute requested.
2402 * We group settings according to the locks they need to acquire.
2403 *
2404 * Note: you cannot set ctime directly, although it will be
2405 * updated as a side-effect of calling this function.
2406 */
2407
2408 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2409 /*
2410 * For the existed object that is upgraded from old system,
2411 * its on-disk layout has no slot for the project ID attribute.
2412 * But quota accounting logic needs to access related slots by
2413 * offset directly. So we need to adjust old objects' layout
2414 * to make the project ID to some unified and fixed offset.
2415 */
2416 if (attrzp)
2417 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2418 if (err == 0)
2419 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2420
2421 if (unlikely(err == EEXIST))
2422 err = 0;
2423 else if (err != 0)
2424 goto out;
2425 else
2426 projid = ZFS_INVALID_PROJID;
2427 }
2428
2429 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2430 mutex_enter(&zp->z_acl_lock);
2431 mutex_enter(&zp->z_lock);
2432
2433 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2434 &zp->z_pflags, sizeof (zp->z_pflags));
2435
2436 if (attrzp) {
2437 /*
2438 * attrzp is zp's hidden xattr directory, so the second
2439 * znode lock acquisition is nested rather than recursive.
2440 */
2441 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2442 mutex_enter_nested(&attrzp->z_acl_lock, NESTED_SINGLE);
2443 mutex_enter_nested(&attrzp->z_lock, NESTED_SINGLE);
2444 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2445 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2446 sizeof (attrzp->z_pflags));
2447 if (projid != ZFS_INVALID_PROJID) {
2448 attrzp->z_projid = projid;
2449 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2450 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2451 sizeof (attrzp->z_projid));
2452 }
2453 }
2454
2455 if (mask & (ATTR_UID|ATTR_GID)) {
2456
2457 if (mask & ATTR_UID) {
2458 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2459 new_uid = zfs_uid_read(ZTOI(zp));
2460 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2461 &new_uid, sizeof (new_uid));
2462 if (attrzp) {
2463 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2464 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2465 sizeof (new_uid));
2466 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2467 }
2468 }
2469
2470 if (mask & ATTR_GID) {
2471 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2472 new_gid = zfs_gid_read(ZTOI(zp));
2473 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2474 NULL, &new_gid, sizeof (new_gid));
2475 if (attrzp) {
2476 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2477 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2478 sizeof (new_gid));
2479 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2480 }
2481 }
2482 if (!(mask & ATTR_MODE)) {
2483 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2484 NULL, &new_mode, sizeof (new_mode));
2485 new_mode = zp->z_mode;
2486 }
2487 err = zfs_acl_chown_setattr(zp);
2488 ASSERT0(err);
2489 if (attrzp) {
2490 err = zfs_acl_chown_setattr(attrzp);
2491 ASSERT0(err);
2492 }
2493 }
2494
2495 if (mask & ATTR_MODE) {
2496 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2497 &new_mode, sizeof (new_mode));
2498 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2499 ASSERT3P(aclp, !=, NULL);
2500 err = zfs_aclset_common(zp, aclp, cr, tx);
2501 ASSERT0(err);
2502 if (zp->z_acl_cached)
2503 zfs_acl_free(zp->z_acl_cached);
2504 zp->z_acl_cached = aclp;
2505 aclp = NULL;
2506 }
2507
2508 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2509 zp->z_atime_dirty = B_FALSE;
2510 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
2511 ZFS_TIME_ENCODE(&tmp_atime, atime);
2512 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2513 &atime, sizeof (atime));
2514 }
2515
2516 if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2517 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2518 zpl_inode_set_mtime_to_ts(ZTOI(zp),
2519 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
2520
2521 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2522 mtime, sizeof (mtime));
2523 }
2524
2525 if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2526 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2527 zpl_inode_set_ctime_to_ts(ZTOI(zp),
2528 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
2529 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2530 ctime, sizeof (ctime));
2531 }
2532
2533 if (projid != ZFS_INVALID_PROJID) {
2534 zp->z_projid = projid;
2535 SA_ADD_BULK_ATTR(bulk, count,
2536 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2537 sizeof (zp->z_projid));
2538 }
2539
2540 if (attrzp && mask) {
2541 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2542 SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2543 sizeof (ctime));
2544 }
2545
2546 /*
2547 * Do this after setting timestamps to prevent timestamp
2548 * update from toggling bit
2549 */
2550
2551 if (xoap && (mask & ATTR_XVATTR)) {
2552
2553 /*
2554 * restore trimmed off masks
2555 * so that return masks can be set for caller.
2556 */
2557
2558 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2559 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2560 }
2561 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2562 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2563 }
2564 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2565 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2566 }
2567 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2568 XVA_SET_REQ(xvap, XAT_NODUMP);
2569 }
2570 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2571 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2572 }
2573 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2574 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2575 }
2576 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2577 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2578 }
2579
2580 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2581 ASSERT(S_ISREG(ip->i_mode));
2582
2583 zfs_xvattr_set(zp, xvap, tx);
2584 }
2585
2586 if (fuid_dirtied)
2587 zfs_fuid_sync(zfsvfs, tx);
2588
2589 if (mask != 0) {
2590 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2591 /*
2592 * Ensure that the z_seq is always incremented on setattr
2593 * operation. This is required for change accounting for
2594 * NFS clients.
2595 *
2596 * ATTR_MODE already increments via zfs_acl_chmod_setattr.
2597 * ATTR_SIZE already increments via zfs_freesp.
2598 */
2599 if (!(mask & (ATTR_MODE | ATTR_SIZE)))
2600 zp->z_seq++;
2601 }
2602
2603 mutex_exit(&zp->z_lock);
2604 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2605 mutex_exit(&zp->z_acl_lock);
2606
2607 if (attrzp) {
2608 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2609 mutex_exit(&attrzp->z_acl_lock);
2610 mutex_exit(&attrzp->z_lock);
2611 }
2612 out:
2613 if (err == 0 && xattr_count > 0) {
2614 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2615 xattr_count, tx);
2616 ASSERT0(err2);
2617 }
2618
2619 if (aclp)
2620 zfs_acl_free(aclp);
2621
2622 if (fuidp) {
2623 zfs_fuid_info_free(fuidp);
2624 fuidp = NULL;
2625 }
2626
2627 if (err) {
2628 dmu_tx_abort(tx);
2629 if (attrzp)
2630 zrele(attrzp);
2631 if (err == ERESTART)
2632 goto top;
2633 } else {
2634 if (count > 0)
2635 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2636 dmu_tx_commit(tx);
2637 if (attrzp) {
2638 if (err2 == 0 && handle_eadir)
2639 err = zfs_setattr_dir(attrzp);
2640 zrele(attrzp);
2641 }
2642 zfs_znode_update_vfs(zp);
2643 }
2644
2645 out2:
2646 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
2647 err = zil_commit(zilog, 0);
2648
2649 out3:
2650 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2651 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2652 kmem_free(tmpxvattr, sizeof (xvattr_t));
2653 zfs_exit(zfsvfs, FTAG);
2654 return (err);
2655 }
2656
2657 typedef struct zfs_zlock {
2658 krwlock_t *zl_rwlock; /* lock we acquired */
2659 znode_t *zl_znode; /* znode we held */
2660 struct zfs_zlock *zl_next; /* next in list */
2661 } zfs_zlock_t;
2662
2663 /*
2664 * Drop locks and release vnodes that were held by zfs_rename_lock().
2665 */
2666 static void
zfs_rename_unlock(zfs_zlock_t ** zlpp)2667 zfs_rename_unlock(zfs_zlock_t **zlpp)
2668 {
2669 zfs_zlock_t *zl;
2670
2671 while ((zl = *zlpp) != NULL) {
2672 if (zl->zl_znode != NULL)
2673 zfs_zrele_async(zl->zl_znode);
2674 rw_exit(zl->zl_rwlock);
2675 *zlpp = zl->zl_next;
2676 kmem_free(zl, sizeof (*zl));
2677 }
2678 }
2679
2680 /*
2681 * Search back through the directory tree, using the ".." entries.
2682 * Lock each directory in the chain to prevent concurrent renames.
2683 * Fail any attempt to move a directory into one of its own descendants.
2684 * XXX - z_parent_lock can overlap with map or grow locks
2685 */
2686 static int
zfs_rename_lock(znode_t * szp,znode_t * tdzp,znode_t * sdzp,zfs_zlock_t ** zlpp)2687 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2688 {
2689 zfs_zlock_t *zl;
2690 znode_t *zp = tdzp;
2691 uint64_t rootid = ZTOZSB(zp)->z_root;
2692 uint64_t oidp = zp->z_id;
2693 krwlock_t *rwlp = &szp->z_parent_lock;
2694 krw_t rw = RW_WRITER;
2695
2696 /*
2697 * First pass write-locks szp and compares to zp->z_id.
2698 * Later passes read-lock zp and compare to zp->z_parent.
2699 */
2700 do {
2701 if (!rw_tryenter(rwlp, rw)) {
2702 /*
2703 * Another thread is renaming in this path.
2704 * Note that if we are a WRITER, we don't have any
2705 * parent_locks held yet.
2706 */
2707 if (rw == RW_READER && zp->z_id > szp->z_id) {
2708 /*
2709 * Drop our locks and restart
2710 */
2711 zfs_rename_unlock(&zl);
2712 *zlpp = NULL;
2713 zp = tdzp;
2714 oidp = zp->z_id;
2715 rwlp = &szp->z_parent_lock;
2716 rw = RW_WRITER;
2717 continue;
2718 } else {
2719 /*
2720 * Wait for other thread to drop its locks
2721 */
2722 rw_enter(rwlp, rw);
2723 }
2724 }
2725
2726 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2727 zl->zl_rwlock = rwlp;
2728 zl->zl_znode = NULL;
2729 zl->zl_next = *zlpp;
2730 *zlpp = zl;
2731
2732 if (oidp == szp->z_id) /* We're a descendant of szp */
2733 return (SET_ERROR(EINVAL));
2734
2735 if (oidp == rootid) /* We've hit the top */
2736 return (0);
2737
2738 if (rw == RW_READER) { /* i.e. not the first pass */
2739 int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2740 if (error)
2741 return (error);
2742 zl->zl_znode = zp;
2743 }
2744 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2745 &oidp, sizeof (oidp));
2746 rwlp = &zp->z_parent_lock;
2747 rw = RW_READER;
2748
2749 } while (zp->z_id != sdzp->z_id);
2750
2751 return (0);
2752 }
2753
2754 /*
2755 * Move an entry from the provided source directory to the target
2756 * directory. Change the entry name as indicated.
2757 *
2758 * IN: sdzp - Source directory containing the "old entry".
2759 * snm - Old entry name.
2760 * tdzp - Target directory to contain the "new entry".
2761 * tnm - New entry name.
2762 * cr - credentials of caller.
2763 * flags - case flags
2764 * rflags - RENAME_* flags
2765 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0).
2766 * mnt_ns - user namespace of the mount
2767 *
2768 * RETURN: 0 on success, error code on failure.
2769 *
2770 * Timestamps:
2771 * sdzp,tdzp - ctime|mtime updated
2772 */
2773 int
zfs_rename(znode_t * sdzp,char * snm,znode_t * tdzp,char * tnm,cred_t * cr,int flags,uint64_t rflags,vattr_t * wo_vap,zidmap_t * mnt_ns)2774 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2775 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2776 {
2777 znode_t *szp, *tzp;
2778 zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
2779 zilog_t *zilog;
2780 zfs_dirlock_t *sdl, *tdl;
2781 dmu_tx_t *tx;
2782 zfs_zlock_t *zl;
2783 int cmp, serr, terr;
2784 int error = 0;
2785 int zflg = 0;
2786 boolean_t waited = B_FALSE;
2787 /* Needed for whiteout inode creation. */
2788 boolean_t fuid_dirtied;
2789 zfs_acl_ids_t acl_ids;
2790 boolean_t have_acl = B_FALSE;
2791 znode_t *wzp = NULL;
2792
2793
2794 if (snm == NULL || tnm == NULL)
2795 return (SET_ERROR(EINVAL));
2796
2797 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2798 return (SET_ERROR(EINVAL));
2799
2800 /* Already checked by Linux VFS, but just to make sure. */
2801 if (rflags & RENAME_EXCHANGE &&
2802 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2803 return (SET_ERROR(EINVAL));
2804
2805 /*
2806 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2807 * right kind of vattr_t for the whiteout file. These are set
2808 * internally by ZFS so should never be incorrect.
2809 */
2810 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2811 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2812 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2813
2814 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2815 return (error);
2816 zilog = zfsvfs->z_log;
2817
2818 if ((error = zfs_verify_zp(tdzp)) != 0) {
2819 zfs_exit(zfsvfs, FTAG);
2820 return (error);
2821 }
2822
2823 /*
2824 * We check i_sb because snapshots and the ctldir must have different
2825 * super blocks.
2826 */
2827 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2828 zfsctl_is_node(ZTOI(tdzp))) {
2829 zfs_exit(zfsvfs, FTAG);
2830 return (SET_ERROR(EXDEV));
2831 }
2832
2833 if (zfsvfs->z_utf8 && u8_validate(tnm,
2834 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2835 zfs_exit(zfsvfs, FTAG);
2836 return (SET_ERROR(EILSEQ));
2837 }
2838
2839 if (flags & FIGNORECASE)
2840 zflg |= ZCILOOK;
2841
2842 top:
2843 szp = NULL;
2844 tzp = NULL;
2845 zl = NULL;
2846
2847 /*
2848 * This is to prevent the creation of links into attribute space
2849 * by renaming a linked file into/outof an attribute directory.
2850 * See the comment in zfs_link() for why this is considered bad.
2851 */
2852 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2853 zfs_exit(zfsvfs, FTAG);
2854 return (SET_ERROR(EINVAL));
2855 }
2856
2857 /*
2858 * Lock source and target directory entries. To prevent deadlock,
2859 * a lock ordering must be defined. We lock the directory with
2860 * the smallest object id first, or if it's a tie, the one with
2861 * the lexically first name.
2862 */
2863 if (sdzp->z_id < tdzp->z_id) {
2864 cmp = -1;
2865 } else if (sdzp->z_id > tdzp->z_id) {
2866 cmp = 1;
2867 } else {
2868 /*
2869 * First compare the two name arguments without
2870 * considering any case folding.
2871 */
2872 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2873
2874 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2875 ASSERT(error == 0 || !zfsvfs->z_utf8);
2876 if (cmp == 0) {
2877 /*
2878 * POSIX: "If the old argument and the new argument
2879 * both refer to links to the same existing file,
2880 * the rename() function shall return successfully
2881 * and perform no other action."
2882 */
2883 zfs_exit(zfsvfs, FTAG);
2884 return (0);
2885 }
2886 /*
2887 * If the file system is case-folding, then we may
2888 * have some more checking to do. A case-folding file
2889 * system is either supporting mixed case sensitivity
2890 * access or is completely case-insensitive. Note
2891 * that the file system is always case preserving.
2892 *
2893 * In mixed sensitivity mode case sensitive behavior
2894 * is the default. FIGNORECASE must be used to
2895 * explicitly request case insensitive behavior.
2896 *
2897 * If the source and target names provided differ only
2898 * by case (e.g., a request to rename 'tim' to 'Tim'),
2899 * we will treat this as a special case in the
2900 * case-insensitive mode: as long as the source name
2901 * is an exact match, we will allow this to proceed as
2902 * a name-change request.
2903 */
2904 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2905 (zfsvfs->z_case == ZFS_CASE_MIXED &&
2906 flags & FIGNORECASE)) &&
2907 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2908 &error) == 0) {
2909 /*
2910 * case preserving rename request, require exact
2911 * name matches
2912 */
2913 zflg |= ZCIEXACT;
2914 zflg &= ~ZCILOOK;
2915 }
2916 }
2917
2918 /*
2919 * If the source and destination directories are the same, we should
2920 * grab the z_name_lock of that directory only once.
2921 */
2922 if (sdzp == tdzp) {
2923 zflg |= ZHAVELOCK;
2924 rw_enter(&sdzp->z_name_lock, RW_READER);
2925 }
2926
2927 if (cmp < 0) {
2928 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2929 ZEXISTS | zflg, NULL, NULL);
2930 terr = zfs_dirent_lock(&tdl,
2931 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2932 } else {
2933 terr = zfs_dirent_lock(&tdl,
2934 tdzp, tnm, &tzp, zflg, NULL, NULL);
2935 serr = zfs_dirent_lock(&sdl,
2936 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2937 NULL, NULL);
2938 }
2939
2940 if (serr) {
2941 /*
2942 * Source entry invalid or not there.
2943 */
2944 if (!terr) {
2945 zfs_dirent_unlock(tdl);
2946 if (tzp)
2947 zrele(tzp);
2948 }
2949
2950 if (sdzp == tdzp)
2951 rw_exit(&sdzp->z_name_lock);
2952
2953 if (strcmp(snm, "..") == 0)
2954 serr = EINVAL;
2955 zfs_exit(zfsvfs, FTAG);
2956 return (serr);
2957 }
2958 if (terr) {
2959 zfs_dirent_unlock(sdl);
2960 zrele(szp);
2961
2962 if (sdzp == tdzp)
2963 rw_exit(&sdzp->z_name_lock);
2964
2965 if (strcmp(tnm, "..") == 0)
2966 terr = EINVAL;
2967 zfs_exit(zfsvfs, FTAG);
2968 return (terr);
2969 }
2970
2971 /*
2972 * If we are using project inheritance, means if the directory has
2973 * ZFS_PROJINHERIT set, then its descendant directories will inherit
2974 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2975 * such case, we only allow renames into our tree when the project
2976 * IDs are the same.
2977 */
2978 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2979 tdzp->z_projid != szp->z_projid) {
2980 error = SET_ERROR(EXDEV);
2981 goto out;
2982 }
2983
2984 /*
2985 * Must have write access at the source to remove the old entry
2986 * and write access at the target to create the new entry.
2987 * Note that if target and source are the same, this can be
2988 * done in a single check.
2989 */
2990 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2991 goto out;
2992
2993 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2994 /*
2995 * Check to make sure rename is valid.
2996 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2997 */
2998 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2999 goto out;
3000 }
3001
3002 /*
3003 * Does target exist?
3004 */
3005 if (tzp) {
3006 if (rflags & RENAME_NOREPLACE) {
3007 error = SET_ERROR(EEXIST);
3008 goto out;
3009 }
3010 /*
3011 * Source and target must be the same type (unless exchanging).
3012 */
3013 if (!(rflags & RENAME_EXCHANGE)) {
3014 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
3015 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
3016
3017 if (s_is_dir != t_is_dir) {
3018 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
3019 goto out;
3020 }
3021 }
3022 /*
3023 * POSIX dictates that when the source and target
3024 * entries refer to the same file object, rename
3025 * must do nothing and exit without error.
3026 */
3027 if (szp->z_id == tzp->z_id) {
3028 error = 0;
3029 goto out;
3030 }
3031 } else if (rflags & RENAME_EXCHANGE) {
3032 /* Target must exist for RENAME_EXCHANGE. */
3033 error = SET_ERROR(ENOENT);
3034 goto out;
3035 }
3036
3037 /* Set up inode creation for RENAME_WHITEOUT. */
3038 if (rflags & RENAME_WHITEOUT) {
3039 /*
3040 * Whiteout files are not regular files or directories, so to
3041 * match zfs_create() we do not inherit the project id.
3042 */
3043 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
3044
3045 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
3046 if (error)
3047 goto out;
3048
3049 if (!have_acl) {
3050 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
3051 &acl_ids, mnt_ns);
3052 if (error)
3053 goto out;
3054 have_acl = B_TRUE;
3055 }
3056
3057 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
3058 error = SET_ERROR(EDQUOT);
3059 goto out;
3060 }
3061 }
3062
3063 tx = dmu_tx_create(zfsvfs->z_os);
3064 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3065 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3066 dmu_tx_hold_zap(tx, sdzp->z_id,
3067 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
3068 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3069 if (sdzp != tdzp) {
3070 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3071 zfs_sa_upgrade_txholds(tx, tdzp);
3072 }
3073 if (tzp) {
3074 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3075 zfs_sa_upgrade_txholds(tx, tzp);
3076 }
3077 if (rflags & RENAME_WHITEOUT) {
3078 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3079 ZFS_SA_BASE_ATTR_SIZE);
3080
3081 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
3082 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3083 if (!zfsvfs->z_use_sa &&
3084 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3085 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3086 0, acl_ids.z_aclp->z_acl_bytes);
3087 }
3088 }
3089 fuid_dirtied = zfsvfs->z_fuid_dirty;
3090 if (fuid_dirtied)
3091 zfs_fuid_txhold(zfsvfs, tx);
3092 zfs_sa_upgrade_txholds(tx, szp);
3093 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3094 error = dmu_tx_assign(tx,
3095 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3096 if (error) {
3097 if (zl != NULL)
3098 zfs_rename_unlock(&zl);
3099 zfs_dirent_unlock(sdl);
3100 zfs_dirent_unlock(tdl);
3101
3102 if (sdzp == tdzp)
3103 rw_exit(&sdzp->z_name_lock);
3104
3105 if (error == ERESTART) {
3106 waited = B_TRUE;
3107 dmu_tx_wait(tx);
3108 dmu_tx_abort(tx);
3109 zrele(szp);
3110 if (tzp)
3111 zrele(tzp);
3112 goto top;
3113 }
3114 dmu_tx_abort(tx);
3115 zrele(szp);
3116 if (tzp)
3117 zrele(tzp);
3118 zfs_exit(zfsvfs, FTAG);
3119 return (error);
3120 }
3121
3122 /*
3123 * Unlink the source.
3124 */
3125 szp->z_pflags |= ZFS_AV_MODIFIED;
3126 if (tdzp->z_pflags & ZFS_PROJINHERIT)
3127 szp->z_pflags |= ZFS_PROJINHERIT;
3128
3129 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3130 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3131 VERIFY0(error);
3132
3133 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3134 if (error)
3135 goto commit;
3136
3137 /*
3138 * Unlink the target.
3139 */
3140 if (tzp) {
3141 int tzflg = zflg;
3142
3143 if (rflags & RENAME_EXCHANGE) {
3144 /* This inode will be re-linked soon. */
3145 tzflg |= ZRENAMING;
3146
3147 tzp->z_pflags |= ZFS_AV_MODIFIED;
3148 if (sdzp->z_pflags & ZFS_PROJINHERIT)
3149 tzp->z_pflags |= ZFS_PROJINHERIT;
3150
3151 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3152 (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3153 ASSERT0(error);
3154 }
3155 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3156 if (error)
3157 goto commit_link_szp;
3158 }
3159
3160 /*
3161 * Create the new target links:
3162 * * We always link the target.
3163 * * RENAME_EXCHANGE: Link the old target to the source.
3164 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3165 */
3166 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3167 if (error) {
3168 /*
3169 * If we have removed the existing target, a subsequent call to
3170 * zfs_link_create() to add back the same entry, but with a new
3171 * dnode (szp), should not fail.
3172 */
3173 ASSERT0P(tzp);
3174 goto commit_link_tzp;
3175 }
3176
3177 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3178 case RENAME_EXCHANGE:
3179 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3180 /*
3181 * The same argument as zfs_link_create() failing for
3182 * szp applies here, since the source directory must
3183 * have had an entry we are replacing.
3184 */
3185 ASSERT0(error);
3186 if (error)
3187 goto commit_unlink_td_szp;
3188 break;
3189 case RENAME_WHITEOUT:
3190 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3191 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3192 if (error) {
3193 zfs_znode_delete(wzp, tx);
3194 remove_inode_hash(ZTOI(wzp));
3195 goto commit_unlink_td_szp;
3196 }
3197 break;
3198 }
3199
3200 if (fuid_dirtied)
3201 zfs_fuid_sync(zfsvfs, tx);
3202
3203 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3204 case RENAME_EXCHANGE:
3205 zfs_log_rename_exchange(zilog, tx,
3206 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3207 tdzp, tdl->dl_name, szp);
3208 break;
3209 case RENAME_WHITEOUT:
3210 zfs_log_rename_whiteout(zilog, tx,
3211 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3212 tdzp, tdl->dl_name, szp, wzp);
3213 break;
3214 default:
3215 ASSERT0(rflags & ~RENAME_NOREPLACE);
3216 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3217 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3218 break;
3219 }
3220
3221 commit:
3222 dmu_tx_commit(tx);
3223 out:
3224 if (have_acl)
3225 zfs_acl_ids_free(&acl_ids);
3226
3227 zfs_znode_update_vfs(sdzp);
3228 if (sdzp == tdzp)
3229 rw_exit(&sdzp->z_name_lock);
3230
3231 if (sdzp != tdzp)
3232 zfs_znode_update_vfs(tdzp);
3233
3234 zfs_znode_update_vfs(szp);
3235 zrele(szp);
3236 if (wzp) {
3237 zfs_znode_update_vfs(wzp);
3238 zrele(wzp);
3239 }
3240 if (tzp) {
3241 zfs_znode_update_vfs(tzp);
3242 zrele(tzp);
3243 }
3244
3245 if (zl != NULL)
3246 zfs_rename_unlock(&zl);
3247
3248 zfs_dirent_unlock(sdl);
3249 zfs_dirent_unlock(tdl);
3250
3251 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3252 error = zil_commit(zilog, 0);
3253
3254 zfs_exit(zfsvfs, FTAG);
3255 return (error);
3256
3257 /*
3258 * Clean-up path for broken link state.
3259 *
3260 * At this point we are in a (very) bad state, so we need to do our
3261 * best to correct the state. In particular, all of the nlinks are
3262 * wrong because we were destroying and creating links with ZRENAMING.
3263 *
3264 * In some form, all of these operations have to resolve the state:
3265 *
3266 * * link_destroy() *must* succeed. Fortunately, this is very likely
3267 * since we only just created it.
3268 *
3269 * * link_create()s are allowed to fail (though they shouldn't because
3270 * we only just unlinked them and are putting the entries back
3271 * during clean-up). But if they fail, we can just forcefully drop
3272 * the nlink value to (at the very least) avoid broken nlink values
3273 * -- though in the case of non-empty directories we will have to
3274 * panic (otherwise we'd have a leaked directory with a broken ..).
3275 */
3276 commit_unlink_td_szp:
3277 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3278 commit_link_tzp:
3279 if (tzp) {
3280 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3281 VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3282 }
3283 commit_link_szp:
3284 if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3285 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3286 goto commit;
3287 }
3288
3289 /*
3290 * Insert the indicated symbolic reference entry into the directory.
3291 *
3292 * IN: dzp - Directory to contain new symbolic link.
3293 * name - Name of directory entry in dip.
3294 * vap - Attributes of new entry.
3295 * link - Name for new symlink entry.
3296 * cr - credentials of caller.
3297 * flags - case flags
3298 * mnt_ns - user namespace of the mount
3299 *
3300 * OUT: zpp - Znode for new symbolic link.
3301 *
3302 * RETURN: 0 on success, error code on failure.
3303 *
3304 * Timestamps:
3305 * dip - ctime|mtime updated
3306 */
3307 int
zfs_symlink(znode_t * dzp,char * name,vattr_t * vap,char * link,znode_t ** zpp,cred_t * cr,int flags,zidmap_t * mnt_ns)3308 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3309 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3310 {
3311 znode_t *zp;
3312 zfs_dirlock_t *dl;
3313 dmu_tx_t *tx;
3314 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
3315 zilog_t *zilog;
3316 uint64_t len = strlen(link);
3317 int error;
3318 int zflg = ZNEW;
3319 zfs_acl_ids_t acl_ids;
3320 boolean_t fuid_dirtied;
3321 uint64_t txtype = TX_SYMLINK;
3322 boolean_t waited = B_FALSE;
3323
3324 ASSERT(S_ISLNK(vap->va_mode));
3325
3326 if (name == NULL)
3327 return (SET_ERROR(EINVAL));
3328
3329 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3330 return (error);
3331 zilog = zfsvfs->z_log;
3332
3333 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3334 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3335 zfs_exit(zfsvfs, FTAG);
3336 return (SET_ERROR(EILSEQ));
3337 }
3338 if (flags & FIGNORECASE)
3339 zflg |= ZCILOOK;
3340
3341 if (len > MAXPATHLEN) {
3342 zfs_exit(zfsvfs, FTAG);
3343 return (SET_ERROR(ENAMETOOLONG));
3344 }
3345
3346 if ((error = zfs_acl_ids_create(dzp, 0,
3347 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3348 zfs_exit(zfsvfs, FTAG);
3349 return (error);
3350 }
3351 top:
3352 *zpp = NULL;
3353
3354 /*
3355 * Attempt to lock directory; fail if entry already exists.
3356 */
3357 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3358 if (error) {
3359 zfs_acl_ids_free(&acl_ids);
3360 zfs_exit(zfsvfs, FTAG);
3361 return (error);
3362 }
3363
3364 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3365 zfs_acl_ids_free(&acl_ids);
3366 zfs_dirent_unlock(dl);
3367 zfs_exit(zfsvfs, FTAG);
3368 return (error);
3369 }
3370
3371 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3372 zfs_acl_ids_free(&acl_ids);
3373 zfs_dirent_unlock(dl);
3374 zfs_exit(zfsvfs, FTAG);
3375 return (SET_ERROR(EDQUOT));
3376 }
3377 tx = dmu_tx_create(zfsvfs->z_os);
3378 fuid_dirtied = zfsvfs->z_fuid_dirty;
3379 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3380 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3381 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3382 ZFS_SA_BASE_ATTR_SIZE + len);
3383 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3384 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3385 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3386 acl_ids.z_aclp->z_acl_bytes);
3387 }
3388 if (fuid_dirtied)
3389 zfs_fuid_txhold(zfsvfs, tx);
3390 error = dmu_tx_assign(tx,
3391 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3392 if (error) {
3393 zfs_dirent_unlock(dl);
3394 if (error == ERESTART) {
3395 waited = B_TRUE;
3396 dmu_tx_wait(tx);
3397 dmu_tx_abort(tx);
3398 goto top;
3399 }
3400 zfs_acl_ids_free(&acl_ids);
3401 dmu_tx_abort(tx);
3402 zfs_exit(zfsvfs, FTAG);
3403 return (error);
3404 }
3405
3406 /*
3407 * Create a new object for the symlink.
3408 * for version 4 ZPL datasets the symlink will be an SA attribute
3409 */
3410 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3411
3412 if (fuid_dirtied)
3413 zfs_fuid_sync(zfsvfs, tx);
3414
3415 mutex_enter(&zp->z_lock);
3416 if (zp->z_is_sa)
3417 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3418 link, len, tx);
3419 else
3420 zfs_sa_symlink(zp, link, len, tx);
3421 mutex_exit(&zp->z_lock);
3422
3423 zp->z_size = len;
3424 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3425 &zp->z_size, sizeof (zp->z_size), tx);
3426 /*
3427 * Insert the new object into the directory.
3428 */
3429 error = zfs_link_create(dl, zp, tx, ZNEW);
3430 if (error != 0) {
3431 zfs_znode_delete(zp, tx);
3432 remove_inode_hash(ZTOI(zp));
3433 } else {
3434 if (flags & FIGNORECASE)
3435 txtype |= TX_CI;
3436 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3437
3438 zfs_znode_update_vfs(dzp);
3439 zfs_znode_update_vfs(zp);
3440 }
3441
3442 zfs_acl_ids_free(&acl_ids);
3443
3444 dmu_tx_commit(tx);
3445
3446 zfs_dirent_unlock(dl);
3447
3448 if (error == 0) {
3449 *zpp = zp;
3450
3451 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3452 error = zil_commit(zilog, 0);
3453 } else {
3454 zrele(zp);
3455 }
3456
3457 zfs_exit(zfsvfs, FTAG);
3458 return (error);
3459 }
3460
3461 /*
3462 * Return, in the buffer contained in the provided uio structure,
3463 * the symbolic path referred to by ip.
3464 *
3465 * IN: ip - inode of symbolic link
3466 * uio - structure to contain the link path.
3467 * cr - credentials of caller.
3468 *
3469 * RETURN: 0 if success
3470 * error code if failure
3471 *
3472 * Timestamps:
3473 * ip - atime updated
3474 */
3475 int
zfs_readlink(struct inode * ip,zfs_uio_t * uio,cred_t * cr)3476 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3477 {
3478 (void) cr;
3479 znode_t *zp = ITOZ(ip);
3480 zfsvfs_t *zfsvfs = ITOZSB(ip);
3481 int error;
3482
3483 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3484 return (error);
3485
3486 mutex_enter(&zp->z_lock);
3487 if (zp->z_is_sa)
3488 error = sa_lookup_uio(zp->z_sa_hdl,
3489 SA_ZPL_SYMLINK(zfsvfs), uio);
3490 else
3491 error = zfs_sa_readlink(zp, uio);
3492 mutex_exit(&zp->z_lock);
3493
3494 zfs_exit(zfsvfs, FTAG);
3495 return (error);
3496 }
3497
3498 /*
3499 * Insert a new entry into directory tdzp referencing szp.
3500 *
3501 * IN: tdzp - Directory to contain new entry.
3502 * szp - znode of new entry.
3503 * name - name of new entry.
3504 * cr - credentials of caller.
3505 * flags - case flags.
3506 *
3507 * RETURN: 0 if success
3508 * error code if failure
3509 *
3510 * Timestamps:
3511 * tdzp - ctime|mtime updated
3512 * szp - ctime updated
3513 */
3514 int
zfs_link(znode_t * tdzp,znode_t * szp,char * name,cred_t * cr,int flags)3515 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3516 int flags)
3517 {
3518 struct inode *sip = ZTOI(szp);
3519 znode_t *tzp;
3520 zfsvfs_t *zfsvfs = ZTOZSB(tdzp);
3521 zilog_t *zilog;
3522 zfs_dirlock_t *dl;
3523 dmu_tx_t *tx;
3524 int error;
3525 int zf = ZNEW;
3526 uint64_t parent;
3527 uid_t owner;
3528 boolean_t waited = B_FALSE;
3529 boolean_t is_tmpfile = 0;
3530 uint64_t txg;
3531
3532 is_tmpfile = (sip->i_nlink == 0 &&
3533 (inode_state_read_once(sip) & I_LINKABLE));
3534
3535 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3536
3537 if (name == NULL)
3538 return (SET_ERROR(EINVAL));
3539
3540 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3541 return (error);
3542 zilog = zfsvfs->z_log;
3543
3544 /*
3545 * POSIX dictates that we return EPERM here.
3546 * Better choices include ENOTSUP or EISDIR.
3547 */
3548 if (S_ISDIR(sip->i_mode)) {
3549 zfs_exit(zfsvfs, FTAG);
3550 return (SET_ERROR(EPERM));
3551 }
3552
3553 if ((error = zfs_verify_zp(szp)) != 0) {
3554 zfs_exit(zfsvfs, FTAG);
3555 return (error);
3556 }
3557
3558 /*
3559 * If we are using project inheritance, means if the directory has
3560 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3561 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3562 * such case, we only allow hard link creation in our tree when the
3563 * project IDs are the same.
3564 */
3565 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3566 tdzp->z_projid != szp->z_projid) {
3567 zfs_exit(zfsvfs, FTAG);
3568 return (SET_ERROR(EXDEV));
3569 }
3570
3571 /*
3572 * We check i_sb because snapshots and the ctldir must have different
3573 * super blocks.
3574 */
3575 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3576 zfs_exit(zfsvfs, FTAG);
3577 return (SET_ERROR(EXDEV));
3578 }
3579
3580 /* Prevent links to .zfs/shares files */
3581
3582 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3583 &parent, sizeof (uint64_t))) != 0) {
3584 zfs_exit(zfsvfs, FTAG);
3585 return (error);
3586 }
3587 if (parent == zfsvfs->z_shares_dir) {
3588 zfs_exit(zfsvfs, FTAG);
3589 return (SET_ERROR(EPERM));
3590 }
3591
3592 if (zfsvfs->z_utf8 && u8_validate(name,
3593 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3594 zfs_exit(zfsvfs, FTAG);
3595 return (SET_ERROR(EILSEQ));
3596 }
3597 if (flags & FIGNORECASE)
3598 zf |= ZCILOOK;
3599
3600 /*
3601 * We do not support links between attributes and non-attributes
3602 * because of the potential security risk of creating links
3603 * into "normal" file space in order to circumvent restrictions
3604 * imposed in attribute space.
3605 */
3606 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3607 zfs_exit(zfsvfs, FTAG);
3608 return (SET_ERROR(EINVAL));
3609 }
3610
3611 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3612 cr, ZFS_OWNER);
3613 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3614 zfs_exit(zfsvfs, FTAG);
3615 return (SET_ERROR(EPERM));
3616 }
3617
3618 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3619 zfs_init_idmap))) {
3620 zfs_exit(zfsvfs, FTAG);
3621 return (error);
3622 }
3623
3624 top:
3625 /*
3626 * Attempt to lock directory; fail if entry already exists.
3627 */
3628 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3629 if (error) {
3630 zfs_exit(zfsvfs, FTAG);
3631 return (error);
3632 }
3633
3634 tx = dmu_tx_create(zfsvfs->z_os);
3635 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3636 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3637 if (is_tmpfile)
3638 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3639
3640 zfs_sa_upgrade_txholds(tx, szp);
3641 zfs_sa_upgrade_txholds(tx, tdzp);
3642 error = dmu_tx_assign(tx,
3643 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3644 if (error) {
3645 zfs_dirent_unlock(dl);
3646 if (error == ERESTART) {
3647 waited = B_TRUE;
3648 dmu_tx_wait(tx);
3649 dmu_tx_abort(tx);
3650 goto top;
3651 }
3652 dmu_tx_abort(tx);
3653 zfs_exit(zfsvfs, FTAG);
3654 return (error);
3655 }
3656 /* unmark z_unlinked so zfs_link_create will not reject */
3657 if (is_tmpfile)
3658 szp->z_unlinked = B_FALSE;
3659 error = zfs_link_create(dl, szp, tx, 0);
3660
3661 if (error == 0) {
3662 uint64_t txtype = TX_LINK;
3663 /*
3664 * tmpfile is created to be in z_unlinkedobj, so remove it.
3665 * Also, we don't log in ZIL, because all previous file
3666 * operation on the tmpfile are ignored by ZIL. Instead we
3667 * always wait for txg to sync to make sure all previous
3668 * operation are sync safe.
3669 */
3670 if (is_tmpfile) {
3671 VERIFY0(zap_remove_int(zfsvfs->z_os,
3672 zfsvfs->z_unlinkedobj, szp->z_id, tx));
3673 } else {
3674 if (flags & FIGNORECASE)
3675 txtype |= TX_CI;
3676 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3677 }
3678 } else if (is_tmpfile) {
3679 /* restore z_unlinked since when linking failed */
3680 szp->z_unlinked = B_TRUE;
3681 }
3682 txg = dmu_tx_get_txg(tx);
3683 dmu_tx_commit(tx);
3684
3685 zfs_dirent_unlock(dl);
3686
3687 if (error == 0) {
3688 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3689 error = zil_commit(zilog, 0);
3690
3691 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3692 txg_wait_flag_t wait_flags =
3693 spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
3694 ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
3695 error = txg_wait_synced_flags(
3696 dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
3697 if (error != 0) {
3698 ASSERT3U(error, ==, ESHUTDOWN);
3699 error = SET_ERROR(EIO);
3700 }
3701 }
3702 }
3703
3704 zfs_znode_update_vfs(tdzp);
3705 zfs_znode_update_vfs(szp);
3706 zfs_exit(zfsvfs, FTAG);
3707 return (error);
3708 }
3709
3710 /* Finish page writeback. */
3711 static inline void
zfs_page_writeback_done(struct page * pp,int err)3712 zfs_page_writeback_done(struct page *pp, int err)
3713 {
3714 if (err != 0) {
3715 /*
3716 * Writeback failed. Re-dirty the page. It was undirtied before
3717 * the IO was issued (in zfs_putpage() or write_cache_pages()).
3718 * The kernel only considers writeback for dirty pages; if we
3719 * don't do this, it is eligible for eviction without being
3720 * written out, which we definitely don't want.
3721 */
3722 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3723 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3724 #else
3725 __set_page_dirty_nobuffers(pp);
3726 #endif
3727 }
3728
3729 ClearPageError(pp);
3730 end_page_writeback(pp);
3731 }
3732
3733 /*
3734 * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
3735 * for syncing writes. Called when the ZIL itx has been written to the log or
3736 * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
3737 * is passed as `err`.
3738 */
3739 static void
zfs_putpage_commit_cb(void * arg,int err)3740 zfs_putpage_commit_cb(void *arg, int err)
3741 {
3742 zfs_page_writeback_done(arg, err);
3743 }
3744
3745 /*
3746 * Push a page out to disk, once the page is on stable storage the
3747 * registered commit callback will be run as notification of completion.
3748 *
3749 * IN: ip - page mapped for inode.
3750 * pp - page to push (page is locked)
3751 * wbc - writeback control data
3752 * for_sync - does the caller intend to wait synchronously for the
3753 * page writeback to complete?
3754 *
3755 * RETURN: 0 if success
3756 * error code if failure
3757 *
3758 * Timestamps:
3759 * ip - ctime|mtime updated
3760 */
3761 int
zfs_putpage(struct inode * ip,struct page * pp,struct writeback_control * wbc,boolean_t for_sync)3762 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3763 boolean_t for_sync)
3764 {
3765 znode_t *zp = ITOZ(ip);
3766 zfsvfs_t *zfsvfs = ITOZSB(ip);
3767 loff_t offset;
3768 loff_t pgoff;
3769 unsigned int pglen;
3770 dmu_tx_t *tx;
3771 caddr_t va;
3772 int err = 0;
3773 uint64_t mtime[2], ctime[2];
3774 inode_timespec_t tmp_ts;
3775 sa_bulk_attr_t bulk[3];
3776 int cnt = 0;
3777 struct address_space *mapping;
3778
3779 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3780 return (err);
3781
3782 ASSERT(PageLocked(pp));
3783
3784 pgoff = page_offset(pp); /* Page byte-offset in file */
3785 offset = i_size_read(ip); /* File length in bytes */
3786 pglen = MIN(PAGE_SIZE, /* Page length in bytes */
3787 P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3788
3789 /* Page is beyond end of file */
3790 if (pgoff >= offset) {
3791 unlock_page(pp);
3792 zfs_exit(zfsvfs, FTAG);
3793 return (0);
3794 }
3795
3796 /* Truncate page length to end of file */
3797 if (pgoff + pglen > offset)
3798 pglen = offset - pgoff;
3799
3800 #if 0
3801 /*
3802 * FIXME: Allow mmap writes past its quota. The correct fix
3803 * is to register a page_mkwrite() handler to count the page
3804 * against its quota when it is about to be dirtied.
3805 */
3806 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3807 KUID_TO_SUID(ip->i_uid)) ||
3808 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3809 KGID_TO_SGID(ip->i_gid)) ||
3810 (zp->z_projid != ZFS_DEFAULT_PROJID &&
3811 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3812 zp->z_projid))) {
3813 err = EDQUOT;
3814 }
3815 #endif
3816
3817 /*
3818 * The ordering here is critical and must adhere to the following
3819 * rules in order to avoid deadlocking in either zfs_read() or
3820 * zfs_free_range() due to a lock inversion.
3821 *
3822 * 1) The page must be unlocked prior to acquiring the range lock.
3823 * This is critical because zfs_read() calls find_lock_page()
3824 * which may block on the page lock while holding the range lock.
3825 *
3826 * 2) Before setting or clearing write back on a page the range lock
3827 * must be held in order to prevent a lock inversion with the
3828 * zfs_free_range() function.
3829 *
3830 * This presents a problem because upon entering this function the
3831 * page lock is already held. To safely acquire the range lock the
3832 * page lock must be dropped. This creates a window where another
3833 * process could truncate, invalidate, dirty, or write out the page.
3834 *
3835 * Therefore, after successfully reacquiring the range and page locks
3836 * the current page state is checked. In the common case everything
3837 * will be as is expected and it can be written out. However, if
3838 * the page state has changed it must be handled accordingly.
3839 */
3840 mapping = pp->mapping;
3841 redirty_page_for_writepage(wbc, pp);
3842 unlock_page(pp);
3843
3844 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3845 pgoff, pglen, RL_WRITER);
3846 lock_page(pp);
3847
3848 /* Page mapping changed or it was no longer dirty, we're done */
3849 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3850 unlock_page(pp);
3851 zfs_rangelock_exit(lr);
3852 zfs_exit(zfsvfs, FTAG);
3853 return (0);
3854 }
3855
3856 /* Another process started write block if required */
3857 if (PageWriteback(pp)) {
3858 unlock_page(pp);
3859 zfs_rangelock_exit(lr);
3860
3861 if (wbc->sync_mode != WB_SYNC_NONE) {
3862 if (PageWriteback(pp))
3863 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3864 folio_wait_bit(page_folio(pp), PG_writeback);
3865 #else
3866 wait_on_page_bit(pp, PG_writeback);
3867 #endif
3868 }
3869
3870 zfs_exit(zfsvfs, FTAG);
3871 return (0);
3872 }
3873
3874 /* Clear the dirty flag the required locks are held */
3875 if (!clear_page_dirty_for_io(pp)) {
3876 unlock_page(pp);
3877 zfs_rangelock_exit(lr);
3878 zfs_exit(zfsvfs, FTAG);
3879 return (0);
3880 }
3881
3882 /*
3883 * Counterpart for redirty_page_for_writepage() above. This page
3884 * was in fact not skipped and should not be counted as if it were.
3885 */
3886 wbc->pages_skipped--;
3887 set_page_writeback(pp);
3888 unlock_page(pp);
3889
3890 tx = dmu_tx_create(zfsvfs->z_os);
3891 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3892 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3893 zfs_sa_upgrade_txholds(tx, zp);
3894
3895 err = dmu_tx_assign(tx, DMU_TX_WAIT);
3896 if (err != 0) {
3897 dmu_tx_abort(tx);
3898 zfs_page_writeback_done(pp, err);
3899 zfs_rangelock_exit(lr);
3900 zfs_exit(zfsvfs, FTAG);
3901
3902 /*
3903 * Don't return error for an async writeback; we've re-dirtied
3904 * the page so it will be tried again some other time.
3905 */
3906 return (for_sync ? err : 0);
3907 }
3908
3909 va = kmap(pp);
3910 ASSERT3U(pglen, <=, PAGE_SIZE);
3911 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx,
3912 DMU_READ_PREFETCH);
3913 kunmap(pp);
3914
3915 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3916 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3917 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3918 &zp->z_pflags, 8);
3919
3920 /* Preserve the mtime and ctime provided by the inode */
3921 tmp_ts = zpl_inode_get_mtime(ip);
3922 ZFS_TIME_ENCODE(&tmp_ts, mtime);
3923 tmp_ts = zpl_inode_get_ctime(ip);
3924 ZFS_TIME_ENCODE(&tmp_ts, ctime);
3925 zp->z_atime_dirty = B_FALSE;
3926 zp->z_seq++;
3927
3928 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3929
3930 /*
3931 * A note about for_sync vs wbc->sync_mode.
3932 *
3933 * for_sync indicates that this is a syncing writeback, that is, kernel
3934 * caller expects the data to be durably stored before being notified.
3935 * Often, but not always, the call was triggered by a userspace syncing
3936 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
3937 * means that that page should remain "locked" (in the writeback state)
3938 * until it is definitely on disk (ie zil_commit() or spa_sync()).
3939 * Otherwise, we can unlock and return as soon as it is on the
3940 * in-memory ZIL.
3941 *
3942 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
3943 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
3944 * indicates this a regular async writeback (eg a cache eviction) and
3945 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
3946 * a syncing op that must be waited on (by convention, we test for
3947 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
3948 * performance should there ever be a new mode that we have not yet
3949 * added support for).
3950 *
3951 * So, why a separate for_sync field? This is because zpl_writepages()
3952 * calls zfs_putpage() multiple times for a single "logical" operation.
3953 * It wants all the individual pages to be for_sync==TRUE ie only
3954 * unlocked once durably stored, but it only wants one call to
3955 * zil_commit() at the very end, once all the pages are synced. So,
3956 * it repurposes sync_mode slightly to indicate who issue and wait for
3957 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
3958 * ALL, zfs_putpage should do it.
3959 *
3960 * Summary:
3961 * for_sync: 0=unlock immediately; 1=unlock once on disk
3962 * sync_mode: NONE=caller will commit; ALL=we will commit
3963 */
3964 boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
3965
3966 /*
3967 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
3968 * because it is a policy flag that indicates "someone will call
3969 * zil_commit() soon". for_sync=TRUE means exactly that; the only
3970 * question is whether it will be us, or zpl_writepages().
3971 */
3972 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
3973 B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
3974
3975 if (!for_sync) {
3976 /*
3977 * Async writeback is logged and written to the DMU, so page
3978 * can now be unlocked.
3979 */
3980 zfs_page_writeback_done(pp, 0);
3981 }
3982
3983 dmu_tx_commit(tx);
3984
3985 zfs_rangelock_exit(lr);
3986
3987 if (need_commit) {
3988 err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
3989 if (err != 0) {
3990 zfs_exit(zfsvfs, FTAG);
3991 return (err);
3992 }
3993 }
3994
3995 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3996
3997 zfs_exit(zfsvfs, FTAG);
3998 return (err);
3999 }
4000
4001 /*
4002 * Update the system attributes when the inode has been dirtied. For the
4003 * moment we only update the mode, atime, mtime, and ctime.
4004 */
4005 int
zfs_dirty_inode(struct inode * ip,int flags)4006 zfs_dirty_inode(struct inode *ip, int flags)
4007 {
4008 znode_t *zp = ITOZ(ip);
4009 zfsvfs_t *zfsvfs = ITOZSB(ip);
4010 dmu_tx_t *tx;
4011 uint64_t mode, atime[2], mtime[2], ctime[2];
4012 inode_timespec_t tmp_ts;
4013 sa_bulk_attr_t bulk[4];
4014 int error = 0;
4015 int cnt = 0;
4016
4017 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4018 return (0);
4019
4020 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4021 return (error);
4022
4023 #ifdef I_DIRTY_TIME
4024 /*
4025 * This is the lazytime semantic introduced in Linux 4.0
4026 * This flag will only be called from update_time when lazytime is set.
4027 * (Note, I_DIRTY_SYNC will also set if not lazytime)
4028 * Fortunately mtime and ctime are managed within ZFS itself, so we
4029 * only need to dirty atime.
4030 */
4031 if (flags == I_DIRTY_TIME) {
4032 zp->z_atime_dirty = B_TRUE;
4033 goto out;
4034 }
4035 #endif
4036
4037 tx = dmu_tx_create(zfsvfs->z_os);
4038
4039 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4040 zfs_sa_upgrade_txholds(tx, zp);
4041
4042 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4043 if (error) {
4044 dmu_tx_abort(tx);
4045 goto out;
4046 }
4047
4048 mutex_enter(&zp->z_lock);
4049 zp->z_atime_dirty = B_FALSE;
4050
4051 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4052 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4053 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4054 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4055
4056 /* Preserve the mode, mtime and ctime provided by the inode */
4057 tmp_ts = zpl_inode_get_atime(ip);
4058 ZFS_TIME_ENCODE(&tmp_ts, atime);
4059 tmp_ts = zpl_inode_get_mtime(ip);
4060 ZFS_TIME_ENCODE(&tmp_ts, mtime);
4061 tmp_ts = zpl_inode_get_ctime(ip);
4062 ZFS_TIME_ENCODE(&tmp_ts, ctime);
4063 mode = ip->i_mode;
4064
4065 zp->z_mode = mode;
4066
4067 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4068 mutex_exit(&zp->z_lock);
4069
4070 dmu_tx_commit(tx);
4071 out:
4072 zfs_exit(zfsvfs, FTAG);
4073 return (error);
4074 }
4075
4076 void
zfs_inactive(struct inode * ip)4077 zfs_inactive(struct inode *ip)
4078 {
4079 znode_t *zp = ITOZ(ip);
4080 zfsvfs_t *zfsvfs = ITOZSB(ip);
4081 krwlock_t *zti_lock = &zfsvfs->z_teardown_inactive_lock;
4082 uint64_t atime[2];
4083 int error;
4084 int need_unlock = 0;
4085 boolean_t no_lockdep = B_FALSE;
4086
4087 /* Only read lock if we haven't already write locked, e.g. rollback */
4088 if (!RW_WRITE_HELD(zti_lock)) {
4089 need_unlock = 1;
4090 /*
4091 * kswapd reaches evict_inode() with fs_reclaim held. Suppress
4092 * lockdep only for this reclaim-thread acquire/release pair.
4093 */
4094 no_lockdep = current_is_reclaim_thread();
4095 if (no_lockdep)
4096 rw_enter_nolockdep(zti_lock, RW_READER);
4097 else
4098 rw_enter(zti_lock, RW_READER);
4099 }
4100 if (zp->z_sa_hdl == NULL) {
4101 if (need_unlock) {
4102 if (no_lockdep)
4103 rw_exit_nolockdep(zti_lock);
4104 else
4105 rw_exit(zti_lock);
4106 }
4107 return;
4108 }
4109
4110 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
4111 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4112
4113 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4114 zfs_sa_upgrade_txholds(tx, zp);
4115 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4116 if (error) {
4117 dmu_tx_abort(tx);
4118 } else {
4119 inode_timespec_t tmp_atime;
4120 tmp_atime = zpl_inode_get_atime(ip);
4121 ZFS_TIME_ENCODE(&tmp_atime, atime);
4122 mutex_enter(&zp->z_lock);
4123 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4124 (void *)&atime, sizeof (atime), tx);
4125 zp->z_atime_dirty = B_FALSE;
4126 mutex_exit(&zp->z_lock);
4127 dmu_tx_commit(tx);
4128 }
4129 }
4130
4131 zfs_zinactive(zp);
4132 if (need_unlock) {
4133 if (no_lockdep)
4134 rw_exit_nolockdep(zti_lock);
4135 else
4136 rw_exit(zti_lock);
4137 }
4138 }
4139
4140 /*
4141 * Fill pages with data from the disk.
4142 */
4143 static int
zfs_fillpage(struct inode * ip,struct page * pp)4144 zfs_fillpage(struct inode *ip, struct page *pp)
4145 {
4146 znode_t *zp = ITOZ(ip);
4147 zfsvfs_t *zfsvfs = ITOZSB(ip);
4148 loff_t i_size = i_size_read(ip);
4149 u_offset_t io_off = page_offset(pp);
4150 size_t io_len = PAGE_SIZE;
4151
4152 ASSERT3U(io_off, <, i_size);
4153
4154 if (io_off + io_len > i_size)
4155 io_len = i_size - io_off;
4156
4157 void *va = kmap(pp);
4158 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
4159 io_len, va, DMU_READ_PREFETCH);
4160 if (io_len != PAGE_SIZE)
4161 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
4162 kunmap(pp);
4163
4164 if (error) {
4165 /* convert checksum errors into IO errors */
4166 if (error == ECKSUM)
4167 error = SET_ERROR(EIO);
4168
4169 SetPageError(pp);
4170 ClearPageUptodate(pp);
4171 } else {
4172 ClearPageError(pp);
4173 SetPageUptodate(pp);
4174 }
4175
4176 return (error);
4177 }
4178
4179 /*
4180 * Uses zfs_fillpage to read data from the file and fill the page.
4181 *
4182 * IN: ip - inode of file to get data from.
4183 * pp - page to read
4184 *
4185 * RETURN: 0 on success, error code on failure.
4186 *
4187 * Timestamps:
4188 * vp - atime updated
4189 */
4190 int
zfs_getpage(struct inode * ip,struct page * pp)4191 zfs_getpage(struct inode *ip, struct page *pp)
4192 {
4193 zfsvfs_t *zfsvfs = ITOZSB(ip);
4194 znode_t *zp = ITOZ(ip);
4195 int error;
4196 loff_t i_size = i_size_read(ip);
4197 u_offset_t io_off = page_offset(pp);
4198 size_t io_len = PAGE_SIZE;
4199
4200 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4201 return (error);
4202
4203 ASSERT3U(io_off, <, i_size);
4204
4205 if (io_off + io_len > i_size)
4206 io_len = i_size - io_off;
4207
4208 /*
4209 * It is important to hold the rangelock here because it is possible
4210 * a Direct I/O write or block clone might be taking place at the same
4211 * time that a page is being faulted in through filemap_fault(). With
4212 * Direct I/O writes and block cloning db->db_data will be set to NULL
4213 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
4214 * rangelock is not held, then there is a race between faulting in a
4215 * page and writing out a Direct I/O write or block cloning. Without
4216 * the rangelock a NULL pointer dereference can occur in
4217 * dmu_read_impl() for db->db_data during the mempcy operation when
4218 * zfs_fillpage() calls dmu_read().
4219 */
4220 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
4221 io_off, io_len, RL_READER);
4222 if (lr == NULL) {
4223 /*
4224 * It is important to drop the page lock before grabbing the
4225 * rangelock to avoid another deadlock between here and
4226 * zfs_write() -> update_pages(). update_pages() holds both the
4227 * rangelock and the page lock.
4228 */
4229 get_page(pp);
4230 unlock_page(pp);
4231 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
4232 io_len, RL_READER);
4233 lock_page(pp);
4234 put_page(pp);
4235 }
4236 error = zfs_fillpage(ip, pp);
4237 zfs_rangelock_exit(lr);
4238
4239 if (error == 0)
4240 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4241
4242 zfs_exit(zfsvfs, FTAG);
4243
4244 return (error);
4245 }
4246
4247 /*
4248 * Check ZFS specific permissions to memory map a section of a file.
4249 *
4250 * IN: ip - inode of the file to mmap
4251 * off - file offset
4252 * addrp - start address in memory region
4253 * len - length of memory region
4254 * vm_flags- address flags
4255 *
4256 * RETURN: 0 if success
4257 * error code if failure
4258 */
4259 int
zfs_map(struct inode * ip,offset_t off,caddr_t * addrp,size_t len,unsigned long vm_flags)4260 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4261 unsigned long vm_flags)
4262 {
4263 (void) addrp;
4264 znode_t *zp = ITOZ(ip);
4265 zfsvfs_t *zfsvfs = ITOZSB(ip);
4266 int error;
4267
4268 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4269 return (error);
4270
4271 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
4272 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4273 zfs_exit(zfsvfs, FTAG);
4274 return (SET_ERROR(EPERM));
4275 }
4276
4277 if ((vm_flags & (VM_READ | VM_EXEC)) &&
4278 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4279 zfs_exit(zfsvfs, FTAG);
4280 return (SET_ERROR(EACCES));
4281 }
4282
4283 if (off < 0 || len > MAXOFFSET_T - off) {
4284 zfs_exit(zfsvfs, FTAG);
4285 return (SET_ERROR(ENXIO));
4286 }
4287
4288 zfs_exit(zfsvfs, FTAG);
4289 return (0);
4290 }
4291
4292 /*
4293 * Free or allocate space in a file. Currently, this function only
4294 * supports the `F_FREESP' command. However, this command is somewhat
4295 * misnamed, as its functionality includes the ability to allocate as
4296 * well as free space.
4297 *
4298 * IN: zp - znode of file to free data in.
4299 * cmd - action to take (only F_FREESP supported).
4300 * bfp - section of file to free/alloc.
4301 * flag - current file open mode flags.
4302 * offset - current file offset.
4303 * cr - credentials of caller.
4304 *
4305 * RETURN: 0 on success, error code on failure.
4306 *
4307 * Timestamps:
4308 * zp - ctime|mtime updated
4309 */
4310 int
zfs_space(znode_t * zp,int cmd,flock64_t * bfp,int flag,offset_t offset,cred_t * cr)4311 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4312 offset_t offset, cred_t *cr)
4313 {
4314 (void) offset;
4315 zfsvfs_t *zfsvfs = ZTOZSB(zp);
4316 uint64_t off, len;
4317 int error;
4318
4319 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4320 return (error);
4321
4322 if (cmd != F_FREESP) {
4323 zfs_exit(zfsvfs, FTAG);
4324 return (SET_ERROR(EINVAL));
4325 }
4326
4327 /*
4328 * Callers might not be able to detect properly that we are read-only,
4329 * so check it explicitly here.
4330 */
4331 if (zfs_is_readonly(zfsvfs)) {
4332 zfs_exit(zfsvfs, FTAG);
4333 return (SET_ERROR(EROFS));
4334 }
4335
4336 if (bfp->l_len < 0) {
4337 zfs_exit(zfsvfs, FTAG);
4338 return (SET_ERROR(EINVAL));
4339 }
4340
4341 /*
4342 * Permissions aren't checked on Solaris because on this OS
4343 * zfs_space() can only be called with an opened file handle.
4344 * On Linux we can get here through truncate_range() which
4345 * operates directly on inodes, so we need to check access rights.
4346 */
4347 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4348 zfs_init_idmap))) {
4349 zfs_exit(zfsvfs, FTAG);
4350 return (error);
4351 }
4352
4353 off = bfp->l_start;
4354 len = bfp->l_len; /* 0 means from off to end of file */
4355
4356 error = zfs_freesp(zp, off, len, flag, TRUE);
4357
4358 zfs_exit(zfsvfs, FTAG);
4359 return (error);
4360 }
4361
4362 int
zfs_fid(struct inode * ip,fid_t * fidp)4363 zfs_fid(struct inode *ip, fid_t *fidp)
4364 {
4365 znode_t *zp = ITOZ(ip);
4366 zfsvfs_t *zfsvfs = ITOZSB(ip);
4367 uint32_t gen;
4368 uint64_t gen64;
4369 uint64_t object = zp->z_id;
4370 zfid_short_t *zfid;
4371 int size, i, error;
4372
4373 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4374 return (error);
4375
4376 if (fidp->fid_len < SHORT_FID_LEN) {
4377 fidp->fid_len = SHORT_FID_LEN;
4378 zfs_exit(zfsvfs, FTAG);
4379 return (SET_ERROR(ENOSPC));
4380 }
4381
4382 if ((error = zfs_verify_zp(zp)) != 0) {
4383 zfs_exit(zfsvfs, FTAG);
4384 return (error);
4385 }
4386
4387 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4388 &gen64, sizeof (uint64_t))) != 0) {
4389 zfs_exit(zfsvfs, FTAG);
4390 return (error);
4391 }
4392
4393 gen = (uint32_t)gen64;
4394
4395 size = SHORT_FID_LEN;
4396
4397 zfid = (zfid_short_t *)fidp;
4398
4399 zfid->zf_len = size;
4400
4401 for (i = 0; i < sizeof (zfid->zf_object); i++)
4402 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4403
4404 /* Must have a non-zero generation number to distinguish from .zfs */
4405 if (gen == 0)
4406 gen = 1;
4407 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4408 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4409
4410 zfs_exit(zfsvfs, FTAG);
4411 return (0);
4412 }
4413
4414 #if defined(_KERNEL)
4415 EXPORT_SYMBOL(zfs_open);
4416 EXPORT_SYMBOL(zfs_close);
4417 EXPORT_SYMBOL(zfs_lookup);
4418 EXPORT_SYMBOL(zfs_create);
4419 EXPORT_SYMBOL(zfs_tmpfile);
4420 EXPORT_SYMBOL(zfs_remove);
4421 EXPORT_SYMBOL(zfs_mkdir);
4422 EXPORT_SYMBOL(zfs_rmdir);
4423 EXPORT_SYMBOL(zfs_readdir);
4424 EXPORT_SYMBOL(zfs_getattr_fast);
4425 EXPORT_SYMBOL(zfs_setattr);
4426 EXPORT_SYMBOL(zfs_rename);
4427 EXPORT_SYMBOL(zfs_symlink);
4428 EXPORT_SYMBOL(zfs_readlink);
4429 EXPORT_SYMBOL(zfs_link);
4430 EXPORT_SYMBOL(zfs_inactive);
4431 EXPORT_SYMBOL(zfs_space);
4432 EXPORT_SYMBOL(zfs_fid);
4433 EXPORT_SYMBOL(zfs_getpage);
4434 EXPORT_SYMBOL(zfs_putpage);
4435 EXPORT_SYMBOL(zfs_dirty_inode);
4436 EXPORT_SYMBOL(zfs_map);
4437
4438 module_param(zfs_delete_blocks, ulong, 0644);
4439 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4440 #endif
4441