1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2025, Klara, Inc.
29 */
30
31 /* Portions Copyright 2007 Jeremy Teo */
32 /* Portions Copyright 2010 Robert Milkowski */
33
34
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/time.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/vmsystm.h>
46 #include <sys/atomic.h>
47 #include <sys/pathname.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/zfs_dir.h>
51 #include <sys/zfs_acl.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/policy.h>
62 #include <sys/sunddi.h>
63 #include <sys/sid.h>
64 #include <sys/zfs_ctldir.h>
65 #include <sys/zfs_fuid.h>
66 #include <sys/zfs_quota.h>
67 #include <sys/zfs_sa.h>
68 #include <sys/zfs_vnops.h>
69 #include <sys/zfs_rlock.h>
70 #include <sys/cred.h>
71 #include <sys/zpl.h>
72 #include <sys/zil.h>
73 #include <sys/sa_impl.h>
74 #include <linux/mm_compat.h>
75
76 /*
77 * Programming rules.
78 *
79 * Each vnode op performs some logical unit of work. To do this, the ZPL must
80 * properly lock its in-core state, create a DMU transaction, do the work,
81 * record this work in the intent log (ZIL), commit the DMU transaction,
82 * and wait for the intent log to commit if it is a synchronous operation.
83 * Moreover, the vnode ops must work in both normal and log replay context.
84 * The ordering of events is important to avoid deadlocks and references
85 * to freed memory. The example below illustrates the following Big Rules:
86 *
87 * (1) A check must be made in each zfs thread for a mounted file system.
88 * This is done avoiding races using zfs_enter(zfsvfs).
89 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
90 * must be checked with zfs_verify_zp(zp). Both of these macros
91 * can return EIO from the calling function.
92 *
93 * (2) zrele() should always be the last thing except for zil_commit() (if
94 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
95 * last reference, the vnode/znode can be freed, so the zp may point to
96 * freed memory. Second, the last reference will call zfs_zinactive(),
97 * which may induce a lot of work -- pushing cached pages (which acquires
98 * range locks) and syncing out cached atime changes. Third,
99 * zfs_zinactive() may require a new tx, which could deadlock the system
100 * if you were already holding one. This deadlock occurs because the tx
101 * currently being operated on prevents a txg from syncing, which
102 * prevents the new tx from progressing, resulting in a deadlock. If you
103 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
104 * is a synonym for zrele().
105 *
106 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
107 * as they can span dmu_tx_assign() calls.
108 *
109 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
110 * dmu_tx_assign(). This is critical because we don't want to block
111 * while holding locks.
112 *
113 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
114 * This reduces lock contention and CPU usage when we must wait (note
115 * that if throughput is constrained by the storage, nearly every
116 * transaction must wait).
117 *
118 * Note, in particular, that if a lock is sometimes acquired before
119 * the tx assigns, and sometimes after (e.g. z_lock), then failing
120 * to use a non-blocking assign can deadlock the system. The scenario:
121 *
122 * Thread A has grabbed a lock before calling dmu_tx_assign().
123 * Thread B is in an already-assigned tx, and blocks for this lock.
124 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
125 * txg_wait_open() forever, because the previous txg can't quiesce
126 * until B's tx commits.
127 *
128 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
129 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
130 * again. On subsequent calls to dmu_tx_assign(), pass
131 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
132 * this operation has already called dmu_tx_wait(). This will ensure
133 * that we don't retry forever, waiting a short bit each time.
134 *
135 * (5) If the operation succeeded, generate the intent log entry for it
136 * before dropping locks. This ensures that the ordering of events
137 * in the intent log matches the order in which they actually occurred.
138 * During ZIL replay the zfs_log_* functions will update the sequence
139 * number to indicate the zil transaction has replayed.
140 *
141 * (6) At the end of each vnode op, the DMU tx must always commit,
142 * regardless of whether there were any errors.
143 *
144 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
145 * to ensure that synchronous semantics are provided when necessary.
146 *
147 * In general, this is how things should be ordered in each vnode op:
148 *
149 * zfs_enter(zfsvfs); // exit if unmounted
150 * top:
151 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
152 * rw_enter(...); // grab any other locks you need
153 * tx = dmu_tx_create(...); // get DMU tx
154 * dmu_tx_hold_*(); // hold each object you might modify
155 * error = dmu_tx_assign(tx,
156 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
157 * if (error) {
158 * rw_exit(...); // drop locks
159 * zfs_dirent_unlock(dl); // unlock directory entry
160 * zrele(...); // release held znodes
161 * if (error == ERESTART) {
162 * waited = B_TRUE;
163 * dmu_tx_wait(tx);
164 * dmu_tx_abort(tx);
165 * goto top;
166 * }
167 * dmu_tx_abort(tx); // abort DMU tx
168 * zfs_exit(zfsvfs); // finished in zfs
169 * return (error); // really out of space
170 * }
171 * error = do_real_work(); // do whatever this VOP does
172 * if (error == 0)
173 * zfs_log_*(...); // on success, make ZIL entry
174 * dmu_tx_commit(tx); // commit DMU tx -- error or not
175 * rw_exit(...); // drop locks
176 * zfs_dirent_unlock(dl); // unlock directory entry
177 * zrele(...); // release held znodes
178 * zil_commit(zilog, foid); // synchronous when necessary
179 * zfs_exit(zfsvfs); // finished in zfs
180 * return (error); // done, report error
181 */
182 int
zfs_open(struct inode * ip,int mode,int flag,cred_t * cr)183 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
184 {
185 (void) cr;
186 znode_t *zp = ITOZ(ip);
187 zfsvfs_t *zfsvfs = ITOZSB(ip);
188 int error;
189
190 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
191 return (error);
192
193 /* Honor ZFS_APPENDONLY file attribute */
194 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
195 ((flag & O_APPEND) == 0)) {
196 zfs_exit(zfsvfs, FTAG);
197 return (SET_ERROR(EPERM));
198 }
199
200 /*
201 * Keep a count of the synchronous opens in the znode. On first
202 * synchronous open we must convert all previous async transactions
203 * into sync to keep correct ordering.
204 */
205 if (flag & O_SYNC) {
206 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
207 zil_async_to_sync(zfsvfs->z_log, zp->z_id);
208 }
209
210 zfs_exit(zfsvfs, FTAG);
211 return (0);
212 }
213
214 int
zfs_close(struct inode * ip,int flag,cred_t * cr)215 zfs_close(struct inode *ip, int flag, cred_t *cr)
216 {
217 (void) cr;
218 znode_t *zp = ITOZ(ip);
219 zfsvfs_t *zfsvfs = ITOZSB(ip);
220 int error;
221
222 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
223 return (error);
224
225 /* Decrement the synchronous opens in the znode */
226 if (flag & O_SYNC)
227 atomic_dec_32(&zp->z_sync_cnt);
228
229 zfs_exit(zfsvfs, FTAG);
230 return (0);
231 }
232
233 #if defined(_KERNEL)
234
235 static int zfs_fillpage(struct inode *ip, struct page *pp);
236
237 /*
238 * When a file is memory mapped, we must keep the IO data synchronized
239 * between the DMU cache and the memory mapped pages. Update all mapped
240 * pages with the contents of the coresponding dmu buffer.
241 */
242 void
update_pages(znode_t * zp,int64_t start,int len,objset_t * os)243 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
244 {
245 struct address_space *mp = ZTOI(zp)->i_mapping;
246 int64_t off = start & (PAGE_SIZE - 1);
247
248 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
249 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
250
251 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
252 if (pp) {
253 if (mapping_writably_mapped(mp))
254 flush_dcache_page(pp);
255
256 void *pb = kmap(pp);
257 int error = dmu_read(os, zp->z_id, start + off,
258 nbytes, pb + off, DMU_READ_PREFETCH);
259 kunmap(pp);
260
261 if (error) {
262 SetPageError(pp);
263 ClearPageUptodate(pp);
264 } else {
265 ClearPageError(pp);
266 SetPageUptodate(pp);
267
268 if (mapping_writably_mapped(mp))
269 flush_dcache_page(pp);
270
271 mark_page_accessed(pp);
272 }
273
274 unlock_page(pp);
275 put_page(pp);
276 }
277
278 len -= nbytes;
279 off = 0;
280 }
281 }
282
283 /*
284 * When a file is memory mapped, we must keep the I/O data synchronized
285 * between the DMU cache and the memory mapped pages. Preferentially read
286 * from memory mapped pages, otherwise fallback to reading through the dmu.
287 */
288 int
mappedread(znode_t * zp,int nbytes,zfs_uio_t * uio)289 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
290 {
291 struct inode *ip = ZTOI(zp);
292 struct address_space *mp = ip->i_mapping;
293 int64_t start = uio->uio_loffset;
294 int64_t off = start & (PAGE_SIZE - 1);
295 int len = nbytes;
296 int error = 0;
297
298 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
299 uint64_t bytes = MIN(PAGE_SIZE - off, len);
300
301 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
302 if (pp) {
303
304 /*
305 * If filemap_fault() retries there exists a window
306 * where the page will be unlocked and not up to date.
307 * In this case we must try and fill the page.
308 */
309 if (unlikely(!PageUptodate(pp))) {
310 error = zfs_fillpage(ip, pp);
311 if (error) {
312 unlock_page(pp);
313 put_page(pp);
314 return (error);
315 }
316 }
317
318 ASSERT(PageUptodate(pp) || PageDirty(pp));
319
320 unlock_page(pp);
321
322 void *pb = kmap(pp);
323 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
324 kunmap(pp);
325
326 if (mapping_writably_mapped(mp))
327 flush_dcache_page(pp);
328
329 mark_page_accessed(pp);
330 put_page(pp);
331 } else {
332 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
333 uio, bytes, DMU_READ_PREFETCH);
334 }
335
336 len -= bytes;
337 off = 0;
338
339 if (error)
340 break;
341 }
342
343 return (error);
344 }
345 #endif /* _KERNEL */
346
347 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
348
349 /*
350 * Write the bytes to a file.
351 *
352 * IN: zp - znode of file to be written to
353 * data - bytes to write
354 * len - number of bytes to write
355 * pos - offset to start writing at
356 *
357 * OUT: resid - remaining bytes to write
358 *
359 * RETURN: 0 if success
360 * positive error code if failure. EIO is returned
361 * for a short write when residp isn't provided.
362 *
363 * Timestamps:
364 * zp - ctime|mtime updated if byte count > 0
365 */
366 int
zfs_write_simple(znode_t * zp,const void * data,size_t len,loff_t pos,size_t * residp)367 zfs_write_simple(znode_t *zp, const void *data, size_t len,
368 loff_t pos, size_t *residp)
369 {
370 fstrans_cookie_t cookie;
371 int error;
372
373 struct iovec iov;
374 iov.iov_base = (void *)data;
375 iov.iov_len = len;
376
377 zfs_uio_t uio;
378 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
379
380 cookie = spl_fstrans_mark();
381 error = zfs_write(zp, &uio, 0, kcred);
382 spl_fstrans_unmark(cookie);
383
384 if (error == 0) {
385 if (residp != NULL)
386 *residp = zfs_uio_resid(&uio);
387 else if (zfs_uio_resid(&uio) != 0)
388 error = SET_ERROR(EIO);
389 }
390
391 return (error);
392 }
393
394 static void
zfs_rele_async_task(void * arg)395 zfs_rele_async_task(void *arg)
396 {
397 iput(arg);
398 }
399
400 void
zfs_zrele_async(znode_t * zp)401 zfs_zrele_async(znode_t *zp)
402 {
403 struct inode *ip = ZTOI(zp);
404 objset_t *os = ITOZSB(ip)->z_os;
405
406 ASSERT(atomic_read(&ip->i_count) > 0);
407 ASSERT(os != NULL);
408
409 /*
410 * If decrementing the count would put us at 0, we can't do it inline
411 * here, because that would be synchronous. Instead, dispatch an iput
412 * to run later.
413 *
414 * For more information on the dangers of a synchronous iput, see the
415 * header comment of this file.
416 */
417 if (!atomic_add_unless(&ip->i_count, -1, 1)) {
418 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
419 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
420 }
421 }
422
423
424 /*
425 * Lookup an entry in a directory, or an extended attribute directory.
426 * If it exists, return a held inode reference for it.
427 *
428 * IN: zdp - znode of directory to search.
429 * nm - name of entry to lookup.
430 * flags - LOOKUP_XATTR set if looking for an attribute.
431 * cr - credentials of caller.
432 * direntflags - directory lookup flags
433 * realpnp - returned pathname.
434 *
435 * OUT: zpp - znode of located entry, NULL if not found.
436 *
437 * RETURN: 0 on success, error code on failure.
438 *
439 * Timestamps:
440 * NA
441 */
442 int
zfs_lookup(znode_t * zdp,char * nm,znode_t ** zpp,int flags,cred_t * cr,int * direntflags,pathname_t * realpnp)443 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
444 int *direntflags, pathname_t *realpnp)
445 {
446 zfsvfs_t *zfsvfs = ZTOZSB(zdp);
447 int error = 0;
448
449 /*
450 * Fast path lookup, however we must skip DNLC lookup
451 * for case folding or normalizing lookups because the
452 * DNLC code only stores the passed in name. This means
453 * creating 'a' and removing 'A' on a case insensitive
454 * file system would work, but DNLC still thinks 'a'
455 * exists and won't let you create it again on the next
456 * pass through fast path.
457 */
458 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
459
460 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
461 return (SET_ERROR(ENOTDIR));
462 } else if (zdp->z_sa_hdl == NULL) {
463 return (SET_ERROR(EIO));
464 }
465
466 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
467 error = zfs_fastaccesschk_execute(zdp, cr);
468 if (!error) {
469 *zpp = zdp;
470 zhold(*zpp);
471 return (0);
472 }
473 return (error);
474 }
475 }
476
477 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
478 return (error);
479
480 *zpp = NULL;
481
482 if (flags & LOOKUP_XATTR) {
483 /*
484 * We don't allow recursive attributes..
485 * Maybe someday we will.
486 */
487 if (zdp->z_pflags & ZFS_XATTR) {
488 zfs_exit(zfsvfs, FTAG);
489 return (SET_ERROR(EINVAL));
490 }
491
492 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
493 zfs_exit(zfsvfs, FTAG);
494 return (error);
495 }
496
497 /*
498 * Do we have permission to get into attribute directory?
499 */
500
501 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
502 B_TRUE, cr, zfs_init_idmap))) {
503 zrele(*zpp);
504 *zpp = NULL;
505 }
506
507 zfs_exit(zfsvfs, FTAG);
508 return (error);
509 }
510
511 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
512 zfs_exit(zfsvfs, FTAG);
513 return (SET_ERROR(ENOTDIR));
514 }
515
516 /*
517 * Check accessibility of directory.
518 */
519
520 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
521 zfs_init_idmap))) {
522 zfs_exit(zfsvfs, FTAG);
523 return (error);
524 }
525
526 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
527 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
528 zfs_exit(zfsvfs, FTAG);
529 return (SET_ERROR(EILSEQ));
530 }
531
532 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
533 if ((error == 0) && (*zpp))
534 zfs_znode_update_vfs(*zpp);
535
536 zfs_exit(zfsvfs, FTAG);
537 return (error);
538 }
539
540 /*
541 * Perform a linear search in directory for the name of specific inode.
542 * Note we don't pass in the buffer size of name because it's hardcoded to
543 * NAME_MAX+1(256) in Linux.
544 *
545 * IN: dzp - znode of directory to search.
546 * zp - znode of the target
547 *
548 * OUT: name - dentry name of the target
549 *
550 * RETURN: 0 on success, error code on failure.
551 */
552 int
zfs_get_name(znode_t * dzp,char * name,znode_t * zp)553 zfs_get_name(znode_t *dzp, char *name, znode_t *zp)
554 {
555 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
556 int error = 0;
557
558 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
559 return (error);
560
561 if ((error = zfs_verify_zp(zp)) != 0) {
562 zfs_exit(zfsvfs, FTAG);
563 return (error);
564 }
565
566 /* ctldir should have got their name in zfs_vget */
567 if (dzp->z_is_ctldir || zp->z_is_ctldir) {
568 zfs_exit(zfsvfs, FTAG);
569 return (ENOENT);
570 }
571
572 /* buffer len is hardcoded to 256 in Linux kernel */
573 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id,
574 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN);
575
576 zfs_exit(zfsvfs, FTAG);
577 return (error);
578 }
579
580 /*
581 * Attempt to create a new entry in a directory. If the entry
582 * already exists, truncate the file if permissible, else return
583 * an error. Return the ip of the created or trunc'd file.
584 *
585 * IN: dzp - znode of directory to put new file entry in.
586 * name - name of new file entry.
587 * vap - attributes of new file.
588 * excl - flag indicating exclusive or non-exclusive mode.
589 * mode - mode to open file with.
590 * cr - credentials of caller.
591 * flag - file flag.
592 * vsecp - ACL to be set
593 * mnt_ns - user namespace of the mount
594 *
595 * OUT: zpp - znode of created or trunc'd entry.
596 *
597 * RETURN: 0 on success, error code on failure.
598 *
599 * Timestamps:
600 * dzp - ctime|mtime updated if new entry created
601 * zp - ctime|mtime always, atime if new
602 */
603 int
zfs_create(znode_t * dzp,char * name,vattr_t * vap,int excl,int mode,znode_t ** zpp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)604 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
605 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
606 zidmap_t *mnt_ns)
607 {
608 znode_t *zp;
609 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
610 zilog_t *zilog;
611 objset_t *os;
612 zfs_dirlock_t *dl;
613 dmu_tx_t *tx;
614 int error;
615 uid_t uid;
616 gid_t gid;
617 zfs_acl_ids_t acl_ids;
618 boolean_t fuid_dirtied;
619 boolean_t have_acl = B_FALSE;
620 boolean_t waited = B_FALSE;
621 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
622
623 /*
624 * If we have an ephemeral id, ACL, or XVATTR then
625 * make sure file system is at proper version
626 */
627
628 gid = crgetgid(cr);
629 uid = crgetuid(cr);
630
631 if (zfsvfs->z_use_fuids == B_FALSE &&
632 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
633 return (SET_ERROR(EINVAL));
634
635 if (name == NULL)
636 return (SET_ERROR(EINVAL));
637
638 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
639 return (error);
640 os = zfsvfs->z_os;
641 zilog = zfsvfs->z_log;
642
643 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
644 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
645 zfs_exit(zfsvfs, FTAG);
646 return (SET_ERROR(EILSEQ));
647 }
648
649 if (vap->va_mask & ATTR_XVATTR) {
650 if ((error = secpolicy_xvattr((xvattr_t *)vap,
651 crgetuid(cr), cr, vap->va_mode)) != 0) {
652 zfs_exit(zfsvfs, FTAG);
653 return (error);
654 }
655 }
656
657 top:
658 *zpp = NULL;
659 if (*name == '\0') {
660 /*
661 * Null component name refers to the directory itself.
662 */
663 zhold(dzp);
664 zp = dzp;
665 dl = NULL;
666 error = 0;
667 } else {
668 /* possible igrab(zp) */
669 int zflg = 0;
670
671 if (flag & FIGNORECASE)
672 zflg |= ZCILOOK;
673
674 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
675 NULL, NULL);
676 if (error) {
677 if (have_acl)
678 zfs_acl_ids_free(&acl_ids);
679 if (strcmp(name, "..") == 0)
680 error = SET_ERROR(EISDIR);
681 zfs_exit(zfsvfs, FTAG);
682 return (error);
683 }
684 }
685
686 if (zp == NULL) {
687 uint64_t txtype;
688 uint64_t projid = ZFS_DEFAULT_PROJID;
689
690 /*
691 * Create a new file object and update the directory
692 * to reference it.
693 */
694 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
695 mnt_ns))) {
696 if (have_acl)
697 zfs_acl_ids_free(&acl_ids);
698 goto out;
699 }
700
701 /*
702 * We only support the creation of regular files in
703 * extended attribute directories.
704 */
705
706 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
707 if (have_acl)
708 zfs_acl_ids_free(&acl_ids);
709 error = SET_ERROR(EINVAL);
710 goto out;
711 }
712
713 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
714 cr, vsecp, &acl_ids, mnt_ns)) != 0)
715 goto out;
716 have_acl = B_TRUE;
717
718 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
719 projid = zfs_inherit_projid(dzp);
720 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
721 zfs_acl_ids_free(&acl_ids);
722 error = SET_ERROR(EDQUOT);
723 goto out;
724 }
725
726 tx = dmu_tx_create(os);
727
728 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
729 ZFS_SA_BASE_ATTR_SIZE);
730
731 fuid_dirtied = zfsvfs->z_fuid_dirty;
732 if (fuid_dirtied)
733 zfs_fuid_txhold(zfsvfs, tx);
734 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
735 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
736 if (!zfsvfs->z_use_sa &&
737 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
738 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
739 0, acl_ids.z_aclp->z_acl_bytes);
740 }
741
742 error = dmu_tx_assign(tx,
743 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
744 if (error) {
745 zfs_dirent_unlock(dl);
746 if (error == ERESTART) {
747 waited = B_TRUE;
748 dmu_tx_wait(tx);
749 dmu_tx_abort(tx);
750 goto top;
751 }
752 zfs_acl_ids_free(&acl_ids);
753 dmu_tx_abort(tx);
754 zfs_exit(zfsvfs, FTAG);
755 return (error);
756 }
757 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
758
759 error = zfs_link_create(dl, zp, tx, ZNEW);
760 if (error != 0) {
761 /*
762 * Since, we failed to add the directory entry for it,
763 * delete the newly created dnode.
764 */
765 zfs_znode_delete(zp, tx);
766 remove_inode_hash(ZTOI(zp));
767 zfs_acl_ids_free(&acl_ids);
768 dmu_tx_commit(tx);
769 goto out;
770 }
771
772 if (fuid_dirtied)
773 zfs_fuid_sync(zfsvfs, tx);
774
775 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
776 if (flag & FIGNORECASE)
777 txtype |= TX_CI;
778 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
779 vsecp, acl_ids.z_fuidp, vap);
780 zfs_acl_ids_free(&acl_ids);
781 dmu_tx_commit(tx);
782 } else {
783 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
784
785 if (have_acl)
786 zfs_acl_ids_free(&acl_ids);
787
788 /*
789 * A directory entry already exists for this name.
790 */
791 /*
792 * Can't truncate an existing file if in exclusive mode.
793 */
794 if (excl) {
795 error = SET_ERROR(EEXIST);
796 goto out;
797 }
798 /*
799 * Can't open a directory for writing.
800 */
801 if (S_ISDIR(ZTOI(zp)->i_mode)) {
802 error = SET_ERROR(EISDIR);
803 goto out;
804 }
805 /*
806 * Verify requested access to file.
807 */
808 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
809 mnt_ns))) {
810 goto out;
811 }
812
813 mutex_enter(&dzp->z_lock);
814 dzp->z_seq++;
815 mutex_exit(&dzp->z_lock);
816
817 /*
818 * Truncate regular files if requested.
819 */
820 if (S_ISREG(ZTOI(zp)->i_mode) &&
821 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
822 /* we can't hold any locks when calling zfs_freesp() */
823 if (dl) {
824 zfs_dirent_unlock(dl);
825 dl = NULL;
826 }
827 error = zfs_freesp(zp, 0, 0, mode, TRUE);
828 }
829 }
830 out:
831
832 if (dl)
833 zfs_dirent_unlock(dl);
834
835 if (error) {
836 if (zp)
837 zrele(zp);
838 } else {
839 zfs_znode_update_vfs(dzp);
840 zfs_znode_update_vfs(zp);
841 *zpp = zp;
842 }
843
844 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
845 error = zil_commit(zilog, 0);
846
847 zfs_exit(zfsvfs, FTAG);
848 return (error);
849 }
850
851 int
zfs_tmpfile(struct inode * dip,vattr_t * vap,int excl,int mode,struct inode ** ipp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)852 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
853 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
854 zidmap_t *mnt_ns)
855 {
856 (void) excl, (void) mode, (void) flag;
857 znode_t *zp = NULL, *dzp = ITOZ(dip);
858 zfsvfs_t *zfsvfs = ITOZSB(dip);
859 objset_t *os;
860 dmu_tx_t *tx;
861 int error;
862 uid_t uid;
863 gid_t gid;
864 zfs_acl_ids_t acl_ids;
865 uint64_t projid = ZFS_DEFAULT_PROJID;
866 boolean_t fuid_dirtied;
867 boolean_t have_acl = B_FALSE;
868 boolean_t waited = B_FALSE;
869
870 /*
871 * If we have an ephemeral id, ACL, or XVATTR then
872 * make sure file system is at proper version
873 */
874
875 gid = crgetgid(cr);
876 uid = crgetuid(cr);
877
878 if (zfsvfs->z_use_fuids == B_FALSE &&
879 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
880 return (SET_ERROR(EINVAL));
881
882 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
883 return (error);
884 os = zfsvfs->z_os;
885
886 if (vap->va_mask & ATTR_XVATTR) {
887 if ((error = secpolicy_xvattr((xvattr_t *)vap,
888 crgetuid(cr), cr, vap->va_mode)) != 0) {
889 zfs_exit(zfsvfs, FTAG);
890 return (error);
891 }
892 }
893
894 top:
895 *ipp = NULL;
896
897 /*
898 * Create a new file object and update the directory
899 * to reference it.
900 */
901 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
902 if (have_acl)
903 zfs_acl_ids_free(&acl_ids);
904 goto out;
905 }
906
907 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
908 cr, vsecp, &acl_ids, mnt_ns)) != 0)
909 goto out;
910 have_acl = B_TRUE;
911
912 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
913 projid = zfs_inherit_projid(dzp);
914 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
915 zfs_acl_ids_free(&acl_ids);
916 error = SET_ERROR(EDQUOT);
917 goto out;
918 }
919
920 tx = dmu_tx_create(os);
921
922 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
923 ZFS_SA_BASE_ATTR_SIZE);
924 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
925
926 fuid_dirtied = zfsvfs->z_fuid_dirty;
927 if (fuid_dirtied)
928 zfs_fuid_txhold(zfsvfs, tx);
929 if (!zfsvfs->z_use_sa &&
930 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
931 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
932 0, acl_ids.z_aclp->z_acl_bytes);
933 }
934 error = dmu_tx_assign(tx,
935 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
936 if (error) {
937 if (error == ERESTART) {
938 waited = B_TRUE;
939 dmu_tx_wait(tx);
940 dmu_tx_abort(tx);
941 goto top;
942 }
943 zfs_acl_ids_free(&acl_ids);
944 dmu_tx_abort(tx);
945 zfs_exit(zfsvfs, FTAG);
946 return (error);
947 }
948 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
949
950 if (fuid_dirtied)
951 zfs_fuid_sync(zfsvfs, tx);
952
953 /* Add to unlinked set */
954 zp->z_unlinked = B_TRUE;
955 zfs_unlinked_add(zp, tx);
956 zfs_acl_ids_free(&acl_ids);
957 dmu_tx_commit(tx);
958 out:
959
960 if (error) {
961 if (zp)
962 zrele(zp);
963 } else {
964 zfs_znode_update_vfs(dzp);
965 zfs_znode_update_vfs(zp);
966 *ipp = ZTOI(zp);
967 }
968
969 zfs_exit(zfsvfs, FTAG);
970 return (error);
971 }
972
973 /*
974 * Remove an entry from a directory.
975 *
976 * IN: dzp - znode of directory to remove entry from.
977 * name - name of entry to remove.
978 * cr - credentials of caller.
979 * flags - case flags.
980 *
981 * RETURN: 0 if success
982 * error code if failure
983 *
984 * Timestamps:
985 * dzp - ctime|mtime
986 * ip - ctime (if nlink > 0)
987 */
988
989 static uint64_t null_xattr = 0;
990
991 int
zfs_remove(znode_t * dzp,char * name,cred_t * cr,int flags)992 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
993 {
994 znode_t *zp;
995 znode_t *xzp;
996 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
997 zilog_t *zilog;
998 uint64_t acl_obj, xattr_obj;
999 uint64_t xattr_obj_unlinked = 0;
1000 uint64_t obj = 0;
1001 uint64_t links;
1002 zfs_dirlock_t *dl;
1003 dmu_tx_t *tx;
1004 boolean_t may_delete_now, delete_now = FALSE;
1005 boolean_t unlinked, toobig = FALSE;
1006 uint64_t txtype;
1007 pathname_t *realnmp = NULL;
1008 pathname_t realnm;
1009 int error;
1010 int zflg = ZEXISTS;
1011 boolean_t waited = B_FALSE;
1012
1013 if (name == NULL)
1014 return (SET_ERROR(EINVAL));
1015
1016 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1017 return (error);
1018 zilog = zfsvfs->z_log;
1019
1020 if (flags & FIGNORECASE) {
1021 zflg |= ZCILOOK;
1022 pn_alloc(&realnm);
1023 realnmp = &realnm;
1024 }
1025
1026 top:
1027 xattr_obj = 0;
1028 xzp = NULL;
1029 /*
1030 * Attempt to lock directory; fail if entry doesn't exist.
1031 */
1032 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1033 NULL, realnmp))) {
1034 if (realnmp)
1035 pn_free(realnmp);
1036 zfs_exit(zfsvfs, FTAG);
1037 return (error);
1038 }
1039
1040 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1041 goto out;
1042 }
1043
1044 /*
1045 * Need to use rmdir for removing directories.
1046 */
1047 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1048 error = SET_ERROR(EPERM);
1049 goto out;
1050 }
1051
1052 mutex_enter(&zp->z_lock);
1053 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1054 !zn_has_cached_data(zp, 0, LLONG_MAX);
1055 mutex_exit(&zp->z_lock);
1056
1057 /*
1058 * We may delete the znode now, or we may put it in the unlinked set;
1059 * it depends on whether we're the last link, and on whether there are
1060 * other holds on the inode. So we dmu_tx_hold() the right things to
1061 * allow for either case.
1062 */
1063 obj = zp->z_id;
1064 tx = dmu_tx_create(zfsvfs->z_os);
1065 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1066 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1067 zfs_sa_upgrade_txholds(tx, zp);
1068 zfs_sa_upgrade_txholds(tx, dzp);
1069 if (may_delete_now) {
1070 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1071 /* if the file is too big, only hold_free a token amount */
1072 dmu_tx_hold_free(tx, zp->z_id, 0,
1073 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1074 }
1075
1076 /* are there any extended attributes? */
1077 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1078 &xattr_obj, sizeof (xattr_obj));
1079 if (error == 0 && xattr_obj) {
1080 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1081 ASSERT0(error);
1082 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1083 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1084 }
1085
1086 mutex_enter(&zp->z_lock);
1087 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1088 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1089 mutex_exit(&zp->z_lock);
1090
1091 /* charge as an update -- would be nice not to charge at all */
1092 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1093
1094 /*
1095 * Mark this transaction as typically resulting in a net free of space
1096 */
1097 dmu_tx_mark_netfree(tx);
1098
1099 error = dmu_tx_assign(tx,
1100 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1101 if (error) {
1102 zfs_dirent_unlock(dl);
1103 if (error == ERESTART) {
1104 waited = B_TRUE;
1105 dmu_tx_wait(tx);
1106 dmu_tx_abort(tx);
1107 zrele(zp);
1108 if (xzp)
1109 zrele(xzp);
1110 goto top;
1111 }
1112 if (realnmp)
1113 pn_free(realnmp);
1114 dmu_tx_abort(tx);
1115 zrele(zp);
1116 if (xzp)
1117 zrele(xzp);
1118 zfs_exit(zfsvfs, FTAG);
1119 return (error);
1120 }
1121
1122 /*
1123 * Remove the directory entry.
1124 */
1125 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1126
1127 if (error) {
1128 dmu_tx_commit(tx);
1129 goto out;
1130 }
1131
1132 if (unlinked) {
1133 /*
1134 * Hold z_lock so that we can make sure that the ACL obj
1135 * hasn't changed. Could have been deleted due to
1136 * zfs_sa_upgrade().
1137 */
1138 mutex_enter(&zp->z_lock);
1139 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1140 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1141 delete_now = may_delete_now && !toobig &&
1142 atomic_read(&ZTOI(zp)->i_count) == 1 &&
1143 !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1144 xattr_obj == xattr_obj_unlinked &&
1145 zfs_external_acl(zp) == acl_obj;
1146 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1147 }
1148
1149 if (delete_now) {
1150 if (xattr_obj_unlinked) {
1151 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1152 mutex_enter(&xzp->z_lock);
1153 xzp->z_unlinked = B_TRUE;
1154 clear_nlink(ZTOI(xzp));
1155 links = 0;
1156 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1157 &links, sizeof (links), tx);
1158 ASSERT3U(error, ==, 0);
1159 mutex_exit(&xzp->z_lock);
1160 zfs_unlinked_add(xzp, tx);
1161
1162 if (zp->z_is_sa)
1163 error = sa_remove(zp->z_sa_hdl,
1164 SA_ZPL_XATTR(zfsvfs), tx);
1165 else
1166 error = sa_update(zp->z_sa_hdl,
1167 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1168 sizeof (uint64_t), tx);
1169 ASSERT0(error);
1170 }
1171 /*
1172 * Add to the unlinked set because a new reference could be
1173 * taken concurrently resulting in a deferred destruction.
1174 */
1175 zfs_unlinked_add(zp, tx);
1176 mutex_exit(&zp->z_lock);
1177 } else if (unlinked) {
1178 mutex_exit(&zp->z_lock);
1179 zfs_unlinked_add(zp, tx);
1180 }
1181
1182 txtype = TX_REMOVE;
1183 if (flags & FIGNORECASE)
1184 txtype |= TX_CI;
1185 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1186
1187 dmu_tx_commit(tx);
1188 out:
1189 if (realnmp)
1190 pn_free(realnmp);
1191
1192 zfs_dirent_unlock(dl);
1193 zfs_znode_update_vfs(dzp);
1194 zfs_znode_update_vfs(zp);
1195
1196 if (delete_now)
1197 zrele(zp);
1198 else
1199 zfs_zrele_async(zp);
1200
1201 if (xzp) {
1202 zfs_znode_update_vfs(xzp);
1203 zfs_zrele_async(xzp);
1204 }
1205
1206 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1207 error = zil_commit(zilog, 0);
1208
1209 zfs_exit(zfsvfs, FTAG);
1210 return (error);
1211 }
1212
1213 /*
1214 * Create a new directory and insert it into dzp using the name
1215 * provided. Return a pointer to the inserted directory.
1216 *
1217 * IN: dzp - znode of directory to add subdir to.
1218 * dirname - name of new directory.
1219 * vap - attributes of new directory.
1220 * cr - credentials of caller.
1221 * flags - case flags.
1222 * vsecp - ACL to be set
1223 * mnt_ns - user namespace of the mount
1224 *
1225 * OUT: zpp - znode of created directory.
1226 *
1227 * RETURN: 0 if success
1228 * error code if failure
1229 *
1230 * Timestamps:
1231 * dzp - ctime|mtime updated
1232 * zpp - ctime|mtime|atime updated
1233 */
1234 int
zfs_mkdir(znode_t * dzp,char * dirname,vattr_t * vap,znode_t ** zpp,cred_t * cr,int flags,vsecattr_t * vsecp,zidmap_t * mnt_ns)1235 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1236 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1237 {
1238 znode_t *zp;
1239 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1240 zilog_t *zilog;
1241 zfs_dirlock_t *dl;
1242 uint64_t txtype;
1243 dmu_tx_t *tx;
1244 int error;
1245 int zf = ZNEW;
1246 uid_t uid;
1247 gid_t gid = crgetgid(cr);
1248 zfs_acl_ids_t acl_ids;
1249 boolean_t fuid_dirtied;
1250 boolean_t waited = B_FALSE;
1251
1252 ASSERT(S_ISDIR(vap->va_mode));
1253
1254 /*
1255 * If we have an ephemeral id, ACL, or XVATTR then
1256 * make sure file system is at proper version
1257 */
1258
1259 uid = crgetuid(cr);
1260 if (zfsvfs->z_use_fuids == B_FALSE &&
1261 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1262 return (SET_ERROR(EINVAL));
1263
1264 if (dirname == NULL)
1265 return (SET_ERROR(EINVAL));
1266
1267 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1268 return (error);
1269 zilog = zfsvfs->z_log;
1270
1271 if (dzp->z_pflags & ZFS_XATTR) {
1272 zfs_exit(zfsvfs, FTAG);
1273 return (SET_ERROR(EINVAL));
1274 }
1275
1276 if (zfsvfs->z_utf8 && u8_validate(dirname,
1277 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1278 zfs_exit(zfsvfs, FTAG);
1279 return (SET_ERROR(EILSEQ));
1280 }
1281 if (flags & FIGNORECASE)
1282 zf |= ZCILOOK;
1283
1284 if (vap->va_mask & ATTR_XVATTR) {
1285 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1286 crgetuid(cr), cr, vap->va_mode)) != 0) {
1287 zfs_exit(zfsvfs, FTAG);
1288 return (error);
1289 }
1290 }
1291
1292 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1293 vsecp, &acl_ids, mnt_ns)) != 0) {
1294 zfs_exit(zfsvfs, FTAG);
1295 return (error);
1296 }
1297 /*
1298 * First make sure the new directory doesn't exist.
1299 *
1300 * Existence is checked first to make sure we don't return
1301 * EACCES instead of EEXIST which can cause some applications
1302 * to fail.
1303 */
1304 top:
1305 *zpp = NULL;
1306
1307 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1308 NULL, NULL))) {
1309 zfs_acl_ids_free(&acl_ids);
1310 zfs_exit(zfsvfs, FTAG);
1311 return (error);
1312 }
1313
1314 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1315 mnt_ns))) {
1316 zfs_acl_ids_free(&acl_ids);
1317 zfs_dirent_unlock(dl);
1318 zfs_exit(zfsvfs, FTAG);
1319 return (error);
1320 }
1321
1322 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1323 zfs_acl_ids_free(&acl_ids);
1324 zfs_dirent_unlock(dl);
1325 zfs_exit(zfsvfs, FTAG);
1326 return (SET_ERROR(EDQUOT));
1327 }
1328
1329 /*
1330 * Add a new entry to the directory.
1331 */
1332 tx = dmu_tx_create(zfsvfs->z_os);
1333 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1334 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1335 fuid_dirtied = zfsvfs->z_fuid_dirty;
1336 if (fuid_dirtied)
1337 zfs_fuid_txhold(zfsvfs, tx);
1338 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1339 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1340 acl_ids.z_aclp->z_acl_bytes);
1341 }
1342
1343 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1344 ZFS_SA_BASE_ATTR_SIZE);
1345
1346 error = dmu_tx_assign(tx,
1347 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1348 if (error) {
1349 zfs_dirent_unlock(dl);
1350 if (error == ERESTART) {
1351 waited = B_TRUE;
1352 dmu_tx_wait(tx);
1353 dmu_tx_abort(tx);
1354 goto top;
1355 }
1356 zfs_acl_ids_free(&acl_ids);
1357 dmu_tx_abort(tx);
1358 zfs_exit(zfsvfs, FTAG);
1359 return (error);
1360 }
1361
1362 /*
1363 * Create new node.
1364 */
1365 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1366
1367 /*
1368 * Now put new name in parent dir.
1369 */
1370 error = zfs_link_create(dl, zp, tx, ZNEW);
1371 if (error != 0) {
1372 zfs_znode_delete(zp, tx);
1373 remove_inode_hash(ZTOI(zp));
1374 goto out;
1375 }
1376
1377 if (fuid_dirtied)
1378 zfs_fuid_sync(zfsvfs, tx);
1379
1380 *zpp = zp;
1381
1382 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1383 if (flags & FIGNORECASE)
1384 txtype |= TX_CI;
1385 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1386 acl_ids.z_fuidp, vap);
1387
1388 out:
1389 zfs_acl_ids_free(&acl_ids);
1390
1391 dmu_tx_commit(tx);
1392
1393 zfs_dirent_unlock(dl);
1394
1395 if (error != 0) {
1396 zrele(zp);
1397 } else {
1398 zfs_znode_update_vfs(dzp);
1399 zfs_znode_update_vfs(zp);
1400
1401 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1402 error = zil_commit(zilog, 0);
1403
1404 }
1405 zfs_exit(zfsvfs, FTAG);
1406 return (error);
1407 }
1408
1409 /*
1410 * Remove a directory subdir entry. If the current working
1411 * directory is the same as the subdir to be removed, the
1412 * remove will fail.
1413 *
1414 * IN: dzp - znode of directory to remove from.
1415 * name - name of directory to be removed.
1416 * cwd - inode of current working directory.
1417 * cr - credentials of caller.
1418 * flags - case flags
1419 *
1420 * RETURN: 0 on success, error code on failure.
1421 *
1422 * Timestamps:
1423 * dzp - ctime|mtime updated
1424 */
1425 int
zfs_rmdir(znode_t * dzp,char * name,znode_t * cwd,cred_t * cr,int flags)1426 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1427 int flags)
1428 {
1429 znode_t *zp;
1430 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1431 zilog_t *zilog;
1432 zfs_dirlock_t *dl;
1433 dmu_tx_t *tx;
1434 int error;
1435 int zflg = ZEXISTS;
1436 boolean_t waited = B_FALSE;
1437
1438 if (name == NULL)
1439 return (SET_ERROR(EINVAL));
1440
1441 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1442 return (error);
1443 zilog = zfsvfs->z_log;
1444
1445 if (flags & FIGNORECASE)
1446 zflg |= ZCILOOK;
1447 top:
1448 zp = NULL;
1449
1450 /*
1451 * Attempt to lock directory; fail if entry doesn't exist.
1452 */
1453 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1454 NULL, NULL))) {
1455 zfs_exit(zfsvfs, FTAG);
1456 return (error);
1457 }
1458
1459 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1460 goto out;
1461 }
1462
1463 if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1464 error = SET_ERROR(ENOTDIR);
1465 goto out;
1466 }
1467
1468 if (zp == cwd) {
1469 error = SET_ERROR(EINVAL);
1470 goto out;
1471 }
1472
1473 /*
1474 * Grab a lock on the directory to make sure that no one is
1475 * trying to add (or lookup) entries while we are removing it.
1476 */
1477 rw_enter(&zp->z_name_lock, RW_WRITER);
1478
1479 /*
1480 * Grab a lock on the parent pointer to make sure we play well
1481 * with the treewalk and directory rename code.
1482 */
1483 rw_enter(&zp->z_parent_lock, RW_WRITER);
1484
1485 tx = dmu_tx_create(zfsvfs->z_os);
1486 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1487 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1488 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1489 zfs_sa_upgrade_txholds(tx, zp);
1490 zfs_sa_upgrade_txholds(tx, dzp);
1491 dmu_tx_mark_netfree(tx);
1492 error = dmu_tx_assign(tx,
1493 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1494 if (error) {
1495 rw_exit(&zp->z_parent_lock);
1496 rw_exit(&zp->z_name_lock);
1497 zfs_dirent_unlock(dl);
1498 if (error == ERESTART) {
1499 waited = B_TRUE;
1500 dmu_tx_wait(tx);
1501 dmu_tx_abort(tx);
1502 zrele(zp);
1503 goto top;
1504 }
1505 dmu_tx_abort(tx);
1506 zrele(zp);
1507 zfs_exit(zfsvfs, FTAG);
1508 return (error);
1509 }
1510
1511 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1512
1513 if (error == 0) {
1514 uint64_t txtype = TX_RMDIR;
1515 if (flags & FIGNORECASE)
1516 txtype |= TX_CI;
1517 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1518 B_FALSE);
1519 }
1520
1521 dmu_tx_commit(tx);
1522
1523 rw_exit(&zp->z_parent_lock);
1524 rw_exit(&zp->z_name_lock);
1525 out:
1526 zfs_dirent_unlock(dl);
1527
1528 zfs_znode_update_vfs(dzp);
1529 zfs_znode_update_vfs(zp);
1530 zrele(zp);
1531
1532 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1533 error = zil_commit(zilog, 0);
1534
1535 zfs_exit(zfsvfs, FTAG);
1536 return (error);
1537 }
1538
1539 /*
1540 * Read directory entries from the given directory cursor position and emit
1541 * name and position for each entry.
1542 *
1543 * IN: ip - inode of directory to read.
1544 * ctx - directory entry context.
1545 * cr - credentials of caller.
1546 *
1547 * RETURN: 0 if success
1548 * error code if failure
1549 *
1550 * Timestamps:
1551 * ip - atime updated
1552 *
1553 * Note that the low 4 bits of the cookie returned by zap is always zero.
1554 * This allows us to use the low range for "special" directory entries:
1555 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1556 * we use the offset 2 for the '.zfs' directory.
1557 */
1558 int
zfs_readdir(struct inode * ip,struct dir_context * ctx,cred_t * cr)1559 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
1560 {
1561 (void) cr;
1562 znode_t *zp = ITOZ(ip);
1563 zfsvfs_t *zfsvfs = ITOZSB(ip);
1564 objset_t *os;
1565 zap_cursor_t zc;
1566 zap_attribute_t *zap;
1567 int error;
1568 uint8_t prefetch;
1569 uint8_t type;
1570 int done = 0;
1571 uint64_t parent;
1572 uint64_t offset; /* must be unsigned; checks for < 1 */
1573
1574 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1575 return (error);
1576
1577 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1578 &parent, sizeof (parent))) != 0)
1579 goto out;
1580
1581 /*
1582 * Quit if directory has been removed (posix)
1583 */
1584 if (zp->z_unlinked)
1585 goto out;
1586
1587 error = 0;
1588 os = zfsvfs->z_os;
1589 offset = ctx->pos;
1590 prefetch = zp->z_zn_prefetch;
1591 zap = zap_attribute_long_alloc();
1592
1593 /*
1594 * Initialize the iterator cursor.
1595 */
1596 if (offset <= 3) {
1597 /*
1598 * Start iteration from the beginning of the directory.
1599 */
1600 zap_cursor_init(&zc, os, zp->z_id);
1601 } else {
1602 /*
1603 * The offset is a serialized cursor.
1604 */
1605 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1606 }
1607
1608 /*
1609 * Transform to file-system independent format
1610 */
1611 while (!done) {
1612 uint64_t objnum;
1613 /*
1614 * Special case `.', `..', and `.zfs'.
1615 */
1616 if (offset == 0) {
1617 (void) strcpy(zap->za_name, ".");
1618 zap->za_normalization_conflict = 0;
1619 objnum = zp->z_id;
1620 type = DT_DIR;
1621 } else if (offset == 1) {
1622 (void) strcpy(zap->za_name, "..");
1623 zap->za_normalization_conflict = 0;
1624 objnum = parent;
1625 type = DT_DIR;
1626 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1627 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1628 zap->za_normalization_conflict = 0;
1629 objnum = ZFSCTL_INO_ROOT;
1630 type = DT_DIR;
1631 } else {
1632 /*
1633 * Grab next entry.
1634 */
1635 if ((error = zap_cursor_retrieve(&zc, zap))) {
1636 if (error == ENOENT)
1637 break;
1638 else
1639 goto update;
1640 }
1641
1642 /*
1643 * Allow multiple entries provided the first entry is
1644 * the object id. Non-zpl consumers may safely make
1645 * use of the additional space.
1646 *
1647 * XXX: This should be a feature flag for compatibility
1648 */
1649 if (zap->za_integer_length != 8 ||
1650 zap->za_num_integers == 0) {
1651 cmn_err(CE_WARN, "zap_readdir: bad directory "
1652 "entry, obj = %lld, offset = %lld, "
1653 "length = %d, num = %lld\n",
1654 (u_longlong_t)zp->z_id,
1655 (u_longlong_t)offset,
1656 zap->za_integer_length,
1657 (u_longlong_t)zap->za_num_integers);
1658 error = SET_ERROR(ENXIO);
1659 goto update;
1660 }
1661
1662 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1663 type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1664 }
1665
1666 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name),
1667 objnum, type);
1668 if (done)
1669 break;
1670
1671 if (prefetch)
1672 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1673
1674 /*
1675 * Move to the next entry, fill in the previous offset.
1676 */
1677 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1678 zap_cursor_advance(&zc);
1679 offset = zap_cursor_serialize(&zc);
1680 } else {
1681 offset += 1;
1682 }
1683 ctx->pos = offset;
1684 }
1685 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1686
1687 update:
1688 zap_cursor_fini(&zc);
1689 zap_attribute_free(zap);
1690 if (error == ENOENT)
1691 error = 0;
1692 out:
1693 zfs_exit(zfsvfs, FTAG);
1694
1695 return (error);
1696 }
1697
1698 /*
1699 * Get the basic file attributes and place them in the provided kstat
1700 * structure. The inode is assumed to be the authoritative source
1701 * for most of the attributes. However, the znode currently has the
1702 * authoritative atime, blksize, and block count.
1703 *
1704 * IN: ip - inode of file.
1705 *
1706 * OUT: sp - kstat values.
1707 *
1708 * RETURN: 0 (always succeeds)
1709 */
1710 int
1711 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
zfs_getattr_fast(zidmap_t * user_ns,u32 request_mask,struct inode * ip,struct kstat * sp)1712 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
1713 struct kstat *sp)
1714 #else
1715 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1716 #endif
1717 {
1718 znode_t *zp = ITOZ(ip);
1719 zfsvfs_t *zfsvfs = ITOZSB(ip);
1720 uint32_t blksize;
1721 u_longlong_t nblocks;
1722 int error;
1723
1724 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1725 return (error);
1726
1727 mutex_enter(&zp->z_lock);
1728
1729 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1730 zpl_generic_fillattr(user_ns, request_mask, ip, sp);
1731 #else
1732 zpl_generic_fillattr(user_ns, ip, sp);
1733 #endif
1734 /*
1735 * +1 link count for root inode with visible '.zfs' directory.
1736 */
1737 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1738 if (sp->nlink < ZFS_LINK_MAX)
1739 sp->nlink++;
1740
1741 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1742 sp->blksize = blksize;
1743 sp->blocks = nblocks;
1744
1745 if (unlikely(zp->z_blksz == 0)) {
1746 /*
1747 * Block size hasn't been set; suggest maximal I/O transfers.
1748 */
1749 sp->blksize = zfsvfs->z_max_blksz;
1750 }
1751
1752 mutex_exit(&zp->z_lock);
1753
1754 /*
1755 * Required to prevent NFS client from detecting different inode
1756 * numbers of snapshot root dentry before and after snapshot mount.
1757 */
1758 if (zfsvfs->z_issnap) {
1759 if (ip->i_sb->s_root->d_inode == ip)
1760 sp->ino = ZFSCTL_INO_SNAPDIRS -
1761 dmu_objset_id(zfsvfs->z_os);
1762 }
1763
1764 zfs_exit(zfsvfs, FTAG);
1765
1766 return (0);
1767 }
1768
1769 /*
1770 * For the operation of changing file's user/group/project, we need to
1771 * handle not only the main object that is assigned to the file directly,
1772 * but also the ones that are used by the file via hidden xattr directory.
1773 *
1774 * Because the xattr directory may contains many EA entries, as to it may
1775 * be impossible to change all of them via the transaction of changing the
1776 * main object's user/group/project attributes. Then we have to change them
1777 * via other multiple independent transactions one by one. It may be not good
1778 * solution, but we have no better idea yet.
1779 */
1780 static int
zfs_setattr_dir(znode_t * dzp)1781 zfs_setattr_dir(znode_t *dzp)
1782 {
1783 struct inode *dxip = ZTOI(dzp);
1784 struct inode *xip = NULL;
1785 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1786 objset_t *os = zfsvfs->z_os;
1787 zap_cursor_t zc;
1788 zap_attribute_t *zap;
1789 zfs_dirlock_t *dl;
1790 znode_t *zp = NULL;
1791 dmu_tx_t *tx = NULL;
1792 uint64_t uid, gid;
1793 sa_bulk_attr_t bulk[4];
1794 int count;
1795 int err;
1796
1797 zap = zap_attribute_alloc();
1798 zap_cursor_init(&zc, os, dzp->z_id);
1799 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
1800 count = 0;
1801 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
1802 err = ENXIO;
1803 break;
1804 }
1805
1806 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp,
1807 ZEXISTS, NULL, NULL);
1808 if (err == ENOENT)
1809 goto next;
1810 if (err)
1811 break;
1812
1813 xip = ZTOI(zp);
1814 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1815 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1816 zp->z_projid == dzp->z_projid)
1817 goto next;
1818
1819 tx = dmu_tx_create(os);
1820 if (!(zp->z_pflags & ZFS_PROJID))
1821 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1822 else
1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1824
1825 err = dmu_tx_assign(tx, DMU_TX_WAIT);
1826 if (err)
1827 break;
1828
1829 mutex_enter(&dzp->z_lock);
1830
1831 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1832 xip->i_uid = dxip->i_uid;
1833 uid = zfs_uid_read(dxip);
1834 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1835 &uid, sizeof (uid));
1836 }
1837
1838 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1839 xip->i_gid = dxip->i_gid;
1840 gid = zfs_gid_read(dxip);
1841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1842 &gid, sizeof (gid));
1843 }
1844
1845
1846 uint64_t projid = dzp->z_projid;
1847 if (zp->z_projid != projid) {
1848 if (!(zp->z_pflags & ZFS_PROJID)) {
1849 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
1850 if (unlikely(err == EEXIST)) {
1851 err = 0;
1852 } else if (err != 0) {
1853 goto sa_add_projid_err;
1854 } else {
1855 projid = ZFS_INVALID_PROJID;
1856 }
1857 }
1858
1859 if (projid != ZFS_INVALID_PROJID) {
1860 zp->z_projid = projid;
1861 SA_ADD_BULK_ATTR(bulk, count,
1862 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
1863 sizeof (zp->z_projid));
1864 }
1865 }
1866
1867 sa_add_projid_err:
1868 mutex_exit(&dzp->z_lock);
1869
1870 if (likely(count > 0)) {
1871 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1872 dmu_tx_commit(tx);
1873 } else if (projid == ZFS_INVALID_PROJID) {
1874 dmu_tx_commit(tx);
1875 } else {
1876 dmu_tx_abort(tx);
1877 }
1878 tx = NULL;
1879 if (err != 0 && err != ENOENT)
1880 break;
1881
1882 next:
1883 if (zp) {
1884 zrele(zp);
1885 zp = NULL;
1886 zfs_dirent_unlock(dl);
1887 }
1888 zap_cursor_advance(&zc);
1889 }
1890
1891 if (tx)
1892 dmu_tx_abort(tx);
1893 if (zp) {
1894 zrele(zp);
1895 zfs_dirent_unlock(dl);
1896 }
1897 zap_cursor_fini(&zc);
1898 zap_attribute_free(zap);
1899
1900 return (err == ENOENT ? 0 : err);
1901 }
1902
1903 /*
1904 * Set the file attributes to the values contained in the
1905 * vattr structure.
1906 *
1907 * IN: zp - znode of file to be modified.
1908 * vap - new attribute values.
1909 * If ATTR_XVATTR set, then optional attrs are being set
1910 * flags - ATTR_UTIME set if non-default time values provided.
1911 * - ATTR_NOACLCHECK (CIFS context only).
1912 * cr - credentials of caller.
1913 * mnt_ns - user namespace of the mount
1914 *
1915 * RETURN: 0 if success
1916 * error code if failure
1917 *
1918 * Timestamps:
1919 * ip - ctime updated, mtime updated if size changed.
1920 */
1921 int
zfs_setattr(znode_t * zp,vattr_t * vap,int flags,cred_t * cr,zidmap_t * mnt_ns)1922 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1923 {
1924 struct inode *ip;
1925 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1926 objset_t *os;
1927 zilog_t *zilog;
1928 dmu_tx_t *tx;
1929 vattr_t oldva;
1930 xvattr_t *tmpxvattr;
1931 uint_t mask = vap->va_mask;
1932 uint_t saved_mask = 0;
1933 int trim_mask = 0;
1934 uint64_t new_mode;
1935 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1936 uint64_t xattr_obj;
1937 uint64_t mtime[2], ctime[2], atime[2];
1938 uint64_t projid = ZFS_INVALID_PROJID;
1939 znode_t *attrzp;
1940 int need_policy = FALSE;
1941 int err, err2 = 0;
1942 zfs_fuid_info_t *fuidp = NULL;
1943 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
1944 xoptattr_t *xoap;
1945 zfs_acl_t *aclp;
1946 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1947 boolean_t fuid_dirtied = B_FALSE;
1948 boolean_t handle_eadir = B_FALSE;
1949 sa_bulk_attr_t *bulk, *xattr_bulk;
1950 int count = 0, xattr_count = 0, bulks = 8;
1951
1952 if (mask == 0)
1953 return (0);
1954
1955 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1956 return (err);
1957 ip = ZTOI(zp);
1958 os = zfsvfs->z_os;
1959
1960 /*
1961 * If this is a xvattr_t, then get a pointer to the structure of
1962 * optional attributes. If this is NULL, then we have a vattr_t.
1963 */
1964 xoap = xva_getxoptattr(xvap);
1965 if (xoap != NULL && (mask & ATTR_XVATTR)) {
1966 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1967 if (!dmu_objset_projectquota_enabled(os) ||
1968 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1969 zfs_exit(zfsvfs, FTAG);
1970 return (SET_ERROR(ENOTSUP));
1971 }
1972
1973 projid = xoap->xoa_projid;
1974 if (unlikely(projid == ZFS_INVALID_PROJID)) {
1975 zfs_exit(zfsvfs, FTAG);
1976 return (SET_ERROR(EINVAL));
1977 }
1978
1979 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1980 projid = ZFS_INVALID_PROJID;
1981 else
1982 need_policy = TRUE;
1983 }
1984
1985 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1986 (xoap->xoa_projinherit !=
1987 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1988 (!dmu_objset_projectquota_enabled(os) ||
1989 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1990 zfs_exit(zfsvfs, FTAG);
1991 return (SET_ERROR(ENOTSUP));
1992 }
1993 }
1994
1995 zilog = zfsvfs->z_log;
1996
1997 /*
1998 * Make sure that if we have ephemeral uid/gid or xvattr specified
1999 * that file system is at proper version level
2000 */
2001
2002 if (zfsvfs->z_use_fuids == B_FALSE &&
2003 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2004 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2005 (mask & ATTR_XVATTR))) {
2006 zfs_exit(zfsvfs, FTAG);
2007 return (SET_ERROR(EINVAL));
2008 }
2009
2010 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2011 zfs_exit(zfsvfs, FTAG);
2012 return (SET_ERROR(EISDIR));
2013 }
2014
2015 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2016 zfs_exit(zfsvfs, FTAG);
2017 return (SET_ERROR(EINVAL));
2018 }
2019
2020 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2021 xva_init(tmpxvattr);
2022
2023 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2024 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2025
2026 /*
2027 * Immutable files can only alter immutable bit and atime
2028 */
2029 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2030 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2031 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2032 err = SET_ERROR(EPERM);
2033 goto out3;
2034 }
2035
2036 /* ZFS_READONLY will be handled in zfs_zaccess() */
2037
2038 /*
2039 * Verify timestamps doesn't overflow 32 bits.
2040 * ZFS can handle large timestamps, but 32bit syscalls can't
2041 * handle times greater than 2039. This check should be removed
2042 * once large timestamps are fully supported.
2043 */
2044 if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2045 if (((mask & ATTR_ATIME) &&
2046 TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2047 ((mask & ATTR_MTIME) &&
2048 TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2049 err = SET_ERROR(EOVERFLOW);
2050 goto out3;
2051 }
2052 }
2053
2054 top:
2055 attrzp = NULL;
2056 aclp = NULL;
2057
2058 /* Can this be moved to before the top label? */
2059 if (zfs_is_readonly(zfsvfs)) {
2060 err = SET_ERROR(EROFS);
2061 goto out3;
2062 }
2063
2064 /*
2065 * First validate permissions
2066 */
2067
2068 if (mask & ATTR_SIZE) {
2069 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
2070 mnt_ns);
2071 if (err)
2072 goto out3;
2073
2074 /*
2075 * XXX - Note, we are not providing any open
2076 * mode flags here (like FNDELAY), so we may
2077 * block if there are locks present... this
2078 * should be addressed in openat().
2079 */
2080 /* XXX - would it be OK to generate a log record here? */
2081 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2082 if (err)
2083 goto out3;
2084 }
2085
2086 if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2087 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2088 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2089 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2090 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2091 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2092 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2093 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2094 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2095 skipaclchk, cr, mnt_ns);
2096 }
2097
2098 if (mask & (ATTR_UID|ATTR_GID)) {
2099 int idmask = (mask & (ATTR_UID|ATTR_GID));
2100 int take_owner;
2101 int take_group;
2102 uid_t uid;
2103 gid_t gid;
2104
2105 /*
2106 * NOTE: even if a new mode is being set,
2107 * we may clear S_ISUID/S_ISGID bits.
2108 */
2109
2110 if (!(mask & ATTR_MODE))
2111 vap->va_mode = zp->z_mode;
2112
2113 /*
2114 * Take ownership or chgrp to group we are a member of
2115 */
2116
2117 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2118 vap->va_uid);
2119 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2120 vap->va_gid);
2121 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2122 take_group = (mask & ATTR_GID) &&
2123 zfs_groupmember(zfsvfs, gid, cr);
2124
2125 /*
2126 * If both ATTR_UID and ATTR_GID are set then take_owner and
2127 * take_group must both be set in order to allow taking
2128 * ownership.
2129 *
2130 * Otherwise, send the check through secpolicy_vnode_setattr()
2131 *
2132 */
2133
2134 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2135 take_owner && take_group) ||
2136 ((idmask == ATTR_UID) && take_owner) ||
2137 ((idmask == ATTR_GID) && take_group)) {
2138 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2139 skipaclchk, cr, mnt_ns) == 0) {
2140 /*
2141 * Remove setuid/setgid for non-privileged users
2142 */
2143 (void) secpolicy_setid_clear(vap, cr);
2144 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2145 } else {
2146 need_policy = TRUE;
2147 }
2148 } else {
2149 need_policy = TRUE;
2150 }
2151 }
2152
2153 mutex_enter(&zp->z_lock);
2154 oldva.va_mode = zp->z_mode;
2155 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2156 if (mask & ATTR_XVATTR) {
2157 /*
2158 * Update xvattr mask to include only those attributes
2159 * that are actually changing.
2160 *
2161 * the bits will be restored prior to actually setting
2162 * the attributes so the caller thinks they were set.
2163 */
2164 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2165 if (xoap->xoa_appendonly !=
2166 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2167 need_policy = TRUE;
2168 } else {
2169 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2170 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2171 }
2172 }
2173
2174 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2175 if (xoap->xoa_projinherit !=
2176 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2177 need_policy = TRUE;
2178 } else {
2179 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2180 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2181 }
2182 }
2183
2184 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2185 if (xoap->xoa_nounlink !=
2186 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2187 need_policy = TRUE;
2188 } else {
2189 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2190 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2191 }
2192 }
2193
2194 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2195 if (xoap->xoa_immutable !=
2196 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2197 need_policy = TRUE;
2198 } else {
2199 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2200 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2201 }
2202 }
2203
2204 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2205 if (xoap->xoa_nodump !=
2206 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2207 need_policy = TRUE;
2208 } else {
2209 XVA_CLR_REQ(xvap, XAT_NODUMP);
2210 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2211 }
2212 }
2213
2214 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2215 if (xoap->xoa_av_modified !=
2216 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2217 need_policy = TRUE;
2218 } else {
2219 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2220 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2221 }
2222 }
2223
2224 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2225 if ((!S_ISREG(ip->i_mode) &&
2226 xoap->xoa_av_quarantined) ||
2227 xoap->xoa_av_quarantined !=
2228 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2229 need_policy = TRUE;
2230 } else {
2231 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2232 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2233 }
2234 }
2235
2236 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2237 mutex_exit(&zp->z_lock);
2238 err = SET_ERROR(EPERM);
2239 goto out3;
2240 }
2241
2242 if (need_policy == FALSE &&
2243 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2244 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2245 need_policy = TRUE;
2246 }
2247 }
2248
2249 mutex_exit(&zp->z_lock);
2250
2251 if (mask & ATTR_MODE) {
2252 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2253 mnt_ns) == 0) {
2254 err = secpolicy_setid_setsticky_clear(ip, vap,
2255 &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2256 if (err)
2257 goto out3;
2258 trim_mask |= ATTR_MODE;
2259 } else {
2260 need_policy = TRUE;
2261 }
2262 }
2263
2264 if (need_policy) {
2265 /*
2266 * If trim_mask is set then take ownership
2267 * has been granted or write_acl is present and user
2268 * has the ability to modify mode. In that case remove
2269 * UID|GID and or MODE from mask so that
2270 * secpolicy_vnode_setattr() doesn't revoke it.
2271 */
2272
2273 if (trim_mask) {
2274 saved_mask = vap->va_mask;
2275 vap->va_mask &= ~trim_mask;
2276 }
2277 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2278 zfs_zaccess_unix, zp);
2279 if (err)
2280 goto out3;
2281
2282 if (trim_mask)
2283 vap->va_mask |= saved_mask;
2284 }
2285
2286 /*
2287 * secpolicy_vnode_setattr, or take ownership may have
2288 * changed va_mask
2289 */
2290 mask = vap->va_mask;
2291
2292 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2293 handle_eadir = B_TRUE;
2294 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2295 &xattr_obj, sizeof (xattr_obj));
2296
2297 if (err == 0 && xattr_obj) {
2298 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2299 if (err)
2300 goto out2;
2301 }
2302 if (mask & ATTR_UID) {
2303 new_kuid = zfs_fuid_create(zfsvfs,
2304 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2305 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2306 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2307 new_kuid)) {
2308 if (attrzp)
2309 zrele(attrzp);
2310 err = SET_ERROR(EDQUOT);
2311 goto out2;
2312 }
2313 }
2314
2315 if (mask & ATTR_GID) {
2316 new_kgid = zfs_fuid_create(zfsvfs,
2317 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2318 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2319 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2320 new_kgid)) {
2321 if (attrzp)
2322 zrele(attrzp);
2323 err = SET_ERROR(EDQUOT);
2324 goto out2;
2325 }
2326 }
2327
2328 if (projid != ZFS_INVALID_PROJID &&
2329 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2330 if (attrzp)
2331 zrele(attrzp);
2332 err = EDQUOT;
2333 goto out2;
2334 }
2335 }
2336 tx = dmu_tx_create(os);
2337
2338 if (mask & ATTR_MODE) {
2339 uint64_t pmode = zp->z_mode;
2340 uint64_t acl_obj;
2341 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2342
2343 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2344 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2345 err = EPERM;
2346 goto out;
2347 }
2348
2349 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2350 goto out;
2351
2352 mutex_enter(&zp->z_lock);
2353 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2354 /*
2355 * Are we upgrading ACL from old V0 format
2356 * to V1 format?
2357 */
2358 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2359 zfs_znode_acl_version(zp) ==
2360 ZFS_ACL_VERSION_INITIAL) {
2361 dmu_tx_hold_free(tx, acl_obj, 0,
2362 DMU_OBJECT_END);
2363 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2364 0, aclp->z_acl_bytes);
2365 } else {
2366 dmu_tx_hold_write(tx, acl_obj, 0,
2367 aclp->z_acl_bytes);
2368 }
2369 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2371 0, aclp->z_acl_bytes);
2372 }
2373 mutex_exit(&zp->z_lock);
2374 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2375 } else {
2376 if (((mask & ATTR_XVATTR) &&
2377 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2378 (projid != ZFS_INVALID_PROJID &&
2379 !(zp->z_pflags & ZFS_PROJID)))
2380 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2381 else
2382 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2383 }
2384
2385 if (attrzp) {
2386 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2387 }
2388
2389 fuid_dirtied = zfsvfs->z_fuid_dirty;
2390 if (fuid_dirtied)
2391 zfs_fuid_txhold(zfsvfs, tx);
2392
2393 zfs_sa_upgrade_txholds(tx, zp);
2394
2395 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2396 if (err)
2397 goto out;
2398
2399 count = 0;
2400 /*
2401 * Set each attribute requested.
2402 * We group settings according to the locks they need to acquire.
2403 *
2404 * Note: you cannot set ctime directly, although it will be
2405 * updated as a side-effect of calling this function.
2406 */
2407
2408 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2409 /*
2410 * For the existed object that is upgraded from old system,
2411 * its on-disk layout has no slot for the project ID attribute.
2412 * But quota accounting logic needs to access related slots by
2413 * offset directly. So we need to adjust old objects' layout
2414 * to make the project ID to some unified and fixed offset.
2415 */
2416 if (attrzp)
2417 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2418 if (err == 0)
2419 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2420
2421 if (unlikely(err == EEXIST))
2422 err = 0;
2423 else if (err != 0)
2424 goto out;
2425 else
2426 projid = ZFS_INVALID_PROJID;
2427 }
2428
2429 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2430 mutex_enter(&zp->z_acl_lock);
2431 mutex_enter(&zp->z_lock);
2432
2433 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2434 &zp->z_pflags, sizeof (zp->z_pflags));
2435
2436 if (attrzp) {
2437 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2438 mutex_enter(&attrzp->z_acl_lock);
2439 mutex_enter(&attrzp->z_lock);
2440 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2441 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2442 sizeof (attrzp->z_pflags));
2443 if (projid != ZFS_INVALID_PROJID) {
2444 attrzp->z_projid = projid;
2445 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2446 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2447 sizeof (attrzp->z_projid));
2448 }
2449 }
2450
2451 if (mask & (ATTR_UID|ATTR_GID)) {
2452
2453 if (mask & ATTR_UID) {
2454 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2455 new_uid = zfs_uid_read(ZTOI(zp));
2456 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2457 &new_uid, sizeof (new_uid));
2458 if (attrzp) {
2459 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2460 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2461 sizeof (new_uid));
2462 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2463 }
2464 }
2465
2466 if (mask & ATTR_GID) {
2467 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2468 new_gid = zfs_gid_read(ZTOI(zp));
2469 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2470 NULL, &new_gid, sizeof (new_gid));
2471 if (attrzp) {
2472 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2473 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2474 sizeof (new_gid));
2475 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2476 }
2477 }
2478 if (!(mask & ATTR_MODE)) {
2479 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2480 NULL, &new_mode, sizeof (new_mode));
2481 new_mode = zp->z_mode;
2482 }
2483 err = zfs_acl_chown_setattr(zp);
2484 ASSERT0(err);
2485 if (attrzp) {
2486 err = zfs_acl_chown_setattr(attrzp);
2487 ASSERT0(err);
2488 }
2489 }
2490
2491 if (mask & ATTR_MODE) {
2492 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2493 &new_mode, sizeof (new_mode));
2494 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2495 ASSERT3P(aclp, !=, NULL);
2496 err = zfs_aclset_common(zp, aclp, cr, tx);
2497 ASSERT0(err);
2498 if (zp->z_acl_cached)
2499 zfs_acl_free(zp->z_acl_cached);
2500 zp->z_acl_cached = aclp;
2501 aclp = NULL;
2502 }
2503
2504 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2505 zp->z_atime_dirty = B_FALSE;
2506 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
2507 ZFS_TIME_ENCODE(&tmp_atime, atime);
2508 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2509 &atime, sizeof (atime));
2510 }
2511
2512 if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2513 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2514 zpl_inode_set_mtime_to_ts(ZTOI(zp),
2515 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
2516
2517 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2518 mtime, sizeof (mtime));
2519 }
2520
2521 if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2522 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2523 zpl_inode_set_ctime_to_ts(ZTOI(zp),
2524 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
2525 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2526 ctime, sizeof (ctime));
2527 }
2528
2529 if (projid != ZFS_INVALID_PROJID) {
2530 zp->z_projid = projid;
2531 SA_ADD_BULK_ATTR(bulk, count,
2532 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2533 sizeof (zp->z_projid));
2534 }
2535
2536 if (attrzp && mask) {
2537 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2538 SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2539 sizeof (ctime));
2540 }
2541
2542 /*
2543 * Do this after setting timestamps to prevent timestamp
2544 * update from toggling bit
2545 */
2546
2547 if (xoap && (mask & ATTR_XVATTR)) {
2548
2549 /*
2550 * restore trimmed off masks
2551 * so that return masks can be set for caller.
2552 */
2553
2554 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2555 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2556 }
2557 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2558 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2559 }
2560 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2561 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2562 }
2563 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2564 XVA_SET_REQ(xvap, XAT_NODUMP);
2565 }
2566 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2567 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2568 }
2569 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2570 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2571 }
2572 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2573 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2574 }
2575
2576 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2577 ASSERT(S_ISREG(ip->i_mode));
2578
2579 zfs_xvattr_set(zp, xvap, tx);
2580 }
2581
2582 if (fuid_dirtied)
2583 zfs_fuid_sync(zfsvfs, tx);
2584
2585 if (mask != 0)
2586 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2587
2588 mutex_exit(&zp->z_lock);
2589 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2590 mutex_exit(&zp->z_acl_lock);
2591
2592 if (attrzp) {
2593 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2594 mutex_exit(&attrzp->z_acl_lock);
2595 mutex_exit(&attrzp->z_lock);
2596 }
2597 out:
2598 if (err == 0 && xattr_count > 0) {
2599 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2600 xattr_count, tx);
2601 ASSERT0(err2);
2602 }
2603
2604 if (aclp)
2605 zfs_acl_free(aclp);
2606
2607 if (fuidp) {
2608 zfs_fuid_info_free(fuidp);
2609 fuidp = NULL;
2610 }
2611
2612 if (err) {
2613 dmu_tx_abort(tx);
2614 if (attrzp)
2615 zrele(attrzp);
2616 if (err == ERESTART)
2617 goto top;
2618 } else {
2619 if (count > 0)
2620 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2621 dmu_tx_commit(tx);
2622 if (attrzp) {
2623 if (err2 == 0 && handle_eadir)
2624 err = zfs_setattr_dir(attrzp);
2625 zrele(attrzp);
2626 }
2627 zfs_znode_update_vfs(zp);
2628 }
2629
2630 out2:
2631 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
2632 err = zil_commit(zilog, 0);
2633
2634 out3:
2635 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2636 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2637 kmem_free(tmpxvattr, sizeof (xvattr_t));
2638 zfs_exit(zfsvfs, FTAG);
2639 return (err);
2640 }
2641
2642 typedef struct zfs_zlock {
2643 krwlock_t *zl_rwlock; /* lock we acquired */
2644 znode_t *zl_znode; /* znode we held */
2645 struct zfs_zlock *zl_next; /* next in list */
2646 } zfs_zlock_t;
2647
2648 /*
2649 * Drop locks and release vnodes that were held by zfs_rename_lock().
2650 */
2651 static void
zfs_rename_unlock(zfs_zlock_t ** zlpp)2652 zfs_rename_unlock(zfs_zlock_t **zlpp)
2653 {
2654 zfs_zlock_t *zl;
2655
2656 while ((zl = *zlpp) != NULL) {
2657 if (zl->zl_znode != NULL)
2658 zfs_zrele_async(zl->zl_znode);
2659 rw_exit(zl->zl_rwlock);
2660 *zlpp = zl->zl_next;
2661 kmem_free(zl, sizeof (*zl));
2662 }
2663 }
2664
2665 /*
2666 * Search back through the directory tree, using the ".." entries.
2667 * Lock each directory in the chain to prevent concurrent renames.
2668 * Fail any attempt to move a directory into one of its own descendants.
2669 * XXX - z_parent_lock can overlap with map or grow locks
2670 */
2671 static int
zfs_rename_lock(znode_t * szp,znode_t * tdzp,znode_t * sdzp,zfs_zlock_t ** zlpp)2672 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2673 {
2674 zfs_zlock_t *zl;
2675 znode_t *zp = tdzp;
2676 uint64_t rootid = ZTOZSB(zp)->z_root;
2677 uint64_t oidp = zp->z_id;
2678 krwlock_t *rwlp = &szp->z_parent_lock;
2679 krw_t rw = RW_WRITER;
2680
2681 /*
2682 * First pass write-locks szp and compares to zp->z_id.
2683 * Later passes read-lock zp and compare to zp->z_parent.
2684 */
2685 do {
2686 if (!rw_tryenter(rwlp, rw)) {
2687 /*
2688 * Another thread is renaming in this path.
2689 * Note that if we are a WRITER, we don't have any
2690 * parent_locks held yet.
2691 */
2692 if (rw == RW_READER && zp->z_id > szp->z_id) {
2693 /*
2694 * Drop our locks and restart
2695 */
2696 zfs_rename_unlock(&zl);
2697 *zlpp = NULL;
2698 zp = tdzp;
2699 oidp = zp->z_id;
2700 rwlp = &szp->z_parent_lock;
2701 rw = RW_WRITER;
2702 continue;
2703 } else {
2704 /*
2705 * Wait for other thread to drop its locks
2706 */
2707 rw_enter(rwlp, rw);
2708 }
2709 }
2710
2711 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2712 zl->zl_rwlock = rwlp;
2713 zl->zl_znode = NULL;
2714 zl->zl_next = *zlpp;
2715 *zlpp = zl;
2716
2717 if (oidp == szp->z_id) /* We're a descendant of szp */
2718 return (SET_ERROR(EINVAL));
2719
2720 if (oidp == rootid) /* We've hit the top */
2721 return (0);
2722
2723 if (rw == RW_READER) { /* i.e. not the first pass */
2724 int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2725 if (error)
2726 return (error);
2727 zl->zl_znode = zp;
2728 }
2729 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2730 &oidp, sizeof (oidp));
2731 rwlp = &zp->z_parent_lock;
2732 rw = RW_READER;
2733
2734 } while (zp->z_id != sdzp->z_id);
2735
2736 return (0);
2737 }
2738
2739 /*
2740 * Move an entry from the provided source directory to the target
2741 * directory. Change the entry name as indicated.
2742 *
2743 * IN: sdzp - Source directory containing the "old entry".
2744 * snm - Old entry name.
2745 * tdzp - Target directory to contain the "new entry".
2746 * tnm - New entry name.
2747 * cr - credentials of caller.
2748 * flags - case flags
2749 * rflags - RENAME_* flags
2750 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0).
2751 * mnt_ns - user namespace of the mount
2752 *
2753 * RETURN: 0 on success, error code on failure.
2754 *
2755 * Timestamps:
2756 * sdzp,tdzp - ctime|mtime updated
2757 */
2758 int
zfs_rename(znode_t * sdzp,char * snm,znode_t * tdzp,char * tnm,cred_t * cr,int flags,uint64_t rflags,vattr_t * wo_vap,zidmap_t * mnt_ns)2759 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2760 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2761 {
2762 znode_t *szp, *tzp;
2763 zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
2764 zilog_t *zilog;
2765 zfs_dirlock_t *sdl, *tdl;
2766 dmu_tx_t *tx;
2767 zfs_zlock_t *zl;
2768 int cmp, serr, terr;
2769 int error = 0;
2770 int zflg = 0;
2771 boolean_t waited = B_FALSE;
2772 /* Needed for whiteout inode creation. */
2773 boolean_t fuid_dirtied;
2774 zfs_acl_ids_t acl_ids;
2775 boolean_t have_acl = B_FALSE;
2776 znode_t *wzp = NULL;
2777
2778
2779 if (snm == NULL || tnm == NULL)
2780 return (SET_ERROR(EINVAL));
2781
2782 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2783 return (SET_ERROR(EINVAL));
2784
2785 /* Already checked by Linux VFS, but just to make sure. */
2786 if (rflags & RENAME_EXCHANGE &&
2787 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2788 return (SET_ERROR(EINVAL));
2789
2790 /*
2791 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2792 * right kind of vattr_t for the whiteout file. These are set
2793 * internally by ZFS so should never be incorrect.
2794 */
2795 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2796 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2797 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2798
2799 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2800 return (error);
2801 zilog = zfsvfs->z_log;
2802
2803 if ((error = zfs_verify_zp(tdzp)) != 0) {
2804 zfs_exit(zfsvfs, FTAG);
2805 return (error);
2806 }
2807
2808 /*
2809 * We check i_sb because snapshots and the ctldir must have different
2810 * super blocks.
2811 */
2812 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2813 zfsctl_is_node(ZTOI(tdzp))) {
2814 zfs_exit(zfsvfs, FTAG);
2815 return (SET_ERROR(EXDEV));
2816 }
2817
2818 if (zfsvfs->z_utf8 && u8_validate(tnm,
2819 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2820 zfs_exit(zfsvfs, FTAG);
2821 return (SET_ERROR(EILSEQ));
2822 }
2823
2824 if (flags & FIGNORECASE)
2825 zflg |= ZCILOOK;
2826
2827 top:
2828 szp = NULL;
2829 tzp = NULL;
2830 zl = NULL;
2831
2832 /*
2833 * This is to prevent the creation of links into attribute space
2834 * by renaming a linked file into/outof an attribute directory.
2835 * See the comment in zfs_link() for why this is considered bad.
2836 */
2837 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2838 zfs_exit(zfsvfs, FTAG);
2839 return (SET_ERROR(EINVAL));
2840 }
2841
2842 /*
2843 * Lock source and target directory entries. To prevent deadlock,
2844 * a lock ordering must be defined. We lock the directory with
2845 * the smallest object id first, or if it's a tie, the one with
2846 * the lexically first name.
2847 */
2848 if (sdzp->z_id < tdzp->z_id) {
2849 cmp = -1;
2850 } else if (sdzp->z_id > tdzp->z_id) {
2851 cmp = 1;
2852 } else {
2853 /*
2854 * First compare the two name arguments without
2855 * considering any case folding.
2856 */
2857 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2858
2859 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2860 ASSERT(error == 0 || !zfsvfs->z_utf8);
2861 if (cmp == 0) {
2862 /*
2863 * POSIX: "If the old argument and the new argument
2864 * both refer to links to the same existing file,
2865 * the rename() function shall return successfully
2866 * and perform no other action."
2867 */
2868 zfs_exit(zfsvfs, FTAG);
2869 return (0);
2870 }
2871 /*
2872 * If the file system is case-folding, then we may
2873 * have some more checking to do. A case-folding file
2874 * system is either supporting mixed case sensitivity
2875 * access or is completely case-insensitive. Note
2876 * that the file system is always case preserving.
2877 *
2878 * In mixed sensitivity mode case sensitive behavior
2879 * is the default. FIGNORECASE must be used to
2880 * explicitly request case insensitive behavior.
2881 *
2882 * If the source and target names provided differ only
2883 * by case (e.g., a request to rename 'tim' to 'Tim'),
2884 * we will treat this as a special case in the
2885 * case-insensitive mode: as long as the source name
2886 * is an exact match, we will allow this to proceed as
2887 * a name-change request.
2888 */
2889 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2890 (zfsvfs->z_case == ZFS_CASE_MIXED &&
2891 flags & FIGNORECASE)) &&
2892 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2893 &error) == 0) {
2894 /*
2895 * case preserving rename request, require exact
2896 * name matches
2897 */
2898 zflg |= ZCIEXACT;
2899 zflg &= ~ZCILOOK;
2900 }
2901 }
2902
2903 /*
2904 * If the source and destination directories are the same, we should
2905 * grab the z_name_lock of that directory only once.
2906 */
2907 if (sdzp == tdzp) {
2908 zflg |= ZHAVELOCK;
2909 rw_enter(&sdzp->z_name_lock, RW_READER);
2910 }
2911
2912 if (cmp < 0) {
2913 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2914 ZEXISTS | zflg, NULL, NULL);
2915 terr = zfs_dirent_lock(&tdl,
2916 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2917 } else {
2918 terr = zfs_dirent_lock(&tdl,
2919 tdzp, tnm, &tzp, zflg, NULL, NULL);
2920 serr = zfs_dirent_lock(&sdl,
2921 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2922 NULL, NULL);
2923 }
2924
2925 if (serr) {
2926 /*
2927 * Source entry invalid or not there.
2928 */
2929 if (!terr) {
2930 zfs_dirent_unlock(tdl);
2931 if (tzp)
2932 zrele(tzp);
2933 }
2934
2935 if (sdzp == tdzp)
2936 rw_exit(&sdzp->z_name_lock);
2937
2938 if (strcmp(snm, "..") == 0)
2939 serr = EINVAL;
2940 zfs_exit(zfsvfs, FTAG);
2941 return (serr);
2942 }
2943 if (terr) {
2944 zfs_dirent_unlock(sdl);
2945 zrele(szp);
2946
2947 if (sdzp == tdzp)
2948 rw_exit(&sdzp->z_name_lock);
2949
2950 if (strcmp(tnm, "..") == 0)
2951 terr = EINVAL;
2952 zfs_exit(zfsvfs, FTAG);
2953 return (terr);
2954 }
2955
2956 /*
2957 * If we are using project inheritance, means if the directory has
2958 * ZFS_PROJINHERIT set, then its descendant directories will inherit
2959 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2960 * such case, we only allow renames into our tree when the project
2961 * IDs are the same.
2962 */
2963 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2964 tdzp->z_projid != szp->z_projid) {
2965 error = SET_ERROR(EXDEV);
2966 goto out;
2967 }
2968
2969 /*
2970 * Must have write access at the source to remove the old entry
2971 * and write access at the target to create the new entry.
2972 * Note that if target and source are the same, this can be
2973 * done in a single check.
2974 */
2975 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2976 goto out;
2977
2978 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2979 /*
2980 * Check to make sure rename is valid.
2981 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2982 */
2983 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2984 goto out;
2985 }
2986
2987 /*
2988 * Does target exist?
2989 */
2990 if (tzp) {
2991 if (rflags & RENAME_NOREPLACE) {
2992 error = SET_ERROR(EEXIST);
2993 goto out;
2994 }
2995 /*
2996 * Source and target must be the same type (unless exchanging).
2997 */
2998 if (!(rflags & RENAME_EXCHANGE)) {
2999 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
3000 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
3001
3002 if (s_is_dir != t_is_dir) {
3003 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
3004 goto out;
3005 }
3006 }
3007 /*
3008 * POSIX dictates that when the source and target
3009 * entries refer to the same file object, rename
3010 * must do nothing and exit without error.
3011 */
3012 if (szp->z_id == tzp->z_id) {
3013 error = 0;
3014 goto out;
3015 }
3016 } else if (rflags & RENAME_EXCHANGE) {
3017 /* Target must exist for RENAME_EXCHANGE. */
3018 error = SET_ERROR(ENOENT);
3019 goto out;
3020 }
3021
3022 /* Set up inode creation for RENAME_WHITEOUT. */
3023 if (rflags & RENAME_WHITEOUT) {
3024 /*
3025 * Whiteout files are not regular files or directories, so to
3026 * match zfs_create() we do not inherit the project id.
3027 */
3028 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
3029
3030 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
3031 if (error)
3032 goto out;
3033
3034 if (!have_acl) {
3035 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
3036 &acl_ids, mnt_ns);
3037 if (error)
3038 goto out;
3039 have_acl = B_TRUE;
3040 }
3041
3042 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
3043 error = SET_ERROR(EDQUOT);
3044 goto out;
3045 }
3046 }
3047
3048 tx = dmu_tx_create(zfsvfs->z_os);
3049 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3050 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3051 dmu_tx_hold_zap(tx, sdzp->z_id,
3052 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
3053 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3054 if (sdzp != tdzp) {
3055 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3056 zfs_sa_upgrade_txholds(tx, tdzp);
3057 }
3058 if (tzp) {
3059 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3060 zfs_sa_upgrade_txholds(tx, tzp);
3061 }
3062 if (rflags & RENAME_WHITEOUT) {
3063 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3064 ZFS_SA_BASE_ATTR_SIZE);
3065
3066 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
3067 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3068 if (!zfsvfs->z_use_sa &&
3069 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3070 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3071 0, acl_ids.z_aclp->z_acl_bytes);
3072 }
3073 }
3074 fuid_dirtied = zfsvfs->z_fuid_dirty;
3075 if (fuid_dirtied)
3076 zfs_fuid_txhold(zfsvfs, tx);
3077 zfs_sa_upgrade_txholds(tx, szp);
3078 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3079 error = dmu_tx_assign(tx,
3080 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3081 if (error) {
3082 if (zl != NULL)
3083 zfs_rename_unlock(&zl);
3084 zfs_dirent_unlock(sdl);
3085 zfs_dirent_unlock(tdl);
3086
3087 if (sdzp == tdzp)
3088 rw_exit(&sdzp->z_name_lock);
3089
3090 if (error == ERESTART) {
3091 waited = B_TRUE;
3092 dmu_tx_wait(tx);
3093 dmu_tx_abort(tx);
3094 zrele(szp);
3095 if (tzp)
3096 zrele(tzp);
3097 goto top;
3098 }
3099 dmu_tx_abort(tx);
3100 zrele(szp);
3101 if (tzp)
3102 zrele(tzp);
3103 zfs_exit(zfsvfs, FTAG);
3104 return (error);
3105 }
3106
3107 /*
3108 * Unlink the source.
3109 */
3110 szp->z_pflags |= ZFS_AV_MODIFIED;
3111 if (tdzp->z_pflags & ZFS_PROJINHERIT)
3112 szp->z_pflags |= ZFS_PROJINHERIT;
3113
3114 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3115 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3116 VERIFY0(error);
3117
3118 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3119 if (error)
3120 goto commit;
3121
3122 /*
3123 * Unlink the target.
3124 */
3125 if (tzp) {
3126 int tzflg = zflg;
3127
3128 if (rflags & RENAME_EXCHANGE) {
3129 /* This inode will be re-linked soon. */
3130 tzflg |= ZRENAMING;
3131
3132 tzp->z_pflags |= ZFS_AV_MODIFIED;
3133 if (sdzp->z_pflags & ZFS_PROJINHERIT)
3134 tzp->z_pflags |= ZFS_PROJINHERIT;
3135
3136 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3137 (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3138 ASSERT0(error);
3139 }
3140 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3141 if (error)
3142 goto commit_link_szp;
3143 }
3144
3145 /*
3146 * Create the new target links:
3147 * * We always link the target.
3148 * * RENAME_EXCHANGE: Link the old target to the source.
3149 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3150 */
3151 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3152 if (error) {
3153 /*
3154 * If we have removed the existing target, a subsequent call to
3155 * zfs_link_create() to add back the same entry, but with a new
3156 * dnode (szp), should not fail.
3157 */
3158 ASSERT0P(tzp);
3159 goto commit_link_tzp;
3160 }
3161
3162 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3163 case RENAME_EXCHANGE:
3164 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3165 /*
3166 * The same argument as zfs_link_create() failing for
3167 * szp applies here, since the source directory must
3168 * have had an entry we are replacing.
3169 */
3170 ASSERT0(error);
3171 if (error)
3172 goto commit_unlink_td_szp;
3173 break;
3174 case RENAME_WHITEOUT:
3175 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3176 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3177 if (error) {
3178 zfs_znode_delete(wzp, tx);
3179 remove_inode_hash(ZTOI(wzp));
3180 goto commit_unlink_td_szp;
3181 }
3182 break;
3183 }
3184
3185 if (fuid_dirtied)
3186 zfs_fuid_sync(zfsvfs, tx);
3187
3188 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3189 case RENAME_EXCHANGE:
3190 zfs_log_rename_exchange(zilog, tx,
3191 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3192 tdzp, tdl->dl_name, szp);
3193 break;
3194 case RENAME_WHITEOUT:
3195 zfs_log_rename_whiteout(zilog, tx,
3196 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3197 tdzp, tdl->dl_name, szp, wzp);
3198 break;
3199 default:
3200 ASSERT0(rflags & ~RENAME_NOREPLACE);
3201 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3202 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3203 break;
3204 }
3205
3206 commit:
3207 dmu_tx_commit(tx);
3208 out:
3209 if (have_acl)
3210 zfs_acl_ids_free(&acl_ids);
3211
3212 zfs_znode_update_vfs(sdzp);
3213 if (sdzp == tdzp)
3214 rw_exit(&sdzp->z_name_lock);
3215
3216 if (sdzp != tdzp)
3217 zfs_znode_update_vfs(tdzp);
3218
3219 zfs_znode_update_vfs(szp);
3220 zrele(szp);
3221 if (wzp) {
3222 zfs_znode_update_vfs(wzp);
3223 zrele(wzp);
3224 }
3225 if (tzp) {
3226 zfs_znode_update_vfs(tzp);
3227 zrele(tzp);
3228 }
3229
3230 if (zl != NULL)
3231 zfs_rename_unlock(&zl);
3232
3233 zfs_dirent_unlock(sdl);
3234 zfs_dirent_unlock(tdl);
3235
3236 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3237 error = zil_commit(zilog, 0);
3238
3239 zfs_exit(zfsvfs, FTAG);
3240 return (error);
3241
3242 /*
3243 * Clean-up path for broken link state.
3244 *
3245 * At this point we are in a (very) bad state, so we need to do our
3246 * best to correct the state. In particular, all of the nlinks are
3247 * wrong because we were destroying and creating links with ZRENAMING.
3248 *
3249 * In some form, all of these operations have to resolve the state:
3250 *
3251 * * link_destroy() *must* succeed. Fortunately, this is very likely
3252 * since we only just created it.
3253 *
3254 * * link_create()s are allowed to fail (though they shouldn't because
3255 * we only just unlinked them and are putting the entries back
3256 * during clean-up). But if they fail, we can just forcefully drop
3257 * the nlink value to (at the very least) avoid broken nlink values
3258 * -- though in the case of non-empty directories we will have to
3259 * panic (otherwise we'd have a leaked directory with a broken ..).
3260 */
3261 commit_unlink_td_szp:
3262 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3263 commit_link_tzp:
3264 if (tzp) {
3265 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3266 VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3267 }
3268 commit_link_szp:
3269 if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3270 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3271 goto commit;
3272 }
3273
3274 /*
3275 * Insert the indicated symbolic reference entry into the directory.
3276 *
3277 * IN: dzp - Directory to contain new symbolic link.
3278 * name - Name of directory entry in dip.
3279 * vap - Attributes of new entry.
3280 * link - Name for new symlink entry.
3281 * cr - credentials of caller.
3282 * flags - case flags
3283 * mnt_ns - user namespace of the mount
3284 *
3285 * OUT: zpp - Znode for new symbolic link.
3286 *
3287 * RETURN: 0 on success, error code on failure.
3288 *
3289 * Timestamps:
3290 * dip - ctime|mtime updated
3291 */
3292 int
zfs_symlink(znode_t * dzp,char * name,vattr_t * vap,char * link,znode_t ** zpp,cred_t * cr,int flags,zidmap_t * mnt_ns)3293 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3294 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3295 {
3296 znode_t *zp;
3297 zfs_dirlock_t *dl;
3298 dmu_tx_t *tx;
3299 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
3300 zilog_t *zilog;
3301 uint64_t len = strlen(link);
3302 int error;
3303 int zflg = ZNEW;
3304 zfs_acl_ids_t acl_ids;
3305 boolean_t fuid_dirtied;
3306 uint64_t txtype = TX_SYMLINK;
3307 boolean_t waited = B_FALSE;
3308
3309 ASSERT(S_ISLNK(vap->va_mode));
3310
3311 if (name == NULL)
3312 return (SET_ERROR(EINVAL));
3313
3314 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3315 return (error);
3316 zilog = zfsvfs->z_log;
3317
3318 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3319 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3320 zfs_exit(zfsvfs, FTAG);
3321 return (SET_ERROR(EILSEQ));
3322 }
3323 if (flags & FIGNORECASE)
3324 zflg |= ZCILOOK;
3325
3326 if (len > MAXPATHLEN) {
3327 zfs_exit(zfsvfs, FTAG);
3328 return (SET_ERROR(ENAMETOOLONG));
3329 }
3330
3331 if ((error = zfs_acl_ids_create(dzp, 0,
3332 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3333 zfs_exit(zfsvfs, FTAG);
3334 return (error);
3335 }
3336 top:
3337 *zpp = NULL;
3338
3339 /*
3340 * Attempt to lock directory; fail if entry already exists.
3341 */
3342 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3343 if (error) {
3344 zfs_acl_ids_free(&acl_ids);
3345 zfs_exit(zfsvfs, FTAG);
3346 return (error);
3347 }
3348
3349 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3350 zfs_acl_ids_free(&acl_ids);
3351 zfs_dirent_unlock(dl);
3352 zfs_exit(zfsvfs, FTAG);
3353 return (error);
3354 }
3355
3356 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3357 zfs_acl_ids_free(&acl_ids);
3358 zfs_dirent_unlock(dl);
3359 zfs_exit(zfsvfs, FTAG);
3360 return (SET_ERROR(EDQUOT));
3361 }
3362 tx = dmu_tx_create(zfsvfs->z_os);
3363 fuid_dirtied = zfsvfs->z_fuid_dirty;
3364 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3365 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3366 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3367 ZFS_SA_BASE_ATTR_SIZE + len);
3368 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3369 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3371 acl_ids.z_aclp->z_acl_bytes);
3372 }
3373 if (fuid_dirtied)
3374 zfs_fuid_txhold(zfsvfs, tx);
3375 error = dmu_tx_assign(tx,
3376 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3377 if (error) {
3378 zfs_dirent_unlock(dl);
3379 if (error == ERESTART) {
3380 waited = B_TRUE;
3381 dmu_tx_wait(tx);
3382 dmu_tx_abort(tx);
3383 goto top;
3384 }
3385 zfs_acl_ids_free(&acl_ids);
3386 dmu_tx_abort(tx);
3387 zfs_exit(zfsvfs, FTAG);
3388 return (error);
3389 }
3390
3391 /*
3392 * Create a new object for the symlink.
3393 * for version 4 ZPL datasets the symlink will be an SA attribute
3394 */
3395 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3396
3397 if (fuid_dirtied)
3398 zfs_fuid_sync(zfsvfs, tx);
3399
3400 mutex_enter(&zp->z_lock);
3401 if (zp->z_is_sa)
3402 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3403 link, len, tx);
3404 else
3405 zfs_sa_symlink(zp, link, len, tx);
3406 mutex_exit(&zp->z_lock);
3407
3408 zp->z_size = len;
3409 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3410 &zp->z_size, sizeof (zp->z_size), tx);
3411 /*
3412 * Insert the new object into the directory.
3413 */
3414 error = zfs_link_create(dl, zp, tx, ZNEW);
3415 if (error != 0) {
3416 zfs_znode_delete(zp, tx);
3417 remove_inode_hash(ZTOI(zp));
3418 } else {
3419 if (flags & FIGNORECASE)
3420 txtype |= TX_CI;
3421 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3422
3423 zfs_znode_update_vfs(dzp);
3424 zfs_znode_update_vfs(zp);
3425 }
3426
3427 zfs_acl_ids_free(&acl_ids);
3428
3429 dmu_tx_commit(tx);
3430
3431 zfs_dirent_unlock(dl);
3432
3433 if (error == 0) {
3434 *zpp = zp;
3435
3436 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3437 error = zil_commit(zilog, 0);
3438 } else {
3439 zrele(zp);
3440 }
3441
3442 zfs_exit(zfsvfs, FTAG);
3443 return (error);
3444 }
3445
3446 /*
3447 * Return, in the buffer contained in the provided uio structure,
3448 * the symbolic path referred to by ip.
3449 *
3450 * IN: ip - inode of symbolic link
3451 * uio - structure to contain the link path.
3452 * cr - credentials of caller.
3453 *
3454 * RETURN: 0 if success
3455 * error code if failure
3456 *
3457 * Timestamps:
3458 * ip - atime updated
3459 */
3460 int
zfs_readlink(struct inode * ip,zfs_uio_t * uio,cred_t * cr)3461 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3462 {
3463 (void) cr;
3464 znode_t *zp = ITOZ(ip);
3465 zfsvfs_t *zfsvfs = ITOZSB(ip);
3466 int error;
3467
3468 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3469 return (error);
3470
3471 mutex_enter(&zp->z_lock);
3472 if (zp->z_is_sa)
3473 error = sa_lookup_uio(zp->z_sa_hdl,
3474 SA_ZPL_SYMLINK(zfsvfs), uio);
3475 else
3476 error = zfs_sa_readlink(zp, uio);
3477 mutex_exit(&zp->z_lock);
3478
3479 zfs_exit(zfsvfs, FTAG);
3480 return (error);
3481 }
3482
3483 /*
3484 * Insert a new entry into directory tdzp referencing szp.
3485 *
3486 * IN: tdzp - Directory to contain new entry.
3487 * szp - znode of new entry.
3488 * name - name of new entry.
3489 * cr - credentials of caller.
3490 * flags - case flags.
3491 *
3492 * RETURN: 0 if success
3493 * error code if failure
3494 *
3495 * Timestamps:
3496 * tdzp - ctime|mtime updated
3497 * szp - ctime updated
3498 */
3499 int
zfs_link(znode_t * tdzp,znode_t * szp,char * name,cred_t * cr,int flags)3500 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3501 int flags)
3502 {
3503 struct inode *sip = ZTOI(szp);
3504 znode_t *tzp;
3505 zfsvfs_t *zfsvfs = ZTOZSB(tdzp);
3506 zilog_t *zilog;
3507 zfs_dirlock_t *dl;
3508 dmu_tx_t *tx;
3509 int error;
3510 int zf = ZNEW;
3511 uint64_t parent;
3512 uid_t owner;
3513 boolean_t waited = B_FALSE;
3514 boolean_t is_tmpfile = 0;
3515 uint64_t txg;
3516
3517 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3518
3519 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3520
3521 if (name == NULL)
3522 return (SET_ERROR(EINVAL));
3523
3524 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3525 return (error);
3526 zilog = zfsvfs->z_log;
3527
3528 /*
3529 * POSIX dictates that we return EPERM here.
3530 * Better choices include ENOTSUP or EISDIR.
3531 */
3532 if (S_ISDIR(sip->i_mode)) {
3533 zfs_exit(zfsvfs, FTAG);
3534 return (SET_ERROR(EPERM));
3535 }
3536
3537 if ((error = zfs_verify_zp(szp)) != 0) {
3538 zfs_exit(zfsvfs, FTAG);
3539 return (error);
3540 }
3541
3542 /*
3543 * If we are using project inheritance, means if the directory has
3544 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3545 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3546 * such case, we only allow hard link creation in our tree when the
3547 * project IDs are the same.
3548 */
3549 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3550 tdzp->z_projid != szp->z_projid) {
3551 zfs_exit(zfsvfs, FTAG);
3552 return (SET_ERROR(EXDEV));
3553 }
3554
3555 /*
3556 * We check i_sb because snapshots and the ctldir must have different
3557 * super blocks.
3558 */
3559 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3560 zfs_exit(zfsvfs, FTAG);
3561 return (SET_ERROR(EXDEV));
3562 }
3563
3564 /* Prevent links to .zfs/shares files */
3565
3566 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3567 &parent, sizeof (uint64_t))) != 0) {
3568 zfs_exit(zfsvfs, FTAG);
3569 return (error);
3570 }
3571 if (parent == zfsvfs->z_shares_dir) {
3572 zfs_exit(zfsvfs, FTAG);
3573 return (SET_ERROR(EPERM));
3574 }
3575
3576 if (zfsvfs->z_utf8 && u8_validate(name,
3577 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3578 zfs_exit(zfsvfs, FTAG);
3579 return (SET_ERROR(EILSEQ));
3580 }
3581 if (flags & FIGNORECASE)
3582 zf |= ZCILOOK;
3583
3584 /*
3585 * We do not support links between attributes and non-attributes
3586 * because of the potential security risk of creating links
3587 * into "normal" file space in order to circumvent restrictions
3588 * imposed in attribute space.
3589 */
3590 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3591 zfs_exit(zfsvfs, FTAG);
3592 return (SET_ERROR(EINVAL));
3593 }
3594
3595 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3596 cr, ZFS_OWNER);
3597 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3598 zfs_exit(zfsvfs, FTAG);
3599 return (SET_ERROR(EPERM));
3600 }
3601
3602 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3603 zfs_init_idmap))) {
3604 zfs_exit(zfsvfs, FTAG);
3605 return (error);
3606 }
3607
3608 top:
3609 /*
3610 * Attempt to lock directory; fail if entry already exists.
3611 */
3612 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3613 if (error) {
3614 zfs_exit(zfsvfs, FTAG);
3615 return (error);
3616 }
3617
3618 tx = dmu_tx_create(zfsvfs->z_os);
3619 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3620 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3621 if (is_tmpfile)
3622 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3623
3624 zfs_sa_upgrade_txholds(tx, szp);
3625 zfs_sa_upgrade_txholds(tx, tdzp);
3626 error = dmu_tx_assign(tx,
3627 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3628 if (error) {
3629 zfs_dirent_unlock(dl);
3630 if (error == ERESTART) {
3631 waited = B_TRUE;
3632 dmu_tx_wait(tx);
3633 dmu_tx_abort(tx);
3634 goto top;
3635 }
3636 dmu_tx_abort(tx);
3637 zfs_exit(zfsvfs, FTAG);
3638 return (error);
3639 }
3640 /* unmark z_unlinked so zfs_link_create will not reject */
3641 if (is_tmpfile)
3642 szp->z_unlinked = B_FALSE;
3643 error = zfs_link_create(dl, szp, tx, 0);
3644
3645 if (error == 0) {
3646 uint64_t txtype = TX_LINK;
3647 /*
3648 * tmpfile is created to be in z_unlinkedobj, so remove it.
3649 * Also, we don't log in ZIL, because all previous file
3650 * operation on the tmpfile are ignored by ZIL. Instead we
3651 * always wait for txg to sync to make sure all previous
3652 * operation are sync safe.
3653 */
3654 if (is_tmpfile) {
3655 VERIFY0(zap_remove_int(zfsvfs->z_os,
3656 zfsvfs->z_unlinkedobj, szp->z_id, tx));
3657 } else {
3658 if (flags & FIGNORECASE)
3659 txtype |= TX_CI;
3660 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3661 }
3662 } else if (is_tmpfile) {
3663 /* restore z_unlinked since when linking failed */
3664 szp->z_unlinked = B_TRUE;
3665 }
3666 txg = dmu_tx_get_txg(tx);
3667 dmu_tx_commit(tx);
3668
3669 zfs_dirent_unlock(dl);
3670
3671 if (error == 0) {
3672 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3673 error = zil_commit(zilog, 0);
3674
3675 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3676 txg_wait_flag_t wait_flags =
3677 spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
3678 ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
3679 error = txg_wait_synced_flags(
3680 dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
3681 if (error != 0) {
3682 ASSERT3U(error, ==, ESHUTDOWN);
3683 error = SET_ERROR(EIO);
3684 }
3685 }
3686 }
3687
3688 zfs_znode_update_vfs(tdzp);
3689 zfs_znode_update_vfs(szp);
3690 zfs_exit(zfsvfs, FTAG);
3691 return (error);
3692 }
3693
3694 /* Finish page writeback. */
3695 static inline void
zfs_page_writeback_done(struct page * pp,int err)3696 zfs_page_writeback_done(struct page *pp, int err)
3697 {
3698 if (err != 0) {
3699 /*
3700 * Writeback failed. Re-dirty the page. It was undirtied before
3701 * the IO was issued (in zfs_putpage() or write_cache_pages()).
3702 * The kernel only considers writeback for dirty pages; if we
3703 * don't do this, it is eligible for eviction without being
3704 * written out, which we definitely don't want.
3705 */
3706 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3707 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3708 #else
3709 __set_page_dirty_nobuffers(pp);
3710 #endif
3711 }
3712
3713 ClearPageError(pp);
3714 end_page_writeback(pp);
3715 }
3716
3717 /*
3718 * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
3719 * for syncing writes. Called when the ZIL itx has been written to the log or
3720 * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
3721 * is passed as `err`.
3722 */
3723 static void
zfs_putpage_commit_cb(void * arg,int err)3724 zfs_putpage_commit_cb(void *arg, int err)
3725 {
3726 zfs_page_writeback_done(arg, err);
3727 }
3728
3729 /*
3730 * Push a page out to disk, once the page is on stable storage the
3731 * registered commit callback will be run as notification of completion.
3732 *
3733 * IN: ip - page mapped for inode.
3734 * pp - page to push (page is locked)
3735 * wbc - writeback control data
3736 * for_sync - does the caller intend to wait synchronously for the
3737 * page writeback to complete?
3738 *
3739 * RETURN: 0 if success
3740 * error code if failure
3741 *
3742 * Timestamps:
3743 * ip - ctime|mtime updated
3744 */
3745 int
zfs_putpage(struct inode * ip,struct page * pp,struct writeback_control * wbc,boolean_t for_sync)3746 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3747 boolean_t for_sync)
3748 {
3749 znode_t *zp = ITOZ(ip);
3750 zfsvfs_t *zfsvfs = ITOZSB(ip);
3751 loff_t offset;
3752 loff_t pgoff;
3753 unsigned int pglen;
3754 dmu_tx_t *tx;
3755 caddr_t va;
3756 int err = 0;
3757 uint64_t mtime[2], ctime[2];
3758 inode_timespec_t tmp_ts;
3759 sa_bulk_attr_t bulk[3];
3760 int cnt = 0;
3761 struct address_space *mapping;
3762
3763 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3764 return (err);
3765
3766 ASSERT(PageLocked(pp));
3767
3768 pgoff = page_offset(pp); /* Page byte-offset in file */
3769 offset = i_size_read(ip); /* File length in bytes */
3770 pglen = MIN(PAGE_SIZE, /* Page length in bytes */
3771 P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3772
3773 /* Page is beyond end of file */
3774 if (pgoff >= offset) {
3775 unlock_page(pp);
3776 zfs_exit(zfsvfs, FTAG);
3777 return (0);
3778 }
3779
3780 /* Truncate page length to end of file */
3781 if (pgoff + pglen > offset)
3782 pglen = offset - pgoff;
3783
3784 #if 0
3785 /*
3786 * FIXME: Allow mmap writes past its quota. The correct fix
3787 * is to register a page_mkwrite() handler to count the page
3788 * against its quota when it is about to be dirtied.
3789 */
3790 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3791 KUID_TO_SUID(ip->i_uid)) ||
3792 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3793 KGID_TO_SGID(ip->i_gid)) ||
3794 (zp->z_projid != ZFS_DEFAULT_PROJID &&
3795 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3796 zp->z_projid))) {
3797 err = EDQUOT;
3798 }
3799 #endif
3800
3801 /*
3802 * The ordering here is critical and must adhere to the following
3803 * rules in order to avoid deadlocking in either zfs_read() or
3804 * zfs_free_range() due to a lock inversion.
3805 *
3806 * 1) The page must be unlocked prior to acquiring the range lock.
3807 * This is critical because zfs_read() calls find_lock_page()
3808 * which may block on the page lock while holding the range lock.
3809 *
3810 * 2) Before setting or clearing write back on a page the range lock
3811 * must be held in order to prevent a lock inversion with the
3812 * zfs_free_range() function.
3813 *
3814 * This presents a problem because upon entering this function the
3815 * page lock is already held. To safely acquire the range lock the
3816 * page lock must be dropped. This creates a window where another
3817 * process could truncate, invalidate, dirty, or write out the page.
3818 *
3819 * Therefore, after successfully reacquiring the range and page locks
3820 * the current page state is checked. In the common case everything
3821 * will be as is expected and it can be written out. However, if
3822 * the page state has changed it must be handled accordingly.
3823 */
3824 mapping = pp->mapping;
3825 redirty_page_for_writepage(wbc, pp);
3826 unlock_page(pp);
3827
3828 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3829 pgoff, pglen, RL_WRITER);
3830 lock_page(pp);
3831
3832 /* Page mapping changed or it was no longer dirty, we're done */
3833 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3834 unlock_page(pp);
3835 zfs_rangelock_exit(lr);
3836 zfs_exit(zfsvfs, FTAG);
3837 return (0);
3838 }
3839
3840 /* Another process started write block if required */
3841 if (PageWriteback(pp)) {
3842 unlock_page(pp);
3843 zfs_rangelock_exit(lr);
3844
3845 if (wbc->sync_mode != WB_SYNC_NONE) {
3846 if (PageWriteback(pp))
3847 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3848 folio_wait_bit(page_folio(pp), PG_writeback);
3849 #else
3850 wait_on_page_bit(pp, PG_writeback);
3851 #endif
3852 }
3853
3854 zfs_exit(zfsvfs, FTAG);
3855 return (0);
3856 }
3857
3858 /* Clear the dirty flag the required locks are held */
3859 if (!clear_page_dirty_for_io(pp)) {
3860 unlock_page(pp);
3861 zfs_rangelock_exit(lr);
3862 zfs_exit(zfsvfs, FTAG);
3863 return (0);
3864 }
3865
3866 /*
3867 * Counterpart for redirty_page_for_writepage() above. This page
3868 * was in fact not skipped and should not be counted as if it were.
3869 */
3870 wbc->pages_skipped--;
3871 set_page_writeback(pp);
3872 unlock_page(pp);
3873
3874 tx = dmu_tx_create(zfsvfs->z_os);
3875 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3876 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3877 zfs_sa_upgrade_txholds(tx, zp);
3878
3879 err = dmu_tx_assign(tx, DMU_TX_WAIT);
3880 if (err != 0) {
3881 dmu_tx_abort(tx);
3882 zfs_page_writeback_done(pp, err);
3883 zfs_rangelock_exit(lr);
3884 zfs_exit(zfsvfs, FTAG);
3885
3886 /*
3887 * Don't return error for an async writeback; we've re-dirtied
3888 * the page so it will be tried again some other time.
3889 */
3890 return (for_sync ? err : 0);
3891 }
3892
3893 va = kmap(pp);
3894 ASSERT3U(pglen, <=, PAGE_SIZE);
3895 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3896 kunmap(pp);
3897
3898 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3899 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3900 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3901 &zp->z_pflags, 8);
3902
3903 /* Preserve the mtime and ctime provided by the inode */
3904 tmp_ts = zpl_inode_get_mtime(ip);
3905 ZFS_TIME_ENCODE(&tmp_ts, mtime);
3906 tmp_ts = zpl_inode_get_ctime(ip);
3907 ZFS_TIME_ENCODE(&tmp_ts, ctime);
3908 zp->z_atime_dirty = B_FALSE;
3909 zp->z_seq++;
3910
3911 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3912
3913 /*
3914 * A note about for_sync vs wbc->sync_mode.
3915 *
3916 * for_sync indicates that this is a syncing writeback, that is, kernel
3917 * caller expects the data to be durably stored before being notified.
3918 * Often, but not always, the call was triggered by a userspace syncing
3919 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
3920 * means that that page should remain "locked" (in the writeback state)
3921 * until it is definitely on disk (ie zil_commit() or spa_sync()).
3922 * Otherwise, we can unlock and return as soon as it is on the
3923 * in-memory ZIL.
3924 *
3925 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
3926 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
3927 * indicates this a regular async writeback (eg a cache eviction) and
3928 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
3929 * a syncing op that must be waited on (by convention, we test for
3930 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
3931 * performance should there ever be a new mode that we have not yet
3932 * added support for).
3933 *
3934 * So, why a separate for_sync field? This is because zpl_writepages()
3935 * calls zfs_putpage() multiple times for a single "logical" operation.
3936 * It wants all the individual pages to be for_sync==TRUE ie only
3937 * unlocked once durably stored, but it only wants one call to
3938 * zil_commit() at the very end, once all the pages are synced. So,
3939 * it repurposes sync_mode slightly to indicate who issue and wait for
3940 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
3941 * ALL, zfs_putpage should do it.
3942 *
3943 * Summary:
3944 * for_sync: 0=unlock immediately; 1=unlock once on disk
3945 * sync_mode: NONE=caller will commit; ALL=we will commit
3946 */
3947 boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
3948
3949 /*
3950 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
3951 * because it is a policy flag that indicates "someone will call
3952 * zil_commit() soon". for_sync=TRUE means exactly that; the only
3953 * question is whether it will be us, or zpl_writepages().
3954 */
3955 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
3956 B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
3957
3958 if (!for_sync) {
3959 /*
3960 * Async writeback is logged and written to the DMU, so page
3961 * can now be unlocked.
3962 */
3963 zfs_page_writeback_done(pp, 0);
3964 }
3965
3966 dmu_tx_commit(tx);
3967
3968 zfs_rangelock_exit(lr);
3969
3970 if (need_commit) {
3971 err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
3972 if (err != 0) {
3973 zfs_exit(zfsvfs, FTAG);
3974 return (err);
3975 }
3976 }
3977
3978 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3979
3980 zfs_exit(zfsvfs, FTAG);
3981 return (err);
3982 }
3983
3984 /*
3985 * Update the system attributes when the inode has been dirtied. For the
3986 * moment we only update the mode, atime, mtime, and ctime.
3987 */
3988 int
zfs_dirty_inode(struct inode * ip,int flags)3989 zfs_dirty_inode(struct inode *ip, int flags)
3990 {
3991 znode_t *zp = ITOZ(ip);
3992 zfsvfs_t *zfsvfs = ITOZSB(ip);
3993 dmu_tx_t *tx;
3994 uint64_t mode, atime[2], mtime[2], ctime[2];
3995 inode_timespec_t tmp_ts;
3996 sa_bulk_attr_t bulk[4];
3997 int error = 0;
3998 int cnt = 0;
3999
4000 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4001 return (0);
4002
4003 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4004 return (error);
4005
4006 #ifdef I_DIRTY_TIME
4007 /*
4008 * This is the lazytime semantic introduced in Linux 4.0
4009 * This flag will only be called from update_time when lazytime is set.
4010 * (Note, I_DIRTY_SYNC will also set if not lazytime)
4011 * Fortunately mtime and ctime are managed within ZFS itself, so we
4012 * only need to dirty atime.
4013 */
4014 if (flags == I_DIRTY_TIME) {
4015 zp->z_atime_dirty = B_TRUE;
4016 goto out;
4017 }
4018 #endif
4019
4020 tx = dmu_tx_create(zfsvfs->z_os);
4021
4022 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4023 zfs_sa_upgrade_txholds(tx, zp);
4024
4025 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4026 if (error) {
4027 dmu_tx_abort(tx);
4028 goto out;
4029 }
4030
4031 mutex_enter(&zp->z_lock);
4032 zp->z_atime_dirty = B_FALSE;
4033
4034 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4035 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4036 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4037 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4038
4039 /* Preserve the mode, mtime and ctime provided by the inode */
4040 tmp_ts = zpl_inode_get_atime(ip);
4041 ZFS_TIME_ENCODE(&tmp_ts, atime);
4042 tmp_ts = zpl_inode_get_mtime(ip);
4043 ZFS_TIME_ENCODE(&tmp_ts, mtime);
4044 tmp_ts = zpl_inode_get_ctime(ip);
4045 ZFS_TIME_ENCODE(&tmp_ts, ctime);
4046 mode = ip->i_mode;
4047
4048 zp->z_mode = mode;
4049
4050 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4051 mutex_exit(&zp->z_lock);
4052
4053 dmu_tx_commit(tx);
4054 out:
4055 zfs_exit(zfsvfs, FTAG);
4056 return (error);
4057 }
4058
4059 void
zfs_inactive(struct inode * ip)4060 zfs_inactive(struct inode *ip)
4061 {
4062 znode_t *zp = ITOZ(ip);
4063 zfsvfs_t *zfsvfs = ITOZSB(ip);
4064 uint64_t atime[2];
4065 int error;
4066 int need_unlock = 0;
4067
4068 /* Only read lock if we haven't already write locked, e.g. rollback */
4069 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
4070 need_unlock = 1;
4071 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4072 }
4073 if (zp->z_sa_hdl == NULL) {
4074 if (need_unlock)
4075 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4076 return;
4077 }
4078
4079 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
4080 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4081
4082 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4083 zfs_sa_upgrade_txholds(tx, zp);
4084 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4085 if (error) {
4086 dmu_tx_abort(tx);
4087 } else {
4088 inode_timespec_t tmp_atime;
4089 tmp_atime = zpl_inode_get_atime(ip);
4090 ZFS_TIME_ENCODE(&tmp_atime, atime);
4091 mutex_enter(&zp->z_lock);
4092 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4093 (void *)&atime, sizeof (atime), tx);
4094 zp->z_atime_dirty = B_FALSE;
4095 mutex_exit(&zp->z_lock);
4096 dmu_tx_commit(tx);
4097 }
4098 }
4099
4100 zfs_zinactive(zp);
4101 if (need_unlock)
4102 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4103 }
4104
4105 /*
4106 * Fill pages with data from the disk.
4107 */
4108 static int
zfs_fillpage(struct inode * ip,struct page * pp)4109 zfs_fillpage(struct inode *ip, struct page *pp)
4110 {
4111 znode_t *zp = ITOZ(ip);
4112 zfsvfs_t *zfsvfs = ITOZSB(ip);
4113 loff_t i_size = i_size_read(ip);
4114 u_offset_t io_off = page_offset(pp);
4115 size_t io_len = PAGE_SIZE;
4116
4117 ASSERT3U(io_off, <, i_size);
4118
4119 if (io_off + io_len > i_size)
4120 io_len = i_size - io_off;
4121
4122 void *va = kmap(pp);
4123 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
4124 io_len, va, DMU_READ_PREFETCH);
4125 if (io_len != PAGE_SIZE)
4126 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
4127 kunmap(pp);
4128
4129 if (error) {
4130 /* convert checksum errors into IO errors */
4131 if (error == ECKSUM)
4132 error = SET_ERROR(EIO);
4133
4134 SetPageError(pp);
4135 ClearPageUptodate(pp);
4136 } else {
4137 ClearPageError(pp);
4138 SetPageUptodate(pp);
4139 }
4140
4141 return (error);
4142 }
4143
4144 /*
4145 * Uses zfs_fillpage to read data from the file and fill the page.
4146 *
4147 * IN: ip - inode of file to get data from.
4148 * pp - page to read
4149 *
4150 * RETURN: 0 on success, error code on failure.
4151 *
4152 * Timestamps:
4153 * vp - atime updated
4154 */
4155 int
zfs_getpage(struct inode * ip,struct page * pp)4156 zfs_getpage(struct inode *ip, struct page *pp)
4157 {
4158 zfsvfs_t *zfsvfs = ITOZSB(ip);
4159 znode_t *zp = ITOZ(ip);
4160 int error;
4161 loff_t i_size = i_size_read(ip);
4162 u_offset_t io_off = page_offset(pp);
4163 size_t io_len = PAGE_SIZE;
4164
4165 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4166 return (error);
4167
4168 ASSERT3U(io_off, <, i_size);
4169
4170 if (io_off + io_len > i_size)
4171 io_len = i_size - io_off;
4172
4173 /*
4174 * It is important to hold the rangelock here because it is possible
4175 * a Direct I/O write or block clone might be taking place at the same
4176 * time that a page is being faulted in through filemap_fault(). With
4177 * Direct I/O writes and block cloning db->db_data will be set to NULL
4178 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
4179 * rangelock is not held, then there is a race between faulting in a
4180 * page and writing out a Direct I/O write or block cloning. Without
4181 * the rangelock a NULL pointer dereference can occur in
4182 * dmu_read_impl() for db->db_data during the mempcy operation when
4183 * zfs_fillpage() calls dmu_read().
4184 */
4185 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
4186 io_off, io_len, RL_READER);
4187 if (lr == NULL) {
4188 /*
4189 * It is important to drop the page lock before grabbing the
4190 * rangelock to avoid another deadlock between here and
4191 * zfs_write() -> update_pages(). update_pages() holds both the
4192 * rangelock and the page lock.
4193 */
4194 get_page(pp);
4195 unlock_page(pp);
4196 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
4197 io_len, RL_READER);
4198 lock_page(pp);
4199 put_page(pp);
4200 }
4201 error = zfs_fillpage(ip, pp);
4202 zfs_rangelock_exit(lr);
4203
4204 if (error == 0)
4205 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4206
4207 zfs_exit(zfsvfs, FTAG);
4208
4209 return (error);
4210 }
4211
4212 /*
4213 * Check ZFS specific permissions to memory map a section of a file.
4214 *
4215 * IN: ip - inode of the file to mmap
4216 * off - file offset
4217 * addrp - start address in memory region
4218 * len - length of memory region
4219 * vm_flags- address flags
4220 *
4221 * RETURN: 0 if success
4222 * error code if failure
4223 */
4224 int
zfs_map(struct inode * ip,offset_t off,caddr_t * addrp,size_t len,unsigned long vm_flags)4225 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4226 unsigned long vm_flags)
4227 {
4228 (void) addrp;
4229 znode_t *zp = ITOZ(ip);
4230 zfsvfs_t *zfsvfs = ITOZSB(ip);
4231 int error;
4232
4233 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4234 return (error);
4235
4236 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
4237 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4238 zfs_exit(zfsvfs, FTAG);
4239 return (SET_ERROR(EPERM));
4240 }
4241
4242 if ((vm_flags & (VM_READ | VM_EXEC)) &&
4243 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4244 zfs_exit(zfsvfs, FTAG);
4245 return (SET_ERROR(EACCES));
4246 }
4247
4248 if (off < 0 || len > MAXOFFSET_T - off) {
4249 zfs_exit(zfsvfs, FTAG);
4250 return (SET_ERROR(ENXIO));
4251 }
4252
4253 zfs_exit(zfsvfs, FTAG);
4254 return (0);
4255 }
4256
4257 /*
4258 * Free or allocate space in a file. Currently, this function only
4259 * supports the `F_FREESP' command. However, this command is somewhat
4260 * misnamed, as its functionality includes the ability to allocate as
4261 * well as free space.
4262 *
4263 * IN: zp - znode of file to free data in.
4264 * cmd - action to take (only F_FREESP supported).
4265 * bfp - section of file to free/alloc.
4266 * flag - current file open mode flags.
4267 * offset - current file offset.
4268 * cr - credentials of caller.
4269 *
4270 * RETURN: 0 on success, error code on failure.
4271 *
4272 * Timestamps:
4273 * zp - ctime|mtime updated
4274 */
4275 int
zfs_space(znode_t * zp,int cmd,flock64_t * bfp,int flag,offset_t offset,cred_t * cr)4276 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4277 offset_t offset, cred_t *cr)
4278 {
4279 (void) offset;
4280 zfsvfs_t *zfsvfs = ZTOZSB(zp);
4281 uint64_t off, len;
4282 int error;
4283
4284 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4285 return (error);
4286
4287 if (cmd != F_FREESP) {
4288 zfs_exit(zfsvfs, FTAG);
4289 return (SET_ERROR(EINVAL));
4290 }
4291
4292 /*
4293 * Callers might not be able to detect properly that we are read-only,
4294 * so check it explicitly here.
4295 */
4296 if (zfs_is_readonly(zfsvfs)) {
4297 zfs_exit(zfsvfs, FTAG);
4298 return (SET_ERROR(EROFS));
4299 }
4300
4301 if (bfp->l_len < 0) {
4302 zfs_exit(zfsvfs, FTAG);
4303 return (SET_ERROR(EINVAL));
4304 }
4305
4306 /*
4307 * Permissions aren't checked on Solaris because on this OS
4308 * zfs_space() can only be called with an opened file handle.
4309 * On Linux we can get here through truncate_range() which
4310 * operates directly on inodes, so we need to check access rights.
4311 */
4312 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4313 zfs_init_idmap))) {
4314 zfs_exit(zfsvfs, FTAG);
4315 return (error);
4316 }
4317
4318 off = bfp->l_start;
4319 len = bfp->l_len; /* 0 means from off to end of file */
4320
4321 error = zfs_freesp(zp, off, len, flag, TRUE);
4322
4323 zfs_exit(zfsvfs, FTAG);
4324 return (error);
4325 }
4326
4327 int
zfs_fid(struct inode * ip,fid_t * fidp)4328 zfs_fid(struct inode *ip, fid_t *fidp)
4329 {
4330 znode_t *zp = ITOZ(ip);
4331 zfsvfs_t *zfsvfs = ITOZSB(ip);
4332 uint32_t gen;
4333 uint64_t gen64;
4334 uint64_t object = zp->z_id;
4335 zfid_short_t *zfid;
4336 int size, i, error;
4337
4338 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4339 return (error);
4340
4341 if (fidp->fid_len < SHORT_FID_LEN) {
4342 fidp->fid_len = SHORT_FID_LEN;
4343 zfs_exit(zfsvfs, FTAG);
4344 return (SET_ERROR(ENOSPC));
4345 }
4346
4347 if ((error = zfs_verify_zp(zp)) != 0) {
4348 zfs_exit(zfsvfs, FTAG);
4349 return (error);
4350 }
4351
4352 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4353 &gen64, sizeof (uint64_t))) != 0) {
4354 zfs_exit(zfsvfs, FTAG);
4355 return (error);
4356 }
4357
4358 gen = (uint32_t)gen64;
4359
4360 size = SHORT_FID_LEN;
4361
4362 zfid = (zfid_short_t *)fidp;
4363
4364 zfid->zf_len = size;
4365
4366 for (i = 0; i < sizeof (zfid->zf_object); i++)
4367 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4368
4369 /* Must have a non-zero generation number to distinguish from .zfs */
4370 if (gen == 0)
4371 gen = 1;
4372 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4373 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4374
4375 zfs_exit(zfsvfs, FTAG);
4376 return (0);
4377 }
4378
4379 #if defined(_KERNEL)
4380 EXPORT_SYMBOL(zfs_open);
4381 EXPORT_SYMBOL(zfs_close);
4382 EXPORT_SYMBOL(zfs_lookup);
4383 EXPORT_SYMBOL(zfs_create);
4384 EXPORT_SYMBOL(zfs_tmpfile);
4385 EXPORT_SYMBOL(zfs_remove);
4386 EXPORT_SYMBOL(zfs_mkdir);
4387 EXPORT_SYMBOL(zfs_rmdir);
4388 EXPORT_SYMBOL(zfs_readdir);
4389 EXPORT_SYMBOL(zfs_getattr_fast);
4390 EXPORT_SYMBOL(zfs_setattr);
4391 EXPORT_SYMBOL(zfs_rename);
4392 EXPORT_SYMBOL(zfs_symlink);
4393 EXPORT_SYMBOL(zfs_readlink);
4394 EXPORT_SYMBOL(zfs_link);
4395 EXPORT_SYMBOL(zfs_inactive);
4396 EXPORT_SYMBOL(zfs_space);
4397 EXPORT_SYMBOL(zfs_fid);
4398 EXPORT_SYMBOL(zfs_getpage);
4399 EXPORT_SYMBOL(zfs_putpage);
4400 EXPORT_SYMBOL(zfs_dirty_inode);
4401 EXPORT_SYMBOL(zfs_map);
4402
4403 module_param(zfs_delete_blocks, ulong, 0644);
4404 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4405 #endif
4406