1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2025, Klara, Inc.
29 */
30
31 /* Portions Copyright 2007 Jeremy Teo */
32 /* Portions Copyright 2010 Robert Milkowski */
33
34
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/time.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/taskq.h>
44 #include <sys/uio.h>
45 #include <sys/vmsystm.h>
46 #include <sys/atomic.h>
47 #include <sys/pathname.h>
48 #include <sys/cmn_err.h>
49 #include <sys/errno.h>
50 #include <sys/zfs_dir.h>
51 #include <sys/zfs_acl.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/fs/zfs.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/spa.h>
57 #include <sys/txg.h>
58 #include <sys/dbuf.h>
59 #include <sys/zap.h>
60 #include <sys/sa.h>
61 #include <sys/policy.h>
62 #include <sys/sunddi.h>
63 #include <sys/sid.h>
64 #include <sys/zfs_ctldir.h>
65 #include <sys/zfs_fuid.h>
66 #include <sys/zfs_quota.h>
67 #include <sys/zfs_sa.h>
68 #include <sys/zfs_vnops.h>
69 #include <sys/zfs_rlock.h>
70 #include <sys/cred.h>
71 #include <sys/zpl.h>
72 #include <sys/zil.h>
73 #include <sys/sa_impl.h>
74 #include <linux/mm_compat.h>
75
76 /*
77 * Programming rules.
78 *
79 * Each vnode op performs some logical unit of work. To do this, the ZPL must
80 * properly lock its in-core state, create a DMU transaction, do the work,
81 * record this work in the intent log (ZIL), commit the DMU transaction,
82 * and wait for the intent log to commit if it is a synchronous operation.
83 * Moreover, the vnode ops must work in both normal and log replay context.
84 * The ordering of events is important to avoid deadlocks and references
85 * to freed memory. The example below illustrates the following Big Rules:
86 *
87 * (1) A check must be made in each zfs thread for a mounted file system.
88 * This is done avoiding races using zfs_enter(zfsvfs).
89 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
90 * must be checked with zfs_verify_zp(zp). Both of these macros
91 * can return EIO from the calling function.
92 *
93 * (2) zrele() should always be the last thing except for zil_commit() (if
94 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
95 * last reference, the vnode/znode can be freed, so the zp may point to
96 * freed memory. Second, the last reference will call zfs_zinactive(),
97 * which may induce a lot of work -- pushing cached pages (which acquires
98 * range locks) and syncing out cached atime changes. Third,
99 * zfs_zinactive() may require a new tx, which could deadlock the system
100 * if you were already holding one. This deadlock occurs because the tx
101 * currently being operated on prevents a txg from syncing, which
102 * prevents the new tx from progressing, resulting in a deadlock. If you
103 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
104 * is a synonym for zrele().
105 *
106 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
107 * as they can span dmu_tx_assign() calls.
108 *
109 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
110 * dmu_tx_assign(). This is critical because we don't want to block
111 * while holding locks.
112 *
113 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
114 * This reduces lock contention and CPU usage when we must wait (note
115 * that if throughput is constrained by the storage, nearly every
116 * transaction must wait).
117 *
118 * Note, in particular, that if a lock is sometimes acquired before
119 * the tx assigns, and sometimes after (e.g. z_lock), then failing
120 * to use a non-blocking assign can deadlock the system. The scenario:
121 *
122 * Thread A has grabbed a lock before calling dmu_tx_assign().
123 * Thread B is in an already-assigned tx, and blocks for this lock.
124 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
125 * txg_wait_open() forever, because the previous txg can't quiesce
126 * until B's tx commits.
127 *
128 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
129 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
130 * again. On subsequent calls to dmu_tx_assign(), pass
131 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
132 * this operation has already called dmu_tx_wait(). This will ensure
133 * that we don't retry forever, waiting a short bit each time.
134 *
135 * (5) If the operation succeeded, generate the intent log entry for it
136 * before dropping locks. This ensures that the ordering of events
137 * in the intent log matches the order in which they actually occurred.
138 * During ZIL replay the zfs_log_* functions will update the sequence
139 * number to indicate the zil transaction has replayed.
140 *
141 * (6) At the end of each vnode op, the DMU tx must always commit,
142 * regardless of whether there were any errors.
143 *
144 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
145 * to ensure that synchronous semantics are provided when necessary.
146 *
147 * In general, this is how things should be ordered in each vnode op:
148 *
149 * zfs_enter(zfsvfs); // exit if unmounted
150 * top:
151 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
152 * rw_enter(...); // grab any other locks you need
153 * tx = dmu_tx_create(...); // get DMU tx
154 * dmu_tx_hold_*(); // hold each object you might modify
155 * error = dmu_tx_assign(tx,
156 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
157 * if (error) {
158 * rw_exit(...); // drop locks
159 * zfs_dirent_unlock(dl); // unlock directory entry
160 * zrele(...); // release held znodes
161 * if (error == ERESTART) {
162 * waited = B_TRUE;
163 * dmu_tx_wait(tx);
164 * dmu_tx_abort(tx);
165 * goto top;
166 * }
167 * dmu_tx_abort(tx); // abort DMU tx
168 * zfs_exit(zfsvfs); // finished in zfs
169 * return (error); // really out of space
170 * }
171 * error = do_real_work(); // do whatever this VOP does
172 * if (error == 0)
173 * zfs_log_*(...); // on success, make ZIL entry
174 * dmu_tx_commit(tx); // commit DMU tx -- error or not
175 * rw_exit(...); // drop locks
176 * zfs_dirent_unlock(dl); // unlock directory entry
177 * zrele(...); // release held znodes
178 * zil_commit(zilog, foid); // synchronous when necessary
179 * zfs_exit(zfsvfs); // finished in zfs
180 * return (error); // done, report error
181 */
182 int
zfs_open(struct inode * ip,int mode,int flag,cred_t * cr)183 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
184 {
185 (void) cr;
186 znode_t *zp = ITOZ(ip);
187 zfsvfs_t *zfsvfs = ITOZSB(ip);
188 int error;
189
190 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
191 return (error);
192
193 /* Honor ZFS_APPENDONLY file attribute */
194 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
195 ((flag & O_APPEND) == 0)) {
196 zfs_exit(zfsvfs, FTAG);
197 return (SET_ERROR(EPERM));
198 }
199
200 /*
201 * Keep a count of the synchronous opens in the znode. On first
202 * synchronous open we must convert all previous async transactions
203 * into sync to keep correct ordering.
204 */
205 if (flag & O_SYNC) {
206 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
207 zil_async_to_sync(zfsvfs->z_log, zp->z_id);
208 }
209
210 zfs_exit(zfsvfs, FTAG);
211 return (0);
212 }
213
214 int
zfs_close(struct inode * ip,int flag,cred_t * cr)215 zfs_close(struct inode *ip, int flag, cred_t *cr)
216 {
217 (void) cr;
218 znode_t *zp = ITOZ(ip);
219 zfsvfs_t *zfsvfs = ITOZSB(ip);
220 int error;
221
222 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
223 return (error);
224
225 /* Decrement the synchronous opens in the znode */
226 if (flag & O_SYNC)
227 atomic_dec_32(&zp->z_sync_cnt);
228
229 zfs_exit(zfsvfs, FTAG);
230 return (0);
231 }
232
233 #if defined(_KERNEL)
234
235 static int zfs_fillpage(struct inode *ip, struct page *pp);
236
237 /*
238 * When a file is memory mapped, we must keep the IO data synchronized
239 * between the DMU cache and the memory mapped pages. Update all mapped
240 * pages with the contents of the coresponding dmu buffer.
241 */
242 void
update_pages(znode_t * zp,int64_t start,int len,objset_t * os)243 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
244 {
245 struct address_space *mp = ZTOI(zp)->i_mapping;
246 int64_t off = start & (PAGE_SIZE - 1);
247
248 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
249 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
250
251 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
252 if (pp) {
253 if (mapping_writably_mapped(mp))
254 flush_dcache_page(pp);
255
256 void *pb = kmap(pp);
257 int error = dmu_read(os, zp->z_id, start + off,
258 nbytes, pb + off, DMU_READ_PREFETCH);
259 kunmap(pp);
260
261 if (error) {
262 SetPageError(pp);
263 ClearPageUptodate(pp);
264 } else {
265 ClearPageError(pp);
266 SetPageUptodate(pp);
267
268 if (mapping_writably_mapped(mp))
269 flush_dcache_page(pp);
270
271 mark_page_accessed(pp);
272 }
273
274 unlock_page(pp);
275 put_page(pp);
276 }
277
278 len -= nbytes;
279 off = 0;
280 }
281 }
282
283 /*
284 * When a file is memory mapped, we must keep the I/O data synchronized
285 * between the DMU cache and the memory mapped pages. Preferentially read
286 * from memory mapped pages, otherwise fallback to reading through the dmu.
287 */
288 int
mappedread(znode_t * zp,int nbytes,zfs_uio_t * uio)289 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
290 {
291 struct inode *ip = ZTOI(zp);
292 struct address_space *mp = ip->i_mapping;
293 int64_t start = uio->uio_loffset;
294 int64_t off = start & (PAGE_SIZE - 1);
295 int len = nbytes;
296 int error = 0;
297
298 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
299 uint64_t bytes = MIN(PAGE_SIZE - off, len);
300
301 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
302 if (pp) {
303
304 /*
305 * If filemap_fault() retries there exists a window
306 * where the page will be unlocked and not up to date.
307 * In this case we must try and fill the page.
308 */
309 if (unlikely(!PageUptodate(pp))) {
310 error = zfs_fillpage(ip, pp);
311 if (error) {
312 unlock_page(pp);
313 put_page(pp);
314 return (error);
315 }
316 }
317
318 ASSERT(PageUptodate(pp) || PageDirty(pp));
319
320 unlock_page(pp);
321
322 void *pb = kmap(pp);
323 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
324 kunmap(pp);
325
326 if (mapping_writably_mapped(mp))
327 flush_dcache_page(pp);
328
329 mark_page_accessed(pp);
330 put_page(pp);
331 } else {
332 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
333 uio, bytes, DMU_READ_PREFETCH);
334 }
335
336 len -= bytes;
337 off = 0;
338
339 if (error)
340 break;
341 }
342
343 return (error);
344 }
345 #endif /* _KERNEL */
346
347 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
348
349 /*
350 * Write the bytes to a file.
351 *
352 * IN: zp - znode of file to be written to
353 * data - bytes to write
354 * len - number of bytes to write
355 * pos - offset to start writing at
356 *
357 * OUT: resid - remaining bytes to write
358 *
359 * RETURN: 0 if success
360 * positive error code if failure. EIO is returned
361 * for a short write when residp isn't provided.
362 *
363 * Timestamps:
364 * zp - ctime|mtime updated if byte count > 0
365 */
366 int
zfs_write_simple(znode_t * zp,const void * data,size_t len,loff_t pos,size_t * residp)367 zfs_write_simple(znode_t *zp, const void *data, size_t len,
368 loff_t pos, size_t *residp)
369 {
370 fstrans_cookie_t cookie;
371 int error;
372
373 struct iovec iov;
374 iov.iov_base = (void *)data;
375 iov.iov_len = len;
376
377 zfs_uio_t uio;
378 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
379
380 cookie = spl_fstrans_mark();
381 error = zfs_write(zp, &uio, 0, kcred);
382 spl_fstrans_unmark(cookie);
383
384 if (error == 0) {
385 if (residp != NULL)
386 *residp = zfs_uio_resid(&uio);
387 else if (zfs_uio_resid(&uio) != 0)
388 error = SET_ERROR(EIO);
389 }
390
391 return (error);
392 }
393
394 static void
zfs_rele_async_task(void * arg)395 zfs_rele_async_task(void *arg)
396 {
397 iput(arg);
398 }
399
400 void
zfs_zrele_async(znode_t * zp)401 zfs_zrele_async(znode_t *zp)
402 {
403 struct inode *ip = ZTOI(zp);
404 objset_t *os = ITOZSB(ip)->z_os;
405
406 ASSERT(atomic_read(&ip->i_count) > 0);
407 ASSERT(os != NULL);
408
409 /*
410 * If decrementing the count would put us at 0, we can't do it inline
411 * here, because that would be synchronous. Instead, dispatch an iput
412 * to run later.
413 *
414 * For more information on the dangers of a synchronous iput, see the
415 * header comment of this file.
416 */
417 if (!atomic_add_unless(&ip->i_count, -1, 1)) {
418 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
419 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
420 }
421 }
422
423
424 /*
425 * Lookup an entry in a directory, or an extended attribute directory.
426 * If it exists, return a held inode reference for it.
427 *
428 * IN: zdp - znode of directory to search.
429 * nm - name of entry to lookup.
430 * flags - LOOKUP_XATTR set if looking for an attribute.
431 * cr - credentials of caller.
432 * direntflags - directory lookup flags
433 * realpnp - returned pathname.
434 *
435 * OUT: zpp - znode of located entry, NULL if not found.
436 *
437 * RETURN: 0 on success, error code on failure.
438 *
439 * Timestamps:
440 * NA
441 */
442 int
zfs_lookup(znode_t * zdp,char * nm,znode_t ** zpp,int flags,cred_t * cr,int * direntflags,pathname_t * realpnp)443 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
444 int *direntflags, pathname_t *realpnp)
445 {
446 zfsvfs_t *zfsvfs = ZTOZSB(zdp);
447 int error = 0;
448
449 /*
450 * Fast path lookup, however we must skip DNLC lookup
451 * for case folding or normalizing lookups because the
452 * DNLC code only stores the passed in name. This means
453 * creating 'a' and removing 'A' on a case insensitive
454 * file system would work, but DNLC still thinks 'a'
455 * exists and won't let you create it again on the next
456 * pass through fast path.
457 */
458 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
459
460 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
461 return (SET_ERROR(ENOTDIR));
462 } else if (zdp->z_sa_hdl == NULL) {
463 return (SET_ERROR(EIO));
464 }
465
466 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
467 error = zfs_fastaccesschk_execute(zdp, cr);
468 if (!error) {
469 *zpp = zdp;
470 zhold(*zpp);
471 return (0);
472 }
473 return (error);
474 }
475 }
476
477 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
478 return (error);
479
480 *zpp = NULL;
481
482 if (flags & LOOKUP_XATTR) {
483 /*
484 * We don't allow recursive attributes..
485 * Maybe someday we will.
486 */
487 if (zdp->z_pflags & ZFS_XATTR) {
488 zfs_exit(zfsvfs, FTAG);
489 return (SET_ERROR(EINVAL));
490 }
491
492 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
493 zfs_exit(zfsvfs, FTAG);
494 return (error);
495 }
496
497 /*
498 * Do we have permission to get into attribute directory?
499 */
500
501 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
502 B_TRUE, cr, zfs_init_idmap))) {
503 zrele(*zpp);
504 *zpp = NULL;
505 }
506
507 zfs_exit(zfsvfs, FTAG);
508 return (error);
509 }
510
511 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
512 zfs_exit(zfsvfs, FTAG);
513 return (SET_ERROR(ENOTDIR));
514 }
515
516 /*
517 * Check accessibility of directory.
518 */
519
520 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
521 zfs_init_idmap))) {
522 zfs_exit(zfsvfs, FTAG);
523 return (error);
524 }
525
526 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
527 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
528 zfs_exit(zfsvfs, FTAG);
529 return (SET_ERROR(EILSEQ));
530 }
531
532 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
533 if ((error == 0) && (*zpp))
534 zfs_znode_update_vfs(*zpp);
535
536 zfs_exit(zfsvfs, FTAG);
537 return (error);
538 }
539
540 /*
541 * Perform a linear search in directory for the name of specific inode.
542 * Note we don't pass in the buffer size of name because it's hardcoded to
543 * NAME_MAX+1(256) in Linux.
544 *
545 * IN: dzp - znode of directory to search.
546 * zp - znode of the target
547 *
548 * OUT: name - dentry name of the target
549 *
550 * RETURN: 0 on success, error code on failure.
551 */
552 int
zfs_get_name(znode_t * dzp,char * name,znode_t * zp)553 zfs_get_name(znode_t *dzp, char *name, znode_t *zp)
554 {
555 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
556 int error = 0;
557
558 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
559 return (error);
560
561 if ((error = zfs_verify_zp(zp)) != 0) {
562 zfs_exit(zfsvfs, FTAG);
563 return (error);
564 }
565
566 /* ctldir should have got their name in zfs_vget */
567 if (dzp->z_is_ctldir || zp->z_is_ctldir) {
568 zfs_exit(zfsvfs, FTAG);
569 return (ENOENT);
570 }
571
572 /* buffer len is hardcoded to 256 in Linux kernel */
573 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id,
574 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN);
575
576 zfs_exit(zfsvfs, FTAG);
577 return (error);
578 }
579
580 /*
581 * Attempt to create a new entry in a directory. If the entry
582 * already exists, truncate the file if permissible, else return
583 * an error. Return the ip of the created or trunc'd file.
584 *
585 * IN: dzp - znode of directory to put new file entry in.
586 * name - name of new file entry.
587 * vap - attributes of new file.
588 * excl - flag indicating exclusive or non-exclusive mode.
589 * mode - mode to open file with.
590 * cr - credentials of caller.
591 * flag - file flag.
592 * vsecp - ACL to be set
593 * mnt_ns - user namespace of the mount
594 *
595 * OUT: zpp - znode of created or trunc'd entry.
596 *
597 * RETURN: 0 on success, error code on failure.
598 *
599 * Timestamps:
600 * dzp - ctime|mtime updated if new entry created
601 * zp - ctime|mtime always, atime if new
602 */
603 int
zfs_create(znode_t * dzp,char * name,vattr_t * vap,int excl,int mode,znode_t ** zpp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)604 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
605 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
606 zidmap_t *mnt_ns)
607 {
608 znode_t *zp;
609 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
610 zilog_t *zilog;
611 objset_t *os;
612 zfs_dirlock_t *dl;
613 dmu_tx_t *tx;
614 int error;
615 uid_t uid;
616 gid_t gid;
617 zfs_acl_ids_t acl_ids;
618 boolean_t fuid_dirtied;
619 boolean_t have_acl = B_FALSE;
620 boolean_t waited = B_FALSE;
621 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
622
623 /*
624 * If we have an ephemeral id, ACL, or XVATTR then
625 * make sure file system is at proper version
626 */
627
628 gid = crgetgid(cr);
629 uid = crgetuid(cr);
630
631 if (zfsvfs->z_use_fuids == B_FALSE &&
632 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
633 return (SET_ERROR(EINVAL));
634
635 if (name == NULL)
636 return (SET_ERROR(EINVAL));
637
638 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
639 return (error);
640 os = zfsvfs->z_os;
641 zilog = zfsvfs->z_log;
642
643 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
644 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
645 zfs_exit(zfsvfs, FTAG);
646 return (SET_ERROR(EILSEQ));
647 }
648
649 if (vap->va_mask & ATTR_XVATTR) {
650 if ((error = secpolicy_xvattr((xvattr_t *)vap,
651 crgetuid(cr), cr, vap->va_mode)) != 0) {
652 zfs_exit(zfsvfs, FTAG);
653 return (error);
654 }
655 }
656
657 top:
658 *zpp = NULL;
659 if (*name == '\0') {
660 /*
661 * Null component name refers to the directory itself.
662 */
663 zhold(dzp);
664 zp = dzp;
665 dl = NULL;
666 error = 0;
667 } else {
668 /* possible igrab(zp) */
669 int zflg = 0;
670
671 if (flag & FIGNORECASE)
672 zflg |= ZCILOOK;
673
674 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
675 NULL, NULL);
676 if (error) {
677 if (have_acl)
678 zfs_acl_ids_free(&acl_ids);
679 if (strcmp(name, "..") == 0)
680 error = SET_ERROR(EISDIR);
681 zfs_exit(zfsvfs, FTAG);
682 return (error);
683 }
684 }
685
686 if (zp == NULL) {
687 uint64_t txtype;
688 uint64_t projid = ZFS_DEFAULT_PROJID;
689
690 /*
691 * Create a new file object and update the directory
692 * to reference it.
693 */
694 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
695 mnt_ns))) {
696 if (have_acl)
697 zfs_acl_ids_free(&acl_ids);
698 goto out;
699 }
700
701 /*
702 * We only support the creation of regular files in
703 * extended attribute directories.
704 */
705
706 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
707 if (have_acl)
708 zfs_acl_ids_free(&acl_ids);
709 error = SET_ERROR(EINVAL);
710 goto out;
711 }
712
713 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
714 cr, vsecp, &acl_ids, mnt_ns)) != 0)
715 goto out;
716 have_acl = B_TRUE;
717
718 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
719 projid = zfs_inherit_projid(dzp);
720 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
721 zfs_acl_ids_free(&acl_ids);
722 error = SET_ERROR(EDQUOT);
723 goto out;
724 }
725
726 tx = dmu_tx_create(os);
727
728 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
729 ZFS_SA_BASE_ATTR_SIZE);
730
731 fuid_dirtied = zfsvfs->z_fuid_dirty;
732 if (fuid_dirtied)
733 zfs_fuid_txhold(zfsvfs, tx);
734 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
735 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
736 if (!zfsvfs->z_use_sa &&
737 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
738 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
739 0, acl_ids.z_aclp->z_acl_bytes);
740 }
741
742 error = dmu_tx_assign(tx,
743 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
744 if (error) {
745 zfs_dirent_unlock(dl);
746 if (error == ERESTART) {
747 waited = B_TRUE;
748 dmu_tx_wait(tx);
749 dmu_tx_abort(tx);
750 goto top;
751 }
752 zfs_acl_ids_free(&acl_ids);
753 dmu_tx_abort(tx);
754 zfs_exit(zfsvfs, FTAG);
755 return (error);
756 }
757 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
758
759 error = zfs_link_create(dl, zp, tx, ZNEW);
760 if (error != 0) {
761 /*
762 * Since, we failed to add the directory entry for it,
763 * delete the newly created dnode.
764 */
765 zfs_znode_delete(zp, tx);
766 remove_inode_hash(ZTOI(zp));
767 zfs_acl_ids_free(&acl_ids);
768 dmu_tx_commit(tx);
769 goto out;
770 }
771
772 if (fuid_dirtied)
773 zfs_fuid_sync(zfsvfs, tx);
774
775 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
776 if (flag & FIGNORECASE)
777 txtype |= TX_CI;
778 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
779 vsecp, acl_ids.z_fuidp, vap);
780 zfs_acl_ids_free(&acl_ids);
781 dmu_tx_commit(tx);
782 } else {
783 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
784
785 if (have_acl)
786 zfs_acl_ids_free(&acl_ids);
787
788 /*
789 * A directory entry already exists for this name.
790 */
791 /*
792 * Can't truncate an existing file if in exclusive mode.
793 */
794 if (excl) {
795 error = SET_ERROR(EEXIST);
796 goto out;
797 }
798 /*
799 * Can't open a directory for writing.
800 */
801 if (S_ISDIR(ZTOI(zp)->i_mode)) {
802 error = SET_ERROR(EISDIR);
803 goto out;
804 }
805 /*
806 * Verify requested access to file.
807 */
808 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
809 mnt_ns))) {
810 goto out;
811 }
812
813 mutex_enter(&dzp->z_lock);
814 dzp->z_seq++;
815 mutex_exit(&dzp->z_lock);
816
817 /*
818 * Truncate regular files if requested.
819 */
820 if (S_ISREG(ZTOI(zp)->i_mode) &&
821 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
822 /* we can't hold any locks when calling zfs_freesp() */
823 if (dl) {
824 zfs_dirent_unlock(dl);
825 dl = NULL;
826 }
827 error = zfs_freesp(zp, 0, 0, mode, TRUE);
828 }
829 }
830 out:
831
832 if (dl)
833 zfs_dirent_unlock(dl);
834
835 if (error) {
836 if (zp)
837 zrele(zp);
838 } else {
839 zfs_znode_update_vfs(dzp);
840 zfs_znode_update_vfs(zp);
841 *zpp = zp;
842 }
843
844 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
845 error = zil_commit(zilog, 0);
846
847 zfs_exit(zfsvfs, FTAG);
848 return (error);
849 }
850
851 int
zfs_tmpfile(struct inode * dip,vattr_t * vap,int excl,int mode,struct inode ** ipp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)852 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
853 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
854 zidmap_t *mnt_ns)
855 {
856 (void) excl, (void) mode, (void) flag;
857 znode_t *zp = NULL, *dzp = ITOZ(dip);
858 zfsvfs_t *zfsvfs = ITOZSB(dip);
859 objset_t *os;
860 dmu_tx_t *tx;
861 int error;
862 uid_t uid;
863 gid_t gid;
864 zfs_acl_ids_t acl_ids;
865 uint64_t projid = ZFS_DEFAULT_PROJID;
866 boolean_t fuid_dirtied;
867 boolean_t have_acl = B_FALSE;
868 boolean_t waited = B_FALSE;
869
870 /*
871 * If we have an ephemeral id, ACL, or XVATTR then
872 * make sure file system is at proper version
873 */
874
875 gid = crgetgid(cr);
876 uid = crgetuid(cr);
877
878 if (zfsvfs->z_use_fuids == B_FALSE &&
879 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
880 return (SET_ERROR(EINVAL));
881
882 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
883 return (error);
884 os = zfsvfs->z_os;
885
886 if (vap->va_mask & ATTR_XVATTR) {
887 if ((error = secpolicy_xvattr((xvattr_t *)vap,
888 crgetuid(cr), cr, vap->va_mode)) != 0) {
889 zfs_exit(zfsvfs, FTAG);
890 return (error);
891 }
892 }
893
894 top:
895 *ipp = NULL;
896
897 /*
898 * Create a new file object and update the directory
899 * to reference it.
900 */
901 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
902 if (have_acl)
903 zfs_acl_ids_free(&acl_ids);
904 goto out;
905 }
906
907 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
908 cr, vsecp, &acl_ids, mnt_ns)) != 0)
909 goto out;
910 have_acl = B_TRUE;
911
912 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
913 projid = zfs_inherit_projid(dzp);
914 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
915 zfs_acl_ids_free(&acl_ids);
916 error = SET_ERROR(EDQUOT);
917 goto out;
918 }
919
920 tx = dmu_tx_create(os);
921
922 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
923 ZFS_SA_BASE_ATTR_SIZE);
924 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
925
926 fuid_dirtied = zfsvfs->z_fuid_dirty;
927 if (fuid_dirtied)
928 zfs_fuid_txhold(zfsvfs, tx);
929 if (!zfsvfs->z_use_sa &&
930 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
931 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
932 0, acl_ids.z_aclp->z_acl_bytes);
933 }
934 error = dmu_tx_assign(tx,
935 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
936 if (error) {
937 if (error == ERESTART) {
938 waited = B_TRUE;
939 dmu_tx_wait(tx);
940 dmu_tx_abort(tx);
941 goto top;
942 }
943 zfs_acl_ids_free(&acl_ids);
944 dmu_tx_abort(tx);
945 zfs_exit(zfsvfs, FTAG);
946 return (error);
947 }
948 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
949
950 if (fuid_dirtied)
951 zfs_fuid_sync(zfsvfs, tx);
952
953 /* Add to unlinked set */
954 zp->z_unlinked = B_TRUE;
955 zfs_unlinked_add(zp, tx);
956 zfs_acl_ids_free(&acl_ids);
957 dmu_tx_commit(tx);
958 out:
959
960 if (error) {
961 if (zp)
962 zrele(zp);
963 } else {
964 zfs_znode_update_vfs(dzp);
965 zfs_znode_update_vfs(zp);
966 *ipp = ZTOI(zp);
967 }
968
969 zfs_exit(zfsvfs, FTAG);
970 return (error);
971 }
972
973 /*
974 * Remove an entry from a directory.
975 *
976 * IN: dzp - znode of directory to remove entry from.
977 * name - name of entry to remove.
978 * cr - credentials of caller.
979 * flags - case flags.
980 *
981 * RETURN: 0 if success
982 * error code if failure
983 *
984 * Timestamps:
985 * dzp - ctime|mtime
986 * ip - ctime (if nlink > 0)
987 */
988
989 static uint64_t null_xattr = 0;
990
991 int
zfs_remove(znode_t * dzp,char * name,cred_t * cr,int flags)992 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
993 {
994 znode_t *zp;
995 znode_t *xzp;
996 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
997 zilog_t *zilog;
998 uint64_t acl_obj, xattr_obj;
999 uint64_t xattr_obj_unlinked = 0;
1000 uint64_t obj = 0;
1001 uint64_t links;
1002 zfs_dirlock_t *dl;
1003 dmu_tx_t *tx;
1004 boolean_t may_delete_now, delete_now = FALSE;
1005 boolean_t unlinked, toobig = FALSE;
1006 uint64_t txtype;
1007 pathname_t *realnmp = NULL;
1008 pathname_t realnm;
1009 int error;
1010 int zflg = ZEXISTS;
1011 boolean_t waited = B_FALSE;
1012
1013 if (name == NULL)
1014 return (SET_ERROR(EINVAL));
1015
1016 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1017 return (error);
1018 zilog = zfsvfs->z_log;
1019
1020 if (flags & FIGNORECASE) {
1021 zflg |= ZCILOOK;
1022 pn_alloc(&realnm);
1023 realnmp = &realnm;
1024 }
1025
1026 top:
1027 xattr_obj = 0;
1028 xzp = NULL;
1029 /*
1030 * Attempt to lock directory; fail if entry doesn't exist.
1031 */
1032 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1033 NULL, realnmp))) {
1034 if (realnmp)
1035 pn_free(realnmp);
1036 zfs_exit(zfsvfs, FTAG);
1037 return (error);
1038 }
1039
1040 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1041 goto out;
1042 }
1043
1044 /*
1045 * Need to use rmdir for removing directories.
1046 */
1047 if (S_ISDIR(ZTOI(zp)->i_mode)) {
1048 error = SET_ERROR(EPERM);
1049 goto out;
1050 }
1051
1052 mutex_enter(&zp->z_lock);
1053 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1054 !zn_has_cached_data(zp, 0, LLONG_MAX);
1055 mutex_exit(&zp->z_lock);
1056
1057 /*
1058 * We may delete the znode now, or we may put it in the unlinked set;
1059 * it depends on whether we're the last link, and on whether there are
1060 * other holds on the inode. So we dmu_tx_hold() the right things to
1061 * allow for either case.
1062 */
1063 obj = zp->z_id;
1064 tx = dmu_tx_create(zfsvfs->z_os);
1065 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1066 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1067 zfs_sa_upgrade_txholds(tx, zp);
1068 zfs_sa_upgrade_txholds(tx, dzp);
1069 if (may_delete_now) {
1070 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1071 /* if the file is too big, only hold_free a token amount */
1072 dmu_tx_hold_free(tx, zp->z_id, 0,
1073 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1074 }
1075
1076 /* are there any extended attributes? */
1077 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1078 &xattr_obj, sizeof (xattr_obj));
1079 if (error == 0 && xattr_obj) {
1080 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1081 ASSERT0(error);
1082 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1083 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1084 }
1085
1086 mutex_enter(&zp->z_lock);
1087 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1088 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1089 mutex_exit(&zp->z_lock);
1090
1091 /* charge as an update -- would be nice not to charge at all */
1092 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1093
1094 /*
1095 * Mark this transaction as typically resulting in a net free of space
1096 */
1097 dmu_tx_mark_netfree(tx);
1098
1099 error = dmu_tx_assign(tx,
1100 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1101 if (error) {
1102 zfs_dirent_unlock(dl);
1103 if (error == ERESTART) {
1104 waited = B_TRUE;
1105 dmu_tx_wait(tx);
1106 dmu_tx_abort(tx);
1107 zrele(zp);
1108 if (xzp)
1109 zrele(xzp);
1110 goto top;
1111 }
1112 if (realnmp)
1113 pn_free(realnmp);
1114 dmu_tx_abort(tx);
1115 zrele(zp);
1116 if (xzp)
1117 zrele(xzp);
1118 zfs_exit(zfsvfs, FTAG);
1119 return (error);
1120 }
1121
1122 /*
1123 * Remove the directory entry.
1124 */
1125 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1126
1127 if (error) {
1128 dmu_tx_commit(tx);
1129 goto out;
1130 }
1131
1132 if (unlinked) {
1133 /*
1134 * Hold z_lock so that we can make sure that the ACL obj
1135 * hasn't changed. Could have been deleted due to
1136 * zfs_sa_upgrade().
1137 */
1138 mutex_enter(&zp->z_lock);
1139 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1140 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1141 delete_now = may_delete_now && !toobig &&
1142 atomic_read(&ZTOI(zp)->i_count) == 1 &&
1143 !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1144 xattr_obj == xattr_obj_unlinked &&
1145 zfs_external_acl(zp) == acl_obj;
1146 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1147 }
1148
1149 if (delete_now) {
1150 if (xattr_obj_unlinked) {
1151 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1152 mutex_enter(&xzp->z_lock);
1153 xzp->z_unlinked = B_TRUE;
1154 clear_nlink(ZTOI(xzp));
1155 links = 0;
1156 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1157 &links, sizeof (links), tx);
1158 ASSERT3U(error, ==, 0);
1159 mutex_exit(&xzp->z_lock);
1160 zfs_unlinked_add(xzp, tx);
1161
1162 if (zp->z_is_sa)
1163 error = sa_remove(zp->z_sa_hdl,
1164 SA_ZPL_XATTR(zfsvfs), tx);
1165 else
1166 error = sa_update(zp->z_sa_hdl,
1167 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1168 sizeof (uint64_t), tx);
1169 ASSERT0(error);
1170 }
1171 /*
1172 * Add to the unlinked set because a new reference could be
1173 * taken concurrently resulting in a deferred destruction.
1174 */
1175 zfs_unlinked_add(zp, tx);
1176 mutex_exit(&zp->z_lock);
1177 } else if (unlinked) {
1178 mutex_exit(&zp->z_lock);
1179 zfs_unlinked_add(zp, tx);
1180 }
1181
1182 txtype = TX_REMOVE;
1183 if (flags & FIGNORECASE)
1184 txtype |= TX_CI;
1185 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1186
1187 dmu_tx_commit(tx);
1188 out:
1189 if (realnmp)
1190 pn_free(realnmp);
1191
1192 zfs_dirent_unlock(dl);
1193 zfs_znode_update_vfs(dzp);
1194 zfs_znode_update_vfs(zp);
1195
1196 if (delete_now)
1197 zrele(zp);
1198 else
1199 zfs_zrele_async(zp);
1200
1201 if (xzp) {
1202 zfs_znode_update_vfs(xzp);
1203 zfs_zrele_async(xzp);
1204 }
1205
1206 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1207 error = zil_commit(zilog, 0);
1208
1209 zfs_exit(zfsvfs, FTAG);
1210 return (error);
1211 }
1212
1213 /*
1214 * Create a new directory and insert it into dzp using the name
1215 * provided. Return a pointer to the inserted directory.
1216 *
1217 * IN: dzp - znode of directory to add subdir to.
1218 * dirname - name of new directory.
1219 * vap - attributes of new directory.
1220 * cr - credentials of caller.
1221 * flags - case flags.
1222 * vsecp - ACL to be set
1223 * mnt_ns - user namespace of the mount
1224 *
1225 * OUT: zpp - znode of created directory.
1226 *
1227 * RETURN: 0 if success
1228 * error code if failure
1229 *
1230 * Timestamps:
1231 * dzp - ctime|mtime updated
1232 * zpp - ctime|mtime|atime updated
1233 */
1234 int
zfs_mkdir(znode_t * dzp,char * dirname,vattr_t * vap,znode_t ** zpp,cred_t * cr,int flags,vsecattr_t * vsecp,zidmap_t * mnt_ns)1235 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1236 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1237 {
1238 znode_t *zp;
1239 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1240 zilog_t *zilog;
1241 zfs_dirlock_t *dl;
1242 uint64_t txtype;
1243 dmu_tx_t *tx;
1244 int error;
1245 int zf = ZNEW;
1246 uid_t uid;
1247 gid_t gid = crgetgid(cr);
1248 zfs_acl_ids_t acl_ids;
1249 boolean_t fuid_dirtied;
1250 boolean_t waited = B_FALSE;
1251
1252 ASSERT(S_ISDIR(vap->va_mode));
1253
1254 /*
1255 * If we have an ephemeral id, ACL, or XVATTR then
1256 * make sure file system is at proper version
1257 */
1258
1259 uid = crgetuid(cr);
1260 if (zfsvfs->z_use_fuids == B_FALSE &&
1261 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1262 return (SET_ERROR(EINVAL));
1263
1264 if (dirname == NULL)
1265 return (SET_ERROR(EINVAL));
1266
1267 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1268 return (error);
1269 zilog = zfsvfs->z_log;
1270
1271 if (dzp->z_pflags & ZFS_XATTR) {
1272 zfs_exit(zfsvfs, FTAG);
1273 return (SET_ERROR(EINVAL));
1274 }
1275
1276 if (zfsvfs->z_utf8 && u8_validate(dirname,
1277 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1278 zfs_exit(zfsvfs, FTAG);
1279 return (SET_ERROR(EILSEQ));
1280 }
1281 if (flags & FIGNORECASE)
1282 zf |= ZCILOOK;
1283
1284 if (vap->va_mask & ATTR_XVATTR) {
1285 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1286 crgetuid(cr), cr, vap->va_mode)) != 0) {
1287 zfs_exit(zfsvfs, FTAG);
1288 return (error);
1289 }
1290 }
1291
1292 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1293 vsecp, &acl_ids, mnt_ns)) != 0) {
1294 zfs_exit(zfsvfs, FTAG);
1295 return (error);
1296 }
1297 /*
1298 * First make sure the new directory doesn't exist.
1299 *
1300 * Existence is checked first to make sure we don't return
1301 * EACCES instead of EEXIST which can cause some applications
1302 * to fail.
1303 */
1304 top:
1305 *zpp = NULL;
1306
1307 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1308 NULL, NULL))) {
1309 zfs_acl_ids_free(&acl_ids);
1310 zfs_exit(zfsvfs, FTAG);
1311 return (error);
1312 }
1313
1314 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1315 mnt_ns))) {
1316 zfs_acl_ids_free(&acl_ids);
1317 zfs_dirent_unlock(dl);
1318 zfs_exit(zfsvfs, FTAG);
1319 return (error);
1320 }
1321
1322 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1323 zfs_acl_ids_free(&acl_ids);
1324 zfs_dirent_unlock(dl);
1325 zfs_exit(zfsvfs, FTAG);
1326 return (SET_ERROR(EDQUOT));
1327 }
1328
1329 /*
1330 * Add a new entry to the directory.
1331 */
1332 tx = dmu_tx_create(zfsvfs->z_os);
1333 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1334 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1335 fuid_dirtied = zfsvfs->z_fuid_dirty;
1336 if (fuid_dirtied)
1337 zfs_fuid_txhold(zfsvfs, tx);
1338 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1339 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1340 acl_ids.z_aclp->z_acl_bytes);
1341 }
1342
1343 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1344 ZFS_SA_BASE_ATTR_SIZE);
1345
1346 error = dmu_tx_assign(tx,
1347 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1348 if (error) {
1349 zfs_dirent_unlock(dl);
1350 if (error == ERESTART) {
1351 waited = B_TRUE;
1352 dmu_tx_wait(tx);
1353 dmu_tx_abort(tx);
1354 goto top;
1355 }
1356 zfs_acl_ids_free(&acl_ids);
1357 dmu_tx_abort(tx);
1358 zfs_exit(zfsvfs, FTAG);
1359 return (error);
1360 }
1361
1362 /*
1363 * Create new node.
1364 */
1365 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1366
1367 /*
1368 * Now put new name in parent dir.
1369 */
1370 error = zfs_link_create(dl, zp, tx, ZNEW);
1371 if (error != 0) {
1372 zfs_znode_delete(zp, tx);
1373 remove_inode_hash(ZTOI(zp));
1374 goto out;
1375 }
1376
1377 if (fuid_dirtied)
1378 zfs_fuid_sync(zfsvfs, tx);
1379
1380 *zpp = zp;
1381
1382 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1383 if (flags & FIGNORECASE)
1384 txtype |= TX_CI;
1385 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1386 acl_ids.z_fuidp, vap);
1387
1388 out:
1389 zfs_acl_ids_free(&acl_ids);
1390
1391 dmu_tx_commit(tx);
1392
1393 zfs_dirent_unlock(dl);
1394
1395 if (error != 0) {
1396 zrele(zp);
1397 } else {
1398 zfs_znode_update_vfs(dzp);
1399 zfs_znode_update_vfs(zp);
1400
1401 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1402 error = zil_commit(zilog, 0);
1403
1404 }
1405 zfs_exit(zfsvfs, FTAG);
1406 return (error);
1407 }
1408
1409 /*
1410 * Remove a directory subdir entry. If the current working
1411 * directory is the same as the subdir to be removed, the
1412 * remove will fail.
1413 *
1414 * IN: dzp - znode of directory to remove from.
1415 * name - name of directory to be removed.
1416 * cwd - inode of current working directory.
1417 * cr - credentials of caller.
1418 * flags - case flags
1419 *
1420 * RETURN: 0 on success, error code on failure.
1421 *
1422 * Timestamps:
1423 * dzp - ctime|mtime updated
1424 */
1425 int
zfs_rmdir(znode_t * dzp,char * name,znode_t * cwd,cred_t * cr,int flags)1426 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1427 int flags)
1428 {
1429 znode_t *zp;
1430 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1431 zilog_t *zilog;
1432 zfs_dirlock_t *dl;
1433 dmu_tx_t *tx;
1434 int error;
1435 int zflg = ZEXISTS;
1436 boolean_t waited = B_FALSE;
1437
1438 if (name == NULL)
1439 return (SET_ERROR(EINVAL));
1440
1441 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1442 return (error);
1443 zilog = zfsvfs->z_log;
1444
1445 if (flags & FIGNORECASE)
1446 zflg |= ZCILOOK;
1447 top:
1448 zp = NULL;
1449
1450 /*
1451 * Attempt to lock directory; fail if entry doesn't exist.
1452 */
1453 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1454 NULL, NULL))) {
1455 zfs_exit(zfsvfs, FTAG);
1456 return (error);
1457 }
1458
1459 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1460 goto out;
1461 }
1462
1463 if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1464 error = SET_ERROR(ENOTDIR);
1465 goto out;
1466 }
1467
1468 if (zp == cwd) {
1469 error = SET_ERROR(EINVAL);
1470 goto out;
1471 }
1472
1473 /*
1474 * Grab a lock on the directory to make sure that no one is
1475 * trying to add (or lookup) entries while we are removing it.
1476 */
1477 rw_enter(&zp->z_name_lock, RW_WRITER);
1478
1479 /*
1480 * Grab a lock on the parent pointer to make sure we play well
1481 * with the treewalk and directory rename code.
1482 */
1483 rw_enter(&zp->z_parent_lock, RW_WRITER);
1484
1485 tx = dmu_tx_create(zfsvfs->z_os);
1486 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1487 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1488 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1489 zfs_sa_upgrade_txholds(tx, zp);
1490 zfs_sa_upgrade_txholds(tx, dzp);
1491 dmu_tx_mark_netfree(tx);
1492 error = dmu_tx_assign(tx,
1493 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
1494 if (error) {
1495 rw_exit(&zp->z_parent_lock);
1496 rw_exit(&zp->z_name_lock);
1497 zfs_dirent_unlock(dl);
1498 if (error == ERESTART) {
1499 waited = B_TRUE;
1500 dmu_tx_wait(tx);
1501 dmu_tx_abort(tx);
1502 zrele(zp);
1503 goto top;
1504 }
1505 dmu_tx_abort(tx);
1506 zrele(zp);
1507 zfs_exit(zfsvfs, FTAG);
1508 return (error);
1509 }
1510
1511 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1512
1513 if (error == 0) {
1514 uint64_t txtype = TX_RMDIR;
1515 if (flags & FIGNORECASE)
1516 txtype |= TX_CI;
1517 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1518 B_FALSE);
1519 }
1520
1521 dmu_tx_commit(tx);
1522
1523 rw_exit(&zp->z_parent_lock);
1524 rw_exit(&zp->z_name_lock);
1525 out:
1526 zfs_dirent_unlock(dl);
1527
1528 zfs_znode_update_vfs(dzp);
1529 zfs_znode_update_vfs(zp);
1530 zrele(zp);
1531
1532 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1533 error = zil_commit(zilog, 0);
1534
1535 zfs_exit(zfsvfs, FTAG);
1536 return (error);
1537 }
1538
1539 /*
1540 * Read directory entries from the given directory cursor position and emit
1541 * name and position for each entry.
1542 *
1543 * IN: ip - inode of directory to read.
1544 * ctx - directory entry context.
1545 * cr - credentials of caller.
1546 *
1547 * RETURN: 0 if success
1548 * error code if failure
1549 *
1550 * Timestamps:
1551 * ip - atime updated
1552 *
1553 * Note that the low 4 bits of the cookie returned by zap is always zero.
1554 * This allows us to use the low range for "special" directory entries:
1555 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1556 * we use the offset 2 for the '.zfs' directory.
1557 */
1558 int
zfs_readdir(struct inode * ip,struct dir_context * ctx,cred_t * cr)1559 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
1560 {
1561 (void) cr;
1562 znode_t *zp = ITOZ(ip);
1563 zfsvfs_t *zfsvfs = ITOZSB(ip);
1564 objset_t *os;
1565 zap_cursor_t zc;
1566 zap_attribute_t *zap;
1567 int error;
1568 uint8_t prefetch;
1569 uint8_t type;
1570 int done = 0;
1571 uint64_t parent;
1572 uint64_t offset; /* must be unsigned; checks for < 1 */
1573
1574 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1575 return (error);
1576
1577 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1578 &parent, sizeof (parent))) != 0)
1579 goto out;
1580
1581 /*
1582 * Quit if directory has been removed (posix)
1583 */
1584 if (zp->z_unlinked)
1585 goto out;
1586
1587 error = 0;
1588 os = zfsvfs->z_os;
1589 offset = ctx->pos;
1590 prefetch = zp->z_zn_prefetch;
1591 zap = zap_attribute_long_alloc();
1592
1593 /*
1594 * Initialize the iterator cursor.
1595 */
1596 if (offset <= 3) {
1597 /*
1598 * Start iteration from the beginning of the directory.
1599 */
1600 zap_cursor_init(&zc, os, zp->z_id);
1601 } else {
1602 /*
1603 * The offset is a serialized cursor.
1604 */
1605 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1606 }
1607
1608 /*
1609 * Transform to file-system independent format
1610 */
1611 while (!done) {
1612 uint64_t objnum;
1613 /*
1614 * Special case `.', `..', and `.zfs'.
1615 */
1616 if (offset == 0) {
1617 (void) strcpy(zap->za_name, ".");
1618 zap->za_normalization_conflict = 0;
1619 objnum = zp->z_id;
1620 type = DT_DIR;
1621 } else if (offset == 1) {
1622 (void) strcpy(zap->za_name, "..");
1623 zap->za_normalization_conflict = 0;
1624 objnum = parent;
1625 type = DT_DIR;
1626 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1627 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1628 zap->za_normalization_conflict = 0;
1629 objnum = ZFSCTL_INO_ROOT;
1630 type = DT_DIR;
1631 } else {
1632 /*
1633 * Grab next entry.
1634 */
1635 if ((error = zap_cursor_retrieve(&zc, zap))) {
1636 if (error == ENOENT)
1637 break;
1638 else
1639 goto update;
1640 }
1641
1642 /*
1643 * Allow multiple entries provided the first entry is
1644 * the object id. Non-zpl consumers may safely make
1645 * use of the additional space.
1646 *
1647 * XXX: This should be a feature flag for compatibility
1648 */
1649 if (zap->za_integer_length != 8 ||
1650 zap->za_num_integers == 0) {
1651 cmn_err(CE_WARN, "zap_readdir: bad directory "
1652 "entry, obj = %lld, offset = %lld, "
1653 "length = %d, num = %lld\n",
1654 (u_longlong_t)zp->z_id,
1655 (u_longlong_t)offset,
1656 zap->za_integer_length,
1657 (u_longlong_t)zap->za_num_integers);
1658 error = SET_ERROR(ENXIO);
1659 goto update;
1660 }
1661
1662 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1663 type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1664 }
1665
1666 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name),
1667 objnum, type);
1668 if (done)
1669 break;
1670
1671 if (prefetch)
1672 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1673
1674 /*
1675 * Move to the next entry, fill in the previous offset.
1676 */
1677 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1678 zap_cursor_advance(&zc);
1679 offset = zap_cursor_serialize(&zc);
1680 } else {
1681 offset += 1;
1682 }
1683 ctx->pos = offset;
1684 }
1685 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1686
1687 update:
1688 zap_cursor_fini(&zc);
1689 zap_attribute_free(zap);
1690 if (error == ENOENT)
1691 error = 0;
1692 out:
1693 zfs_exit(zfsvfs, FTAG);
1694
1695 return (error);
1696 }
1697
1698 /*
1699 * Get the basic file attributes and place them in the provided kstat
1700 * structure. The inode is assumed to be the authoritative source
1701 * for most of the attributes. However, the znode currently has the
1702 * authoritative atime, blksize, and block count.
1703 *
1704 * IN: ip - inode of file.
1705 *
1706 * OUT: sp - kstat values.
1707 *
1708 * RETURN: 0 (always succeeds)
1709 */
1710 int
1711 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
zfs_getattr_fast(zidmap_t * user_ns,u32 request_mask,struct inode * ip,struct kstat * sp)1712 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
1713 struct kstat *sp)
1714 #else
1715 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1716 #endif
1717 {
1718 znode_t *zp = ITOZ(ip);
1719 zfsvfs_t *zfsvfs = ITOZSB(ip);
1720 uint32_t blksize;
1721 u_longlong_t nblocks;
1722 int error;
1723
1724 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1725 return (error);
1726
1727 mutex_enter(&zp->z_lock);
1728
1729 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1730 zpl_generic_fillattr(user_ns, request_mask, ip, sp);
1731 #else
1732 zpl_generic_fillattr(user_ns, ip, sp);
1733 #endif
1734 /*
1735 * +1 link count for root inode with visible '.zfs' directory.
1736 */
1737 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1738 if (sp->nlink < ZFS_LINK_MAX)
1739 sp->nlink++;
1740
1741 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1742 sp->blksize = blksize;
1743 sp->blocks = nblocks;
1744
1745 if (unlikely(zp->z_blksz == 0)) {
1746 /*
1747 * Block size hasn't been set; suggest maximal I/O transfers.
1748 */
1749 sp->blksize = zfsvfs->z_max_blksz;
1750 }
1751
1752 mutex_exit(&zp->z_lock);
1753
1754 /*
1755 * Required to prevent NFS client from detecting different inode
1756 * numbers of snapshot root dentry before and after snapshot mount.
1757 */
1758 if (zfsvfs->z_issnap) {
1759 if (ip->i_sb->s_root->d_inode == ip)
1760 sp->ino = ZFSCTL_INO_SNAPDIRS -
1761 dmu_objset_id(zfsvfs->z_os);
1762 }
1763
1764 zfs_exit(zfsvfs, FTAG);
1765
1766 return (0);
1767 }
1768
1769 /*
1770 * For the operation of changing file's user/group/project, we need to
1771 * handle not only the main object that is assigned to the file directly,
1772 * but also the ones that are used by the file via hidden xattr directory.
1773 *
1774 * Because the xattr directory may contains many EA entries, as to it may
1775 * be impossible to change all of them via the transaction of changing the
1776 * main object's user/group/project attributes. Then we have to change them
1777 * via other multiple independent transactions one by one. It may be not good
1778 * solution, but we have no better idea yet.
1779 */
1780 static int
zfs_setattr_dir(znode_t * dzp)1781 zfs_setattr_dir(znode_t *dzp)
1782 {
1783 struct inode *dxip = ZTOI(dzp);
1784 struct inode *xip = NULL;
1785 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
1786 objset_t *os = zfsvfs->z_os;
1787 zap_cursor_t zc;
1788 zap_attribute_t *zap;
1789 zfs_dirlock_t *dl;
1790 znode_t *zp = NULL;
1791 dmu_tx_t *tx = NULL;
1792 uint64_t uid, gid;
1793 sa_bulk_attr_t bulk[4];
1794 int count;
1795 int err;
1796
1797 zap = zap_attribute_alloc();
1798 zap_cursor_init(&zc, os, dzp->z_id);
1799 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
1800 count = 0;
1801 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
1802 err = ENXIO;
1803 break;
1804 }
1805
1806 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp,
1807 ZEXISTS, NULL, NULL);
1808 if (err == ENOENT)
1809 goto next;
1810 if (err)
1811 break;
1812
1813 xip = ZTOI(zp);
1814 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1815 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1816 zp->z_projid == dzp->z_projid)
1817 goto next;
1818
1819 tx = dmu_tx_create(os);
1820 if (!(zp->z_pflags & ZFS_PROJID))
1821 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1822 else
1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1824
1825 err = dmu_tx_assign(tx, DMU_TX_WAIT);
1826 if (err)
1827 break;
1828
1829 mutex_enter(&dzp->z_lock);
1830
1831 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1832 xip->i_uid = dxip->i_uid;
1833 uid = zfs_uid_read(dxip);
1834 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1835 &uid, sizeof (uid));
1836 }
1837
1838 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1839 xip->i_gid = dxip->i_gid;
1840 gid = zfs_gid_read(dxip);
1841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1842 &gid, sizeof (gid));
1843 }
1844
1845
1846 uint64_t projid = dzp->z_projid;
1847 if (zp->z_projid != projid) {
1848 if (!(zp->z_pflags & ZFS_PROJID)) {
1849 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
1850 if (unlikely(err == EEXIST)) {
1851 err = 0;
1852 } else if (err != 0) {
1853 goto sa_add_projid_err;
1854 } else {
1855 projid = ZFS_INVALID_PROJID;
1856 }
1857 }
1858
1859 if (projid != ZFS_INVALID_PROJID) {
1860 zp->z_projid = projid;
1861 SA_ADD_BULK_ATTR(bulk, count,
1862 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
1863 sizeof (zp->z_projid));
1864 }
1865 }
1866
1867 sa_add_projid_err:
1868 mutex_exit(&dzp->z_lock);
1869
1870 if (likely(count > 0)) {
1871 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1872 dmu_tx_commit(tx);
1873 } else if (projid == ZFS_INVALID_PROJID) {
1874 dmu_tx_commit(tx);
1875 } else {
1876 dmu_tx_abort(tx);
1877 }
1878 tx = NULL;
1879 if (err != 0 && err != ENOENT)
1880 break;
1881
1882 next:
1883 if (zp) {
1884 zrele(zp);
1885 zp = NULL;
1886 zfs_dirent_unlock(dl);
1887 }
1888 zap_cursor_advance(&zc);
1889 }
1890
1891 if (tx)
1892 dmu_tx_abort(tx);
1893 if (zp) {
1894 zrele(zp);
1895 zfs_dirent_unlock(dl);
1896 }
1897 zap_cursor_fini(&zc);
1898 zap_attribute_free(zap);
1899
1900 return (err == ENOENT ? 0 : err);
1901 }
1902
1903 /*
1904 * Set the file attributes to the values contained in the
1905 * vattr structure.
1906 *
1907 * IN: zp - znode of file to be modified.
1908 * vap - new attribute values.
1909 * If ATTR_XVATTR set, then optional attrs are being set
1910 * flags - ATTR_UTIME set if non-default time values provided.
1911 * - ATTR_NOACLCHECK (CIFS context only).
1912 * cr - credentials of caller.
1913 * mnt_ns - user namespace of the mount
1914 *
1915 * RETURN: 0 if success
1916 * error code if failure
1917 *
1918 * Timestamps:
1919 * ip - ctime updated, mtime updated if size changed.
1920 */
1921 int
zfs_setattr(znode_t * zp,vattr_t * vap,int flags,cred_t * cr,zidmap_t * mnt_ns)1922 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1923 {
1924 struct inode *ip;
1925 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1926 objset_t *os;
1927 zilog_t *zilog;
1928 dmu_tx_t *tx;
1929 vattr_t oldva;
1930 xvattr_t *tmpxvattr;
1931 uint_t mask = vap->va_mask;
1932 uint_t saved_mask = 0;
1933 int trim_mask = 0;
1934 uint64_t new_mode;
1935 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1936 uint64_t xattr_obj;
1937 uint64_t mtime[2], ctime[2], atime[2];
1938 uint64_t projid = ZFS_INVALID_PROJID;
1939 znode_t *attrzp;
1940 int need_policy = FALSE;
1941 int err, err2 = 0;
1942 zfs_fuid_info_t *fuidp = NULL;
1943 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
1944 xoptattr_t *xoap;
1945 zfs_acl_t *aclp;
1946 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1947 boolean_t fuid_dirtied = B_FALSE;
1948 boolean_t handle_eadir = B_FALSE;
1949 sa_bulk_attr_t *bulk, *xattr_bulk;
1950 int count = 0, xattr_count = 0, bulks = 8;
1951
1952 if (mask == 0)
1953 return (0);
1954
1955 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1956 return (err);
1957 ip = ZTOI(zp);
1958 os = zfsvfs->z_os;
1959
1960 /*
1961 * If this is a xvattr_t, then get a pointer to the structure of
1962 * optional attributes. If this is NULL, then we have a vattr_t.
1963 */
1964 xoap = xva_getxoptattr(xvap);
1965 if (xoap != NULL && (mask & ATTR_XVATTR)) {
1966 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1967 if (!dmu_objset_projectquota_enabled(os) ||
1968 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1969 zfs_exit(zfsvfs, FTAG);
1970 return (SET_ERROR(ENOTSUP));
1971 }
1972
1973 projid = xoap->xoa_projid;
1974 if (unlikely(projid == ZFS_INVALID_PROJID)) {
1975 zfs_exit(zfsvfs, FTAG);
1976 return (SET_ERROR(EINVAL));
1977 }
1978
1979 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1980 projid = ZFS_INVALID_PROJID;
1981 else
1982 need_policy = TRUE;
1983 }
1984
1985 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1986 (xoap->xoa_projinherit !=
1987 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1988 (!dmu_objset_projectquota_enabled(os) ||
1989 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1990 zfs_exit(zfsvfs, FTAG);
1991 return (SET_ERROR(ENOTSUP));
1992 }
1993 }
1994
1995 zilog = zfsvfs->z_log;
1996
1997 /*
1998 * Make sure that if we have ephemeral uid/gid or xvattr specified
1999 * that file system is at proper version level
2000 */
2001
2002 if (zfsvfs->z_use_fuids == B_FALSE &&
2003 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2004 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2005 (mask & ATTR_XVATTR))) {
2006 zfs_exit(zfsvfs, FTAG);
2007 return (SET_ERROR(EINVAL));
2008 }
2009
2010 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
2011 zfs_exit(zfsvfs, FTAG);
2012 return (SET_ERROR(EISDIR));
2013 }
2014
2015 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
2016 zfs_exit(zfsvfs, FTAG);
2017 return (SET_ERROR(EINVAL));
2018 }
2019
2020 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
2021 xva_init(tmpxvattr);
2022
2023 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2024 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
2025
2026 /*
2027 * Immutable files can only alter immutable bit and atime
2028 */
2029 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2030 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
2031 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2032 err = SET_ERROR(EPERM);
2033 goto out3;
2034 }
2035
2036 if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2037 err = SET_ERROR(EPERM);
2038 goto out3;
2039 }
2040
2041 /*
2042 * Verify timestamps doesn't overflow 32 bits.
2043 * ZFS can handle large timestamps, but 32bit syscalls can't
2044 * handle times greater than 2039. This check should be removed
2045 * once large timestamps are fully supported.
2046 */
2047 if (mask & (ATTR_ATIME | ATTR_MTIME)) {
2048 if (((mask & ATTR_ATIME) &&
2049 TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2050 ((mask & ATTR_MTIME) &&
2051 TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2052 err = SET_ERROR(EOVERFLOW);
2053 goto out3;
2054 }
2055 }
2056
2057 top:
2058 attrzp = NULL;
2059 aclp = NULL;
2060
2061 /* Can this be moved to before the top label? */
2062 if (zfs_is_readonly(zfsvfs)) {
2063 err = SET_ERROR(EROFS);
2064 goto out3;
2065 }
2066
2067 /*
2068 * First validate permissions
2069 */
2070
2071 if (mask & ATTR_SIZE) {
2072 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
2073 mnt_ns);
2074 if (err)
2075 goto out3;
2076
2077 /*
2078 * XXX - Note, we are not providing any open
2079 * mode flags here (like FNDELAY), so we may
2080 * block if there are locks present... this
2081 * should be addressed in openat().
2082 */
2083 /* XXX - would it be OK to generate a log record here? */
2084 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2085 if (err)
2086 goto out3;
2087 }
2088
2089 if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2090 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2091 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2092 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2093 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2094 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2095 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2096 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2097 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2098 skipaclchk, cr, mnt_ns);
2099 }
2100
2101 if (mask & (ATTR_UID|ATTR_GID)) {
2102 int idmask = (mask & (ATTR_UID|ATTR_GID));
2103 int take_owner;
2104 int take_group;
2105 uid_t uid;
2106 gid_t gid;
2107
2108 /*
2109 * NOTE: even if a new mode is being set,
2110 * we may clear S_ISUID/S_ISGID bits.
2111 */
2112
2113 if (!(mask & ATTR_MODE))
2114 vap->va_mode = zp->z_mode;
2115
2116 /*
2117 * Take ownership or chgrp to group we are a member of
2118 */
2119
2120 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2121 vap->va_uid);
2122 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2123 vap->va_gid);
2124 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2125 take_group = (mask & ATTR_GID) &&
2126 zfs_groupmember(zfsvfs, gid, cr);
2127
2128 /*
2129 * If both ATTR_UID and ATTR_GID are set then take_owner and
2130 * take_group must both be set in order to allow taking
2131 * ownership.
2132 *
2133 * Otherwise, send the check through secpolicy_vnode_setattr()
2134 *
2135 */
2136
2137 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2138 take_owner && take_group) ||
2139 ((idmask == ATTR_UID) && take_owner) ||
2140 ((idmask == ATTR_GID) && take_group)) {
2141 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2142 skipaclchk, cr, mnt_ns) == 0) {
2143 /*
2144 * Remove setuid/setgid for non-privileged users
2145 */
2146 (void) secpolicy_setid_clear(vap, cr);
2147 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2148 } else {
2149 need_policy = TRUE;
2150 }
2151 } else {
2152 need_policy = TRUE;
2153 }
2154 }
2155
2156 mutex_enter(&zp->z_lock);
2157 oldva.va_mode = zp->z_mode;
2158 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2159 if (mask & ATTR_XVATTR) {
2160 /*
2161 * Update xvattr mask to include only those attributes
2162 * that are actually changing.
2163 *
2164 * the bits will be restored prior to actually setting
2165 * the attributes so the caller thinks they were set.
2166 */
2167 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2168 if (xoap->xoa_appendonly !=
2169 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2170 need_policy = TRUE;
2171 } else {
2172 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2173 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2174 }
2175 }
2176
2177 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2178 if (xoap->xoa_projinherit !=
2179 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2180 need_policy = TRUE;
2181 } else {
2182 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2183 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2184 }
2185 }
2186
2187 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2188 if (xoap->xoa_nounlink !=
2189 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2190 need_policy = TRUE;
2191 } else {
2192 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2193 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2194 }
2195 }
2196
2197 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2198 if (xoap->xoa_immutable !=
2199 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2200 need_policy = TRUE;
2201 } else {
2202 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2203 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2204 }
2205 }
2206
2207 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2208 if (xoap->xoa_nodump !=
2209 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2210 need_policy = TRUE;
2211 } else {
2212 XVA_CLR_REQ(xvap, XAT_NODUMP);
2213 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2214 }
2215 }
2216
2217 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2218 if (xoap->xoa_av_modified !=
2219 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2220 need_policy = TRUE;
2221 } else {
2222 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2223 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2224 }
2225 }
2226
2227 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2228 if ((!S_ISREG(ip->i_mode) &&
2229 xoap->xoa_av_quarantined) ||
2230 xoap->xoa_av_quarantined !=
2231 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2232 need_policy = TRUE;
2233 } else {
2234 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2235 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2236 }
2237 }
2238
2239 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2240 mutex_exit(&zp->z_lock);
2241 err = SET_ERROR(EPERM);
2242 goto out3;
2243 }
2244
2245 if (need_policy == FALSE &&
2246 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2247 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2248 need_policy = TRUE;
2249 }
2250 }
2251
2252 mutex_exit(&zp->z_lock);
2253
2254 if (mask & ATTR_MODE) {
2255 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2256 mnt_ns) == 0) {
2257 err = secpolicy_setid_setsticky_clear(ip, vap,
2258 &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2259 if (err)
2260 goto out3;
2261 trim_mask |= ATTR_MODE;
2262 } else {
2263 need_policy = TRUE;
2264 }
2265 }
2266
2267 if (need_policy) {
2268 /*
2269 * If trim_mask is set then take ownership
2270 * has been granted or write_acl is present and user
2271 * has the ability to modify mode. In that case remove
2272 * UID|GID and or MODE from mask so that
2273 * secpolicy_vnode_setattr() doesn't revoke it.
2274 */
2275
2276 if (trim_mask) {
2277 saved_mask = vap->va_mask;
2278 vap->va_mask &= ~trim_mask;
2279 }
2280 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2281 zfs_zaccess_unix, zp);
2282 if (err)
2283 goto out3;
2284
2285 if (trim_mask)
2286 vap->va_mask |= saved_mask;
2287 }
2288
2289 /*
2290 * secpolicy_vnode_setattr, or take ownership may have
2291 * changed va_mask
2292 */
2293 mask = vap->va_mask;
2294
2295 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2296 handle_eadir = B_TRUE;
2297 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2298 &xattr_obj, sizeof (xattr_obj));
2299
2300 if (err == 0 && xattr_obj) {
2301 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2302 if (err)
2303 goto out2;
2304 }
2305 if (mask & ATTR_UID) {
2306 new_kuid = zfs_fuid_create(zfsvfs,
2307 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2308 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2309 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2310 new_kuid)) {
2311 if (attrzp)
2312 zrele(attrzp);
2313 err = SET_ERROR(EDQUOT);
2314 goto out2;
2315 }
2316 }
2317
2318 if (mask & ATTR_GID) {
2319 new_kgid = zfs_fuid_create(zfsvfs,
2320 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2321 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2322 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2323 new_kgid)) {
2324 if (attrzp)
2325 zrele(attrzp);
2326 err = SET_ERROR(EDQUOT);
2327 goto out2;
2328 }
2329 }
2330
2331 if (projid != ZFS_INVALID_PROJID &&
2332 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2333 if (attrzp)
2334 zrele(attrzp);
2335 err = EDQUOT;
2336 goto out2;
2337 }
2338 }
2339 tx = dmu_tx_create(os);
2340
2341 if (mask & ATTR_MODE) {
2342 uint64_t pmode = zp->z_mode;
2343 uint64_t acl_obj;
2344 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2345
2346 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2347 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2348 err = EPERM;
2349 goto out;
2350 }
2351
2352 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2353 goto out;
2354
2355 mutex_enter(&zp->z_lock);
2356 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2357 /*
2358 * Are we upgrading ACL from old V0 format
2359 * to V1 format?
2360 */
2361 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2362 zfs_znode_acl_version(zp) ==
2363 ZFS_ACL_VERSION_INITIAL) {
2364 dmu_tx_hold_free(tx, acl_obj, 0,
2365 DMU_OBJECT_END);
2366 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2367 0, aclp->z_acl_bytes);
2368 } else {
2369 dmu_tx_hold_write(tx, acl_obj, 0,
2370 aclp->z_acl_bytes);
2371 }
2372 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2373 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2374 0, aclp->z_acl_bytes);
2375 }
2376 mutex_exit(&zp->z_lock);
2377 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2378 } else {
2379 if (((mask & ATTR_XVATTR) &&
2380 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2381 (projid != ZFS_INVALID_PROJID &&
2382 !(zp->z_pflags & ZFS_PROJID)))
2383 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2384 else
2385 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2386 }
2387
2388 if (attrzp) {
2389 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2390 }
2391
2392 fuid_dirtied = zfsvfs->z_fuid_dirty;
2393 if (fuid_dirtied)
2394 zfs_fuid_txhold(zfsvfs, tx);
2395
2396 zfs_sa_upgrade_txholds(tx, zp);
2397
2398 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2399 if (err)
2400 goto out;
2401
2402 count = 0;
2403 /*
2404 * Set each attribute requested.
2405 * We group settings according to the locks they need to acquire.
2406 *
2407 * Note: you cannot set ctime directly, although it will be
2408 * updated as a side-effect of calling this function.
2409 */
2410
2411 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2412 /*
2413 * For the existed object that is upgraded from old system,
2414 * its on-disk layout has no slot for the project ID attribute.
2415 * But quota accounting logic needs to access related slots by
2416 * offset directly. So we need to adjust old objects' layout
2417 * to make the project ID to some unified and fixed offset.
2418 */
2419 if (attrzp)
2420 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2421 if (err == 0)
2422 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2423
2424 if (unlikely(err == EEXIST))
2425 err = 0;
2426 else if (err != 0)
2427 goto out;
2428 else
2429 projid = ZFS_INVALID_PROJID;
2430 }
2431
2432 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2433 mutex_enter(&zp->z_acl_lock);
2434 mutex_enter(&zp->z_lock);
2435
2436 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2437 &zp->z_pflags, sizeof (zp->z_pflags));
2438
2439 if (attrzp) {
2440 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2441 mutex_enter(&attrzp->z_acl_lock);
2442 mutex_enter(&attrzp->z_lock);
2443 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2444 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2445 sizeof (attrzp->z_pflags));
2446 if (projid != ZFS_INVALID_PROJID) {
2447 attrzp->z_projid = projid;
2448 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2449 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2450 sizeof (attrzp->z_projid));
2451 }
2452 }
2453
2454 if (mask & (ATTR_UID|ATTR_GID)) {
2455
2456 if (mask & ATTR_UID) {
2457 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2458 new_uid = zfs_uid_read(ZTOI(zp));
2459 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2460 &new_uid, sizeof (new_uid));
2461 if (attrzp) {
2462 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2463 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2464 sizeof (new_uid));
2465 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2466 }
2467 }
2468
2469 if (mask & ATTR_GID) {
2470 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2471 new_gid = zfs_gid_read(ZTOI(zp));
2472 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2473 NULL, &new_gid, sizeof (new_gid));
2474 if (attrzp) {
2475 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2476 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2477 sizeof (new_gid));
2478 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2479 }
2480 }
2481 if (!(mask & ATTR_MODE)) {
2482 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2483 NULL, &new_mode, sizeof (new_mode));
2484 new_mode = zp->z_mode;
2485 }
2486 err = zfs_acl_chown_setattr(zp);
2487 ASSERT0(err);
2488 if (attrzp) {
2489 err = zfs_acl_chown_setattr(attrzp);
2490 ASSERT0(err);
2491 }
2492 }
2493
2494 if (mask & ATTR_MODE) {
2495 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2496 &new_mode, sizeof (new_mode));
2497 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2498 ASSERT3P(aclp, !=, NULL);
2499 err = zfs_aclset_common(zp, aclp, cr, tx);
2500 ASSERT0(err);
2501 if (zp->z_acl_cached)
2502 zfs_acl_free(zp->z_acl_cached);
2503 zp->z_acl_cached = aclp;
2504 aclp = NULL;
2505 }
2506
2507 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2508 zp->z_atime_dirty = B_FALSE;
2509 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
2510 ZFS_TIME_ENCODE(&tmp_atime, atime);
2511 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2512 &atime, sizeof (atime));
2513 }
2514
2515 if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2516 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2517 zpl_inode_set_mtime_to_ts(ZTOI(zp),
2518 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
2519
2520 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2521 mtime, sizeof (mtime));
2522 }
2523
2524 if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2525 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2526 zpl_inode_set_ctime_to_ts(ZTOI(zp),
2527 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
2528 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2529 ctime, sizeof (ctime));
2530 }
2531
2532 if (projid != ZFS_INVALID_PROJID) {
2533 zp->z_projid = projid;
2534 SA_ADD_BULK_ATTR(bulk, count,
2535 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2536 sizeof (zp->z_projid));
2537 }
2538
2539 if (attrzp && mask) {
2540 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2541 SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2542 sizeof (ctime));
2543 }
2544
2545 /*
2546 * Do this after setting timestamps to prevent timestamp
2547 * update from toggling bit
2548 */
2549
2550 if (xoap && (mask & ATTR_XVATTR)) {
2551
2552 /*
2553 * restore trimmed off masks
2554 * so that return masks can be set for caller.
2555 */
2556
2557 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2558 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2559 }
2560 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2561 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2562 }
2563 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2564 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2565 }
2566 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2567 XVA_SET_REQ(xvap, XAT_NODUMP);
2568 }
2569 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2570 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2571 }
2572 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2573 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2574 }
2575 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2576 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2577 }
2578
2579 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2580 ASSERT(S_ISREG(ip->i_mode));
2581
2582 zfs_xvattr_set(zp, xvap, tx);
2583 }
2584
2585 if (fuid_dirtied)
2586 zfs_fuid_sync(zfsvfs, tx);
2587
2588 if (mask != 0)
2589 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2590
2591 mutex_exit(&zp->z_lock);
2592 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2593 mutex_exit(&zp->z_acl_lock);
2594
2595 if (attrzp) {
2596 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2597 mutex_exit(&attrzp->z_acl_lock);
2598 mutex_exit(&attrzp->z_lock);
2599 }
2600 out:
2601 if (err == 0 && xattr_count > 0) {
2602 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2603 xattr_count, tx);
2604 ASSERT0(err2);
2605 }
2606
2607 if (aclp)
2608 zfs_acl_free(aclp);
2609
2610 if (fuidp) {
2611 zfs_fuid_info_free(fuidp);
2612 fuidp = NULL;
2613 }
2614
2615 if (err) {
2616 dmu_tx_abort(tx);
2617 if (attrzp)
2618 zrele(attrzp);
2619 if (err == ERESTART)
2620 goto top;
2621 } else {
2622 if (count > 0)
2623 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2624 dmu_tx_commit(tx);
2625 if (attrzp) {
2626 if (err2 == 0 && handle_eadir)
2627 err = zfs_setattr_dir(attrzp);
2628 zrele(attrzp);
2629 }
2630 zfs_znode_update_vfs(zp);
2631 }
2632
2633 out2:
2634 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
2635 err = zil_commit(zilog, 0);
2636
2637 out3:
2638 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2639 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2640 kmem_free(tmpxvattr, sizeof (xvattr_t));
2641 zfs_exit(zfsvfs, FTAG);
2642 return (err);
2643 }
2644
2645 typedef struct zfs_zlock {
2646 krwlock_t *zl_rwlock; /* lock we acquired */
2647 znode_t *zl_znode; /* znode we held */
2648 struct zfs_zlock *zl_next; /* next in list */
2649 } zfs_zlock_t;
2650
2651 /*
2652 * Drop locks and release vnodes that were held by zfs_rename_lock().
2653 */
2654 static void
zfs_rename_unlock(zfs_zlock_t ** zlpp)2655 zfs_rename_unlock(zfs_zlock_t **zlpp)
2656 {
2657 zfs_zlock_t *zl;
2658
2659 while ((zl = *zlpp) != NULL) {
2660 if (zl->zl_znode != NULL)
2661 zfs_zrele_async(zl->zl_znode);
2662 rw_exit(zl->zl_rwlock);
2663 *zlpp = zl->zl_next;
2664 kmem_free(zl, sizeof (*zl));
2665 }
2666 }
2667
2668 /*
2669 * Search back through the directory tree, using the ".." entries.
2670 * Lock each directory in the chain to prevent concurrent renames.
2671 * Fail any attempt to move a directory into one of its own descendants.
2672 * XXX - z_parent_lock can overlap with map or grow locks
2673 */
2674 static int
zfs_rename_lock(znode_t * szp,znode_t * tdzp,znode_t * sdzp,zfs_zlock_t ** zlpp)2675 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2676 {
2677 zfs_zlock_t *zl;
2678 znode_t *zp = tdzp;
2679 uint64_t rootid = ZTOZSB(zp)->z_root;
2680 uint64_t oidp = zp->z_id;
2681 krwlock_t *rwlp = &szp->z_parent_lock;
2682 krw_t rw = RW_WRITER;
2683
2684 /*
2685 * First pass write-locks szp and compares to zp->z_id.
2686 * Later passes read-lock zp and compare to zp->z_parent.
2687 */
2688 do {
2689 if (!rw_tryenter(rwlp, rw)) {
2690 /*
2691 * Another thread is renaming in this path.
2692 * Note that if we are a WRITER, we don't have any
2693 * parent_locks held yet.
2694 */
2695 if (rw == RW_READER && zp->z_id > szp->z_id) {
2696 /*
2697 * Drop our locks and restart
2698 */
2699 zfs_rename_unlock(&zl);
2700 *zlpp = NULL;
2701 zp = tdzp;
2702 oidp = zp->z_id;
2703 rwlp = &szp->z_parent_lock;
2704 rw = RW_WRITER;
2705 continue;
2706 } else {
2707 /*
2708 * Wait for other thread to drop its locks
2709 */
2710 rw_enter(rwlp, rw);
2711 }
2712 }
2713
2714 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2715 zl->zl_rwlock = rwlp;
2716 zl->zl_znode = NULL;
2717 zl->zl_next = *zlpp;
2718 *zlpp = zl;
2719
2720 if (oidp == szp->z_id) /* We're a descendant of szp */
2721 return (SET_ERROR(EINVAL));
2722
2723 if (oidp == rootid) /* We've hit the top */
2724 return (0);
2725
2726 if (rw == RW_READER) { /* i.e. not the first pass */
2727 int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2728 if (error)
2729 return (error);
2730 zl->zl_znode = zp;
2731 }
2732 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2733 &oidp, sizeof (oidp));
2734 rwlp = &zp->z_parent_lock;
2735 rw = RW_READER;
2736
2737 } while (zp->z_id != sdzp->z_id);
2738
2739 return (0);
2740 }
2741
2742 /*
2743 * Move an entry from the provided source directory to the target
2744 * directory. Change the entry name as indicated.
2745 *
2746 * IN: sdzp - Source directory containing the "old entry".
2747 * snm - Old entry name.
2748 * tdzp - Target directory to contain the "new entry".
2749 * tnm - New entry name.
2750 * cr - credentials of caller.
2751 * flags - case flags
2752 * rflags - RENAME_* flags
2753 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0).
2754 * mnt_ns - user namespace of the mount
2755 *
2756 * RETURN: 0 on success, error code on failure.
2757 *
2758 * Timestamps:
2759 * sdzp,tdzp - ctime|mtime updated
2760 */
2761 int
zfs_rename(znode_t * sdzp,char * snm,znode_t * tdzp,char * tnm,cred_t * cr,int flags,uint64_t rflags,vattr_t * wo_vap,zidmap_t * mnt_ns)2762 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2763 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2764 {
2765 znode_t *szp, *tzp;
2766 zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
2767 zilog_t *zilog;
2768 zfs_dirlock_t *sdl, *tdl;
2769 dmu_tx_t *tx;
2770 zfs_zlock_t *zl;
2771 int cmp, serr, terr;
2772 int error = 0;
2773 int zflg = 0;
2774 boolean_t waited = B_FALSE;
2775 /* Needed for whiteout inode creation. */
2776 boolean_t fuid_dirtied;
2777 zfs_acl_ids_t acl_ids;
2778 boolean_t have_acl = B_FALSE;
2779 znode_t *wzp = NULL;
2780
2781
2782 if (snm == NULL || tnm == NULL)
2783 return (SET_ERROR(EINVAL));
2784
2785 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2786 return (SET_ERROR(EINVAL));
2787
2788 /* Already checked by Linux VFS, but just to make sure. */
2789 if (rflags & RENAME_EXCHANGE &&
2790 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2791 return (SET_ERROR(EINVAL));
2792
2793 /*
2794 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2795 * right kind of vattr_t for the whiteout file. These are set
2796 * internally by ZFS so should never be incorrect.
2797 */
2798 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2799 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2800 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2801
2802 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2803 return (error);
2804 zilog = zfsvfs->z_log;
2805
2806 if ((error = zfs_verify_zp(tdzp)) != 0) {
2807 zfs_exit(zfsvfs, FTAG);
2808 return (error);
2809 }
2810
2811 /*
2812 * We check i_sb because snapshots and the ctldir must have different
2813 * super blocks.
2814 */
2815 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2816 zfsctl_is_node(ZTOI(tdzp))) {
2817 zfs_exit(zfsvfs, FTAG);
2818 return (SET_ERROR(EXDEV));
2819 }
2820
2821 if (zfsvfs->z_utf8 && u8_validate(tnm,
2822 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2823 zfs_exit(zfsvfs, FTAG);
2824 return (SET_ERROR(EILSEQ));
2825 }
2826
2827 if (flags & FIGNORECASE)
2828 zflg |= ZCILOOK;
2829
2830 top:
2831 szp = NULL;
2832 tzp = NULL;
2833 zl = NULL;
2834
2835 /*
2836 * This is to prevent the creation of links into attribute space
2837 * by renaming a linked file into/outof an attribute directory.
2838 * See the comment in zfs_link() for why this is considered bad.
2839 */
2840 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2841 zfs_exit(zfsvfs, FTAG);
2842 return (SET_ERROR(EINVAL));
2843 }
2844
2845 /*
2846 * Lock source and target directory entries. To prevent deadlock,
2847 * a lock ordering must be defined. We lock the directory with
2848 * the smallest object id first, or if it's a tie, the one with
2849 * the lexically first name.
2850 */
2851 if (sdzp->z_id < tdzp->z_id) {
2852 cmp = -1;
2853 } else if (sdzp->z_id > tdzp->z_id) {
2854 cmp = 1;
2855 } else {
2856 /*
2857 * First compare the two name arguments without
2858 * considering any case folding.
2859 */
2860 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2861
2862 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2863 ASSERT(error == 0 || !zfsvfs->z_utf8);
2864 if (cmp == 0) {
2865 /*
2866 * POSIX: "If the old argument and the new argument
2867 * both refer to links to the same existing file,
2868 * the rename() function shall return successfully
2869 * and perform no other action."
2870 */
2871 zfs_exit(zfsvfs, FTAG);
2872 return (0);
2873 }
2874 /*
2875 * If the file system is case-folding, then we may
2876 * have some more checking to do. A case-folding file
2877 * system is either supporting mixed case sensitivity
2878 * access or is completely case-insensitive. Note
2879 * that the file system is always case preserving.
2880 *
2881 * In mixed sensitivity mode case sensitive behavior
2882 * is the default. FIGNORECASE must be used to
2883 * explicitly request case insensitive behavior.
2884 *
2885 * If the source and target names provided differ only
2886 * by case (e.g., a request to rename 'tim' to 'Tim'),
2887 * we will treat this as a special case in the
2888 * case-insensitive mode: as long as the source name
2889 * is an exact match, we will allow this to proceed as
2890 * a name-change request.
2891 */
2892 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2893 (zfsvfs->z_case == ZFS_CASE_MIXED &&
2894 flags & FIGNORECASE)) &&
2895 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2896 &error) == 0) {
2897 /*
2898 * case preserving rename request, require exact
2899 * name matches
2900 */
2901 zflg |= ZCIEXACT;
2902 zflg &= ~ZCILOOK;
2903 }
2904 }
2905
2906 /*
2907 * If the source and destination directories are the same, we should
2908 * grab the z_name_lock of that directory only once.
2909 */
2910 if (sdzp == tdzp) {
2911 zflg |= ZHAVELOCK;
2912 rw_enter(&sdzp->z_name_lock, RW_READER);
2913 }
2914
2915 if (cmp < 0) {
2916 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2917 ZEXISTS | zflg, NULL, NULL);
2918 terr = zfs_dirent_lock(&tdl,
2919 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2920 } else {
2921 terr = zfs_dirent_lock(&tdl,
2922 tdzp, tnm, &tzp, zflg, NULL, NULL);
2923 serr = zfs_dirent_lock(&sdl,
2924 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2925 NULL, NULL);
2926 }
2927
2928 if (serr) {
2929 /*
2930 * Source entry invalid or not there.
2931 */
2932 if (!terr) {
2933 zfs_dirent_unlock(tdl);
2934 if (tzp)
2935 zrele(tzp);
2936 }
2937
2938 if (sdzp == tdzp)
2939 rw_exit(&sdzp->z_name_lock);
2940
2941 if (strcmp(snm, "..") == 0)
2942 serr = EINVAL;
2943 zfs_exit(zfsvfs, FTAG);
2944 return (serr);
2945 }
2946 if (terr) {
2947 zfs_dirent_unlock(sdl);
2948 zrele(szp);
2949
2950 if (sdzp == tdzp)
2951 rw_exit(&sdzp->z_name_lock);
2952
2953 if (strcmp(tnm, "..") == 0)
2954 terr = EINVAL;
2955 zfs_exit(zfsvfs, FTAG);
2956 return (terr);
2957 }
2958
2959 /*
2960 * If we are using project inheritance, means if the directory has
2961 * ZFS_PROJINHERIT set, then its descendant directories will inherit
2962 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2963 * such case, we only allow renames into our tree when the project
2964 * IDs are the same.
2965 */
2966 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2967 tdzp->z_projid != szp->z_projid) {
2968 error = SET_ERROR(EXDEV);
2969 goto out;
2970 }
2971
2972 /*
2973 * Must have write access at the source to remove the old entry
2974 * and write access at the target to create the new entry.
2975 * Note that if target and source are the same, this can be
2976 * done in a single check.
2977 */
2978 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2979 goto out;
2980
2981 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2982 /*
2983 * Check to make sure rename is valid.
2984 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2985 */
2986 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2987 goto out;
2988 }
2989
2990 /*
2991 * Does target exist?
2992 */
2993 if (tzp) {
2994 if (rflags & RENAME_NOREPLACE) {
2995 error = SET_ERROR(EEXIST);
2996 goto out;
2997 }
2998 /*
2999 * Source and target must be the same type (unless exchanging).
3000 */
3001 if (!(rflags & RENAME_EXCHANGE)) {
3002 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
3003 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
3004
3005 if (s_is_dir != t_is_dir) {
3006 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
3007 goto out;
3008 }
3009 }
3010 /*
3011 * POSIX dictates that when the source and target
3012 * entries refer to the same file object, rename
3013 * must do nothing and exit without error.
3014 */
3015 if (szp->z_id == tzp->z_id) {
3016 error = 0;
3017 goto out;
3018 }
3019 } else if (rflags & RENAME_EXCHANGE) {
3020 /* Target must exist for RENAME_EXCHANGE. */
3021 error = SET_ERROR(ENOENT);
3022 goto out;
3023 }
3024
3025 /* Set up inode creation for RENAME_WHITEOUT. */
3026 if (rflags & RENAME_WHITEOUT) {
3027 /*
3028 * Whiteout files are not regular files or directories, so to
3029 * match zfs_create() we do not inherit the project id.
3030 */
3031 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
3032
3033 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
3034 if (error)
3035 goto out;
3036
3037 if (!have_acl) {
3038 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
3039 &acl_ids, mnt_ns);
3040 if (error)
3041 goto out;
3042 have_acl = B_TRUE;
3043 }
3044
3045 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
3046 error = SET_ERROR(EDQUOT);
3047 goto out;
3048 }
3049 }
3050
3051 tx = dmu_tx_create(zfsvfs->z_os);
3052 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3053 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3054 dmu_tx_hold_zap(tx, sdzp->z_id,
3055 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
3056 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3057 if (sdzp != tdzp) {
3058 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3059 zfs_sa_upgrade_txholds(tx, tdzp);
3060 }
3061 if (tzp) {
3062 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3063 zfs_sa_upgrade_txholds(tx, tzp);
3064 }
3065 if (rflags & RENAME_WHITEOUT) {
3066 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3067 ZFS_SA_BASE_ATTR_SIZE);
3068
3069 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
3070 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3071 if (!zfsvfs->z_use_sa &&
3072 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3073 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3074 0, acl_ids.z_aclp->z_acl_bytes);
3075 }
3076 }
3077 fuid_dirtied = zfsvfs->z_fuid_dirty;
3078 if (fuid_dirtied)
3079 zfs_fuid_txhold(zfsvfs, tx);
3080 zfs_sa_upgrade_txholds(tx, szp);
3081 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3082 error = dmu_tx_assign(tx,
3083 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3084 if (error) {
3085 if (zl != NULL)
3086 zfs_rename_unlock(&zl);
3087 zfs_dirent_unlock(sdl);
3088 zfs_dirent_unlock(tdl);
3089
3090 if (sdzp == tdzp)
3091 rw_exit(&sdzp->z_name_lock);
3092
3093 if (error == ERESTART) {
3094 waited = B_TRUE;
3095 dmu_tx_wait(tx);
3096 dmu_tx_abort(tx);
3097 zrele(szp);
3098 if (tzp)
3099 zrele(tzp);
3100 goto top;
3101 }
3102 dmu_tx_abort(tx);
3103 zrele(szp);
3104 if (tzp)
3105 zrele(tzp);
3106 zfs_exit(zfsvfs, FTAG);
3107 return (error);
3108 }
3109
3110 /*
3111 * Unlink the source.
3112 */
3113 szp->z_pflags |= ZFS_AV_MODIFIED;
3114 if (tdzp->z_pflags & ZFS_PROJINHERIT)
3115 szp->z_pflags |= ZFS_PROJINHERIT;
3116
3117 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3118 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3119 VERIFY0(error);
3120
3121 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3122 if (error)
3123 goto commit;
3124
3125 /*
3126 * Unlink the target.
3127 */
3128 if (tzp) {
3129 int tzflg = zflg;
3130
3131 if (rflags & RENAME_EXCHANGE) {
3132 /* This inode will be re-linked soon. */
3133 tzflg |= ZRENAMING;
3134
3135 tzp->z_pflags |= ZFS_AV_MODIFIED;
3136 if (sdzp->z_pflags & ZFS_PROJINHERIT)
3137 tzp->z_pflags |= ZFS_PROJINHERIT;
3138
3139 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3140 (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3141 ASSERT0(error);
3142 }
3143 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3144 if (error)
3145 goto commit_link_szp;
3146 }
3147
3148 /*
3149 * Create the new target links:
3150 * * We always link the target.
3151 * * RENAME_EXCHANGE: Link the old target to the source.
3152 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3153 */
3154 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3155 if (error) {
3156 /*
3157 * If we have removed the existing target, a subsequent call to
3158 * zfs_link_create() to add back the same entry, but with a new
3159 * dnode (szp), should not fail.
3160 */
3161 ASSERT0P(tzp);
3162 goto commit_link_tzp;
3163 }
3164
3165 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3166 case RENAME_EXCHANGE:
3167 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3168 /*
3169 * The same argument as zfs_link_create() failing for
3170 * szp applies here, since the source directory must
3171 * have had an entry we are replacing.
3172 */
3173 ASSERT0(error);
3174 if (error)
3175 goto commit_unlink_td_szp;
3176 break;
3177 case RENAME_WHITEOUT:
3178 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3179 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3180 if (error) {
3181 zfs_znode_delete(wzp, tx);
3182 remove_inode_hash(ZTOI(wzp));
3183 goto commit_unlink_td_szp;
3184 }
3185 break;
3186 }
3187
3188 if (fuid_dirtied)
3189 zfs_fuid_sync(zfsvfs, tx);
3190
3191 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3192 case RENAME_EXCHANGE:
3193 zfs_log_rename_exchange(zilog, tx,
3194 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3195 tdzp, tdl->dl_name, szp);
3196 break;
3197 case RENAME_WHITEOUT:
3198 zfs_log_rename_whiteout(zilog, tx,
3199 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3200 tdzp, tdl->dl_name, szp, wzp);
3201 break;
3202 default:
3203 ASSERT0(rflags & ~RENAME_NOREPLACE);
3204 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3205 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3206 break;
3207 }
3208
3209 commit:
3210 dmu_tx_commit(tx);
3211 out:
3212 if (have_acl)
3213 zfs_acl_ids_free(&acl_ids);
3214
3215 zfs_znode_update_vfs(sdzp);
3216 if (sdzp == tdzp)
3217 rw_exit(&sdzp->z_name_lock);
3218
3219 if (sdzp != tdzp)
3220 zfs_znode_update_vfs(tdzp);
3221
3222 zfs_znode_update_vfs(szp);
3223 zrele(szp);
3224 if (wzp) {
3225 zfs_znode_update_vfs(wzp);
3226 zrele(wzp);
3227 }
3228 if (tzp) {
3229 zfs_znode_update_vfs(tzp);
3230 zrele(tzp);
3231 }
3232
3233 if (zl != NULL)
3234 zfs_rename_unlock(&zl);
3235
3236 zfs_dirent_unlock(sdl);
3237 zfs_dirent_unlock(tdl);
3238
3239 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3240 error = zil_commit(zilog, 0);
3241
3242 zfs_exit(zfsvfs, FTAG);
3243 return (error);
3244
3245 /*
3246 * Clean-up path for broken link state.
3247 *
3248 * At this point we are in a (very) bad state, so we need to do our
3249 * best to correct the state. In particular, all of the nlinks are
3250 * wrong because we were destroying and creating links with ZRENAMING.
3251 *
3252 * In some form, all of these operations have to resolve the state:
3253 *
3254 * * link_destroy() *must* succeed. Fortunately, this is very likely
3255 * since we only just created it.
3256 *
3257 * * link_create()s are allowed to fail (though they shouldn't because
3258 * we only just unlinked them and are putting the entries back
3259 * during clean-up). But if they fail, we can just forcefully drop
3260 * the nlink value to (at the very least) avoid broken nlink values
3261 * -- though in the case of non-empty directories we will have to
3262 * panic (otherwise we'd have a leaked directory with a broken ..).
3263 */
3264 commit_unlink_td_szp:
3265 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3266 commit_link_tzp:
3267 if (tzp) {
3268 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3269 VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3270 }
3271 commit_link_szp:
3272 if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3273 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3274 goto commit;
3275 }
3276
3277 /*
3278 * Insert the indicated symbolic reference entry into the directory.
3279 *
3280 * IN: dzp - Directory to contain new symbolic link.
3281 * name - Name of directory entry in dip.
3282 * vap - Attributes of new entry.
3283 * link - Name for new symlink entry.
3284 * cr - credentials of caller.
3285 * flags - case flags
3286 * mnt_ns - user namespace of the mount
3287 *
3288 * OUT: zpp - Znode for new symbolic link.
3289 *
3290 * RETURN: 0 on success, error code on failure.
3291 *
3292 * Timestamps:
3293 * dip - ctime|mtime updated
3294 */
3295 int
zfs_symlink(znode_t * dzp,char * name,vattr_t * vap,char * link,znode_t ** zpp,cred_t * cr,int flags,zidmap_t * mnt_ns)3296 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3297 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3298 {
3299 znode_t *zp;
3300 zfs_dirlock_t *dl;
3301 dmu_tx_t *tx;
3302 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
3303 zilog_t *zilog;
3304 uint64_t len = strlen(link);
3305 int error;
3306 int zflg = ZNEW;
3307 zfs_acl_ids_t acl_ids;
3308 boolean_t fuid_dirtied;
3309 uint64_t txtype = TX_SYMLINK;
3310 boolean_t waited = B_FALSE;
3311
3312 ASSERT(S_ISLNK(vap->va_mode));
3313
3314 if (name == NULL)
3315 return (SET_ERROR(EINVAL));
3316
3317 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3318 return (error);
3319 zilog = zfsvfs->z_log;
3320
3321 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3322 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3323 zfs_exit(zfsvfs, FTAG);
3324 return (SET_ERROR(EILSEQ));
3325 }
3326 if (flags & FIGNORECASE)
3327 zflg |= ZCILOOK;
3328
3329 if (len > MAXPATHLEN) {
3330 zfs_exit(zfsvfs, FTAG);
3331 return (SET_ERROR(ENAMETOOLONG));
3332 }
3333
3334 if ((error = zfs_acl_ids_create(dzp, 0,
3335 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3336 zfs_exit(zfsvfs, FTAG);
3337 return (error);
3338 }
3339 top:
3340 *zpp = NULL;
3341
3342 /*
3343 * Attempt to lock directory; fail if entry already exists.
3344 */
3345 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3346 if (error) {
3347 zfs_acl_ids_free(&acl_ids);
3348 zfs_exit(zfsvfs, FTAG);
3349 return (error);
3350 }
3351
3352 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3353 zfs_acl_ids_free(&acl_ids);
3354 zfs_dirent_unlock(dl);
3355 zfs_exit(zfsvfs, FTAG);
3356 return (error);
3357 }
3358
3359 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3360 zfs_acl_ids_free(&acl_ids);
3361 zfs_dirent_unlock(dl);
3362 zfs_exit(zfsvfs, FTAG);
3363 return (SET_ERROR(EDQUOT));
3364 }
3365 tx = dmu_tx_create(zfsvfs->z_os);
3366 fuid_dirtied = zfsvfs->z_fuid_dirty;
3367 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3368 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3369 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3370 ZFS_SA_BASE_ATTR_SIZE + len);
3371 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3372 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3373 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3374 acl_ids.z_aclp->z_acl_bytes);
3375 }
3376 if (fuid_dirtied)
3377 zfs_fuid_txhold(zfsvfs, tx);
3378 error = dmu_tx_assign(tx,
3379 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3380 if (error) {
3381 zfs_dirent_unlock(dl);
3382 if (error == ERESTART) {
3383 waited = B_TRUE;
3384 dmu_tx_wait(tx);
3385 dmu_tx_abort(tx);
3386 goto top;
3387 }
3388 zfs_acl_ids_free(&acl_ids);
3389 dmu_tx_abort(tx);
3390 zfs_exit(zfsvfs, FTAG);
3391 return (error);
3392 }
3393
3394 /*
3395 * Create a new object for the symlink.
3396 * for version 4 ZPL datasets the symlink will be an SA attribute
3397 */
3398 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3399
3400 if (fuid_dirtied)
3401 zfs_fuid_sync(zfsvfs, tx);
3402
3403 mutex_enter(&zp->z_lock);
3404 if (zp->z_is_sa)
3405 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3406 link, len, tx);
3407 else
3408 zfs_sa_symlink(zp, link, len, tx);
3409 mutex_exit(&zp->z_lock);
3410
3411 zp->z_size = len;
3412 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3413 &zp->z_size, sizeof (zp->z_size), tx);
3414 /*
3415 * Insert the new object into the directory.
3416 */
3417 error = zfs_link_create(dl, zp, tx, ZNEW);
3418 if (error != 0) {
3419 zfs_znode_delete(zp, tx);
3420 remove_inode_hash(ZTOI(zp));
3421 } else {
3422 if (flags & FIGNORECASE)
3423 txtype |= TX_CI;
3424 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3425
3426 zfs_znode_update_vfs(dzp);
3427 zfs_znode_update_vfs(zp);
3428 }
3429
3430 zfs_acl_ids_free(&acl_ids);
3431
3432 dmu_tx_commit(tx);
3433
3434 zfs_dirent_unlock(dl);
3435
3436 if (error == 0) {
3437 *zpp = zp;
3438
3439 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3440 error = zil_commit(zilog, 0);
3441 } else {
3442 zrele(zp);
3443 }
3444
3445 zfs_exit(zfsvfs, FTAG);
3446 return (error);
3447 }
3448
3449 /*
3450 * Return, in the buffer contained in the provided uio structure,
3451 * the symbolic path referred to by ip.
3452 *
3453 * IN: ip - inode of symbolic link
3454 * uio - structure to contain the link path.
3455 * cr - credentials of caller.
3456 *
3457 * RETURN: 0 if success
3458 * error code if failure
3459 *
3460 * Timestamps:
3461 * ip - atime updated
3462 */
3463 int
zfs_readlink(struct inode * ip,zfs_uio_t * uio,cred_t * cr)3464 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3465 {
3466 (void) cr;
3467 znode_t *zp = ITOZ(ip);
3468 zfsvfs_t *zfsvfs = ITOZSB(ip);
3469 int error;
3470
3471 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3472 return (error);
3473
3474 mutex_enter(&zp->z_lock);
3475 if (zp->z_is_sa)
3476 error = sa_lookup_uio(zp->z_sa_hdl,
3477 SA_ZPL_SYMLINK(zfsvfs), uio);
3478 else
3479 error = zfs_sa_readlink(zp, uio);
3480 mutex_exit(&zp->z_lock);
3481
3482 zfs_exit(zfsvfs, FTAG);
3483 return (error);
3484 }
3485
3486 /*
3487 * Insert a new entry into directory tdzp referencing szp.
3488 *
3489 * IN: tdzp - Directory to contain new entry.
3490 * szp - znode of new entry.
3491 * name - name of new entry.
3492 * cr - credentials of caller.
3493 * flags - case flags.
3494 *
3495 * RETURN: 0 if success
3496 * error code if failure
3497 *
3498 * Timestamps:
3499 * tdzp - ctime|mtime updated
3500 * szp - ctime updated
3501 */
3502 int
zfs_link(znode_t * tdzp,znode_t * szp,char * name,cred_t * cr,int flags)3503 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3504 int flags)
3505 {
3506 struct inode *sip = ZTOI(szp);
3507 znode_t *tzp;
3508 zfsvfs_t *zfsvfs = ZTOZSB(tdzp);
3509 zilog_t *zilog;
3510 zfs_dirlock_t *dl;
3511 dmu_tx_t *tx;
3512 int error;
3513 int zf = ZNEW;
3514 uint64_t parent;
3515 uid_t owner;
3516 boolean_t waited = B_FALSE;
3517 boolean_t is_tmpfile = 0;
3518 uint64_t txg;
3519
3520 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3521
3522 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3523
3524 if (name == NULL)
3525 return (SET_ERROR(EINVAL));
3526
3527 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3528 return (error);
3529 zilog = zfsvfs->z_log;
3530
3531 /*
3532 * POSIX dictates that we return EPERM here.
3533 * Better choices include ENOTSUP or EISDIR.
3534 */
3535 if (S_ISDIR(sip->i_mode)) {
3536 zfs_exit(zfsvfs, FTAG);
3537 return (SET_ERROR(EPERM));
3538 }
3539
3540 if ((error = zfs_verify_zp(szp)) != 0) {
3541 zfs_exit(zfsvfs, FTAG);
3542 return (error);
3543 }
3544
3545 /*
3546 * If we are using project inheritance, means if the directory has
3547 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3548 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3549 * such case, we only allow hard link creation in our tree when the
3550 * project IDs are the same.
3551 */
3552 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3553 tdzp->z_projid != szp->z_projid) {
3554 zfs_exit(zfsvfs, FTAG);
3555 return (SET_ERROR(EXDEV));
3556 }
3557
3558 /*
3559 * We check i_sb because snapshots and the ctldir must have different
3560 * super blocks.
3561 */
3562 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3563 zfs_exit(zfsvfs, FTAG);
3564 return (SET_ERROR(EXDEV));
3565 }
3566
3567 /* Prevent links to .zfs/shares files */
3568
3569 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3570 &parent, sizeof (uint64_t))) != 0) {
3571 zfs_exit(zfsvfs, FTAG);
3572 return (error);
3573 }
3574 if (parent == zfsvfs->z_shares_dir) {
3575 zfs_exit(zfsvfs, FTAG);
3576 return (SET_ERROR(EPERM));
3577 }
3578
3579 if (zfsvfs->z_utf8 && u8_validate(name,
3580 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3581 zfs_exit(zfsvfs, FTAG);
3582 return (SET_ERROR(EILSEQ));
3583 }
3584 if (flags & FIGNORECASE)
3585 zf |= ZCILOOK;
3586
3587 /*
3588 * We do not support links between attributes and non-attributes
3589 * because of the potential security risk of creating links
3590 * into "normal" file space in order to circumvent restrictions
3591 * imposed in attribute space.
3592 */
3593 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3594 zfs_exit(zfsvfs, FTAG);
3595 return (SET_ERROR(EINVAL));
3596 }
3597
3598 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3599 cr, ZFS_OWNER);
3600 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3601 zfs_exit(zfsvfs, FTAG);
3602 return (SET_ERROR(EPERM));
3603 }
3604
3605 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3606 zfs_init_idmap))) {
3607 zfs_exit(zfsvfs, FTAG);
3608 return (error);
3609 }
3610
3611 top:
3612 /*
3613 * Attempt to lock directory; fail if entry already exists.
3614 */
3615 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3616 if (error) {
3617 zfs_exit(zfsvfs, FTAG);
3618 return (error);
3619 }
3620
3621 tx = dmu_tx_create(zfsvfs->z_os);
3622 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3623 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3624 if (is_tmpfile)
3625 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3626
3627 zfs_sa_upgrade_txholds(tx, szp);
3628 zfs_sa_upgrade_txholds(tx, tdzp);
3629 error = dmu_tx_assign(tx,
3630 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
3631 if (error) {
3632 zfs_dirent_unlock(dl);
3633 if (error == ERESTART) {
3634 waited = B_TRUE;
3635 dmu_tx_wait(tx);
3636 dmu_tx_abort(tx);
3637 goto top;
3638 }
3639 dmu_tx_abort(tx);
3640 zfs_exit(zfsvfs, FTAG);
3641 return (error);
3642 }
3643 /* unmark z_unlinked so zfs_link_create will not reject */
3644 if (is_tmpfile)
3645 szp->z_unlinked = B_FALSE;
3646 error = zfs_link_create(dl, szp, tx, 0);
3647
3648 if (error == 0) {
3649 uint64_t txtype = TX_LINK;
3650 /*
3651 * tmpfile is created to be in z_unlinkedobj, so remove it.
3652 * Also, we don't log in ZIL, because all previous file
3653 * operation on the tmpfile are ignored by ZIL. Instead we
3654 * always wait for txg to sync to make sure all previous
3655 * operation are sync safe.
3656 */
3657 if (is_tmpfile) {
3658 VERIFY0(zap_remove_int(zfsvfs->z_os,
3659 zfsvfs->z_unlinkedobj, szp->z_id, tx));
3660 } else {
3661 if (flags & FIGNORECASE)
3662 txtype |= TX_CI;
3663 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3664 }
3665 } else if (is_tmpfile) {
3666 /* restore z_unlinked since when linking failed */
3667 szp->z_unlinked = B_TRUE;
3668 }
3669 txg = dmu_tx_get_txg(tx);
3670 dmu_tx_commit(tx);
3671
3672 zfs_dirent_unlock(dl);
3673
3674 if (error == 0) {
3675 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3676 error = zil_commit(zilog, 0);
3677
3678 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3679 txg_wait_flag_t wait_flags =
3680 spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) ==
3681 ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0;
3682 error = txg_wait_synced_flags(
3683 dmu_objset_pool(zfsvfs->z_os), txg, wait_flags);
3684 if (error != 0) {
3685 ASSERT3U(error, ==, ESHUTDOWN);
3686 error = SET_ERROR(EIO);
3687 }
3688 }
3689 }
3690
3691 zfs_znode_update_vfs(tdzp);
3692 zfs_znode_update_vfs(szp);
3693 zfs_exit(zfsvfs, FTAG);
3694 return (error);
3695 }
3696
3697 /* Finish page writeback. */
3698 static inline void
zfs_page_writeback_done(struct page * pp,int err)3699 zfs_page_writeback_done(struct page *pp, int err)
3700 {
3701 if (err != 0) {
3702 /*
3703 * Writeback failed. Re-dirty the page. It was undirtied before
3704 * the IO was issued (in zfs_putpage() or write_cache_pages()).
3705 * The kernel only considers writeback for dirty pages; if we
3706 * don't do this, it is eligible for eviction without being
3707 * written out, which we definitely don't want.
3708 */
3709 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3710 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3711 #else
3712 __set_page_dirty_nobuffers(pp);
3713 #endif
3714 }
3715
3716 ClearPageError(pp);
3717 end_page_writeback(pp);
3718 }
3719
3720 /*
3721 * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
3722 * for syncing writes. Called when the ZIL itx has been written to the log or
3723 * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
3724 * is passed as `err`.
3725 */
3726 static void
zfs_putpage_commit_cb(void * arg,int err)3727 zfs_putpage_commit_cb(void *arg, int err)
3728 {
3729 zfs_page_writeback_done(arg, err);
3730 }
3731
3732 /*
3733 * Push a page out to disk, once the page is on stable storage the
3734 * registered commit callback will be run as notification of completion.
3735 *
3736 * IN: ip - page mapped for inode.
3737 * pp - page to push (page is locked)
3738 * wbc - writeback control data
3739 * for_sync - does the caller intend to wait synchronously for the
3740 * page writeback to complete?
3741 *
3742 * RETURN: 0 if success
3743 * error code if failure
3744 *
3745 * Timestamps:
3746 * ip - ctime|mtime updated
3747 */
3748 int
zfs_putpage(struct inode * ip,struct page * pp,struct writeback_control * wbc,boolean_t for_sync)3749 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3750 boolean_t for_sync)
3751 {
3752 znode_t *zp = ITOZ(ip);
3753 zfsvfs_t *zfsvfs = ITOZSB(ip);
3754 loff_t offset;
3755 loff_t pgoff;
3756 unsigned int pglen;
3757 dmu_tx_t *tx;
3758 caddr_t va;
3759 int err = 0;
3760 uint64_t mtime[2], ctime[2];
3761 inode_timespec_t tmp_ts;
3762 sa_bulk_attr_t bulk[3];
3763 int cnt = 0;
3764 struct address_space *mapping;
3765
3766 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3767 return (err);
3768
3769 ASSERT(PageLocked(pp));
3770
3771 pgoff = page_offset(pp); /* Page byte-offset in file */
3772 offset = i_size_read(ip); /* File length in bytes */
3773 pglen = MIN(PAGE_SIZE, /* Page length in bytes */
3774 P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3775
3776 /* Page is beyond end of file */
3777 if (pgoff >= offset) {
3778 unlock_page(pp);
3779 zfs_exit(zfsvfs, FTAG);
3780 return (0);
3781 }
3782
3783 /* Truncate page length to end of file */
3784 if (pgoff + pglen > offset)
3785 pglen = offset - pgoff;
3786
3787 #if 0
3788 /*
3789 * FIXME: Allow mmap writes past its quota. The correct fix
3790 * is to register a page_mkwrite() handler to count the page
3791 * against its quota when it is about to be dirtied.
3792 */
3793 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3794 KUID_TO_SUID(ip->i_uid)) ||
3795 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3796 KGID_TO_SGID(ip->i_gid)) ||
3797 (zp->z_projid != ZFS_DEFAULT_PROJID &&
3798 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3799 zp->z_projid))) {
3800 err = EDQUOT;
3801 }
3802 #endif
3803
3804 /*
3805 * The ordering here is critical and must adhere to the following
3806 * rules in order to avoid deadlocking in either zfs_read() or
3807 * zfs_free_range() due to a lock inversion.
3808 *
3809 * 1) The page must be unlocked prior to acquiring the range lock.
3810 * This is critical because zfs_read() calls find_lock_page()
3811 * which may block on the page lock while holding the range lock.
3812 *
3813 * 2) Before setting or clearing write back on a page the range lock
3814 * must be held in order to prevent a lock inversion with the
3815 * zfs_free_range() function.
3816 *
3817 * This presents a problem because upon entering this function the
3818 * page lock is already held. To safely acquire the range lock the
3819 * page lock must be dropped. This creates a window where another
3820 * process could truncate, invalidate, dirty, or write out the page.
3821 *
3822 * Therefore, after successfully reacquiring the range and page locks
3823 * the current page state is checked. In the common case everything
3824 * will be as is expected and it can be written out. However, if
3825 * the page state has changed it must be handled accordingly.
3826 */
3827 mapping = pp->mapping;
3828 redirty_page_for_writepage(wbc, pp);
3829 unlock_page(pp);
3830
3831 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3832 pgoff, pglen, RL_WRITER);
3833 lock_page(pp);
3834
3835 /* Page mapping changed or it was no longer dirty, we're done */
3836 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3837 unlock_page(pp);
3838 zfs_rangelock_exit(lr);
3839 zfs_exit(zfsvfs, FTAG);
3840 return (0);
3841 }
3842
3843 /* Another process started write block if required */
3844 if (PageWriteback(pp)) {
3845 unlock_page(pp);
3846 zfs_rangelock_exit(lr);
3847
3848 if (wbc->sync_mode != WB_SYNC_NONE) {
3849 if (PageWriteback(pp))
3850 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3851 folio_wait_bit(page_folio(pp), PG_writeback);
3852 #else
3853 wait_on_page_bit(pp, PG_writeback);
3854 #endif
3855 }
3856
3857 zfs_exit(zfsvfs, FTAG);
3858 return (0);
3859 }
3860
3861 /* Clear the dirty flag the required locks are held */
3862 if (!clear_page_dirty_for_io(pp)) {
3863 unlock_page(pp);
3864 zfs_rangelock_exit(lr);
3865 zfs_exit(zfsvfs, FTAG);
3866 return (0);
3867 }
3868
3869 /*
3870 * Counterpart for redirty_page_for_writepage() above. This page
3871 * was in fact not skipped and should not be counted as if it were.
3872 */
3873 wbc->pages_skipped--;
3874 set_page_writeback(pp);
3875 unlock_page(pp);
3876
3877 tx = dmu_tx_create(zfsvfs->z_os);
3878 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3879 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3880 zfs_sa_upgrade_txholds(tx, zp);
3881
3882 err = dmu_tx_assign(tx, DMU_TX_WAIT);
3883 if (err != 0) {
3884 dmu_tx_abort(tx);
3885 zfs_page_writeback_done(pp, err);
3886 zfs_rangelock_exit(lr);
3887 zfs_exit(zfsvfs, FTAG);
3888
3889 /*
3890 * Don't return error for an async writeback; we've re-dirtied
3891 * the page so it will be tried again some other time.
3892 */
3893 return (for_sync ? err : 0);
3894 }
3895
3896 va = kmap(pp);
3897 ASSERT3U(pglen, <=, PAGE_SIZE);
3898 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3899 kunmap(pp);
3900
3901 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3902 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3903 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3904 &zp->z_pflags, 8);
3905
3906 /* Preserve the mtime and ctime provided by the inode */
3907 tmp_ts = zpl_inode_get_mtime(ip);
3908 ZFS_TIME_ENCODE(&tmp_ts, mtime);
3909 tmp_ts = zpl_inode_get_ctime(ip);
3910 ZFS_TIME_ENCODE(&tmp_ts, ctime);
3911 zp->z_atime_dirty = B_FALSE;
3912 zp->z_seq++;
3913
3914 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3915
3916 /*
3917 * A note about for_sync vs wbc->sync_mode.
3918 *
3919 * for_sync indicates that this is a syncing writeback, that is, kernel
3920 * caller expects the data to be durably stored before being notified.
3921 * Often, but not always, the call was triggered by a userspace syncing
3922 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
3923 * means that that page should remain "locked" (in the writeback state)
3924 * until it is definitely on disk (ie zil_commit() or spa_sync()).
3925 * Otherwise, we can unlock and return as soon as it is on the
3926 * in-memory ZIL.
3927 *
3928 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
3929 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
3930 * indicates this a regular async writeback (eg a cache eviction) and
3931 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
3932 * a syncing op that must be waited on (by convention, we test for
3933 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
3934 * performance should there ever be a new mode that we have not yet
3935 * added support for).
3936 *
3937 * So, why a separate for_sync field? This is because zpl_writepages()
3938 * calls zfs_putpage() multiple times for a single "logical" operation.
3939 * It wants all the individual pages to be for_sync==TRUE ie only
3940 * unlocked once durably stored, but it only wants one call to
3941 * zil_commit() at the very end, once all the pages are synced. So,
3942 * it repurposes sync_mode slightly to indicate who issue and wait for
3943 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
3944 * ALL, zfs_putpage should do it.
3945 *
3946 * Summary:
3947 * for_sync: 0=unlock immediately; 1=unlock once on disk
3948 * sync_mode: NONE=caller will commit; ALL=we will commit
3949 */
3950 boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
3951
3952 /*
3953 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
3954 * because it is a policy flag that indicates "someone will call
3955 * zil_commit() soon". for_sync=TRUE means exactly that; the only
3956 * question is whether it will be us, or zpl_writepages().
3957 */
3958 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
3959 B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
3960
3961 if (!for_sync) {
3962 /*
3963 * Async writeback is logged and written to the DMU, so page
3964 * can now be unlocked.
3965 */
3966 zfs_page_writeback_done(pp, 0);
3967 }
3968
3969 dmu_tx_commit(tx);
3970
3971 zfs_rangelock_exit(lr);
3972
3973 if (need_commit) {
3974 err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
3975 if (err != 0) {
3976 zfs_exit(zfsvfs, FTAG);
3977 return (err);
3978 }
3979 }
3980
3981 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3982
3983 zfs_exit(zfsvfs, FTAG);
3984 return (err);
3985 }
3986
3987 /*
3988 * Update the system attributes when the inode has been dirtied. For the
3989 * moment we only update the mode, atime, mtime, and ctime.
3990 */
3991 int
zfs_dirty_inode(struct inode * ip,int flags)3992 zfs_dirty_inode(struct inode *ip, int flags)
3993 {
3994 znode_t *zp = ITOZ(ip);
3995 zfsvfs_t *zfsvfs = ITOZSB(ip);
3996 dmu_tx_t *tx;
3997 uint64_t mode, atime[2], mtime[2], ctime[2];
3998 inode_timespec_t tmp_ts;
3999 sa_bulk_attr_t bulk[4];
4000 int error = 0;
4001 int cnt = 0;
4002
4003 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
4004 return (0);
4005
4006 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4007 return (error);
4008
4009 #ifdef I_DIRTY_TIME
4010 /*
4011 * This is the lazytime semantic introduced in Linux 4.0
4012 * This flag will only be called from update_time when lazytime is set.
4013 * (Note, I_DIRTY_SYNC will also set if not lazytime)
4014 * Fortunately mtime and ctime are managed within ZFS itself, so we
4015 * only need to dirty atime.
4016 */
4017 if (flags == I_DIRTY_TIME) {
4018 zp->z_atime_dirty = B_TRUE;
4019 goto out;
4020 }
4021 #endif
4022
4023 tx = dmu_tx_create(zfsvfs->z_os);
4024
4025 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4026 zfs_sa_upgrade_txholds(tx, zp);
4027
4028 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4029 if (error) {
4030 dmu_tx_abort(tx);
4031 goto out;
4032 }
4033
4034 mutex_enter(&zp->z_lock);
4035 zp->z_atime_dirty = B_FALSE;
4036
4037 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
4038 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
4039 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
4040 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
4041
4042 /* Preserve the mode, mtime and ctime provided by the inode */
4043 tmp_ts = zpl_inode_get_atime(ip);
4044 ZFS_TIME_ENCODE(&tmp_ts, atime);
4045 tmp_ts = zpl_inode_get_mtime(ip);
4046 ZFS_TIME_ENCODE(&tmp_ts, mtime);
4047 tmp_ts = zpl_inode_get_ctime(ip);
4048 ZFS_TIME_ENCODE(&tmp_ts, ctime);
4049 mode = ip->i_mode;
4050
4051 zp->z_mode = mode;
4052
4053 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
4054 mutex_exit(&zp->z_lock);
4055
4056 dmu_tx_commit(tx);
4057 out:
4058 zfs_exit(zfsvfs, FTAG);
4059 return (error);
4060 }
4061
4062 void
zfs_inactive(struct inode * ip)4063 zfs_inactive(struct inode *ip)
4064 {
4065 znode_t *zp = ITOZ(ip);
4066 zfsvfs_t *zfsvfs = ITOZSB(ip);
4067 uint64_t atime[2];
4068 int error;
4069 int need_unlock = 0;
4070
4071 /* Only read lock if we haven't already write locked, e.g. rollback */
4072 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
4073 need_unlock = 1;
4074 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4075 }
4076 if (zp->z_sa_hdl == NULL) {
4077 if (need_unlock)
4078 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4079 return;
4080 }
4081
4082 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
4083 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4084
4085 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4086 zfs_sa_upgrade_txholds(tx, zp);
4087 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4088 if (error) {
4089 dmu_tx_abort(tx);
4090 } else {
4091 inode_timespec_t tmp_atime;
4092 tmp_atime = zpl_inode_get_atime(ip);
4093 ZFS_TIME_ENCODE(&tmp_atime, atime);
4094 mutex_enter(&zp->z_lock);
4095 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4096 (void *)&atime, sizeof (atime), tx);
4097 zp->z_atime_dirty = B_FALSE;
4098 mutex_exit(&zp->z_lock);
4099 dmu_tx_commit(tx);
4100 }
4101 }
4102
4103 zfs_zinactive(zp);
4104 if (need_unlock)
4105 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4106 }
4107
4108 /*
4109 * Fill pages with data from the disk.
4110 */
4111 static int
zfs_fillpage(struct inode * ip,struct page * pp)4112 zfs_fillpage(struct inode *ip, struct page *pp)
4113 {
4114 znode_t *zp = ITOZ(ip);
4115 zfsvfs_t *zfsvfs = ITOZSB(ip);
4116 loff_t i_size = i_size_read(ip);
4117 u_offset_t io_off = page_offset(pp);
4118 size_t io_len = PAGE_SIZE;
4119
4120 ASSERT3U(io_off, <, i_size);
4121
4122 if (io_off + io_len > i_size)
4123 io_len = i_size - io_off;
4124
4125 void *va = kmap(pp);
4126 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
4127 io_len, va, DMU_READ_PREFETCH);
4128 if (io_len != PAGE_SIZE)
4129 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
4130 kunmap(pp);
4131
4132 if (error) {
4133 /* convert checksum errors into IO errors */
4134 if (error == ECKSUM)
4135 error = SET_ERROR(EIO);
4136
4137 SetPageError(pp);
4138 ClearPageUptodate(pp);
4139 } else {
4140 ClearPageError(pp);
4141 SetPageUptodate(pp);
4142 }
4143
4144 return (error);
4145 }
4146
4147 /*
4148 * Uses zfs_fillpage to read data from the file and fill the page.
4149 *
4150 * IN: ip - inode of file to get data from.
4151 * pp - page to read
4152 *
4153 * RETURN: 0 on success, error code on failure.
4154 *
4155 * Timestamps:
4156 * vp - atime updated
4157 */
4158 int
zfs_getpage(struct inode * ip,struct page * pp)4159 zfs_getpage(struct inode *ip, struct page *pp)
4160 {
4161 zfsvfs_t *zfsvfs = ITOZSB(ip);
4162 znode_t *zp = ITOZ(ip);
4163 int error;
4164 loff_t i_size = i_size_read(ip);
4165 u_offset_t io_off = page_offset(pp);
4166 size_t io_len = PAGE_SIZE;
4167
4168 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4169 return (error);
4170
4171 ASSERT3U(io_off, <, i_size);
4172
4173 if (io_off + io_len > i_size)
4174 io_len = i_size - io_off;
4175
4176 /*
4177 * It is important to hold the rangelock here because it is possible
4178 * a Direct I/O write or block clone might be taking place at the same
4179 * time that a page is being faulted in through filemap_fault(). With
4180 * Direct I/O writes and block cloning db->db_data will be set to NULL
4181 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
4182 * rangelock is not held, then there is a race between faulting in a
4183 * page and writing out a Direct I/O write or block cloning. Without
4184 * the rangelock a NULL pointer dereference can occur in
4185 * dmu_read_impl() for db->db_data during the mempcy operation when
4186 * zfs_fillpage() calls dmu_read().
4187 */
4188 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
4189 io_off, io_len, RL_READER);
4190 if (lr == NULL) {
4191 /*
4192 * It is important to drop the page lock before grabbing the
4193 * rangelock to avoid another deadlock between here and
4194 * zfs_write() -> update_pages(). update_pages() holds both the
4195 * rangelock and the page lock.
4196 */
4197 get_page(pp);
4198 unlock_page(pp);
4199 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
4200 io_len, RL_READER);
4201 lock_page(pp);
4202 put_page(pp);
4203 }
4204 error = zfs_fillpage(ip, pp);
4205 zfs_rangelock_exit(lr);
4206
4207 if (error == 0)
4208 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4209
4210 zfs_exit(zfsvfs, FTAG);
4211
4212 return (error);
4213 }
4214
4215 /*
4216 * Check ZFS specific permissions to memory map a section of a file.
4217 *
4218 * IN: ip - inode of the file to mmap
4219 * off - file offset
4220 * addrp - start address in memory region
4221 * len - length of memory region
4222 * vm_flags- address flags
4223 *
4224 * RETURN: 0 if success
4225 * error code if failure
4226 */
4227 int
zfs_map(struct inode * ip,offset_t off,caddr_t * addrp,size_t len,unsigned long vm_flags)4228 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4229 unsigned long vm_flags)
4230 {
4231 (void) addrp;
4232 znode_t *zp = ITOZ(ip);
4233 zfsvfs_t *zfsvfs = ITOZSB(ip);
4234 int error;
4235
4236 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4237 return (error);
4238
4239 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
4240 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4241 zfs_exit(zfsvfs, FTAG);
4242 return (SET_ERROR(EPERM));
4243 }
4244
4245 if ((vm_flags & (VM_READ | VM_EXEC)) &&
4246 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4247 zfs_exit(zfsvfs, FTAG);
4248 return (SET_ERROR(EACCES));
4249 }
4250
4251 if (off < 0 || len > MAXOFFSET_T - off) {
4252 zfs_exit(zfsvfs, FTAG);
4253 return (SET_ERROR(ENXIO));
4254 }
4255
4256 zfs_exit(zfsvfs, FTAG);
4257 return (0);
4258 }
4259
4260 /*
4261 * Free or allocate space in a file. Currently, this function only
4262 * supports the `F_FREESP' command. However, this command is somewhat
4263 * misnamed, as its functionality includes the ability to allocate as
4264 * well as free space.
4265 *
4266 * IN: zp - znode of file to free data in.
4267 * cmd - action to take (only F_FREESP supported).
4268 * bfp - section of file to free/alloc.
4269 * flag - current file open mode flags.
4270 * offset - current file offset.
4271 * cr - credentials of caller.
4272 *
4273 * RETURN: 0 on success, error code on failure.
4274 *
4275 * Timestamps:
4276 * zp - ctime|mtime updated
4277 */
4278 int
zfs_space(znode_t * zp,int cmd,flock64_t * bfp,int flag,offset_t offset,cred_t * cr)4279 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4280 offset_t offset, cred_t *cr)
4281 {
4282 (void) offset;
4283 zfsvfs_t *zfsvfs = ZTOZSB(zp);
4284 uint64_t off, len;
4285 int error;
4286
4287 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4288 return (error);
4289
4290 if (cmd != F_FREESP) {
4291 zfs_exit(zfsvfs, FTAG);
4292 return (SET_ERROR(EINVAL));
4293 }
4294
4295 /*
4296 * Callers might not be able to detect properly that we are read-only,
4297 * so check it explicitly here.
4298 */
4299 if (zfs_is_readonly(zfsvfs)) {
4300 zfs_exit(zfsvfs, FTAG);
4301 return (SET_ERROR(EROFS));
4302 }
4303
4304 if (bfp->l_len < 0) {
4305 zfs_exit(zfsvfs, FTAG);
4306 return (SET_ERROR(EINVAL));
4307 }
4308
4309 /*
4310 * Permissions aren't checked on Solaris because on this OS
4311 * zfs_space() can only be called with an opened file handle.
4312 * On Linux we can get here through truncate_range() which
4313 * operates directly on inodes, so we need to check access rights.
4314 */
4315 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4316 zfs_init_idmap))) {
4317 zfs_exit(zfsvfs, FTAG);
4318 return (error);
4319 }
4320
4321 off = bfp->l_start;
4322 len = bfp->l_len; /* 0 means from off to end of file */
4323
4324 error = zfs_freesp(zp, off, len, flag, TRUE);
4325
4326 zfs_exit(zfsvfs, FTAG);
4327 return (error);
4328 }
4329
4330 int
zfs_fid(struct inode * ip,fid_t * fidp)4331 zfs_fid(struct inode *ip, fid_t *fidp)
4332 {
4333 znode_t *zp = ITOZ(ip);
4334 zfsvfs_t *zfsvfs = ITOZSB(ip);
4335 uint32_t gen;
4336 uint64_t gen64;
4337 uint64_t object = zp->z_id;
4338 zfid_short_t *zfid;
4339 int size, i, error;
4340
4341 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4342 return (error);
4343
4344 if (fidp->fid_len < SHORT_FID_LEN) {
4345 fidp->fid_len = SHORT_FID_LEN;
4346 zfs_exit(zfsvfs, FTAG);
4347 return (SET_ERROR(ENOSPC));
4348 }
4349
4350 if ((error = zfs_verify_zp(zp)) != 0) {
4351 zfs_exit(zfsvfs, FTAG);
4352 return (error);
4353 }
4354
4355 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4356 &gen64, sizeof (uint64_t))) != 0) {
4357 zfs_exit(zfsvfs, FTAG);
4358 return (error);
4359 }
4360
4361 gen = (uint32_t)gen64;
4362
4363 size = SHORT_FID_LEN;
4364
4365 zfid = (zfid_short_t *)fidp;
4366
4367 zfid->zf_len = size;
4368
4369 for (i = 0; i < sizeof (zfid->zf_object); i++)
4370 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4371
4372 /* Must have a non-zero generation number to distinguish from .zfs */
4373 if (gen == 0)
4374 gen = 1;
4375 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4376 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4377
4378 zfs_exit(zfsvfs, FTAG);
4379 return (0);
4380 }
4381
4382 #if defined(_KERNEL)
4383 EXPORT_SYMBOL(zfs_open);
4384 EXPORT_SYMBOL(zfs_close);
4385 EXPORT_SYMBOL(zfs_lookup);
4386 EXPORT_SYMBOL(zfs_create);
4387 EXPORT_SYMBOL(zfs_tmpfile);
4388 EXPORT_SYMBOL(zfs_remove);
4389 EXPORT_SYMBOL(zfs_mkdir);
4390 EXPORT_SYMBOL(zfs_rmdir);
4391 EXPORT_SYMBOL(zfs_readdir);
4392 EXPORT_SYMBOL(zfs_getattr_fast);
4393 EXPORT_SYMBOL(zfs_setattr);
4394 EXPORT_SYMBOL(zfs_rename);
4395 EXPORT_SYMBOL(zfs_symlink);
4396 EXPORT_SYMBOL(zfs_readlink);
4397 EXPORT_SYMBOL(zfs_link);
4398 EXPORT_SYMBOL(zfs_inactive);
4399 EXPORT_SYMBOL(zfs_space);
4400 EXPORT_SYMBOL(zfs_fid);
4401 EXPORT_SYMBOL(zfs_getpage);
4402 EXPORT_SYMBOL(zfs_putpage);
4403 EXPORT_SYMBOL(zfs_dirty_inode);
4404 EXPORT_SYMBOL(zfs_map);
4405
4406 module_param(zfs_delete_blocks, ulong, 0644);
4407 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4408 #endif
4409