1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2025, Klara, Inc.
29 */
30
31 /* Portions Copyright 2007 Jeremy Teo */
32 /* Portions Copyright 2010 Robert Milkowski */
33
34 #include <sys/param.h>
35 #include <sys/time.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/resource.h>
39 #include <security/mac/mac_framework.h>
40 #include <sys/vfs.h>
41 #include <sys/endian.h>
42 #include <sys/vm.h>
43 #include <sys/vnode.h>
44 #include <sys/smr.h>
45 #include <sys/dirent.h>
46 #include <sys/file.h>
47 #include <sys/stat.h>
48 #include <sys/kmem.h>
49 #include <sys/taskq.h>
50 #include <sys/uio.h>
51 #include <sys/atomic.h>
52 #include <sys/namei.h>
53 #include <sys/mman.h>
54 #include <sys/cmn_err.h>
55 #include <sys/kdb.h>
56 #include <sys/sysproto.h>
57 #include <sys/errno.h>
58 #include <sys/unistd.h>
59 #include <sys/zfs_dir.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/fs/zfs.h>
62 #include <sys/dmu.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/spa.h>
65 #include <sys/txg.h>
66 #include <sys/dbuf.h>
67 #include <sys/zap.h>
68 #include <sys/sa.h>
69 #include <sys/policy.h>
70 #include <sys/sunddi.h>
71 #include <sys/filio.h>
72 #include <sys/sid.h>
73 #include <sys/zfs_ctldir.h>
74 #include <sys/zfs_fuid.h>
75 #include <sys/zfs_quota.h>
76 #include <sys/zfs_sa.h>
77 #include <sys/zfs_rlock.h>
78 #include <sys/zfs_project.h>
79 #include <sys/bio.h>
80 #include <sys/buf.h>
81 #include <sys/sched.h>
82 #include <sys/acl.h>
83 #include <sys/vmmeter.h>
84 #include <vm/vm_param.h>
85 #include <sys/zil.h>
86 #include <sys/zfs_vnops.h>
87 #include <sys/module.h>
88 #include <sys/sysent.h>
89 #include <sys/dmu_impl.h>
90 #include <sys/brt.h>
91 #include <sys/zfeature.h>
92
93 #include <vm/vm_object.h>
94
95 #include <sys/extattr.h>
96 #include <sys/priv.h>
97
98 #ifndef VN_OPEN_INVFS
99 #define VN_OPEN_INVFS 0x0
100 #endif
101
102 VFS_SMR_DECLARE;
103
104 #ifdef DEBUG_VFS_LOCKS
105 #define VNCHECKREF(vp) \
106 VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \
107 ("%s: wrong ref counts", __func__));
108 #else
109 #define VNCHECKREF(vp)
110 #endif
111
112 #if __FreeBSD_version >= 1400045
113 typedef uint64_t cookie_t;
114 #else
115 typedef ulong_t cookie_t;
116 #endif
117
118 static int zfs_check_attrname(const char *name);
119
120 /*
121 * Programming rules.
122 *
123 * Each vnode op performs some logical unit of work. To do this, the ZPL must
124 * properly lock its in-core state, create a DMU transaction, do the work,
125 * record this work in the intent log (ZIL), commit the DMU transaction,
126 * and wait for the intent log to commit if it is a synchronous operation.
127 * Moreover, the vnode ops must work in both normal and log replay context.
128 * The ordering of events is important to avoid deadlocks and references
129 * to freed memory. The example below illustrates the following Big Rules:
130 *
131 * (1) A check must be made in each zfs thread for a mounted file system.
132 * This is done avoiding races using zfs_enter(zfsvfs).
133 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
134 * must be checked with zfs_verify_zp(zp). Both of these macros
135 * can return EIO from the calling function.
136 *
137 * (2) VN_RELE() should always be the last thing except for zil_commit()
138 * (if necessary) and zfs_exit(). This is for 3 reasons:
139 * First, if it's the last reference, the vnode/znode
140 * can be freed, so the zp may point to freed memory. Second, the last
141 * reference will call zfs_zinactive(), which may induce a lot of work --
142 * pushing cached pages (which acquires range locks) and syncing out
143 * cached atime changes. Third, zfs_zinactive() may require a new tx,
144 * which could deadlock the system if you were already holding one.
145 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
146 *
147 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
148 * as they can span dmu_tx_assign() calls.
149 *
150 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
151 * dmu_tx_assign(). This is critical because we don't want to block
152 * while holding locks.
153 *
154 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
155 * This reduces lock contention and CPU usage when we must wait (note
156 * that if throughput is constrained by the storage, nearly every
157 * transaction must wait).
158 *
159 * Note, in particular, that if a lock is sometimes acquired before
160 * the tx assigns, and sometimes after (e.g. z_lock), then failing
161 * to use a non-blocking assign can deadlock the system. The scenario:
162 *
163 * Thread A has grabbed a lock before calling dmu_tx_assign().
164 * Thread B is in an already-assigned tx, and blocks for this lock.
165 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
166 * txg_wait_open() forever, because the previous txg can't quiesce
167 * until B's tx commits.
168 *
169 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
170 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
171 * again. On subsequent calls to dmu_tx_assign(), pass
172 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
173 * this operation has already called dmu_tx_wait(). This will ensure
174 * that we don't retry forever, waiting a short bit each time.
175 *
176 * (5) If the operation succeeded, generate the intent log entry for it
177 * before dropping locks. This ensures that the ordering of events
178 * in the intent log matches the order in which they actually occurred.
179 * During ZIL replay the zfs_log_* functions will update the sequence
180 * number to indicate the zil transaction has replayed.
181 *
182 * (6) At the end of each vnode op, the DMU tx must always commit,
183 * regardless of whether there were any errors.
184 *
185 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
186 * to ensure that synchronous semantics are provided when necessary.
187 *
188 * In general, this is how things should be ordered in each vnode op:
189 *
190 * zfs_enter(zfsvfs); // exit if unmounted
191 * top:
192 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
193 * rw_enter(...); // grab any other locks you need
194 * tx = dmu_tx_create(...); // get DMU tx
195 * dmu_tx_hold_*(); // hold each object you might modify
196 * error = dmu_tx_assign(tx,
197 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
198 * if (error) {
199 * rw_exit(...); // drop locks
200 * zfs_dirent_unlock(dl); // unlock directory entry
201 * VN_RELE(...); // release held vnodes
202 * if (error == ERESTART) {
203 * waited = B_TRUE;
204 * dmu_tx_wait(tx);
205 * dmu_tx_abort(tx);
206 * goto top;
207 * }
208 * dmu_tx_abort(tx); // abort DMU tx
209 * zfs_exit(zfsvfs); // finished in zfs
210 * return (error); // really out of space
211 * }
212 * error = do_real_work(); // do whatever this VOP does
213 * if (error == 0)
214 * zfs_log_*(...); // on success, make ZIL entry
215 * dmu_tx_commit(tx); // commit DMU tx -- error or not
216 * rw_exit(...); // drop locks
217 * zfs_dirent_unlock(dl); // unlock directory entry
218 * VN_RELE(...); // release held vnodes
219 * zil_commit(zilog, foid); // synchronous when necessary
220 * zfs_exit(zfsvfs); // finished in zfs
221 * return (error); // done, report error
222 */
223 static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr)224 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
225 {
226 (void) cr;
227 znode_t *zp = VTOZ(*vpp);
228 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
229 int error;
230
231 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
232 return (error);
233
234 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
235 ((flag & FAPPEND) == 0)) {
236 zfs_exit(zfsvfs, FTAG);
237 return (SET_ERROR(EPERM));
238 }
239
240 /*
241 * Keep a count of the synchronous opens in the znode. On first
242 * synchronous open we must convert all previous async transactions
243 * into sync to keep correct ordering.
244 */
245 if (flag & O_SYNC) {
246 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
247 zil_async_to_sync(zfsvfs->z_log, zp->z_id);
248 }
249
250 zfs_exit(zfsvfs, FTAG);
251 return (0);
252 }
253
254 static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr)255 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
256 {
257 (void) offset, (void) cr;
258 znode_t *zp = VTOZ(vp);
259 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
260 int error;
261
262 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
263 return (error);
264
265 /* Decrement the synchronous opens in the znode */
266 if ((flag & O_SYNC) && (count == 1))
267 atomic_dec_32(&zp->z_sync_cnt);
268
269 zfs_exit(zfsvfs, FTAG);
270 return (0);
271 }
272
273 static int
zfs_ioctl_getxattr(vnode_t * vp,zfsxattr_t * fsx)274 zfs_ioctl_getxattr(vnode_t *vp, zfsxattr_t *fsx)
275 {
276 znode_t *zp = VTOZ(vp);
277
278 memset(fsx, 0, sizeof (*fsx));
279 fsx->fsx_xflags = (zp->z_pflags & ZFS_PROJINHERIT) ?
280 ZFS_PROJINHERIT_FL : 0;
281 fsx->fsx_projid = zp->z_projid;
282
283 return (0);
284 }
285
286 static int
zfs_ioctl_setflags(vnode_t * vp,uint32_t ioctl_flags,xvattr_t * xva)287 zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva)
288 {
289 uint64_t zfs_flags = VTOZ(vp)->z_pflags;
290 xoptattr_t *xoap;
291
292 if (ioctl_flags & ~(ZFS_PROJINHERIT_FL))
293 return (SET_ERROR(EOPNOTSUPP));
294
295 xva_init(xva);
296 xoap = xva_getxoptattr(xva);
297
298 #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
299 if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
300 ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
301 XVA_SET_REQ(xva, (xflag)); \
302 (xfield) = ((ioctl_flags & (iflag)) != 0); \
303 } \
304 } while (0)
305
306 FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
307 xoap->xoa_projinherit);
308
309 #undef FLAG_CHANGE
310
311 return (0);
312 }
313
314 static int
zfs_ioctl_setxattr(vnode_t * vp,zfsxattr_t * fsx,cred_t * cr)315 zfs_ioctl_setxattr(vnode_t *vp, zfsxattr_t *fsx, cred_t *cr)
316 {
317 znode_t *zp = VTOZ(vp);
318 xvattr_t xva;
319 xoptattr_t *xoap;
320 int err;
321
322 if (!zpl_is_valid_projid(fsx->fsx_projid))
323 return (SET_ERROR(EINVAL));
324
325 err = zfs_ioctl_setflags(vp, fsx->fsx_xflags, &xva);
326 if (err)
327 return (err);
328
329 xoap = xva_getxoptattr(&xva);
330 XVA_SET_REQ(&xva, XAT_PROJID);
331 xoap->xoa_projid = fsx->fsx_projid;
332
333 err = zfs_setattr(zp, (vattr_t *)&xva, 0, cr, NULL);
334
335 return (err);
336 }
337
338 static int
zfs_ioctl(vnode_t * vp,ulong_t com,intptr_t data,int flag,cred_t * cred,int * rvalp)339 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
340 int *rvalp)
341 {
342 (void) flag, (void) cred, (void) rvalp;
343 loff_t off;
344 int error;
345
346 switch (com) {
347 case _FIOFFS:
348 {
349 return (0);
350
351 /*
352 * The following two ioctls are used by bfu. Faking out,
353 * necessary to avoid bfu errors.
354 */
355 }
356 case _FIOGDIO:
357 case _FIOSDIO:
358 {
359 return (0);
360 }
361
362 case F_SEEK_DATA:
363 case F_SEEK_HOLE:
364 {
365 off = *(offset_t *)data;
366 error = vn_lock(vp, LK_SHARED);
367 if (error)
368 return (error);
369 /* offset parameter is in/out */
370 error = zfs_holey(VTOZ(vp), com, &off);
371 VOP_UNLOCK(vp);
372 if (error)
373 return (error);
374 *(offset_t *)data = off;
375 return (0);
376 }
377 case ZFS_IOC_FSGETXATTR: {
378 zfsxattr_t *fsx = (zfsxattr_t *)data;
379 error = vn_lock(vp, LK_SHARED);
380 if (error)
381 return (error);
382 error = zfs_ioctl_getxattr(vp, fsx);
383 VOP_UNLOCK(vp);
384 return (error);
385 }
386 case ZFS_IOC_FSSETXATTR: {
387 zfsxattr_t *fsx = (zfsxattr_t *)data;
388 error = vn_lock(vp, LK_EXCLUSIVE);
389 if (error)
390 return (error);
391 error = zfs_ioctl_setxattr(vp, fsx, cred);
392 VOP_UNLOCK(vp);
393 return (error);
394 }
395 case ZFS_IOC_REWRITE: {
396 zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
397 if ((flag & FWRITE) == 0)
398 return (SET_ERROR(EBADF));
399 error = vn_lock(vp, LK_SHARED);
400 if (error)
401 return (error);
402 error = zfs_rewrite(VTOZ(vp), args->off, args->len,
403 args->flags, args->arg);
404 VOP_UNLOCK(vp);
405 return (error);
406 }
407 }
408 return (SET_ERROR(ENOTTY));
409 }
410
411 static vm_page_t
page_busy(vnode_t * vp,int64_t start,int64_t off,int64_t nbytes)412 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
413 {
414 vm_object_t obj;
415 vm_page_t pp;
416 int64_t end;
417
418 /*
419 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
420 * aligned boundaries, if the range is not aligned. As a result a
421 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
422 * It may happen that all DEV_BSIZE subranges are marked clean and thus
423 * the whole page would be considered clean despite have some
424 * dirty data.
425 * For this reason we should shrink the range to DEV_BSIZE aligned
426 * boundaries before calling vm_page_clear_dirty.
427 */
428 end = rounddown2(off + nbytes, DEV_BSIZE);
429 off = roundup2(off, DEV_BSIZE);
430 nbytes = end - off;
431
432 obj = vp->v_object;
433 vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
434 VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
435 VM_ALLOC_IGN_SBUSY);
436 if (pp != NULL) {
437 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
438 vm_object_pip_add(obj, 1);
439 pmap_remove_write(pp);
440 if (nbytes != 0)
441 vm_page_clear_dirty(pp, off, nbytes);
442 }
443 return (pp);
444 }
445
446 static void
page_unbusy(vm_page_t pp)447 page_unbusy(vm_page_t pp)
448 {
449
450 vm_page_sunbusy(pp);
451 vm_object_pip_wakeup(pp->object);
452 }
453
454 static vm_page_t
page_hold(vnode_t * vp,int64_t start)455 page_hold(vnode_t *vp, int64_t start)
456 {
457 vm_object_t obj;
458 vm_page_t m;
459
460 obj = vp->v_object;
461 vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
462 VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
463 VM_ALLOC_NOBUSY);
464 return (m);
465 }
466
467 static void
page_unhold(vm_page_t pp)468 page_unhold(vm_page_t pp)
469 {
470 vm_page_unwire(pp, PQ_ACTIVE);
471 }
472
473 /*
474 * When a file is memory mapped, we must keep the IO data synchronized
475 * between the DMU cache and the memory mapped pages. What this means:
476 *
477 * On Write: If we find a memory mapped page, we write to *both*
478 * the page and the dmu buffer.
479 */
480 void
update_pages(znode_t * zp,int64_t start,int len,objset_t * os)481 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
482 {
483 vm_object_t obj;
484 struct sf_buf *sf;
485 vnode_t *vp = ZTOV(zp);
486 caddr_t va;
487 int off;
488
489 ASSERT3P(vp->v_mount, !=, NULL);
490 obj = vp->v_object;
491 ASSERT3P(obj, !=, NULL);
492
493 off = start & PAGEOFFSET;
494 vm_object_pip_add(obj, 1);
495 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
496 vm_page_t pp;
497 int nbytes = imin(PAGESIZE - off, len);
498
499 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
500 va = zfs_map_page(pp, &sf);
501 (void) dmu_read(os, zp->z_id, start + off, nbytes,
502 va + off, DMU_READ_PREFETCH);
503 zfs_unmap_page(sf);
504 page_unbusy(pp);
505 }
506 len -= nbytes;
507 off = 0;
508 }
509 vm_object_pip_wakeup(obj);
510 }
511
512 /*
513 * Read with UIO_NOCOPY flag means that sendfile(2) requests
514 * ZFS to populate a range of page cache pages with data.
515 *
516 * NOTE: this function could be optimized to pre-allocate
517 * all pages in advance, drain exclusive busy on all of them,
518 * map them into contiguous KVA region and populate them
519 * in one single dmu_read() call.
520 */
521 int
mappedread_sf(znode_t * zp,int nbytes,zfs_uio_t * uio)522 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
523 {
524 vnode_t *vp = ZTOV(zp);
525 objset_t *os = zp->z_zfsvfs->z_os;
526 struct sf_buf *sf;
527 vm_object_t obj;
528 vm_page_t pp;
529 int64_t start;
530 caddr_t va;
531 int len = nbytes;
532 int error = 0;
533
534 ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
535 ASSERT3P(vp->v_mount, !=, NULL);
536 obj = vp->v_object;
537 ASSERT3P(obj, !=, NULL);
538 ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
539
540 for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
541 int bytes = MIN(PAGESIZE, len);
542
543 pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
544 VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
545 if (vm_page_none_valid(pp)) {
546 va = zfs_map_page(pp, &sf);
547 error = dmu_read(os, zp->z_id, start, bytes, va,
548 DMU_READ_PREFETCH);
549 if (bytes != PAGESIZE && error == 0)
550 memset(va + bytes, 0, PAGESIZE - bytes);
551 zfs_unmap_page(sf);
552 if (error == 0) {
553 vm_page_valid(pp);
554 vm_page_activate(pp);
555 vm_page_sunbusy(pp);
556 } else {
557 zfs_vmobject_wlock(obj);
558 if (!vm_page_wired(pp) && pp->valid == 0 &&
559 vm_page_busy_tryupgrade(pp))
560 vm_page_free(pp);
561 else {
562 vm_page_deactivate_noreuse(pp);
563 vm_page_sunbusy(pp);
564 }
565 zfs_vmobject_wunlock(obj);
566 }
567 } else {
568 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
569 vm_page_sunbusy(pp);
570 }
571 if (error)
572 break;
573 zfs_uio_advance(uio, bytes);
574 len -= bytes;
575 }
576 return (error);
577 }
578
579 /*
580 * When a file is memory mapped, we must keep the IO data synchronized
581 * between the DMU cache and the memory mapped pages. What this means:
582 *
583 * On Read: We "read" preferentially from memory mapped pages,
584 * else we default from the dmu buffer.
585 *
586 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
587 * the file is memory mapped.
588 */
589 int
mappedread(znode_t * zp,int nbytes,zfs_uio_t * uio)590 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
591 {
592 vnode_t *vp = ZTOV(zp);
593 vm_object_t obj;
594 int64_t start;
595 int len = nbytes;
596 int off;
597 int error = 0;
598
599 ASSERT3P(vp->v_mount, !=, NULL);
600 obj = vp->v_object;
601 ASSERT3P(obj, !=, NULL);
602
603 start = zfs_uio_offset(uio);
604 off = start & PAGEOFFSET;
605 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
606 vm_page_t pp;
607 uint64_t bytes = MIN(PAGESIZE - off, len);
608
609 if ((pp = page_hold(vp, start))) {
610 struct sf_buf *sf;
611 caddr_t va;
612
613 va = zfs_map_page(pp, &sf);
614 error = vn_io_fault_uiomove(va + off, bytes,
615 GET_UIO_STRUCT(uio));
616 zfs_unmap_page(sf);
617 page_unhold(pp);
618 } else {
619 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
620 uio, bytes, DMU_READ_PREFETCH);
621 }
622 len -= bytes;
623 off = 0;
624 if (error)
625 break;
626 }
627 return (error);
628 }
629
630 int
zfs_write_simple(znode_t * zp,const void * data,size_t len,loff_t pos,size_t * presid)631 zfs_write_simple(znode_t *zp, const void *data, size_t len,
632 loff_t pos, size_t *presid)
633 {
634 int error = 0;
635 ssize_t resid;
636
637 error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
638 UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
639
640 if (error) {
641 return (SET_ERROR(error));
642 } else if (presid == NULL) {
643 if (resid != 0) {
644 error = SET_ERROR(EIO);
645 }
646 } else {
647 *presid = resid;
648 }
649 return (error);
650 }
651
652 void
zfs_zrele_async(znode_t * zp)653 zfs_zrele_async(znode_t *zp)
654 {
655 vnode_t *vp = ZTOV(zp);
656 objset_t *os = ITOZSB(vp)->z_os;
657
658 VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
659 }
660
661 static int
zfs_dd_callback(struct mount * mp,void * arg,int lkflags,struct vnode ** vpp)662 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
663 {
664 int error;
665
666 *vpp = arg;
667 error = vn_lock(*vpp, lkflags);
668 if (error != 0)
669 vrele(*vpp);
670 return (error);
671 }
672
673 static int
zfs_lookup_lock(vnode_t * dvp,vnode_t * vp,const char * name,int lkflags)674 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
675 {
676 znode_t *zdp = VTOZ(dvp);
677 zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
678 int error;
679 int ltype;
680
681 if (zfsvfs->z_replay == B_FALSE)
682 ASSERT_VOP_LOCKED(dvp, __func__);
683
684 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
685 ASSERT3P(dvp, ==, vp);
686 vref(dvp);
687 ltype = lkflags & LK_TYPE_MASK;
688 if (ltype != VOP_ISLOCKED(dvp)) {
689 if (ltype == LK_EXCLUSIVE)
690 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
691 else /* if (ltype == LK_SHARED) */
692 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
693
694 /*
695 * Relock for the "." case could leave us with
696 * reclaimed vnode.
697 */
698 if (VN_IS_DOOMED(dvp)) {
699 vrele(dvp);
700 return (SET_ERROR(ENOENT));
701 }
702 }
703 return (0);
704 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
705 /*
706 * Note that in this case, dvp is the child vnode, and we
707 * are looking up the parent vnode - exactly reverse from
708 * normal operation. Unlocking dvp requires some rather
709 * tricky unlock/relock dance to prevent mp from being freed;
710 * use vn_vget_ino_gen() which takes care of all that.
711 *
712 * XXX Note that there is a time window when both vnodes are
713 * unlocked. It is possible, although highly unlikely, that
714 * during that window the parent-child relationship between
715 * the vnodes may change, for example, get reversed.
716 * In that case we would have a wrong lock order for the vnodes.
717 * All other filesystems seem to ignore this problem, so we
718 * do the same here.
719 * A potential solution could be implemented as follows:
720 * - using LK_NOWAIT when locking the second vnode and retrying
721 * if necessary
722 * - checking that the parent-child relationship still holds
723 * after locking both vnodes and retrying if it doesn't
724 */
725 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
726 return (error);
727 } else {
728 error = vn_lock(vp, lkflags);
729 if (error != 0)
730 vrele(vp);
731 return (error);
732 }
733 }
734
735 /*
736 * Lookup an entry in a directory, or an extended attribute directory.
737 * If it exists, return a held vnode reference for it.
738 *
739 * IN: dvp - vnode of directory to search.
740 * nm - name of entry to lookup.
741 * pnp - full pathname to lookup [UNUSED].
742 * flags - LOOKUP_XATTR set if looking for an attribute.
743 * rdir - root directory vnode [UNUSED].
744 * cr - credentials of caller.
745 * ct - caller context
746 *
747 * OUT: vpp - vnode of located entry, NULL if not found.
748 *
749 * RETURN: 0 on success, error code on failure.
750 *
751 * Timestamps:
752 * NA
753 */
754 static int
zfs_lookup(vnode_t * dvp,const char * nm,vnode_t ** vpp,struct componentname * cnp,int nameiop,cred_t * cr,int flags,boolean_t cached)755 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
756 struct componentname *cnp, int nameiop, cred_t *cr, int flags,
757 boolean_t cached)
758 {
759 znode_t *zdp = VTOZ(dvp);
760 znode_t *zp;
761 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
762 seqc_t dvp_seqc;
763 int error = 0;
764
765 /*
766 * Fast path lookup, however we must skip DNLC lookup
767 * for case folding or normalizing lookups because the
768 * DNLC code only stores the passed in name. This means
769 * creating 'a' and removing 'A' on a case insensitive
770 * file system would work, but DNLC still thinks 'a'
771 * exists and won't let you create it again on the next
772 * pass through fast path.
773 */
774 if (!(flags & LOOKUP_XATTR)) {
775 if (dvp->v_type != VDIR) {
776 return (SET_ERROR(ENOTDIR));
777 } else if (zdp->z_sa_hdl == NULL) {
778 return (SET_ERROR(EIO));
779 }
780 }
781
782 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
783 const char *, nm);
784
785 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
786 return (error);
787
788 dvp_seqc = vn_seqc_read_notmodify(dvp);
789
790 *vpp = NULL;
791
792 if (flags & LOOKUP_XATTR) {
793 /*
794 * If the xattr property is off, refuse the lookup request.
795 */
796 if (!(zfsvfs->z_flags & ZSB_XATTR)) {
797 zfs_exit(zfsvfs, FTAG);
798 return (SET_ERROR(EOPNOTSUPP));
799 }
800
801 /*
802 * We don't allow recursive attributes..
803 * Maybe someday we will.
804 */
805 if (zdp->z_pflags & ZFS_XATTR) {
806 zfs_exit(zfsvfs, FTAG);
807 return (SET_ERROR(EINVAL));
808 }
809
810 if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
811 zfs_exit(zfsvfs, FTAG);
812 return (error);
813 }
814 *vpp = ZTOV(zp);
815
816 /*
817 * Do we have permission to get into attribute directory?
818 */
819 if (flags & LOOKUP_NAMED_ATTR)
820 error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
821 B_FALSE, cr, NULL);
822 else
823 error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
824 NULL);
825 if (error) {
826 vrele(ZTOV(zp));
827 }
828
829 zfs_exit(zfsvfs, FTAG);
830 return (error);
831 }
832
833 /*
834 * Check accessibility of directory if we're not coming in via
835 * VOP_CACHEDLOOKUP.
836 */
837 if (!cached) {
838 #ifdef NOEXECCHECK
839 if ((cnp->cn_flags & NOEXECCHECK) != 0) {
840 cnp->cn_flags &= ~NOEXECCHECK;
841 } else
842 #endif
843 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
844 NULL))) {
845 zfs_exit(zfsvfs, FTAG);
846 return (error);
847 }
848 }
849
850 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
851 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
852 zfs_exit(zfsvfs, FTAG);
853 return (SET_ERROR(EILSEQ));
854 }
855
856
857 /*
858 * First handle the special cases.
859 */
860 if ((cnp->cn_flags & ISDOTDOT) != 0) {
861 /*
862 * If we are a snapshot mounted under .zfs, return
863 * the vp for the snapshot directory.
864 */
865 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
866 struct componentname cn;
867 vnode_t *zfsctl_vp;
868 int ltype;
869
870 zfs_exit(zfsvfs, FTAG);
871 ltype = VOP_ISLOCKED(dvp);
872 VOP_UNLOCK(dvp);
873 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
874 &zfsctl_vp);
875 if (error == 0) {
876 cn.cn_nameptr = "snapshot";
877 cn.cn_namelen = strlen(cn.cn_nameptr);
878 cn.cn_nameiop = cnp->cn_nameiop;
879 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
880 cn.cn_lkflags = cnp->cn_lkflags;
881 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
882 vput(zfsctl_vp);
883 }
884 vn_lock(dvp, ltype | LK_RETRY);
885 return (error);
886 }
887 }
888 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
889 zfs_exit(zfsvfs, FTAG);
890 if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED)
891 return (SET_ERROR(ENOENT));
892 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
893 return (SET_ERROR(ENOTSUP));
894 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
895 return (error);
896 }
897
898 /*
899 * The loop is retry the lookup if the parent-child relationship
900 * changes during the dot-dot locking complexities.
901 */
902 for (;;) {
903 uint64_t parent;
904
905 error = zfs_dirlook(zdp, nm, &zp);
906 if (error == 0)
907 *vpp = ZTOV(zp);
908
909 zfs_exit(zfsvfs, FTAG);
910 if (error != 0)
911 break;
912
913 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
914 if (error != 0) {
915 /*
916 * If we've got a locking error, then the vnode
917 * got reclaimed because of a force unmount.
918 * We never enter doomed vnodes into the name cache.
919 */
920 *vpp = NULL;
921 return (error);
922 }
923
924 if ((cnp->cn_flags & ISDOTDOT) == 0)
925 break;
926
927 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
928 vput(ZTOV(zp));
929 *vpp = NULL;
930 return (error);
931 }
932 if (zdp->z_sa_hdl == NULL) {
933 error = SET_ERROR(EIO);
934 } else {
935 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
936 &parent, sizeof (parent));
937 }
938 if (error != 0) {
939 zfs_exit(zfsvfs, FTAG);
940 vput(ZTOV(zp));
941 break;
942 }
943 if (zp->z_id == parent) {
944 zfs_exit(zfsvfs, FTAG);
945 break;
946 }
947 vput(ZTOV(zp));
948 }
949
950 if (error != 0)
951 *vpp = NULL;
952
953 /* Translate errors and add SAVENAME when needed. */
954 if (cnp->cn_flags & ISLASTCN) {
955 switch (nameiop) {
956 case CREATE:
957 case RENAME:
958 if (error == ENOENT) {
959 error = EJUSTRETURN;
960 #if __FreeBSD_version < 1400068
961 cnp->cn_flags |= SAVENAME;
962 #endif
963 break;
964 }
965 zfs_fallthrough;
966 case DELETE:
967 #if __FreeBSD_version < 1400068
968 if (error == 0)
969 cnp->cn_flags |= SAVENAME;
970 #endif
971 break;
972 }
973 }
974
975 if ((cnp->cn_flags & ISDOTDOT) != 0) {
976 /*
977 * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
978 * handle races. In particular different callers may end up
979 * with different vnodes and will try to add conflicting
980 * entries to the namecache.
981 *
982 * While finding different result may be acceptable in face
983 * of concurrent modification, adding conflicting entries
984 * trips over an assert in the namecache.
985 *
986 * Ultimately let an entry through once everything settles.
987 */
988 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
989 cnp->cn_flags &= ~MAKEENTRY;
990 }
991 }
992
993 /* Insert name into cache (as non-existent) if appropriate. */
994 if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
995 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
996 cache_enter(dvp, NULL, cnp);
997
998 /* Insert name into cache if appropriate. */
999 if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
1000 error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1001 if (!(cnp->cn_flags & ISLASTCN) ||
1002 (nameiop != DELETE && nameiop != RENAME)) {
1003 cache_enter(dvp, *vpp, cnp);
1004 }
1005 }
1006
1007 return (error);
1008 }
1009
1010 static inline bool
is_nametoolong(zfsvfs_t * zfsvfs,const char * name)1011 is_nametoolong(zfsvfs_t *zfsvfs, const char *name)
1012 {
1013 size_t dlen = strlen(name);
1014 return ((!zfsvfs->z_longname && dlen >= ZAP_MAXNAMELEN) ||
1015 dlen >= ZAP_MAXNAMELEN_NEW);
1016 }
1017
1018 /*
1019 * Attempt to create a new entry in a directory. If the entry
1020 * already exists, truncate the file if permissible, else return
1021 * an error. Return the vp of the created or trunc'd file.
1022 *
1023 * IN: dvp - vnode of directory to put new file entry in.
1024 * name - name of new file entry.
1025 * vap - attributes of new file.
1026 * excl - flag indicating exclusive or non-exclusive mode.
1027 * mode - mode to open file with.
1028 * cr - credentials of caller.
1029 * flag - large file flag [UNUSED].
1030 * ct - caller context
1031 * vsecp - ACL to be set
1032 * mnt_ns - Unused on FreeBSD
1033 *
1034 * OUT: vpp - vnode of created or trunc'd entry.
1035 *
1036 * RETURN: 0 on success, error code on failure.
1037 *
1038 * Timestamps:
1039 * dvp - ctime|mtime updated if new entry created
1040 * vp - ctime|mtime always, atime if new
1041 */
1042 int
zfs_create(znode_t * dzp,const char * name,vattr_t * vap,int excl,int mode,znode_t ** zpp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)1043 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
1044 znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1045 {
1046 (void) excl, (void) mode, (void) flag;
1047 znode_t *zp;
1048 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1049 zilog_t *zilog;
1050 objset_t *os;
1051 dmu_tx_t *tx;
1052 int error;
1053 uid_t uid = crgetuid(cr);
1054 gid_t gid = crgetgid(cr);
1055 uint64_t projid = ZFS_DEFAULT_PROJID;
1056 zfs_acl_ids_t acl_ids;
1057 boolean_t fuid_dirtied;
1058 uint64_t txtype;
1059 #ifdef DEBUG_VFS_LOCKS
1060 vnode_t *dvp = ZTOV(dzp);
1061 #endif
1062
1063 if (is_nametoolong(zfsvfs, name))
1064 return (SET_ERROR(ENAMETOOLONG));
1065
1066 /*
1067 * If we have an ephemeral id, ACL, or XVATTR then
1068 * make sure file system is at proper version
1069 */
1070 if (zfsvfs->z_use_fuids == B_FALSE &&
1071 (vsecp || (vap->va_mask & AT_XVATTR) ||
1072 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1073 return (SET_ERROR(EINVAL));
1074
1075 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1076 return (error);
1077 os = zfsvfs->z_os;
1078 zilog = zfsvfs->z_log;
1079
1080 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1081 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1082 zfs_exit(zfsvfs, FTAG);
1083 return (SET_ERROR(EILSEQ));
1084 }
1085
1086 if (vap->va_mask & AT_XVATTR) {
1087 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1088 crgetuid(cr), cr, vap->va_type)) != 0) {
1089 zfs_exit(zfsvfs, FTAG);
1090 return (error);
1091 }
1092 }
1093
1094 *zpp = NULL;
1095
1096 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1097 vap->va_mode &= ~S_ISVTX;
1098
1099 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1100 if (error) {
1101 zfs_exit(zfsvfs, FTAG);
1102 return (error);
1103 }
1104 ASSERT0P(zp);
1105
1106 /*
1107 * Create a new file object and update the directory
1108 * to reference it.
1109 */
1110 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
1111 goto out;
1112 }
1113
1114 /*
1115 * We only support the creation of regular files in
1116 * extended attribute directories.
1117 */
1118
1119 if ((dzp->z_pflags & ZFS_XATTR) &&
1120 (vap->va_type != VREG)) {
1121 error = SET_ERROR(EINVAL);
1122 goto out;
1123 }
1124
1125 if ((error = zfs_acl_ids_create(dzp, 0, vap,
1126 cr, vsecp, &acl_ids, NULL)) != 0)
1127 goto out;
1128
1129 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1130 projid = zfs_inherit_projid(dzp);
1131 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1132 zfs_acl_ids_free(&acl_ids);
1133 error = SET_ERROR(EDQUOT);
1134 goto out;
1135 }
1136
1137 getnewvnode_reserve();
1138
1139 tx = dmu_tx_create(os);
1140
1141 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1142 ZFS_SA_BASE_ATTR_SIZE);
1143
1144 fuid_dirtied = zfsvfs->z_fuid_dirty;
1145 if (fuid_dirtied)
1146 zfs_fuid_txhold(zfsvfs, tx);
1147 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1148 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1149 if (!zfsvfs->z_use_sa &&
1150 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1151 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1152 0, acl_ids.z_aclp->z_acl_bytes);
1153 }
1154 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1155 if (error) {
1156 zfs_acl_ids_free(&acl_ids);
1157 dmu_tx_abort(tx);
1158 getnewvnode_drop_reserve();
1159 zfs_exit(zfsvfs, FTAG);
1160 return (error);
1161 }
1162 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1163
1164 error = zfs_link_create(dzp, name, zp, tx, ZNEW);
1165 if (error != 0) {
1166 /*
1167 * Since, we failed to add the directory entry for it,
1168 * delete the newly created dnode.
1169 */
1170 zfs_znode_delete(zp, tx);
1171 VOP_UNLOCK(ZTOV(zp));
1172 zrele(zp);
1173 zfs_acl_ids_free(&acl_ids);
1174 dmu_tx_commit(tx);
1175 getnewvnode_drop_reserve();
1176 goto out;
1177 }
1178
1179 if (fuid_dirtied)
1180 zfs_fuid_sync(zfsvfs, tx);
1181
1182 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1183 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1184 vsecp, acl_ids.z_fuidp, vap);
1185 zfs_acl_ids_free(&acl_ids);
1186 dmu_tx_commit(tx);
1187
1188 getnewvnode_drop_reserve();
1189
1190 out:
1191 VNCHECKREF(dvp);
1192 if (error == 0) {
1193 *zpp = zp;
1194 }
1195
1196 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1197 error = zil_commit(zilog, 0);
1198
1199 zfs_exit(zfsvfs, FTAG);
1200 return (error);
1201 }
1202
1203 /*
1204 * Remove an entry from a directory.
1205 *
1206 * IN: dvp - vnode of directory to remove entry from.
1207 * name - name of entry to remove.
1208 * cr - credentials of caller.
1209 * ct - caller context
1210 * flags - case flags
1211 *
1212 * RETURN: 0 on success, error code on failure.
1213 *
1214 * Timestamps:
1215 * dvp - ctime|mtime
1216 * vp - ctime (if nlink > 0)
1217 */
1218 static int
zfs_remove_(vnode_t * dvp,vnode_t * vp,const char * name,cred_t * cr)1219 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1220 {
1221 znode_t *dzp = VTOZ(dvp);
1222 znode_t *zp;
1223 znode_t *xzp;
1224 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1225 zilog_t *zilog;
1226 uint64_t xattr_obj;
1227 uint64_t obj = 0;
1228 dmu_tx_t *tx;
1229 boolean_t unlinked;
1230 uint64_t txtype;
1231 int error;
1232
1233
1234 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1235 return (error);
1236 zp = VTOZ(vp);
1237 if ((error = zfs_verify_zp(zp)) != 0) {
1238 zfs_exit(zfsvfs, FTAG);
1239 return (error);
1240 }
1241 zilog = zfsvfs->z_log;
1242
1243 xattr_obj = 0;
1244 xzp = NULL;
1245
1246 if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1247 goto out;
1248 }
1249
1250 /*
1251 * Need to use rmdir for removing directories.
1252 */
1253 if (vp->v_type == VDIR) {
1254 error = SET_ERROR(EPERM);
1255 goto out;
1256 }
1257
1258 vnevent_remove(vp, dvp, name, ct);
1259
1260 obj = zp->z_id;
1261
1262 /* are there any extended attributes? */
1263 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1264 &xattr_obj, sizeof (xattr_obj));
1265 if (error == 0 && xattr_obj) {
1266 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1267 ASSERT0(error);
1268 }
1269
1270 /*
1271 * We may delete the znode now, or we may put it in the unlinked set;
1272 * it depends on whether we're the last link, and on whether there are
1273 * other holds on the vnode. So we dmu_tx_hold() the right things to
1274 * allow for either case.
1275 */
1276 tx = dmu_tx_create(zfsvfs->z_os);
1277 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1278 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1279 zfs_sa_upgrade_txholds(tx, zp);
1280 zfs_sa_upgrade_txholds(tx, dzp);
1281
1282 if (xzp) {
1283 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1284 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1285 }
1286
1287 /* charge as an update -- would be nice not to charge at all */
1288 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1289
1290 /*
1291 * Mark this transaction as typically resulting in a net free of space
1292 */
1293 dmu_tx_mark_netfree(tx);
1294
1295 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1296 if (error) {
1297 dmu_tx_abort(tx);
1298 zfs_exit(zfsvfs, FTAG);
1299 return (error);
1300 }
1301
1302 /*
1303 * Remove the directory entry.
1304 */
1305 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1306
1307 if (error) {
1308 dmu_tx_commit(tx);
1309 goto out;
1310 }
1311
1312 if (unlinked) {
1313 zfs_unlinked_add(zp, tx);
1314 vp->v_vflag |= VV_NOSYNC;
1315 }
1316 /* XXX check changes to linux vnops */
1317 txtype = TX_REMOVE;
1318 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1319
1320 dmu_tx_commit(tx);
1321 out:
1322
1323 if (xzp)
1324 vrele(ZTOV(xzp));
1325
1326 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1327 error = zil_commit(zilog, 0);
1328
1329 zfs_exit(zfsvfs, FTAG);
1330 return (error);
1331 }
1332
1333
1334 static int
zfs_lookup_internal(znode_t * dzp,const char * name,vnode_t ** vpp,struct componentname * cnp,int nameiop)1335 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
1336 struct componentname *cnp, int nameiop)
1337 {
1338 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1339 int error;
1340
1341 cnp->cn_nameptr = __DECONST(char *, name);
1342 cnp->cn_namelen = strlen(name);
1343 cnp->cn_nameiop = nameiop;
1344 cnp->cn_flags = ISLASTCN;
1345 #if __FreeBSD_version < 1400068
1346 cnp->cn_flags |= SAVENAME;
1347 #endif
1348 cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
1349 cnp->cn_cred = kcred;
1350 #if __FreeBSD_version < 1400037
1351 cnp->cn_thread = curthread;
1352 #endif
1353
1354 if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
1355 struct vop_lookup_args a;
1356
1357 a.a_gen.a_desc = &vop_lookup_desc;
1358 a.a_dvp = ZTOV(dzp);
1359 a.a_vpp = vpp;
1360 a.a_cnp = cnp;
1361 error = vfs_cache_lookup(&a);
1362 } else {
1363 error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
1364 B_FALSE);
1365 }
1366 #ifdef ZFS_DEBUG
1367 if (error) {
1368 printf("got error %d on name %s on op %d\n", error, name,
1369 nameiop);
1370 kdb_backtrace();
1371 }
1372 #endif
1373 return (error);
1374 }
1375
1376 int
zfs_remove(znode_t * dzp,const char * name,cred_t * cr,int flags)1377 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
1378 {
1379 vnode_t *vp;
1380 int error;
1381 struct componentname cn;
1382
1383 if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1384 return (error);
1385
1386 error = zfs_remove_(ZTOV(dzp), vp, name, cr);
1387 vput(vp);
1388 return (error);
1389 }
1390 /*
1391 * Create a new directory and insert it into dvp using the name
1392 * provided. Return a pointer to the inserted directory.
1393 *
1394 * IN: dvp - vnode of directory to add subdir to.
1395 * dirname - name of new directory.
1396 * vap - attributes of new directory.
1397 * cr - credentials of caller.
1398 * ct - caller context
1399 * flags - case flags
1400 * vsecp - ACL to be set
1401 * mnt_ns - Unused on FreeBSD
1402 *
1403 * OUT: vpp - vnode of created directory.
1404 *
1405 * RETURN: 0 on success, error code on failure.
1406 *
1407 * Timestamps:
1408 * dvp - ctime|mtime updated
1409 * vp - ctime|mtime|atime updated
1410 */
1411 int
zfs_mkdir(znode_t * dzp,const char * dirname,vattr_t * vap,znode_t ** zpp,cred_t * cr,int flags,vsecattr_t * vsecp,zidmap_t * mnt_ns)1412 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
1413 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1414 {
1415 (void) flags, (void) vsecp;
1416 znode_t *zp;
1417 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1418 zilog_t *zilog;
1419 uint64_t txtype;
1420 dmu_tx_t *tx;
1421 int error;
1422 uid_t uid = crgetuid(cr);
1423 gid_t gid = crgetgid(cr);
1424 zfs_acl_ids_t acl_ids;
1425 boolean_t fuid_dirtied;
1426
1427 ASSERT3U(vap->va_type, ==, VDIR);
1428
1429 if (is_nametoolong(zfsvfs, dirname))
1430 return (SET_ERROR(ENAMETOOLONG));
1431
1432 /*
1433 * If we have an ephemeral id, ACL, or XVATTR then
1434 * make sure file system is at proper version
1435 */
1436 if (zfsvfs->z_use_fuids == B_FALSE &&
1437 ((vap->va_mask & AT_XVATTR) ||
1438 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1439 return (SET_ERROR(EINVAL));
1440
1441 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1442 return (error);
1443 zilog = zfsvfs->z_log;
1444
1445 if (dzp->z_pflags & ZFS_XATTR) {
1446 zfs_exit(zfsvfs, FTAG);
1447 return (SET_ERROR(EINVAL));
1448 }
1449
1450 if (zfsvfs->z_utf8 && u8_validate(dirname,
1451 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1452 zfs_exit(zfsvfs, FTAG);
1453 return (SET_ERROR(EILSEQ));
1454 }
1455
1456 if (vap->va_mask & AT_XVATTR) {
1457 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1458 crgetuid(cr), cr, vap->va_type)) != 0) {
1459 zfs_exit(zfsvfs, FTAG);
1460 return (error);
1461 }
1462 }
1463
1464 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1465 NULL, &acl_ids, NULL)) != 0) {
1466 zfs_exit(zfsvfs, FTAG);
1467 return (error);
1468 }
1469
1470 /*
1471 * First make sure the new directory doesn't exist.
1472 *
1473 * Existence is checked first to make sure we don't return
1474 * EACCES instead of EEXIST which can cause some applications
1475 * to fail.
1476 */
1477 *zpp = NULL;
1478
1479 if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
1480 zfs_acl_ids_free(&acl_ids);
1481 zfs_exit(zfsvfs, FTAG);
1482 return (error);
1483 }
1484 ASSERT0P(zp);
1485
1486 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1487 mnt_ns))) {
1488 zfs_acl_ids_free(&acl_ids);
1489 zfs_exit(zfsvfs, FTAG);
1490 return (error);
1491 }
1492
1493 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1494 zfs_acl_ids_free(&acl_ids);
1495 zfs_exit(zfsvfs, FTAG);
1496 return (SET_ERROR(EDQUOT));
1497 }
1498
1499 /*
1500 * Add a new entry to the directory.
1501 */
1502 getnewvnode_reserve();
1503 tx = dmu_tx_create(zfsvfs->z_os);
1504 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1505 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1506 fuid_dirtied = zfsvfs->z_fuid_dirty;
1507 if (fuid_dirtied)
1508 zfs_fuid_txhold(zfsvfs, tx);
1509 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1510 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1511 acl_ids.z_aclp->z_acl_bytes);
1512 }
1513
1514 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1515 ZFS_SA_BASE_ATTR_SIZE);
1516
1517 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1518 if (error) {
1519 zfs_acl_ids_free(&acl_ids);
1520 dmu_tx_abort(tx);
1521 getnewvnode_drop_reserve();
1522 zfs_exit(zfsvfs, FTAG);
1523 return (error);
1524 }
1525
1526 /*
1527 * Create new node.
1528 */
1529 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1530
1531 /*
1532 * Now put new name in parent dir.
1533 */
1534 error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
1535 if (error != 0) {
1536 zfs_znode_delete(zp, tx);
1537 VOP_UNLOCK(ZTOV(zp));
1538 zrele(zp);
1539 goto out;
1540 }
1541
1542 if (fuid_dirtied)
1543 zfs_fuid_sync(zfsvfs, tx);
1544
1545 *zpp = zp;
1546
1547 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
1548 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
1549 acl_ids.z_fuidp, vap);
1550
1551 out:
1552 zfs_acl_ids_free(&acl_ids);
1553
1554 dmu_tx_commit(tx);
1555
1556 getnewvnode_drop_reserve();
1557
1558 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1559 error = zil_commit(zilog, 0);
1560
1561 zfs_exit(zfsvfs, FTAG);
1562 return (error);
1563 }
1564
1565 /*
1566 * Remove a directory subdir entry. If the current working
1567 * directory is the same as the subdir to be removed, the
1568 * remove will fail.
1569 *
1570 * IN: dvp - vnode of directory to remove from.
1571 * name - name of directory to be removed.
1572 * cwd - vnode of current working directory.
1573 * cr - credentials of caller.
1574 * ct - caller context
1575 * flags - case flags
1576 *
1577 * RETURN: 0 on success, error code on failure.
1578 *
1579 * Timestamps:
1580 * dvp - ctime|mtime updated
1581 */
1582 static int
zfs_rmdir_(vnode_t * dvp,vnode_t * vp,const char * name,cred_t * cr)1583 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1584 {
1585 znode_t *dzp = VTOZ(dvp);
1586 znode_t *zp = VTOZ(vp);
1587 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1588 zilog_t *zilog;
1589 dmu_tx_t *tx;
1590 int error;
1591
1592 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1593 return (error);
1594 if ((error = zfs_verify_zp(zp)) != 0) {
1595 zfs_exit(zfsvfs, FTAG);
1596 return (error);
1597 }
1598 zilog = zfsvfs->z_log;
1599
1600
1601 if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1602 goto out;
1603 }
1604
1605 if (vp->v_type != VDIR) {
1606 error = SET_ERROR(ENOTDIR);
1607 goto out;
1608 }
1609
1610 vnevent_rmdir(vp, dvp, name, ct);
1611
1612 tx = dmu_tx_create(zfsvfs->z_os);
1613 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1614 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1615 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1616 zfs_sa_upgrade_txholds(tx, zp);
1617 zfs_sa_upgrade_txholds(tx, dzp);
1618 dmu_tx_mark_netfree(tx);
1619 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1620 if (error) {
1621 dmu_tx_abort(tx);
1622 zfs_exit(zfsvfs, FTAG);
1623 return (error);
1624 }
1625
1626 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
1627
1628 if (error == 0) {
1629 uint64_t txtype = TX_RMDIR;
1630 zfs_log_remove(zilog, tx, txtype, dzp, name,
1631 ZFS_NO_OBJECT, B_FALSE);
1632 }
1633
1634 dmu_tx_commit(tx);
1635
1636 if (zfsvfs->z_use_namecache)
1637 cache_vop_rmdir(dvp, vp);
1638 out:
1639 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1640 error = zil_commit(zilog, 0);
1641
1642 zfs_exit(zfsvfs, FTAG);
1643 return (error);
1644 }
1645
1646 int
zfs_rmdir(znode_t * dzp,const char * name,znode_t * cwd,cred_t * cr,int flags)1647 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
1648 {
1649 struct componentname cn;
1650 vnode_t *vp;
1651 int error;
1652
1653 if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1654 return (error);
1655
1656 error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
1657 vput(vp);
1658 return (error);
1659 }
1660
1661 /*
1662 * Read as many directory entries as will fit into the provided
1663 * buffer from the given directory cursor position (specified in
1664 * the uio structure).
1665 *
1666 * IN: vp - vnode of directory to read.
1667 * uio - structure supplying read location, range info,
1668 * and return buffer.
1669 * cr - credentials of caller.
1670 * ct - caller context
1671 *
1672 * OUT: uio - updated offset and range, buffer filled.
1673 * eofp - set to true if end-of-file detected.
1674 * ncookies- number of entries in cookies
1675 * cookies - offsets to directory entries
1676 *
1677 * RETURN: 0 on success, error code on failure.
1678 *
1679 * Timestamps:
1680 * vp - atime updated
1681 *
1682 * Note that the low 4 bits of the cookie returned by zap is always zero.
1683 * This allows us to use the low range for "special" directory entries:
1684 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1685 * we use the offset 2 for the '.zfs' directory.
1686 */
1687 static int
zfs_readdir(vnode_t * vp,zfs_uio_t * uio,cred_t * cr,int * eofp,int * ncookies,cookie_t ** cookies)1688 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
1689 int *ncookies, cookie_t **cookies)
1690 {
1691 znode_t *zp = VTOZ(vp);
1692 iovec_t *iovp;
1693 dirent64_t *odp;
1694 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1695 objset_t *os;
1696 caddr_t outbuf;
1697 size_t bufsize;
1698 ssize_t orig_resid;
1699 zap_cursor_t zc;
1700 zap_attribute_t *zap;
1701 uint_t bytes_wanted;
1702 uint64_t offset; /* must be unsigned; checks for < 1 */
1703 uint64_t parent;
1704 int local_eof;
1705 int outcount;
1706 int error;
1707 uint8_t prefetch;
1708 uint8_t type;
1709 int ncooks;
1710 cookie_t *cooks = NULL;
1711
1712 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1713 return (error);
1714
1715 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1716 &parent, sizeof (parent))) != 0) {
1717 zfs_exit(zfsvfs, FTAG);
1718 return (error);
1719 }
1720
1721 /*
1722 * If we are not given an eof variable,
1723 * use a local one.
1724 */
1725 if (eofp == NULL)
1726 eofp = &local_eof;
1727
1728 /*
1729 * Check for valid iov_len.
1730 */
1731 if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
1732 zfs_exit(zfsvfs, FTAG);
1733 return (SET_ERROR(EINVAL));
1734 }
1735
1736 /*
1737 * Quit if directory has been removed (posix)
1738 */
1739 if ((*eofp = (zp->z_unlinked != 0)) != 0) {
1740 zfs_exit(zfsvfs, FTAG);
1741 return (0);
1742 }
1743
1744 error = 0;
1745 os = zfsvfs->z_os;
1746 offset = zfs_uio_offset(uio);
1747 orig_resid = zfs_uio_resid(uio);
1748 prefetch = zp->z_zn_prefetch;
1749 zap = zap_attribute_long_alloc();
1750
1751 /*
1752 * Initialize the iterator cursor.
1753 */
1754 if (offset <= 3) {
1755 /*
1756 * Start iteration from the beginning of the directory.
1757 */
1758 zap_cursor_init(&zc, os, zp->z_id);
1759 } else {
1760 /*
1761 * The offset is a serialized cursor.
1762 */
1763 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1764 }
1765
1766 /*
1767 * Get space to change directory entries into fs independent format.
1768 */
1769 iovp = GET_UIO_STRUCT(uio)->uio_iov;
1770 bytes_wanted = iovp->iov_len;
1771 if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
1772 bufsize = bytes_wanted;
1773 outbuf = kmem_alloc(bufsize, KM_SLEEP);
1774 odp = (struct dirent64 *)outbuf;
1775 } else {
1776 bufsize = bytes_wanted;
1777 outbuf = NULL;
1778 odp = (struct dirent64 *)iovp->iov_base;
1779 }
1780
1781 if (ncookies != NULL) {
1782 /*
1783 * Minimum entry size is dirent size and 1 byte for a file name.
1784 */
1785 ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
1786 sizeof (((struct dirent *)NULL)->d_name) + 1);
1787 cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
1788 *cookies = cooks;
1789 *ncookies = ncooks;
1790 }
1791
1792 /*
1793 * Transform to file-system independent format
1794 */
1795 outcount = 0;
1796 while (outcount < bytes_wanted) {
1797 ino64_t objnum;
1798 ushort_t reclen;
1799 off64_t *next = NULL;
1800
1801 /*
1802 * Special case `.', `..', and `.zfs'.
1803 */
1804 if (offset == 0) {
1805 (void) strcpy(zap->za_name, ".");
1806 zap->za_normalization_conflict = 0;
1807 objnum = zp->z_id;
1808 type = DT_DIR;
1809 } else if (offset == 1) {
1810 (void) strcpy(zap->za_name, "..");
1811 zap->za_normalization_conflict = 0;
1812 objnum = parent;
1813 type = DT_DIR;
1814 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1815 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1816 zap->za_normalization_conflict = 0;
1817 objnum = ZFSCTL_INO_ROOT;
1818 type = DT_DIR;
1819 } else {
1820 /*
1821 * Grab next entry.
1822 */
1823 if ((error = zap_cursor_retrieve(&zc, zap))) {
1824 if ((*eofp = (error == ENOENT)) != 0)
1825 break;
1826 else
1827 goto update;
1828 }
1829
1830 if (zap->za_integer_length != 8 ||
1831 zap->za_num_integers != 1) {
1832 cmn_err(CE_WARN, "zap_readdir: bad directory "
1833 "entry, obj = %lld, offset = %lld\n",
1834 (u_longlong_t)zp->z_id,
1835 (u_longlong_t)offset);
1836 error = SET_ERROR(ENXIO);
1837 goto update;
1838 }
1839
1840 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1841 /*
1842 * MacOS X can extract the object type here such as:
1843 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1844 */
1845 type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1846 }
1847
1848 reclen = DIRENT64_RECLEN(strlen(zap->za_name));
1849
1850 /*
1851 * Will this entry fit in the buffer?
1852 */
1853 if (outcount + reclen > bufsize) {
1854 /*
1855 * Did we manage to fit anything in the buffer?
1856 */
1857 if (!outcount) {
1858 error = SET_ERROR(EINVAL);
1859 goto update;
1860 }
1861 break;
1862 }
1863 /*
1864 * Add normal entry:
1865 */
1866 odp->d_ino = objnum;
1867 odp->d_reclen = reclen;
1868 odp->d_namlen = strlen(zap->za_name);
1869 /* NOTE: d_off is the offset for the *next* entry. */
1870 next = &odp->d_off;
1871 strlcpy(odp->d_name, zap->za_name, odp->d_namlen + 1);
1872 odp->d_type = type;
1873 dirent_terminate(odp);
1874 odp = (dirent64_t *)((intptr_t)odp + reclen);
1875
1876 outcount += reclen;
1877
1878 ASSERT3S(outcount, <=, bufsize);
1879
1880 if (prefetch)
1881 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1882
1883 /*
1884 * Move to the next entry, fill in the previous offset.
1885 */
1886 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1887 zap_cursor_advance(&zc);
1888 offset = zap_cursor_serialize(&zc);
1889 } else {
1890 offset += 1;
1891 }
1892
1893 /* Fill the offset right after advancing the cursor. */
1894 if (next != NULL)
1895 *next = offset;
1896 if (cooks != NULL) {
1897 *cooks++ = offset;
1898 ncooks--;
1899 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1900 }
1901 }
1902 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1903
1904 /* Subtract unused cookies */
1905 if (ncookies != NULL)
1906 *ncookies -= ncooks;
1907
1908 if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
1909 iovp->iov_base += outcount;
1910 iovp->iov_len -= outcount;
1911 zfs_uio_resid(uio) -= outcount;
1912 } else if ((error =
1913 zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
1914 /*
1915 * Reset the pointer.
1916 */
1917 offset = zfs_uio_offset(uio);
1918 }
1919
1920 update:
1921 zap_cursor_fini(&zc);
1922 zap_attribute_free(zap);
1923 if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
1924 kmem_free(outbuf, bufsize);
1925
1926 if (error == ENOENT)
1927 error = orig_resid == zfs_uio_resid(uio) ? EINVAL : 0;
1928
1929 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1930
1931 zfs_uio_setoffset(uio, offset);
1932 zfs_exit(zfsvfs, FTAG);
1933 if (error != 0 && cookies != NULL) {
1934 free(*cookies, M_TEMP);
1935 *cookies = NULL;
1936 *ncookies = 0;
1937 }
1938 return (error);
1939 }
1940
1941 /*
1942 * Get the requested file attributes and place them in the provided
1943 * vattr structure.
1944 *
1945 * IN: vp - vnode of file.
1946 * vap - va_mask identifies requested attributes.
1947 * If AT_XVATTR set, then optional attrs are requested
1948 * flags - ATTR_NOACLCHECK (CIFS server context)
1949 * cr - credentials of caller.
1950 *
1951 * OUT: vap - attribute values.
1952 *
1953 * RETURN: 0 (always succeeds).
1954 */
1955 static int
zfs_getattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr)1956 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1957 {
1958 znode_t *zp = VTOZ(vp);
1959 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1960 int error = 0;
1961 uint32_t blksize;
1962 u_longlong_t nblocks;
1963 uint64_t mtime[2], ctime[2], crtime[2], rdev;
1964 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
1965 xoptattr_t *xoap = NULL;
1966 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1967 sa_bulk_attr_t bulk[4];
1968 int count = 0;
1969
1970 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1971 return (error);
1972
1973 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
1974
1975 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1976 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1977 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
1978 if (vp->v_type == VBLK || vp->v_type == VCHR)
1979 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1980 &rdev, 8);
1981
1982 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
1983 zfs_exit(zfsvfs, FTAG);
1984 return (error);
1985 }
1986
1987 /*
1988 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
1989 * Also, if we are the owner don't bother, since owner should
1990 * always be allowed to read basic attributes of file.
1991 */
1992 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
1993 (vap->va_uid != crgetuid(cr))) {
1994 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
1995 skipaclchk, cr, NULL))) {
1996 zfs_exit(zfsvfs, FTAG);
1997 return (error);
1998 }
1999 }
2000
2001 /*
2002 * Return all attributes. It's cheaper to provide the answer
2003 * than to determine whether we were asked the question.
2004 */
2005
2006 vap->va_type = IFTOVT(zp->z_mode);
2007 vap->va_mode = zp->z_mode & ~S_IFMT;
2008 vn_fsid(vp, vap);
2009 vap->va_nodeid = zp->z_id;
2010 vap->va_nlink = zp->z_links;
2011 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
2012 zp->z_links < ZFS_LINK_MAX)
2013 vap->va_nlink++;
2014 vap->va_size = zp->z_size;
2015 if (vp->v_type == VBLK || vp->v_type == VCHR)
2016 vap->va_rdev = zfs_cmpldev(rdev);
2017 else
2018 vap->va_rdev = NODEV;
2019 vap->va_gen = zp->z_gen;
2020 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
2021 vap->va_filerev = zp->z_seq;
2022
2023 /*
2024 * Add in any requested optional attributes and the create time.
2025 * Also set the corresponding bits in the returned attribute bitmap.
2026 */
2027 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2028 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2029 xoap->xoa_archive =
2030 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2031 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2032 }
2033
2034 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2035 xoap->xoa_readonly =
2036 ((zp->z_pflags & ZFS_READONLY) != 0);
2037 XVA_SET_RTN(xvap, XAT_READONLY);
2038 }
2039
2040 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2041 xoap->xoa_system =
2042 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2043 XVA_SET_RTN(xvap, XAT_SYSTEM);
2044 }
2045
2046 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2047 xoap->xoa_hidden =
2048 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2049 XVA_SET_RTN(xvap, XAT_HIDDEN);
2050 }
2051
2052 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2053 xoap->xoa_nounlink =
2054 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2055 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2056 }
2057
2058 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2059 xoap->xoa_immutable =
2060 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2061 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2062 }
2063
2064 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2065 xoap->xoa_appendonly =
2066 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2067 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2068 }
2069
2070 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2071 xoap->xoa_nodump =
2072 ((zp->z_pflags & ZFS_NODUMP) != 0);
2073 XVA_SET_RTN(xvap, XAT_NODUMP);
2074 }
2075
2076 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2077 xoap->xoa_opaque =
2078 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2079 XVA_SET_RTN(xvap, XAT_OPAQUE);
2080 }
2081
2082 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2083 xoap->xoa_av_quarantined =
2084 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2085 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2086 }
2087
2088 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2089 xoap->xoa_av_modified =
2090 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2091 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2092 }
2093
2094 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2095 vp->v_type == VREG) {
2096 zfs_sa_get_scanstamp(zp, xvap);
2097 }
2098
2099 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2100 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2101 XVA_SET_RTN(xvap, XAT_REPARSE);
2102 }
2103 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2104 xoap->xoa_generation = zp->z_gen;
2105 XVA_SET_RTN(xvap, XAT_GEN);
2106 }
2107
2108 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2109 xoap->xoa_offline =
2110 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2111 XVA_SET_RTN(xvap, XAT_OFFLINE);
2112 }
2113
2114 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2115 xoap->xoa_sparse =
2116 ((zp->z_pflags & ZFS_SPARSE) != 0);
2117 XVA_SET_RTN(xvap, XAT_SPARSE);
2118 }
2119
2120 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2121 xoap->xoa_projinherit =
2122 ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
2123 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
2124 }
2125
2126 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2127 xoap->xoa_projid = zp->z_projid;
2128 XVA_SET_RTN(xvap, XAT_PROJID);
2129 }
2130 }
2131
2132 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2133 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2134 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2135 ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2136
2137
2138 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2139 vap->va_blksize = blksize;
2140 vap->va_bytes = nblocks << 9; /* nblocks * 512 */
2141
2142 if (zp->z_blksz == 0) {
2143 /*
2144 * Block size hasn't been set; suggest maximal I/O transfers.
2145 */
2146 vap->va_blksize = zfsvfs->z_max_blksz;
2147 }
2148
2149 zfs_exit(zfsvfs, FTAG);
2150 return (0);
2151 }
2152
2153 /*
2154 * For the operation of changing file's user/group/project, we need to
2155 * handle not only the main object that is assigned to the file directly,
2156 * but also the ones that are used by the file via hidden xattr directory.
2157 *
2158 * Because the xattr directory may contains many EA entries, as to it may
2159 * be impossible to change all of them via the transaction of changing the
2160 * main object's user/group/project attributes. Then we have to change them
2161 * via other multiple independent transactions one by one. It may be not good
2162 * solution, but we have no better idea yet.
2163 */
2164 static int
zfs_setattr_dir(znode_t * dzp)2165 zfs_setattr_dir(znode_t *dzp)
2166 {
2167 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2168 objset_t *os = zfsvfs->z_os;
2169 zap_cursor_t zc;
2170 zap_attribute_t *zap;
2171 znode_t *zp = NULL;
2172 dmu_tx_t *tx = NULL;
2173 uint64_t uid, gid;
2174 sa_bulk_attr_t bulk[4];
2175 int count;
2176 int err;
2177
2178 zap = zap_attribute_alloc();
2179 zap_cursor_init(&zc, os, dzp->z_id);
2180 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
2181 count = 0;
2182 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
2183 err = ENXIO;
2184 break;
2185 }
2186
2187 err = zfs_dirent_lookup(dzp, zap->za_name, &zp, ZEXISTS);
2188 if (err == ENOENT)
2189 goto next;
2190 if (err)
2191 break;
2192
2193 if (zp->z_uid == dzp->z_uid &&
2194 zp->z_gid == dzp->z_gid &&
2195 zp->z_projid == dzp->z_projid)
2196 goto next;
2197
2198 tx = dmu_tx_create(os);
2199 if (!(zp->z_pflags & ZFS_PROJID))
2200 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2201 else
2202 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2203
2204 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2205 if (err)
2206 break;
2207
2208 mutex_enter(&dzp->z_lock);
2209
2210 if (zp->z_uid != dzp->z_uid) {
2211 uid = dzp->z_uid;
2212 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2213 &uid, sizeof (uid));
2214 zp->z_uid = uid;
2215 }
2216
2217 if (zp->z_gid != dzp->z_gid) {
2218 gid = dzp->z_gid;
2219 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2220 &gid, sizeof (gid));
2221 zp->z_gid = gid;
2222 }
2223
2224 uint64_t projid = dzp->z_projid;
2225 if (zp->z_projid != projid) {
2226 if (!(zp->z_pflags & ZFS_PROJID)) {
2227 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2228 if (unlikely(err == EEXIST)) {
2229 err = 0;
2230 } else if (err != 0) {
2231 goto sa_add_projid_err;
2232 } else {
2233 projid = ZFS_INVALID_PROJID;
2234 }
2235 }
2236
2237 if (projid != ZFS_INVALID_PROJID) {
2238 zp->z_projid = projid;
2239 SA_ADD_BULK_ATTR(bulk, count,
2240 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2241 sizeof (zp->z_projid));
2242 }
2243 }
2244
2245 sa_add_projid_err:
2246 mutex_exit(&dzp->z_lock);
2247
2248 if (likely(count > 0)) {
2249 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2250 dmu_tx_commit(tx);
2251 } else if (projid == ZFS_INVALID_PROJID) {
2252 dmu_tx_commit(tx);
2253 } else {
2254 dmu_tx_abort(tx);
2255 }
2256 tx = NULL;
2257 if (err != 0 && err != ENOENT)
2258 break;
2259
2260 next:
2261 if (zp) {
2262 zrele(zp);
2263 zp = NULL;
2264 }
2265 zap_cursor_advance(&zc);
2266 }
2267
2268 if (tx)
2269 dmu_tx_abort(tx);
2270 if (zp) {
2271 zrele(zp);
2272 }
2273 zap_cursor_fini(&zc);
2274 zap_attribute_free(zap);
2275
2276 return (err == ENOENT ? 0 : err);
2277 }
2278
2279 /*
2280 * Set the file attributes to the values contained in the
2281 * vattr structure.
2282 *
2283 * IN: zp - znode of file to be modified.
2284 * vap - new attribute values.
2285 * If AT_XVATTR set, then optional attrs are being set
2286 * flags - ATTR_UTIME set if non-default time values provided.
2287 * - ATTR_NOACLCHECK (CIFS context only).
2288 * cr - credentials of caller.
2289 * mnt_ns - Unused on FreeBSD
2290 *
2291 * RETURN: 0 on success, error code on failure.
2292 *
2293 * Timestamps:
2294 * vp - ctime updated, mtime updated if size changed.
2295 */
2296 int
zfs_setattr(znode_t * zp,vattr_t * vap,int flags,cred_t * cr,zidmap_t * mnt_ns)2297 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
2298 {
2299 vnode_t *vp = ZTOV(zp);
2300 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2301 objset_t *os;
2302 zilog_t *zilog;
2303 dmu_tx_t *tx;
2304 vattr_t oldva;
2305 xvattr_t tmpxvattr;
2306 uint_t mask = vap->va_mask;
2307 uint_t saved_mask = 0;
2308 uint64_t saved_mode;
2309 int trim_mask = 0;
2310 uint64_t new_mode;
2311 uint64_t new_uid, new_gid;
2312 uint64_t xattr_obj;
2313 uint64_t mtime[2], ctime[2];
2314 uint64_t projid = ZFS_INVALID_PROJID;
2315 znode_t *attrzp;
2316 int need_policy = FALSE;
2317 int err, err2;
2318 zfs_fuid_info_t *fuidp = NULL;
2319 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2320 xoptattr_t *xoap;
2321 zfs_acl_t *aclp;
2322 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2323 boolean_t fuid_dirtied = B_FALSE;
2324 boolean_t handle_eadir = B_FALSE;
2325 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2326 int count = 0, xattr_count = 0;
2327
2328 if (mask == 0)
2329 return (0);
2330
2331 if (mask & AT_NOSET)
2332 return (SET_ERROR(EINVAL));
2333
2334 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
2335 return (err);
2336
2337 os = zfsvfs->z_os;
2338 zilog = zfsvfs->z_log;
2339
2340 /*
2341 * Make sure that if we have ephemeral uid/gid or xvattr specified
2342 * that file system is at proper version level
2343 */
2344
2345 if (zfsvfs->z_use_fuids == B_FALSE &&
2346 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2347 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2348 (mask & AT_XVATTR))) {
2349 zfs_exit(zfsvfs, FTAG);
2350 return (SET_ERROR(EINVAL));
2351 }
2352
2353 if (mask & AT_SIZE && vp->v_type == VDIR) {
2354 zfs_exit(zfsvfs, FTAG);
2355 return (SET_ERROR(EISDIR));
2356 }
2357
2358 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2359 zfs_exit(zfsvfs, FTAG);
2360 return (SET_ERROR(EINVAL));
2361 }
2362
2363 /*
2364 * If this is an xvattr_t, then get a pointer to the structure of
2365 * optional attributes. If this is NULL, then we have a vattr_t.
2366 */
2367 xoap = xva_getxoptattr(xvap);
2368
2369 xva_init(&tmpxvattr);
2370
2371 /*
2372 * Immutable files can only alter immutable bit and atime
2373 */
2374 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2375 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2376 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2377 zfs_exit(zfsvfs, FTAG);
2378 return (SET_ERROR(EPERM));
2379 }
2380
2381 /*
2382 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2383 */
2384
2385 /*
2386 * Verify timestamps doesn't overflow 32 bits.
2387 * ZFS can handle large timestamps, but 32bit syscalls can't
2388 * handle times greater than 2039. This check should be removed
2389 * once large timestamps are fully supported.
2390 */
2391 if (mask & (AT_ATIME | AT_MTIME)) {
2392 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2393 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2394 zfs_exit(zfsvfs, FTAG);
2395 return (SET_ERROR(EOVERFLOW));
2396 }
2397 }
2398 if (xoap != NULL && (mask & AT_XVATTR)) {
2399 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2400 TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2401 zfs_exit(zfsvfs, FTAG);
2402 return (SET_ERROR(EOVERFLOW));
2403 }
2404
2405 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2406 if (!dmu_objset_projectquota_enabled(os) ||
2407 (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
2408 zfs_exit(zfsvfs, FTAG);
2409 return (SET_ERROR(EOPNOTSUPP));
2410 }
2411
2412 projid = xoap->xoa_projid;
2413 if (unlikely(projid == ZFS_INVALID_PROJID)) {
2414 zfs_exit(zfsvfs, FTAG);
2415 return (SET_ERROR(EINVAL));
2416 }
2417
2418 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2419 projid = ZFS_INVALID_PROJID;
2420 else
2421 need_policy = TRUE;
2422 }
2423
2424 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2425 (xoap->xoa_projinherit !=
2426 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2427 (!dmu_objset_projectquota_enabled(os) ||
2428 (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
2429 zfs_exit(zfsvfs, FTAG);
2430 return (SET_ERROR(EOPNOTSUPP));
2431 }
2432 }
2433
2434 attrzp = NULL;
2435 aclp = NULL;
2436
2437 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2438 zfs_exit(zfsvfs, FTAG);
2439 return (SET_ERROR(EROFS));
2440 }
2441
2442 /*
2443 * First validate permissions
2444 */
2445
2446 if (mask & AT_SIZE) {
2447 /*
2448 * XXX - Note, we are not providing any open
2449 * mode flags here (like FNDELAY), so we may
2450 * block if there are locks present... this
2451 * should be addressed in openat().
2452 */
2453 /* XXX - would it be OK to generate a log record here? */
2454 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2455 if (err) {
2456 zfs_exit(zfsvfs, FTAG);
2457 return (err);
2458 }
2459 }
2460
2461 if (mask & (AT_ATIME|AT_MTIME) ||
2462 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2463 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2464 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2465 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2466 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2467 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2468 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2469 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2470 skipaclchk, cr, mnt_ns);
2471 }
2472
2473 if (mask & (AT_UID|AT_GID)) {
2474 int idmask = (mask & (AT_UID|AT_GID));
2475 int take_owner;
2476 int take_group;
2477
2478 /*
2479 * NOTE: even if a new mode is being set,
2480 * we may clear S_ISUID/S_ISGID bits.
2481 */
2482
2483 if (!(mask & AT_MODE))
2484 vap->va_mode = zp->z_mode;
2485
2486 /*
2487 * Take ownership or chgrp to group we are a member of
2488 */
2489
2490 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2491 take_group = (mask & AT_GID) &&
2492 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2493
2494 /*
2495 * If both AT_UID and AT_GID are set then take_owner and
2496 * take_group must both be set in order to allow taking
2497 * ownership.
2498 *
2499 * Otherwise, send the check through secpolicy_vnode_setattr()
2500 *
2501 */
2502
2503 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2504 ((idmask == AT_UID) && take_owner) ||
2505 ((idmask == AT_GID) && take_group)) {
2506 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2507 skipaclchk, cr, mnt_ns) == 0) {
2508 /*
2509 * Remove setuid/setgid for non-privileged users
2510 */
2511 secpolicy_setid_clear(vap, vp, cr);
2512 trim_mask = (mask & (AT_UID|AT_GID));
2513 } else {
2514 need_policy = TRUE;
2515 }
2516 } else {
2517 need_policy = TRUE;
2518 }
2519 }
2520
2521 oldva.va_mode = zp->z_mode;
2522 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2523 if (mask & AT_XVATTR) {
2524 /*
2525 * Update xvattr mask to include only those attributes
2526 * that are actually changing.
2527 *
2528 * the bits will be restored prior to actually setting
2529 * the attributes so the caller thinks they were set.
2530 */
2531 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2532 if (xoap->xoa_appendonly !=
2533 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2534 need_policy = TRUE;
2535 } else {
2536 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2537 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2538 }
2539 }
2540
2541 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2542 if (xoap->xoa_projinherit !=
2543 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2544 need_policy = TRUE;
2545 } else {
2546 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2547 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
2548 }
2549 }
2550
2551 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2552 if (xoap->xoa_nounlink !=
2553 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2554 need_policy = TRUE;
2555 } else {
2556 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2557 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2558 }
2559 }
2560
2561 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2562 if (xoap->xoa_immutable !=
2563 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2564 need_policy = TRUE;
2565 } else {
2566 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2567 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2568 }
2569 }
2570
2571 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2572 if (xoap->xoa_nodump !=
2573 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2574 need_policy = TRUE;
2575 } else {
2576 XVA_CLR_REQ(xvap, XAT_NODUMP);
2577 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2578 }
2579 }
2580
2581 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2582 if (xoap->xoa_av_modified !=
2583 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2584 need_policy = TRUE;
2585 } else {
2586 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2587 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2588 }
2589 }
2590
2591 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2592 if ((vp->v_type != VREG &&
2593 xoap->xoa_av_quarantined) ||
2594 xoap->xoa_av_quarantined !=
2595 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2596 need_policy = TRUE;
2597 } else {
2598 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2599 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2600 }
2601 }
2602
2603 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2604 zfs_exit(zfsvfs, FTAG);
2605 return (SET_ERROR(EPERM));
2606 }
2607
2608 if (need_policy == FALSE &&
2609 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2610 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2611 need_policy = TRUE;
2612 }
2613 }
2614
2615 if (mask & AT_MODE) {
2616 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2617 mnt_ns) == 0) {
2618 err = secpolicy_setid_setsticky_clear(vp, vap,
2619 &oldva, cr);
2620 if (err) {
2621 zfs_exit(zfsvfs, FTAG);
2622 return (err);
2623 }
2624 trim_mask |= AT_MODE;
2625 } else {
2626 need_policy = TRUE;
2627 }
2628 }
2629
2630 if (need_policy) {
2631 /*
2632 * If trim_mask is set then take ownership
2633 * has been granted or write_acl is present and user
2634 * has the ability to modify mode. In that case remove
2635 * UID|GID and or MODE from mask so that
2636 * secpolicy_vnode_setattr() doesn't revoke it.
2637 */
2638
2639 if (trim_mask) {
2640 saved_mask = vap->va_mask;
2641 vap->va_mask &= ~trim_mask;
2642 if (trim_mask & AT_MODE) {
2643 /*
2644 * Save the mode, as secpolicy_vnode_setattr()
2645 * will overwrite it with ova.va_mode.
2646 */
2647 saved_mode = vap->va_mode;
2648 }
2649 }
2650 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2651 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2652 if (err) {
2653 zfs_exit(zfsvfs, FTAG);
2654 return (err);
2655 }
2656
2657 if (trim_mask) {
2658 vap->va_mask |= saved_mask;
2659 if (trim_mask & AT_MODE) {
2660 /*
2661 * Recover the mode after
2662 * secpolicy_vnode_setattr().
2663 */
2664 vap->va_mode = saved_mode;
2665 }
2666 }
2667 }
2668
2669 /*
2670 * secpolicy_vnode_setattr, or take ownership may have
2671 * changed va_mask
2672 */
2673 mask = vap->va_mask;
2674
2675 if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
2676 handle_eadir = B_TRUE;
2677 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2678 &xattr_obj, sizeof (xattr_obj));
2679
2680 if (err == 0 && xattr_obj) {
2681 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2682 if (err == 0) {
2683 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
2684 if (err != 0)
2685 vrele(ZTOV(attrzp));
2686 }
2687 if (err)
2688 goto out2;
2689 }
2690 if (mask & AT_UID) {
2691 new_uid = zfs_fuid_create(zfsvfs,
2692 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2693 if (new_uid != zp->z_uid &&
2694 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2695 new_uid)) {
2696 if (attrzp)
2697 vput(ZTOV(attrzp));
2698 err = SET_ERROR(EDQUOT);
2699 goto out2;
2700 }
2701 }
2702
2703 if (mask & AT_GID) {
2704 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2705 cr, ZFS_GROUP, &fuidp);
2706 if (new_gid != zp->z_gid &&
2707 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2708 new_gid)) {
2709 if (attrzp)
2710 vput(ZTOV(attrzp));
2711 err = SET_ERROR(EDQUOT);
2712 goto out2;
2713 }
2714 }
2715
2716 if (projid != ZFS_INVALID_PROJID &&
2717 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2718 if (attrzp)
2719 vput(ZTOV(attrzp));
2720 err = SET_ERROR(EDQUOT);
2721 goto out2;
2722 }
2723 }
2724 tx = dmu_tx_create(os);
2725
2726 if (mask & AT_MODE) {
2727 uint64_t pmode = zp->z_mode;
2728 uint64_t acl_obj;
2729 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2730
2731 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
2732 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2733 err = SET_ERROR(EPERM);
2734 goto out;
2735 }
2736
2737 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2738 goto out;
2739
2740 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2741 /*
2742 * Are we upgrading ACL from old V0 format
2743 * to V1 format?
2744 */
2745 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2746 zfs_znode_acl_version(zp) ==
2747 ZFS_ACL_VERSION_INITIAL) {
2748 dmu_tx_hold_free(tx, acl_obj, 0,
2749 DMU_OBJECT_END);
2750 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2751 0, aclp->z_acl_bytes);
2752 } else {
2753 dmu_tx_hold_write(tx, acl_obj, 0,
2754 aclp->z_acl_bytes);
2755 }
2756 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2757 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2758 0, aclp->z_acl_bytes);
2759 }
2760 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2761 } else {
2762 if (((mask & AT_XVATTR) &&
2763 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2764 (projid != ZFS_INVALID_PROJID &&
2765 !(zp->z_pflags & ZFS_PROJID)))
2766 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2767 else
2768 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2769 }
2770
2771 if (attrzp) {
2772 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2773 }
2774
2775 fuid_dirtied = zfsvfs->z_fuid_dirty;
2776 if (fuid_dirtied)
2777 zfs_fuid_txhold(zfsvfs, tx);
2778
2779 zfs_sa_upgrade_txholds(tx, zp);
2780
2781 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2782 if (err)
2783 goto out;
2784
2785 count = 0;
2786 /*
2787 * Set each attribute requested.
2788 * We group settings according to the locks they need to acquire.
2789 *
2790 * Note: you cannot set ctime directly, although it will be
2791 * updated as a side-effect of calling this function.
2792 */
2793
2794 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2795 /*
2796 * For the existed object that is upgraded from old system,
2797 * its on-disk layout has no slot for the project ID attribute.
2798 * But quota accounting logic needs to access related slots by
2799 * offset directly. So we need to adjust old objects' layout
2800 * to make the project ID to some unified and fixed offset.
2801 */
2802 if (attrzp)
2803 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2804 if (err == 0)
2805 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2806
2807 if (unlikely(err == EEXIST))
2808 err = 0;
2809 else if (err != 0)
2810 goto out;
2811 else
2812 projid = ZFS_INVALID_PROJID;
2813 }
2814
2815 if (mask & (AT_UID|AT_GID|AT_MODE))
2816 mutex_enter(&zp->z_acl_lock);
2817
2818 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2819 &zp->z_pflags, sizeof (zp->z_pflags));
2820
2821 if (attrzp) {
2822 if (mask & (AT_UID|AT_GID|AT_MODE))
2823 mutex_enter(&attrzp->z_acl_lock);
2824 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2825 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2826 sizeof (attrzp->z_pflags));
2827 if (projid != ZFS_INVALID_PROJID) {
2828 attrzp->z_projid = projid;
2829 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2830 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2831 sizeof (attrzp->z_projid));
2832 }
2833 }
2834
2835 if (mask & (AT_UID|AT_GID)) {
2836
2837 if (mask & AT_UID) {
2838 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2839 &new_uid, sizeof (new_uid));
2840 zp->z_uid = new_uid;
2841 if (attrzp) {
2842 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2843 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2844 sizeof (new_uid));
2845 attrzp->z_uid = new_uid;
2846 }
2847 }
2848
2849 if (mask & AT_GID) {
2850 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2851 NULL, &new_gid, sizeof (new_gid));
2852 zp->z_gid = new_gid;
2853 if (attrzp) {
2854 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2855 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2856 sizeof (new_gid));
2857 attrzp->z_gid = new_gid;
2858 }
2859 }
2860 if (!(mask & AT_MODE)) {
2861 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2862 NULL, &new_mode, sizeof (new_mode));
2863 new_mode = zp->z_mode;
2864 }
2865 err = zfs_acl_chown_setattr(zp);
2866 ASSERT0(err);
2867 if (attrzp) {
2868 vn_seqc_write_begin(ZTOV(attrzp));
2869 err = zfs_acl_chown_setattr(attrzp);
2870 vn_seqc_write_end(ZTOV(attrzp));
2871 ASSERT0(err);
2872 }
2873 }
2874
2875 if (mask & AT_MODE) {
2876 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2877 &new_mode, sizeof (new_mode));
2878 zp->z_mode = new_mode;
2879 ASSERT3P(aclp, !=, NULL);
2880 err = zfs_aclset_common(zp, aclp, cr, tx);
2881 ASSERT0(err);
2882 if (zp->z_acl_cached)
2883 zfs_acl_free(zp->z_acl_cached);
2884 zp->z_acl_cached = aclp;
2885 aclp = NULL;
2886 }
2887
2888
2889 if (mask & AT_ATIME) {
2890 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
2891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2892 &zp->z_atime, sizeof (zp->z_atime));
2893 }
2894
2895 if (mask & AT_MTIME) {
2896 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2897 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2898 mtime, sizeof (mtime));
2899 }
2900
2901 if (projid != ZFS_INVALID_PROJID) {
2902 zp->z_projid = projid;
2903 SA_ADD_BULK_ATTR(bulk, count,
2904 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2905 sizeof (zp->z_projid));
2906 }
2907
2908 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2909 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
2910 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
2911 NULL, mtime, sizeof (mtime));
2912 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2913 &ctime, sizeof (ctime));
2914 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
2915 } else if (mask != 0) {
2916 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2917 &ctime, sizeof (ctime));
2918 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
2919 if (attrzp) {
2920 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2921 SA_ZPL_CTIME(zfsvfs), NULL,
2922 &ctime, sizeof (ctime));
2923 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2924 mtime, ctime);
2925 }
2926 }
2927
2928 /*
2929 * Do this after setting timestamps to prevent timestamp
2930 * update from toggling bit
2931 */
2932
2933 if (xoap && (mask & AT_XVATTR)) {
2934
2935 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
2936 xoap->xoa_createtime = vap->va_birthtime;
2937 /*
2938 * restore trimmed off masks
2939 * so that return masks can be set for caller.
2940 */
2941
2942 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
2943 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2944 }
2945 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
2946 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2947 }
2948 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
2949 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2950 }
2951 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
2952 XVA_SET_REQ(xvap, XAT_NODUMP);
2953 }
2954 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
2955 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2956 }
2957 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
2958 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2959 }
2960 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
2961 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2962 }
2963
2964 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2965 ASSERT3S(vp->v_type, ==, VREG);
2966
2967 zfs_xvattr_set(zp, xvap, tx);
2968 }
2969
2970 if (fuid_dirtied)
2971 zfs_fuid_sync(zfsvfs, tx);
2972
2973 if (mask != 0)
2974 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2975
2976 if (mask & (AT_UID|AT_GID|AT_MODE))
2977 mutex_exit(&zp->z_acl_lock);
2978
2979 if (attrzp) {
2980 if (mask & (AT_UID|AT_GID|AT_MODE))
2981 mutex_exit(&attrzp->z_acl_lock);
2982 }
2983 out:
2984 if (err == 0 && attrzp) {
2985 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2986 xattr_count, tx);
2987 ASSERT0(err2);
2988 }
2989
2990 if (attrzp)
2991 vput(ZTOV(attrzp));
2992
2993 if (aclp)
2994 zfs_acl_free(aclp);
2995
2996 if (fuidp) {
2997 zfs_fuid_info_free(fuidp);
2998 fuidp = NULL;
2999 }
3000
3001 if (err) {
3002 dmu_tx_abort(tx);
3003 } else {
3004 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3005 dmu_tx_commit(tx);
3006 if (attrzp) {
3007 if (err2 == 0 && handle_eadir)
3008 err = zfs_setattr_dir(attrzp);
3009 }
3010 }
3011
3012 out2:
3013 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
3014 err = zil_commit(zilog, 0);
3015
3016 zfs_exit(zfsvfs, FTAG);
3017 return (err);
3018 }
3019
3020 /*
3021 * Look up the directory entries corresponding to the source and target
3022 * directory/name pairs.
3023 */
3024 static int
zfs_rename_relock_lookup(znode_t * sdzp,const struct componentname * scnp,znode_t ** szpp,znode_t * tdzp,const struct componentname * tcnp,znode_t ** tzpp)3025 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
3026 znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
3027 znode_t **tzpp)
3028 {
3029 zfsvfs_t *zfsvfs;
3030 znode_t *szp, *tzp;
3031 int error;
3032
3033 /*
3034 * Before using sdzp and tdzp we must ensure that they are live.
3035 * As a porting legacy from illumos we have two things to worry
3036 * about. One is typical for FreeBSD and it is that the vnode is
3037 * not reclaimed (doomed). The other is that the znode is live.
3038 * The current code can invalidate the znode without acquiring the
3039 * corresponding vnode lock if the object represented by the znode
3040 * and vnode is no longer valid after a rollback or receive operation.
3041 * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
3042 * that protects the znodes from the invalidation.
3043 */
3044 zfsvfs = sdzp->z_zfsvfs;
3045 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3046 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
3047 return (error);
3048 if ((error = zfs_verify_zp(tdzp)) != 0) {
3049 zfs_exit(zfsvfs, FTAG);
3050 return (error);
3051 }
3052
3053 /*
3054 * Re-resolve svp to be certain it still exists and fetch the
3055 * correct vnode.
3056 */
3057 error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
3058 if (error != 0) {
3059 /* Source entry invalid or not there. */
3060 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3061 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3062 error = SET_ERROR(EINVAL);
3063 goto out;
3064 }
3065 *szpp = szp;
3066
3067 /*
3068 * Re-resolve tvp, if it disappeared we just carry on.
3069 */
3070 error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
3071 if (error != 0) {
3072 vrele(ZTOV(szp));
3073 if ((tcnp->cn_flags & ISDOTDOT) != 0)
3074 error = SET_ERROR(EINVAL);
3075 goto out;
3076 }
3077 *tzpp = tzp;
3078 out:
3079 zfs_exit(zfsvfs, FTAG);
3080 return (error);
3081 }
3082
3083 /*
3084 * We acquire all but fdvp locks using non-blocking acquisitions. If we
3085 * fail to acquire any lock in the path we will drop all held locks,
3086 * acquire the new lock in a blocking fashion, and then release it and
3087 * restart the rename. This acquire/release step ensures that we do not
3088 * spin on a lock waiting for release. On error release all vnode locks
3089 * and decrement references the way tmpfs_rename() would do.
3090 */
3091 static int
zfs_rename_relock(struct vnode * sdvp,struct vnode ** svpp,struct vnode * tdvp,struct vnode ** tvpp,const struct componentname * scnp,const struct componentname * tcnp)3092 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3093 struct vnode *tdvp, struct vnode **tvpp,
3094 const struct componentname *scnp, const struct componentname *tcnp)
3095 {
3096 struct vnode *nvp, *svp, *tvp;
3097 znode_t *sdzp, *tdzp, *szp, *tzp;
3098 int error;
3099
3100 VOP_UNLOCK(tdvp);
3101 if (*tvpp != NULL && *tvpp != tdvp)
3102 VOP_UNLOCK(*tvpp);
3103
3104 relock:
3105 error = vn_lock(sdvp, LK_EXCLUSIVE);
3106 if (error)
3107 goto out;
3108 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3109 if (error != 0) {
3110 VOP_UNLOCK(sdvp);
3111 if (error != EBUSY)
3112 goto out;
3113 error = vn_lock(tdvp, LK_EXCLUSIVE);
3114 if (error)
3115 goto out;
3116 VOP_UNLOCK(tdvp);
3117 goto relock;
3118 }
3119 tdzp = VTOZ(tdvp);
3120 sdzp = VTOZ(sdvp);
3121
3122 error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
3123 if (error != 0) {
3124 VOP_UNLOCK(sdvp);
3125 VOP_UNLOCK(tdvp);
3126 goto out;
3127 }
3128 svp = ZTOV(szp);
3129 tvp = tzp != NULL ? ZTOV(tzp) : NULL;
3130
3131 /*
3132 * Now try acquire locks on svp and tvp.
3133 */
3134 nvp = svp;
3135 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3136 if (error != 0) {
3137 VOP_UNLOCK(sdvp);
3138 VOP_UNLOCK(tdvp);
3139 if (tvp != NULL)
3140 vrele(tvp);
3141 if (error != EBUSY) {
3142 vrele(nvp);
3143 goto out;
3144 }
3145 error = vn_lock(nvp, LK_EXCLUSIVE);
3146 if (error != 0) {
3147 vrele(nvp);
3148 goto out;
3149 }
3150 VOP_UNLOCK(nvp);
3151 /*
3152 * Concurrent rename race.
3153 * XXX ?
3154 */
3155 if (nvp == tdvp) {
3156 vrele(nvp);
3157 error = SET_ERROR(EINVAL);
3158 goto out;
3159 }
3160 vrele(*svpp);
3161 *svpp = nvp;
3162 goto relock;
3163 }
3164 vrele(*svpp);
3165 *svpp = nvp;
3166
3167 if (*tvpp != NULL)
3168 vrele(*tvpp);
3169 *tvpp = NULL;
3170 if (tvp != NULL) {
3171 nvp = tvp;
3172 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3173 if (error != 0) {
3174 VOP_UNLOCK(sdvp);
3175 VOP_UNLOCK(tdvp);
3176 VOP_UNLOCK(*svpp);
3177 if (error != EBUSY) {
3178 vrele(nvp);
3179 goto out;
3180 }
3181 error = vn_lock(nvp, LK_EXCLUSIVE);
3182 if (error != 0) {
3183 vrele(nvp);
3184 goto out;
3185 }
3186 vput(nvp);
3187 goto relock;
3188 }
3189 *tvpp = nvp;
3190 }
3191
3192 return (0);
3193
3194 out:
3195 return (error);
3196 }
3197
3198 /*
3199 * Note that we must use VRELE_ASYNC in this function as it walks
3200 * up the directory tree and vrele may need to acquire an exclusive
3201 * lock if a last reference to a vnode is dropped.
3202 */
3203 static int
zfs_rename_check(znode_t * szp,znode_t * sdzp,znode_t * tdzp)3204 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3205 {
3206 zfsvfs_t *zfsvfs;
3207 znode_t *zp, *zp1;
3208 uint64_t parent;
3209 int error;
3210
3211 zfsvfs = tdzp->z_zfsvfs;
3212 if (tdzp == szp)
3213 return (SET_ERROR(EINVAL));
3214 if (tdzp == sdzp)
3215 return (0);
3216 if (tdzp->z_id == zfsvfs->z_root)
3217 return (0);
3218 zp = tdzp;
3219 for (;;) {
3220 ASSERT(!zp->z_unlinked);
3221 if ((error = sa_lookup(zp->z_sa_hdl,
3222 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3223 break;
3224
3225 if (parent == szp->z_id) {
3226 error = SET_ERROR(EINVAL);
3227 break;
3228 }
3229 if (parent == zfsvfs->z_root)
3230 break;
3231 if (parent == sdzp->z_id)
3232 break;
3233
3234 error = zfs_zget(zfsvfs, parent, &zp1);
3235 if (error != 0)
3236 break;
3237
3238 if (zp != tdzp)
3239 VN_RELE_ASYNC(ZTOV(zp),
3240 dsl_pool_zrele_taskq(
3241 dmu_objset_pool(zfsvfs->z_os)));
3242 zp = zp1;
3243 }
3244
3245 if (error == ENOTDIR)
3246 panic("checkpath: .. not a directory\n");
3247 if (zp != tdzp)
3248 VN_RELE_ASYNC(ZTOV(zp),
3249 dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3250 return (error);
3251 }
3252
3253 static int
3254 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3255 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3256 cred_t *cr);
3257
3258 /*
3259 * Move an entry from the provided source directory to the target
3260 * directory. Change the entry name as indicated.
3261 *
3262 * IN: sdvp - Source directory containing the "old entry".
3263 * scnp - Old entry name.
3264 * tdvp - Target directory to contain the "new entry".
3265 * tcnp - New entry name.
3266 * cr - credentials of caller.
3267 * INOUT: svpp - Source file
3268 * tvpp - Target file, may point to NULL initially
3269 *
3270 * RETURN: 0 on success, error code on failure.
3271 *
3272 * Timestamps:
3273 * sdvp,tdvp - ctime|mtime updated
3274 */
3275 static int
zfs_do_rename(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr)3276 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3277 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3278 cred_t *cr)
3279 {
3280 int error;
3281
3282 ASSERT_VOP_ELOCKED(tdvp, __func__);
3283 if (*tvpp != NULL)
3284 ASSERT_VOP_ELOCKED(*tvpp, __func__);
3285
3286 /* Reject renames across filesystems. */
3287 if ((*svpp)->v_mount != tdvp->v_mount ||
3288 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3289 error = SET_ERROR(EXDEV);
3290 goto out;
3291 }
3292
3293 if (zfsctl_is_node(tdvp)) {
3294 error = SET_ERROR(EXDEV);
3295 goto out;
3296 }
3297
3298 /*
3299 * Lock all four vnodes to ensure safety and semantics of renaming.
3300 */
3301 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3302 if (error != 0) {
3303 /* no vnodes are locked in the case of error here */
3304 return (error);
3305 }
3306
3307 error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr);
3308 VOP_UNLOCK(sdvp);
3309 VOP_UNLOCK(*svpp);
3310 out:
3311 if (*tvpp != NULL)
3312 VOP_UNLOCK(*tvpp);
3313 if (tdvp != *tvpp)
3314 VOP_UNLOCK(tdvp);
3315
3316 return (error);
3317 }
3318
3319 static int
zfs_do_rename_impl(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr)3320 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3321 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3322 cred_t *cr)
3323 {
3324 dmu_tx_t *tx;
3325 zfsvfs_t *zfsvfs;
3326 zilog_t *zilog;
3327 znode_t *tdzp, *sdzp, *tzp, *szp;
3328 const char *snm = scnp->cn_nameptr;
3329 const char *tnm = tcnp->cn_nameptr;
3330 int error;
3331
3332 tdzp = VTOZ(tdvp);
3333 sdzp = VTOZ(sdvp);
3334 zfsvfs = tdzp->z_zfsvfs;
3335
3336 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3337 return (error);
3338 if ((error = zfs_verify_zp(sdzp)) != 0) {
3339 zfs_exit(zfsvfs, FTAG);
3340 return (error);
3341 }
3342 zilog = zfsvfs->z_log;
3343
3344 if (zfsvfs->z_utf8 && u8_validate(tnm,
3345 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3346 error = SET_ERROR(EILSEQ);
3347 goto out;
3348 }
3349
3350 /* If source and target are the same file, there is nothing to do. */
3351 if ((*svpp) == (*tvpp)) {
3352 error = 0;
3353 goto out;
3354 }
3355
3356 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3357 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3358 (*tvpp)->v_mountedhere != NULL)) {
3359 error = SET_ERROR(EXDEV);
3360 goto out;
3361 }
3362
3363 szp = VTOZ(*svpp);
3364 if ((error = zfs_verify_zp(szp)) != 0) {
3365 zfs_exit(zfsvfs, FTAG);
3366 return (error);
3367 }
3368 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3369 if (tzp != NULL) {
3370 if ((error = zfs_verify_zp(tzp)) != 0) {
3371 zfs_exit(zfsvfs, FTAG);
3372 return (error);
3373 }
3374 }
3375
3376 /*
3377 * This is to prevent the creation of links into attribute space
3378 * by renaming a linked file into/outof an attribute directory.
3379 * See the comment in zfs_link() for why this is considered bad.
3380 */
3381 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3382 error = SET_ERROR(EINVAL);
3383 goto out;
3384 }
3385
3386 /*
3387 * If we are using project inheritance, means if the directory has
3388 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3389 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3390 * such case, we only allow renames into our tree when the project
3391 * IDs are the same.
3392 */
3393 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3394 tdzp->z_projid != szp->z_projid) {
3395 error = SET_ERROR(EXDEV);
3396 goto out;
3397 }
3398
3399 /*
3400 * Must have write access at the source to remove the old entry
3401 * and write access at the target to create the new entry.
3402 * Note that if target and source are the same, this can be
3403 * done in a single check.
3404 */
3405 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
3406 goto out;
3407
3408 if ((*svpp)->v_type == VDIR) {
3409 /*
3410 * Avoid ".", "..", and aliases of "." for obvious reasons.
3411 */
3412 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3413 sdzp == szp ||
3414 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3415 error = EINVAL;
3416 goto out;
3417 }
3418
3419 /*
3420 * Check to make sure rename is valid.
3421 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3422 */
3423 if ((error = zfs_rename_check(szp, sdzp, tdzp)))
3424 goto out;
3425 }
3426
3427 /*
3428 * Does target exist?
3429 */
3430 if (tzp) {
3431 /*
3432 * Source and target must be the same type.
3433 */
3434 if ((*svpp)->v_type == VDIR) {
3435 if ((*tvpp)->v_type != VDIR) {
3436 error = SET_ERROR(ENOTDIR);
3437 goto out;
3438 } else {
3439 cache_purge(tdvp);
3440 if (sdvp != tdvp)
3441 cache_purge(sdvp);
3442 }
3443 } else {
3444 if ((*tvpp)->v_type == VDIR) {
3445 error = SET_ERROR(EISDIR);
3446 goto out;
3447 }
3448 }
3449 }
3450
3451 vn_seqc_write_begin(*svpp);
3452 vn_seqc_write_begin(sdvp);
3453 if (*tvpp != NULL)
3454 vn_seqc_write_begin(*tvpp);
3455 if (tdvp != *tvpp)
3456 vn_seqc_write_begin(tdvp);
3457
3458 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3459 if (tzp)
3460 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3461
3462 /*
3463 * notify the target directory if it is not the same
3464 * as source directory.
3465 */
3466 if (tdvp != sdvp) {
3467 vnevent_rename_dest_dir(tdvp, ct);
3468 }
3469
3470 tx = dmu_tx_create(zfsvfs->z_os);
3471 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3472 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3473 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3474 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3475 if (sdzp != tdzp) {
3476 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3477 zfs_sa_upgrade_txholds(tx, tdzp);
3478 }
3479 if (tzp) {
3480 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3481 zfs_sa_upgrade_txholds(tx, tzp);
3482 }
3483
3484 zfs_sa_upgrade_txholds(tx, szp);
3485 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3486 error = dmu_tx_assign(tx, DMU_TX_WAIT);
3487 if (error) {
3488 dmu_tx_abort(tx);
3489 goto out_seq;
3490 }
3491
3492 if (tzp) /* Attempt to remove the existing target */
3493 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3494
3495 if (error == 0) {
3496 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3497 if (error == 0) {
3498 szp->z_pflags |= ZFS_AV_MODIFIED;
3499
3500 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3501 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3502 ASSERT0(error);
3503
3504 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3505 NULL);
3506 if (error == 0) {
3507 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3508 snm, tdzp, tnm, szp);
3509 } else {
3510 /*
3511 * At this point, we have successfully created
3512 * the target name, but have failed to remove
3513 * the source name. Since the create was done
3514 * with the ZRENAMING flag, there are
3515 * complications; for one, the link count is
3516 * wrong. The easiest way to deal with this
3517 * is to remove the newly created target, and
3518 * return the original error. This must
3519 * succeed; fortunately, it is very unlikely to
3520 * fail, since we just created it.
3521 */
3522 VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
3523 ZRENAMING, NULL));
3524 }
3525 }
3526 if (error == 0) {
3527 cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
3528 }
3529 }
3530
3531 dmu_tx_commit(tx);
3532
3533 out_seq:
3534 vn_seqc_write_end(*svpp);
3535 vn_seqc_write_end(sdvp);
3536 if (*tvpp != NULL)
3537 vn_seqc_write_end(*tvpp);
3538 if (tdvp != *tvpp)
3539 vn_seqc_write_end(tdvp);
3540
3541 out:
3542 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3543 error = zil_commit(zilog, 0);
3544 zfs_exit(zfsvfs, FTAG);
3545
3546 return (error);
3547 }
3548
3549 int
zfs_rename(znode_t * sdzp,const char * sname,znode_t * tdzp,const char * tname,cred_t * cr,int flags,uint64_t rflags,vattr_t * wo_vap,zidmap_t * mnt_ns)3550 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
3551 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
3552 {
3553 struct componentname scn, tcn;
3554 vnode_t *sdvp, *tdvp;
3555 vnode_t *svp, *tvp;
3556 int error;
3557 svp = tvp = NULL;
3558
3559 if (is_nametoolong(tdzp->z_zfsvfs, tname))
3560 return (SET_ERROR(ENAMETOOLONG));
3561
3562 if (rflags != 0 || wo_vap != NULL)
3563 return (SET_ERROR(EINVAL));
3564
3565 sdvp = ZTOV(sdzp);
3566 tdvp = ZTOV(tdzp);
3567 error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
3568 if (sdzp->z_zfsvfs->z_replay == B_FALSE)
3569 VOP_UNLOCK(sdvp);
3570 if (error != 0)
3571 goto fail;
3572 VOP_UNLOCK(svp);
3573
3574 vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
3575 error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
3576 if (error == EJUSTRETURN)
3577 tvp = NULL;
3578 else if (error != 0) {
3579 VOP_UNLOCK(tdvp);
3580 goto fail;
3581 }
3582
3583 error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr);
3584 fail:
3585 if (svp != NULL)
3586 vrele(svp);
3587 if (tvp != NULL)
3588 vrele(tvp);
3589
3590 return (error);
3591 }
3592
3593 /*
3594 * Insert the indicated symbolic reference entry into the directory.
3595 *
3596 * IN: dvp - Directory to contain new symbolic link.
3597 * link - Name for new symlink entry.
3598 * vap - Attributes of new entry.
3599 * cr - credentials of caller.
3600 * ct - caller context
3601 * flags - case flags
3602 * mnt_ns - Unused on FreeBSD
3603 *
3604 * RETURN: 0 on success, error code on failure.
3605 *
3606 * Timestamps:
3607 * dvp - ctime|mtime updated
3608 */
3609 int
zfs_symlink(znode_t * dzp,const char * name,vattr_t * vap,const char * link,znode_t ** zpp,cred_t * cr,int flags,zidmap_t * mnt_ns)3610 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
3611 const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3612 {
3613 (void) flags;
3614 znode_t *zp;
3615 dmu_tx_t *tx;
3616 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3617 zilog_t *zilog;
3618 uint64_t len = strlen(link);
3619 int error;
3620 zfs_acl_ids_t acl_ids;
3621 boolean_t fuid_dirtied;
3622 uint64_t txtype = TX_SYMLINK;
3623
3624 ASSERT3S(vap->va_type, ==, VLNK);
3625
3626 if (is_nametoolong(zfsvfs, name))
3627 return (SET_ERROR(ENAMETOOLONG));
3628
3629 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3630 return (error);
3631 zilog = zfsvfs->z_log;
3632
3633 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3634 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3635 zfs_exit(zfsvfs, FTAG);
3636 return (SET_ERROR(EILSEQ));
3637 }
3638
3639 if (len > MAXPATHLEN) {
3640 zfs_exit(zfsvfs, FTAG);
3641 return (SET_ERROR(ENAMETOOLONG));
3642 }
3643
3644 if ((error = zfs_acl_ids_create(dzp, 0,
3645 vap, cr, NULL, &acl_ids, NULL)) != 0) {
3646 zfs_exit(zfsvfs, FTAG);
3647 return (error);
3648 }
3649
3650 /*
3651 * Attempt to lock directory; fail if entry already exists.
3652 */
3653 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
3654 if (error) {
3655 zfs_acl_ids_free(&acl_ids);
3656 zfs_exit(zfsvfs, FTAG);
3657 return (error);
3658 }
3659
3660 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3661 zfs_acl_ids_free(&acl_ids);
3662 zfs_exit(zfsvfs, FTAG);
3663 return (error);
3664 }
3665
3666 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3667 zfs_acl_ids_free(&acl_ids);
3668 zfs_exit(zfsvfs, FTAG);
3669 return (SET_ERROR(EDQUOT));
3670 }
3671
3672 getnewvnode_reserve();
3673 tx = dmu_tx_create(zfsvfs->z_os);
3674 fuid_dirtied = zfsvfs->z_fuid_dirty;
3675 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3676 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3677 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3678 ZFS_SA_BASE_ATTR_SIZE + len);
3679 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3680 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3681 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3682 acl_ids.z_aclp->z_acl_bytes);
3683 }
3684 if (fuid_dirtied)
3685 zfs_fuid_txhold(zfsvfs, tx);
3686 error = dmu_tx_assign(tx, DMU_TX_WAIT);
3687 if (error) {
3688 zfs_acl_ids_free(&acl_ids);
3689 dmu_tx_abort(tx);
3690 getnewvnode_drop_reserve();
3691 zfs_exit(zfsvfs, FTAG);
3692 return (error);
3693 }
3694
3695 /*
3696 * Create a new object for the symlink.
3697 * for version 4 ZPL datasets the symlink will be an SA attribute
3698 */
3699 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3700
3701 if (fuid_dirtied)
3702 zfs_fuid_sync(zfsvfs, tx);
3703
3704 if (zp->z_is_sa)
3705 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3706 __DECONST(void *, link), len, tx);
3707 else
3708 zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
3709
3710 zp->z_size = len;
3711 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3712 &zp->z_size, sizeof (zp->z_size), tx);
3713 /*
3714 * Insert the new object into the directory.
3715 */
3716 error = zfs_link_create(dzp, name, zp, tx, ZNEW);
3717 if (error != 0) {
3718 zfs_znode_delete(zp, tx);
3719 VOP_UNLOCK(ZTOV(zp));
3720 zrele(zp);
3721 } else {
3722 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3723 }
3724
3725 zfs_acl_ids_free(&acl_ids);
3726
3727 dmu_tx_commit(tx);
3728
3729 getnewvnode_drop_reserve();
3730
3731 if (error == 0) {
3732 *zpp = zp;
3733
3734 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3735 error = zil_commit(zilog, 0);
3736 }
3737
3738 zfs_exit(zfsvfs, FTAG);
3739 return (error);
3740 }
3741
3742 /*
3743 * Return, in the buffer contained in the provided uio structure,
3744 * the symbolic path referred to by vp.
3745 *
3746 * IN: vp - vnode of symbolic link.
3747 * uio - structure to contain the link path.
3748 * cr - credentials of caller.
3749 * ct - caller context
3750 *
3751 * OUT: uio - structure containing the link path.
3752 *
3753 * RETURN: 0 on success, error code on failure.
3754 *
3755 * Timestamps:
3756 * vp - atime updated
3757 */
3758 static int
zfs_readlink(vnode_t * vp,zfs_uio_t * uio,cred_t * cr,caller_context_t * ct)3759 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
3760 {
3761 (void) cr, (void) ct;
3762 znode_t *zp = VTOZ(vp);
3763 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3764 int error;
3765
3766 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3767 return (error);
3768
3769 if (zp->z_is_sa)
3770 error = sa_lookup_uio(zp->z_sa_hdl,
3771 SA_ZPL_SYMLINK(zfsvfs), uio);
3772 else
3773 error = zfs_sa_readlink(zp, uio);
3774
3775 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3776
3777 zfs_exit(zfsvfs, FTAG);
3778 return (error);
3779 }
3780
3781 /*
3782 * Insert a new entry into directory tdvp referencing svp.
3783 *
3784 * IN: tdvp - Directory to contain new entry.
3785 * svp - vnode of new entry.
3786 * name - name of new entry.
3787 * cr - credentials of caller.
3788 *
3789 * RETURN: 0 on success, error code on failure.
3790 *
3791 * Timestamps:
3792 * tdvp - ctime|mtime updated
3793 * svp - ctime updated
3794 */
3795 int
zfs_link(znode_t * tdzp,znode_t * szp,const char * name,cred_t * cr,int flags)3796 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
3797 int flags)
3798 {
3799 (void) flags;
3800 znode_t *tzp;
3801 zfsvfs_t *zfsvfs = tdzp->z_zfsvfs;
3802 zilog_t *zilog;
3803 dmu_tx_t *tx;
3804 int error;
3805 uint64_t parent;
3806 uid_t owner;
3807
3808 ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
3809
3810 if (is_nametoolong(zfsvfs, name))
3811 return (SET_ERROR(ENAMETOOLONG));
3812
3813 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3814 return (error);
3815 zilog = zfsvfs->z_log;
3816
3817 /*
3818 * POSIX dictates that we return EPERM here.
3819 * Better choices include ENOTSUP or EISDIR.
3820 */
3821 if (ZTOV(szp)->v_type == VDIR) {
3822 zfs_exit(zfsvfs, FTAG);
3823 return (SET_ERROR(EPERM));
3824 }
3825
3826 if ((error = zfs_verify_zp(szp)) != 0) {
3827 zfs_exit(zfsvfs, FTAG);
3828 return (error);
3829 }
3830
3831 /*
3832 * If we are using project inheritance, means if the directory has
3833 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3834 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3835 * such case, we only allow hard link creation in our tree when the
3836 * project IDs are the same.
3837 */
3838 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3839 tdzp->z_projid != szp->z_projid) {
3840 zfs_exit(zfsvfs, FTAG);
3841 return (SET_ERROR(EXDEV));
3842 }
3843
3844 if (szp->z_pflags & (ZFS_APPENDONLY |
3845 ZFS_IMMUTABLE | ZFS_READONLY)) {
3846 zfs_exit(zfsvfs, FTAG);
3847 return (SET_ERROR(EPERM));
3848 }
3849
3850 /* Prevent links to .zfs/shares files */
3851
3852 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3853 &parent, sizeof (uint64_t))) != 0) {
3854 zfs_exit(zfsvfs, FTAG);
3855 return (error);
3856 }
3857 if (parent == zfsvfs->z_shares_dir) {
3858 zfs_exit(zfsvfs, FTAG);
3859 return (SET_ERROR(EPERM));
3860 }
3861
3862 if (zfsvfs->z_utf8 && u8_validate(name,
3863 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3864 zfs_exit(zfsvfs, FTAG);
3865 return (SET_ERROR(EILSEQ));
3866 }
3867
3868 /*
3869 * We do not support links between attributes and non-attributes
3870 * because of the potential security risk of creating links
3871 * into "normal" file space in order to circumvent restrictions
3872 * imposed in attribute space.
3873 */
3874 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3875 zfs_exit(zfsvfs, FTAG);
3876 return (SET_ERROR(EINVAL));
3877 }
3878
3879
3880 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3881 if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
3882 zfs_exit(zfsvfs, FTAG);
3883 return (SET_ERROR(EPERM));
3884 }
3885
3886 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
3887 zfs_exit(zfsvfs, FTAG);
3888 return (error);
3889 }
3890
3891 /*
3892 * Attempt to lock directory; fail if entry already exists.
3893 */
3894 error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
3895 if (error) {
3896 zfs_exit(zfsvfs, FTAG);
3897 return (error);
3898 }
3899
3900 tx = dmu_tx_create(zfsvfs->z_os);
3901 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3902 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3903 zfs_sa_upgrade_txholds(tx, szp);
3904 zfs_sa_upgrade_txholds(tx, tdzp);
3905 error = dmu_tx_assign(tx, DMU_TX_WAIT);
3906 if (error) {
3907 dmu_tx_abort(tx);
3908 zfs_exit(zfsvfs, FTAG);
3909 return (error);
3910 }
3911
3912 error = zfs_link_create(tdzp, name, szp, tx, 0);
3913
3914 if (error == 0) {
3915 uint64_t txtype = TX_LINK;
3916 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3917 }
3918
3919 dmu_tx_commit(tx);
3920
3921 if (error == 0) {
3922 vnevent_link(ZTOV(szp), ct);
3923 }
3924
3925 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3926 error = zil_commit(zilog, 0);
3927
3928 zfs_exit(zfsvfs, FTAG);
3929 return (error);
3930 }
3931
3932 /*
3933 * Free or allocate space in a file. Currently, this function only
3934 * supports the `F_FREESP' command. However, this command is somewhat
3935 * misnamed, as its functionality includes the ability to allocate as
3936 * well as free space.
3937 *
3938 * IN: ip - inode of file to free data in.
3939 * cmd - action to take (only F_FREESP supported).
3940 * bfp - section of file to free/alloc.
3941 * flag - current file open mode flags.
3942 * offset - current file offset.
3943 * cr - credentials of caller.
3944 *
3945 * RETURN: 0 on success, error code on failure.
3946 *
3947 * Timestamps:
3948 * ip - ctime|mtime updated
3949 */
3950 int
zfs_space(znode_t * zp,int cmd,flock64_t * bfp,int flag,offset_t offset,cred_t * cr)3951 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3952 offset_t offset, cred_t *cr)
3953 {
3954 (void) offset;
3955 zfsvfs_t *zfsvfs = ZTOZSB(zp);
3956 uint64_t off, len;
3957 int error;
3958
3959 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3960 return (error);
3961
3962 if (cmd != F_FREESP) {
3963 zfs_exit(zfsvfs, FTAG);
3964 return (SET_ERROR(EINVAL));
3965 }
3966
3967 /*
3968 * Callers might not be able to detect properly that we are read-only,
3969 * so check it explicitly here.
3970 */
3971 if (zfs_is_readonly(zfsvfs)) {
3972 zfs_exit(zfsvfs, FTAG);
3973 return (SET_ERROR(EROFS));
3974 }
3975
3976 if (bfp->l_len < 0) {
3977 zfs_exit(zfsvfs, FTAG);
3978 return (SET_ERROR(EINVAL));
3979 }
3980
3981 /*
3982 * Permissions aren't checked on Solaris because on this OS
3983 * zfs_space() can only be called with an opened file handle.
3984 * On Linux we can get here through truncate_range() which
3985 * operates directly on inodes, so we need to check access rights.
3986 */
3987 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
3988 zfs_exit(zfsvfs, FTAG);
3989 return (error);
3990 }
3991
3992 off = bfp->l_start;
3993 len = bfp->l_len; /* 0 means from off to end of file */
3994
3995 error = zfs_freesp(zp, off, len, flag, TRUE);
3996
3997 zfs_exit(zfsvfs, FTAG);
3998 return (error);
3999 }
4000
4001 static void
zfs_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)4002 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4003 {
4004 (void) cr, (void) ct;
4005 znode_t *zp = VTOZ(vp);
4006 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4007 int error;
4008
4009 ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
4010 if (zp->z_sa_hdl == NULL) {
4011 /*
4012 * The fs has been unmounted, or we did a
4013 * suspend/resume and this file no longer exists.
4014 */
4015 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4016 vrecycle(vp);
4017 return;
4018 }
4019
4020 if (zp->z_unlinked) {
4021 /*
4022 * Fast path to recycle a vnode of a removed file.
4023 */
4024 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4025 vrecycle(vp);
4026 return;
4027 }
4028
4029 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4030 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4031
4032 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4033 zfs_sa_upgrade_txholds(tx, zp);
4034 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4035 if (error) {
4036 dmu_tx_abort(tx);
4037 } else {
4038 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4039 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4040 zp->z_atime_dirty = 0;
4041 dmu_tx_commit(tx);
4042 }
4043 }
4044 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4045 }
4046
4047
4048 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
4049 "struct zfid_short bigger than struct fid");
4050 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
4051 "struct zfid_long bigger than struct fid");
4052
4053 static int
zfs_fid(vnode_t * vp,fid_t * fidp,caller_context_t * ct)4054 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4055 {
4056 (void) ct;
4057 znode_t *zp = VTOZ(vp);
4058 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4059 uint32_t gen;
4060 uint64_t gen64;
4061 uint64_t object = zp->z_id;
4062 zfid_short_t *zfid;
4063 int size, i, error;
4064
4065 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4066 return (error);
4067
4068 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4069 &gen64, sizeof (uint64_t))) != 0) {
4070 zfs_exit(zfsvfs, FTAG);
4071 return (error);
4072 }
4073
4074 gen = (uint32_t)gen64;
4075
4076 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4077 fidp->fid_len = size;
4078
4079 zfid = (zfid_short_t *)fidp;
4080
4081 zfid->zf_len = size;
4082
4083 for (i = 0; i < sizeof (zfid->zf_object); i++)
4084 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4085
4086 /* Must have a non-zero generation number to distinguish from .zfs */
4087 if (gen == 0)
4088 gen = 1;
4089 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4090 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4091
4092 if (size == LONG_FID_LEN) {
4093 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4094 zfid_long_t *zlfid;
4095
4096 zlfid = (zfid_long_t *)fidp;
4097
4098 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4099 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4100
4101 /* XXX - this should be the generation number for the objset */
4102 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4103 zlfid->zf_setgen[i] = 0;
4104 }
4105
4106 zfs_exit(zfsvfs, FTAG);
4107 return (0);
4108 }
4109
4110 static int
zfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)4111 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4112 caller_context_t *ct)
4113 {
4114 znode_t *zp;
4115 zfsvfs_t *zfsvfs;
4116 int error;
4117
4118 switch (cmd) {
4119 case _PC_LINK_MAX:
4120 *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
4121 return (0);
4122
4123 case _PC_FILESIZEBITS:
4124 *valp = 64;
4125 return (0);
4126 case _PC_MIN_HOLE_SIZE:
4127 *valp = (int)SPA_MINBLOCKSIZE;
4128 return (0);
4129 case _PC_ACL_EXTENDED:
4130 #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
4131 zp = VTOZ(vp);
4132 zfsvfs = zp->z_zfsvfs;
4133 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4134 return (error);
4135 *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
4136 zfs_exit(zfsvfs, FTAG);
4137 #else
4138 *valp = 0;
4139 #endif
4140 return (0);
4141
4142 case _PC_ACL_NFS4:
4143 zp = VTOZ(vp);
4144 zfsvfs = zp->z_zfsvfs;
4145 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4146 return (error);
4147 *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
4148 zfs_exit(zfsvfs, FTAG);
4149 return (0);
4150
4151 case _PC_ACL_PATH_MAX:
4152 *valp = ACL_MAX_ENTRIES;
4153 return (0);
4154
4155 default:
4156 return (EOPNOTSUPP);
4157 }
4158 }
4159
4160 static int
zfs_getpages(struct vnode * vp,vm_page_t * ma,int count,int * rbehind,int * rahead)4161 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4162 int *rahead)
4163 {
4164 znode_t *zp = VTOZ(vp);
4165 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4166 zfs_locked_range_t *lr;
4167 vm_object_t object;
4168 off_t start, end, obj_size;
4169 uint_t blksz;
4170 int pgsin_b, pgsin_a;
4171 int error;
4172
4173 if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4174 return (zfs_vm_pagerret_error);
4175
4176 object = ma[0]->object;
4177 start = IDX_TO_OFF(ma[0]->pindex);
4178 end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4179
4180 /*
4181 * Lock a range covering all required and optional pages.
4182 * Note that we need to handle the case of the block size growing.
4183 */
4184 for (;;) {
4185 uint64_t len;
4186
4187 blksz = zp->z_blksz;
4188 len = roundup(end, blksz) - rounddown(start, blksz);
4189
4190 lr = zfs_rangelock_tryenter(&zp->z_rangelock,
4191 rounddown(start, blksz), len, RL_READER);
4192 if (lr == NULL) {
4193 /*
4194 * Avoid a deadlock with update_pages(). We need to
4195 * hold the range lock when copying from the DMU, so
4196 * give up the busy lock to allow update_pages() to
4197 * proceed. We might need to allocate new pages, which
4198 * isn't quite right since this allocation isn't subject
4199 * to the page fault handler's OOM logic, but this is
4200 * the best we can do for now.
4201 */
4202 for (int i = 0; i < count; i++)
4203 vm_page_xunbusy(ma[i]);
4204
4205 lr = zfs_rangelock_enter(&zp->z_rangelock,
4206 rounddown(start, blksz), len, RL_READER);
4207
4208 zfs_vmobject_wlock(object);
4209 (void) vm_page_grab_pages(object, OFF_TO_IDX(start),
4210 VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
4211 ma, count);
4212 zfs_vmobject_wunlock(object);
4213 }
4214 if (blksz == zp->z_blksz)
4215 break;
4216 zfs_rangelock_exit(lr);
4217 }
4218
4219 zfs_vmobject_wlock(object);
4220 obj_size = object->un_pager.vnp.vnp_size;
4221 zfs_vmobject_wunlock(object);
4222 if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4223 zfs_rangelock_exit(lr);
4224 zfs_exit(zfsvfs, FTAG);
4225 return (zfs_vm_pagerret_bad);
4226 }
4227
4228 pgsin_b = 0;
4229 if (rbehind != NULL) {
4230 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4231 pgsin_b = MIN(*rbehind, pgsin_b);
4232 }
4233
4234 pgsin_a = 0;
4235 if (rahead != NULL) {
4236 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4237 if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4238 pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4239 pgsin_a = MIN(*rahead, pgsin_a);
4240 }
4241
4242 /*
4243 * NB: we need to pass the exact byte size of the data that we expect
4244 * to read after accounting for the file size. This is required because
4245 * ZFS will panic if we request DMU to read beyond the end of the last
4246 * allocated block.
4247 */
4248 for (int i = 0; i < count; i++) {
4249 int dummypgsin, count1, j, last_size;
4250
4251 if (vm_page_any_valid(ma[i])) {
4252 ASSERT(vm_page_all_valid(ma[i]));
4253 continue;
4254 }
4255 for (j = i + 1; j < count; j++) {
4256 if (vm_page_any_valid(ma[j])) {
4257 ASSERT(vm_page_all_valid(ma[j]));
4258 break;
4259 }
4260 }
4261 count1 = j - i;
4262 dummypgsin = 0;
4263 last_size = j == count ?
4264 MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE;
4265 error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1,
4266 i == 0 ? &pgsin_b : &dummypgsin,
4267 j == count ? &pgsin_a : &dummypgsin,
4268 last_size);
4269 if (error != 0)
4270 break;
4271 i += count1 - 1;
4272 }
4273
4274 zfs_rangelock_exit(lr);
4275 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4276
4277 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
4278
4279 zfs_exit(zfsvfs, FTAG);
4280
4281 if (error != 0)
4282 return (zfs_vm_pagerret_error);
4283
4284 VM_CNT_INC(v_vnodein);
4285 VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
4286 if (rbehind != NULL)
4287 *rbehind = pgsin_b;
4288 if (rahead != NULL)
4289 *rahead = pgsin_a;
4290 return (zfs_vm_pagerret_ok);
4291 }
4292
4293 #ifndef _SYS_SYSPROTO_H_
4294 struct vop_getpages_args {
4295 struct vnode *a_vp;
4296 vm_page_t *a_m;
4297 int a_count;
4298 int *a_rbehind;
4299 int *a_rahead;
4300 };
4301 #endif
4302
4303 static int
zfs_freebsd_getpages(struct vop_getpages_args * ap)4304 zfs_freebsd_getpages(struct vop_getpages_args *ap)
4305 {
4306
4307 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4308 ap->a_rahead));
4309 }
4310
4311 typedef struct {
4312 uint_t pca_npages;
4313 vm_page_t pca_pages[];
4314 } putpage_commit_arg_t;
4315
4316 static void
zfs_putpage_commit_cb(void * arg,int err)4317 zfs_putpage_commit_cb(void *arg, int err)
4318 {
4319 putpage_commit_arg_t *pca = arg;
4320 vm_object_t object = pca->pca_pages[0]->object;
4321
4322 zfs_vmobject_wlock(object);
4323
4324 for (uint_t i = 0; i < pca->pca_npages; i++) {
4325 vm_page_t pp = pca->pca_pages[i];
4326
4327 if (err == 0) {
4328 /*
4329 * Writeback succeeded, so undirty the page. If it
4330 * fails, we leave it in the same state it was. That's
4331 * most likely dirty, so it will get tried again some
4332 * other time.
4333 */
4334 vm_page_undirty(pp);
4335 }
4336
4337 vm_page_sunbusy(pp);
4338 }
4339
4340 vm_object_pip_wakeupn(object, pca->pca_npages);
4341
4342 zfs_vmobject_wunlock(object);
4343
4344 kmem_free(pca,
4345 offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
4346 }
4347
4348 static int
zfs_putpages(struct vnode * vp,vm_page_t * ma,size_t len,int flags,int * rtvals)4349 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4350 int *rtvals)
4351 {
4352 znode_t *zp = VTOZ(vp);
4353 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4354 zfs_locked_range_t *lr;
4355 dmu_tx_t *tx;
4356 struct sf_buf *sf;
4357 vm_object_t object;
4358 vm_page_t m;
4359 caddr_t va;
4360 size_t tocopy;
4361 size_t lo_len;
4362 vm_ooffset_t lo_off;
4363 vm_ooffset_t off;
4364 uint_t blksz;
4365 int ncount;
4366 int pcount;
4367 int err;
4368 int i;
4369
4370 object = vp->v_object;
4371 KASSERT(ma[0]->object == object, ("mismatching object"));
4372 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4373
4374 pcount = btoc(len);
4375 ncount = pcount;
4376 for (i = 0; i < pcount; i++)
4377 rtvals[i] = zfs_vm_pagerret_error;
4378
4379 if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4380 return (zfs_vm_pagerret_error);
4381
4382 off = IDX_TO_OFF(ma[0]->pindex);
4383 blksz = zp->z_blksz;
4384 lo_off = rounddown(off, blksz);
4385 lo_len = roundup(len + (off - lo_off), blksz);
4386 lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
4387
4388 zfs_vmobject_wlock(object);
4389 if (len + off > object->un_pager.vnp.vnp_size) {
4390 if (object->un_pager.vnp.vnp_size > off) {
4391 int pgoff;
4392
4393 len = object->un_pager.vnp.vnp_size - off;
4394 ncount = btoc(len);
4395 if ((pgoff = (int)len & PAGE_MASK) != 0) {
4396 /*
4397 * If the object is locked and the following
4398 * conditions hold, then the page's dirty
4399 * field cannot be concurrently changed by a
4400 * pmap operation.
4401 */
4402 m = ma[ncount - 1];
4403 vm_page_assert_sbusied(m);
4404 KASSERT(!pmap_page_is_write_mapped(m),
4405 ("zfs_putpages: page %p is not read-only",
4406 m));
4407 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4408 pgoff);
4409 }
4410 } else {
4411 len = 0;
4412 ncount = 0;
4413 }
4414 if (ncount < pcount) {
4415 for (i = ncount; i < pcount; i++) {
4416 rtvals[i] = zfs_vm_pagerret_bad;
4417 }
4418 }
4419 }
4420 zfs_vmobject_wunlock(object);
4421
4422 boolean_t commit = (flags & (zfs_vm_pagerput_sync |
4423 zfs_vm_pagerput_inval)) != 0 ||
4424 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
4425
4426 if (ncount == 0)
4427 goto out;
4428
4429 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
4430 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
4431 (zp->z_projid != ZFS_DEFAULT_PROJID &&
4432 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4433 zp->z_projid))) {
4434 goto out;
4435 }
4436
4437 tx = dmu_tx_create(zfsvfs->z_os);
4438 dmu_tx_hold_write(tx, zp->z_id, off, len);
4439
4440 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4441 zfs_sa_upgrade_txholds(tx, zp);
4442 err = dmu_tx_assign(tx, DMU_TX_WAIT);
4443 if (err != 0) {
4444 dmu_tx_abort(tx);
4445 goto out;
4446 }
4447
4448 if (zp->z_blksz < PAGE_SIZE) {
4449 vm_ooffset_t woff = off;
4450 size_t wlen = len;
4451 for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
4452 tocopy = MIN(PAGE_SIZE, wlen);
4453 va = zfs_map_page(ma[i], &sf);
4454 dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
4455 zfs_unmap_page(sf);
4456 }
4457 } else {
4458 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4459 }
4460
4461 if (err == 0) {
4462 uint64_t mtime[2], ctime[2];
4463 sa_bulk_attr_t bulk[3];
4464 int count = 0;
4465
4466 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4467 &mtime, 16);
4468 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4469 &ctime, 16);
4470 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4471 &zp->z_pflags, 8);
4472 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
4473 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4474 ASSERT0(err);
4475
4476 if (commit) {
4477 /*
4478 * Caller requested that we commit immediately. We set
4479 * a callback on the log entry, to be called once its
4480 * on disk after the call to zil_commit() below. The
4481 * pages will be undirtied and unbusied there.
4482 */
4483 putpage_commit_arg_t *pca = kmem_alloc(
4484 offsetof(putpage_commit_arg_t, pca_pages[ncount]),
4485 KM_SLEEP);
4486 pca->pca_npages = ncount;
4487 memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
4488
4489 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
4490 B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
4491
4492 for (i = 0; i < ncount; i++)
4493 rtvals[i] = zfs_vm_pagerret_pend;
4494 } else {
4495 /*
4496 * Caller just wants the page written back somewhere,
4497 * but doesn't need it committed yet. We've already
4498 * written it back to the DMU, so we just need to put
4499 * it on the async log, then undirty the page and
4500 * return.
4501 *
4502 * We cannot use a callback here, because it would keep
4503 * the page busy (locked) until it is eventually
4504 * written down at txg sync.
4505 */
4506 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
4507 B_FALSE, B_FALSE, NULL, NULL);
4508
4509 zfs_vmobject_wlock(object);
4510 for (i = 0; i < ncount; i++) {
4511 rtvals[i] = zfs_vm_pagerret_ok;
4512 vm_page_undirty(ma[i]);
4513 }
4514 zfs_vmobject_wunlock(object);
4515 }
4516
4517 VM_CNT_INC(v_vnodeout);
4518 VM_CNT_ADD(v_vnodepgsout, ncount);
4519 }
4520 dmu_tx_commit(tx);
4521
4522 out:
4523 zfs_rangelock_exit(lr);
4524 if (commit) {
4525 err = zil_commit(zfsvfs->z_log, zp->z_id);
4526 if (err != 0) {
4527 zfs_exit(zfsvfs, FTAG);
4528 return (err);
4529 }
4530 }
4531
4532 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
4533
4534 zfs_exit(zfsvfs, FTAG);
4535 return (rtvals[0]);
4536 }
4537
4538 #ifndef _SYS_SYSPROTO_H_
4539 struct vop_putpages_args {
4540 struct vnode *a_vp;
4541 vm_page_t *a_m;
4542 int a_count;
4543 int a_sync;
4544 int *a_rtvals;
4545 };
4546 #endif
4547
4548 static int
zfs_freebsd_putpages(struct vop_putpages_args * ap)4549 zfs_freebsd_putpages(struct vop_putpages_args *ap)
4550 {
4551
4552 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4553 ap->a_rtvals));
4554 }
4555
4556 #ifndef _SYS_SYSPROTO_H_
4557 struct vop_bmap_args {
4558 struct vnode *a_vp;
4559 daddr_t a_bn;
4560 struct bufobj **a_bop;
4561 daddr_t *a_bnp;
4562 int *a_runp;
4563 int *a_runb;
4564 };
4565 #endif
4566
4567 static int
zfs_freebsd_bmap(struct vop_bmap_args * ap)4568 zfs_freebsd_bmap(struct vop_bmap_args *ap)
4569 {
4570
4571 if (ap->a_bop != NULL)
4572 *ap->a_bop = &ap->a_vp->v_bufobj;
4573 if (ap->a_bnp != NULL)
4574 *ap->a_bnp = ap->a_bn;
4575 if (ap->a_runp != NULL)
4576 *ap->a_runp = 0;
4577 if (ap->a_runb != NULL)
4578 *ap->a_runb = 0;
4579
4580 return (0);
4581 }
4582
4583 #ifndef _SYS_SYSPROTO_H_
4584 struct vop_open_args {
4585 struct vnode *a_vp;
4586 int a_mode;
4587 struct ucred *a_cred;
4588 struct thread *a_td;
4589 };
4590 #endif
4591
4592 static int
zfs_freebsd_open(struct vop_open_args * ap)4593 zfs_freebsd_open(struct vop_open_args *ap)
4594 {
4595 vnode_t *vp = ap->a_vp;
4596 znode_t *zp = VTOZ(vp);
4597 int error;
4598
4599 error = zfs_open(&vp, ap->a_mode, ap->a_cred);
4600 if (error == 0)
4601 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4602 return (error);
4603 }
4604
4605 #ifndef _SYS_SYSPROTO_H_
4606 struct vop_close_args {
4607 struct vnode *a_vp;
4608 int a_fflag;
4609 struct ucred *a_cred;
4610 struct thread *a_td;
4611 };
4612 #endif
4613
4614 static int
zfs_freebsd_close(struct vop_close_args * ap)4615 zfs_freebsd_close(struct vop_close_args *ap)
4616 {
4617
4618 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
4619 }
4620
4621 #ifndef _SYS_SYSPROTO_H_
4622 struct vop_ioctl_args {
4623 struct vnode *a_vp;
4624 ulong_t a_command;
4625 caddr_t a_data;
4626 int a_fflag;
4627 struct ucred *cred;
4628 struct thread *td;
4629 };
4630 #endif
4631
4632 static int
zfs_freebsd_ioctl(struct vop_ioctl_args * ap)4633 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
4634 {
4635
4636 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4637 ap->a_fflag, ap->a_cred, NULL));
4638 }
4639
4640 static int
ioflags(int ioflags)4641 ioflags(int ioflags)
4642 {
4643 int flags = 0;
4644
4645 if (ioflags & IO_APPEND)
4646 flags |= O_APPEND;
4647 if (ioflags & IO_NDELAY)
4648 flags |= O_NONBLOCK;
4649 if (ioflags & IO_DIRECT)
4650 flags |= O_DIRECT;
4651 if (ioflags & IO_SYNC)
4652 flags |= O_SYNC;
4653
4654 return (flags);
4655 }
4656
4657 #ifndef _SYS_SYSPROTO_H_
4658 struct vop_read_args {
4659 struct vnode *a_vp;
4660 struct uio *a_uio;
4661 int a_ioflag;
4662 struct ucred *a_cred;
4663 };
4664 #endif
4665
4666 static int
zfs_freebsd_read(struct vop_read_args * ap)4667 zfs_freebsd_read(struct vop_read_args *ap)
4668 {
4669 zfs_uio_t uio;
4670 int error = 0;
4671 zfs_uio_init(&uio, ap->a_uio);
4672 error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4673 ap->a_cred);
4674 /*
4675 * XXX We occasionally get an EFAULT for Direct I/O reads on
4676 * FreeBSD 13. This still needs to be resolved. The EFAULT comes
4677 * from:
4678 * zfs_uio_get__dio_pages_alloc() ->
4679 * zfs_uio_get_dio_pages_impl() ->
4680 * zfs_uio_iov_step() ->
4681 * zfs_uio_get_user_pages().
4682 * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
4683 * read fails to map in the user pages (returning EFAULT) the
4684 * Direct I/O request is broken up into two separate IO requests
4685 * and issued separately using Direct I/O.
4686 */
4687 #ifdef ZFS_DEBUG
4688 if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
4689 #if 0
4690 printf("%s(%d): Direct I/O read returning EFAULT "
4691 "uio = %p, zfs_uio_offset(uio) = %lu "
4692 "zfs_uio_resid(uio) = %lu\n",
4693 __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
4694 zfs_uio_resid(&uio));
4695 #endif
4696 }
4697
4698 #endif
4699 return (error);
4700 }
4701
4702 #ifndef _SYS_SYSPROTO_H_
4703 struct vop_write_args {
4704 struct vnode *a_vp;
4705 struct uio *a_uio;
4706 int a_ioflag;
4707 struct ucred *a_cred;
4708 };
4709 #endif
4710
4711 static int
zfs_freebsd_write(struct vop_write_args * ap)4712 zfs_freebsd_write(struct vop_write_args *ap)
4713 {
4714 zfs_uio_t uio;
4715 zfs_uio_init(&uio, ap->a_uio);
4716 return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4717 ap->a_cred));
4718 }
4719
4720 /*
4721 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
4722 * the comment above cache_fplookup for details.
4723 */
4724 static int
zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args * v)4725 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
4726 {
4727 vnode_t *vp;
4728 znode_t *zp;
4729 uint64_t pflags;
4730
4731 vp = v->a_vp;
4732 zp = VTOZ_SMR(vp);
4733 if (__predict_false(zp == NULL))
4734 return (EAGAIN);
4735 pflags = atomic_load_64(&zp->z_pflags);
4736 if (pflags & ZFS_AV_QUARANTINED)
4737 return (EAGAIN);
4738 if (pflags & ZFS_XATTR)
4739 return (EAGAIN);
4740 if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
4741 return (EAGAIN);
4742 return (0);
4743 }
4744
4745 static int
zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args * v)4746 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
4747 {
4748 vnode_t *vp;
4749 znode_t *zp;
4750 char *target;
4751
4752 vp = v->a_vp;
4753 zp = VTOZ_SMR(vp);
4754 if (__predict_false(zp == NULL)) {
4755 return (EAGAIN);
4756 }
4757
4758 target = atomic_load_consume_ptr(&zp->z_cached_symlink);
4759 if (target == NULL) {
4760 return (EAGAIN);
4761 }
4762 return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
4763 }
4764
4765 #ifndef _SYS_SYSPROTO_H_
4766 struct vop_access_args {
4767 struct vnode *a_vp;
4768 accmode_t a_accmode;
4769 struct ucred *a_cred;
4770 struct thread *a_td;
4771 };
4772 #endif
4773
4774 static int
zfs_freebsd_access(struct vop_access_args * ap)4775 zfs_freebsd_access(struct vop_access_args *ap)
4776 {
4777 vnode_t *vp = ap->a_vp;
4778 znode_t *zp = VTOZ(vp);
4779 accmode_t accmode;
4780 int error = 0;
4781
4782
4783 if (ap->a_accmode == VEXEC) {
4784 if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
4785 return (0);
4786 }
4787
4788 /*
4789 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4790 */
4791 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4792 if (accmode != 0) {
4793 #if __FreeBSD_version >= 1500040
4794 /* For named attributes, do the checks. */
4795 if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
4796 error = zfs_access(zp, accmode, V_NAMEDATTR,
4797 ap->a_cred);
4798 else
4799 #endif
4800 error = zfs_access(zp, accmode, 0, ap->a_cred);
4801 }
4802
4803 /*
4804 * VADMIN has to be handled by vaccess().
4805 */
4806 if (error == 0) {
4807 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4808 if (accmode != 0) {
4809 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4810 zp->z_gid, accmode, ap->a_cred);
4811 }
4812 }
4813
4814 /*
4815 * For VEXEC, ensure that at least one execute bit is set for
4816 * non-directories.
4817 */
4818 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4819 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4820 error = EACCES;
4821 }
4822
4823 return (error);
4824 }
4825
4826 #ifndef _SYS_SYSPROTO_H_
4827 struct vop_lookup_args {
4828 struct vnode *a_dvp;
4829 struct vnode **a_vpp;
4830 struct componentname *a_cnp;
4831 };
4832 #endif
4833
4834 #if __FreeBSD_version >= 1500040
4835 static int
zfs_lookup_nameddir(struct vnode * dvp,struct componentname * cnp,struct vnode ** vpp)4836 zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
4837 struct vnode **vpp)
4838 {
4839 struct vnode *xvp;
4840 int error, flags;
4841
4842 *vpp = NULL;
4843 flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
4844 if ((cnp->cn_flags & CREATENAMED) != 0)
4845 flags |= CREATE_XATTR_DIR;
4846 error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
4847 B_FALSE);
4848 if (error == 0) {
4849 if ((cnp->cn_flags & LOCKLEAF) != 0)
4850 error = vn_lock(xvp, cnp->cn_lkflags);
4851 if (error == 0) {
4852 vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
4853 *vpp = xvp;
4854 } else {
4855 vrele(xvp);
4856 }
4857 }
4858 return (error);
4859 }
4860
4861 static ssize_t
zfs_readdir_named(struct vnode * vp,char * buf,ssize_t blen,off_t * offp,int * eofflagp,struct ucred * cred,struct thread * td)4862 zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
4863 int *eofflagp, struct ucred *cred, struct thread *td)
4864 {
4865 struct uio io;
4866 struct iovec iv;
4867 zfs_uio_t uio;
4868 int error;
4869
4870 io.uio_offset = *offp;
4871 io.uio_segflg = UIO_SYSSPACE;
4872 io.uio_rw = UIO_READ;
4873 io.uio_td = td;
4874 iv.iov_base = buf;
4875 iv.iov_len = blen;
4876 io.uio_iov = &iv;
4877 io.uio_iovcnt = 1;
4878 io.uio_resid = blen;
4879 zfs_uio_init(&uio, &io);
4880 error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
4881 if (error != 0)
4882 return (-1);
4883 *offp = io.uio_offset;
4884 return (blen - io.uio_resid);
4885 }
4886
4887 static bool
zfs_has_namedattr(struct vnode * vp,struct ucred * cred)4888 zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
4889 {
4890 struct componentname cn;
4891 struct vnode *xvp;
4892 struct dirent *dp;
4893 off_t offs;
4894 ssize_t rsize;
4895 char *buf, *cp, *endcp;
4896 int eofflag, error;
4897 bool ret;
4898
4899 MNT_ILOCK(vp->v_mount);
4900 if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
4901 MNT_IUNLOCK(vp->v_mount);
4902 return (false);
4903 }
4904 MNT_IUNLOCK(vp->v_mount);
4905
4906 /* Now see if a named attribute directory exists. */
4907 cn.cn_flags = LOCKLEAF;
4908 cn.cn_lkflags = LK_SHARED;
4909 cn.cn_cred = cred;
4910 error = zfs_lookup_nameddir(vp, &cn, &xvp);
4911 if (error != 0)
4912 return (false);
4913
4914 /* It exists, so see if there is any entry other than "." and "..". */
4915 buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
4916 ret = false;
4917 offs = 0;
4918 do {
4919 rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
4920 cred, curthread);
4921 if (rsize <= 0)
4922 break;
4923 cp = buf;
4924 endcp = &buf[rsize];
4925 while (cp < endcp) {
4926 dp = (struct dirent *)cp;
4927 if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
4928 dp->d_type == DT_UNKNOWN) &&
4929 !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
4930 ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
4931 (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
4932 dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
4933 ret = true;
4934 break;
4935 }
4936 cp += dp->d_reclen;
4937 }
4938 } while (!ret && rsize > 0 && eofflag == 0);
4939 vput(xvp);
4940 free(buf, M_TEMP);
4941 return (ret);
4942 }
4943
4944 static int
zfs_freebsd_lookup(struct vop_lookup_args * ap,boolean_t cached)4945 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4946 {
4947 struct componentname *cnp = ap->a_cnp;
4948 char nm[NAME_MAX + 1];
4949 int error;
4950 struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
4951 bool is_nameddir, needs_nameddir, opennamed = false;
4952
4953 /*
4954 * These variables are used to handle the named attribute cases:
4955 * opennamed - Is true when this is a call from open with O_NAMEDATTR
4956 * specified and it is the last component.
4957 * is_nameddir - Is true when the directory is a named attribute dir.
4958 * needs_nameddir - Is set when the lookup needs to look for/create
4959 * a named attribute directory. It is only set when is_nameddir
4960 * is_nameddir is false and opennamed is true.
4961 * xvp - Is the directory that the lookup needs to be done in.
4962 * Usually dvp, unless needs_nameddir is true where it is the
4963 * result of the first non-named directory lookup.
4964 * Note that name caching must be disabled for named attribute
4965 * handling.
4966 */
4967 needs_nameddir = false;
4968 xvp = dvp;
4969 opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
4970 (OPENNAMED | ISLASTCN);
4971 is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
4972 if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
4973 return (ENOATTR);
4974 if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
4975 return (ENOATTR);
4976 if (opennamed || is_nameddir)
4977 cnp->cn_flags &= ~MAKEENTRY;
4978 if (opennamed && !is_nameddir)
4979 needs_nameddir = true;
4980 ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
4981 error = 0;
4982 *vpp = NULL;
4983 if (needs_nameddir) {
4984 if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
4985 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
4986 error = zfs_lookup_nameddir(dvp, cnp, &xvp);
4987 if (error == 0)
4988 is_nameddir = true;
4989 }
4990 if (error == 0) {
4991 if (!needs_nameddir || cnp->cn_namelen != 1 ||
4992 *cnp->cn_nameptr != '.') {
4993 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
4994 sizeof (nm)));
4995 error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
4996 cnp->cn_cred, 0, cached);
4997 if (is_nameddir && error == 0 &&
4998 (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
4999 (cnp->cn_flags & ISDOTDOT) == 0) {
5000 if ((*vpp)->v_type == VDIR)
5001 vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
5002 else
5003 vn_irflag_set_cond(*vpp,
5004 VIRF_NAMEDATTR);
5005 }
5006 if (needs_nameddir && xvp != *vpp)
5007 vput(xvp);
5008 } else {
5009 /*
5010 * Lookup of "." when a named attribute dir is needed.
5011 */
5012 *vpp = xvp;
5013 }
5014 }
5015 return (error);
5016 }
5017 #else
5018 static int
zfs_freebsd_lookup(struct vop_lookup_args * ap,boolean_t cached)5019 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
5020 {
5021 struct componentname *cnp = ap->a_cnp;
5022 char nm[NAME_MAX + 1];
5023
5024 ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
5025 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
5026
5027 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5028 cnp->cn_cred, 0, cached));
5029 }
5030 #endif
5031
5032 static int
zfs_freebsd_cachedlookup(struct vop_cachedlookup_args * ap)5033 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
5034 {
5035
5036 return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
5037 }
5038
5039 #ifndef _SYS_SYSPROTO_H_
5040 struct vop_lookup_args {
5041 struct vnode *a_dvp;
5042 struct vnode **a_vpp;
5043 struct componentname *a_cnp;
5044 };
5045 #endif
5046
5047 static int
zfs_cache_lookup(struct vop_lookup_args * ap)5048 zfs_cache_lookup(struct vop_lookup_args *ap)
5049 {
5050 zfsvfs_t *zfsvfs;
5051
5052 zfsvfs = ap->a_dvp->v_mount->mnt_data;
5053 #if __FreeBSD_version >= 1500040
5054 if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
5055 #else
5056 if (zfsvfs->z_use_namecache)
5057 #endif
5058 return (vfs_cache_lookup(ap));
5059 else
5060 return (zfs_freebsd_lookup(ap, B_FALSE));
5061 }
5062
5063 #ifndef _SYS_SYSPROTO_H_
5064 struct vop_create_args {
5065 struct vnode *a_dvp;
5066 struct vnode **a_vpp;
5067 struct componentname *a_cnp;
5068 struct vattr *a_vap;
5069 };
5070 #endif
5071
5072 static int
zfs_freebsd_create(struct vop_create_args * ap)5073 zfs_freebsd_create(struct vop_create_args *ap)
5074 {
5075 zfsvfs_t *zfsvfs;
5076 struct componentname *cnp = ap->a_cnp;
5077 vattr_t *vap = ap->a_vap;
5078 znode_t *zp = NULL;
5079 int rc, mode;
5080 struct vnode *dvp = ap->a_dvp;
5081 #if __FreeBSD_version >= 1500040
5082 struct vnode *xvp;
5083 bool is_nameddir;
5084 #endif
5085
5086 #if __FreeBSD_version < 1400068
5087 ASSERT(cnp->cn_flags & SAVENAME);
5088 #endif
5089
5090 vattr_init_mask(vap);
5091 mode = vap->va_mode & ALLPERMS;
5092 zfsvfs = ap->a_dvp->v_mount->mnt_data;
5093 *ap->a_vpp = NULL;
5094
5095 rc = 0;
5096 #if __FreeBSD_version >= 1500040
5097 xvp = NULL;
5098 is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
5099 if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
5100 /* Needs a named attribute directory. */
5101 rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
5102 if (rc == 0) {
5103 dvp = xvp;
5104 is_nameddir = true;
5105 }
5106 }
5107 if (is_nameddir && rc == 0)
5108 rc = zfs_check_attrname(cnp->cn_nameptr);
5109 #endif
5110
5111 if (rc == 0)
5112 rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
5113 &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
5114 #if __FreeBSD_version >= 1500040
5115 if (xvp != NULL)
5116 vput(xvp);
5117 #endif
5118 if (rc == 0) {
5119 *ap->a_vpp = ZTOV(zp);
5120 #if __FreeBSD_version >= 1500040
5121 if (is_nameddir)
5122 vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
5123 #endif
5124 }
5125 if (zfsvfs->z_use_namecache &&
5126 rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5127 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5128
5129 return (rc);
5130 }
5131
5132 #ifndef _SYS_SYSPROTO_H_
5133 struct vop_remove_args {
5134 struct vnode *a_dvp;
5135 struct vnode *a_vp;
5136 struct componentname *a_cnp;
5137 };
5138 #endif
5139
5140 static int
zfs_freebsd_remove(struct vop_remove_args * ap)5141 zfs_freebsd_remove(struct vop_remove_args *ap)
5142 {
5143 int error = 0;
5144
5145 #if __FreeBSD_version < 1400068
5146 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5147 #endif
5148
5149 #if __FreeBSD_version >= 1500040
5150 if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
5151 error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
5152 #endif
5153
5154 if (error == 0)
5155 error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5156 ap->a_cnp->cn_cred);
5157 return (error);
5158 }
5159
5160 #ifndef _SYS_SYSPROTO_H_
5161 struct vop_mkdir_args {
5162 struct vnode *a_dvp;
5163 struct vnode **a_vpp;
5164 struct componentname *a_cnp;
5165 struct vattr *a_vap;
5166 };
5167 #endif
5168
5169 static int
zfs_freebsd_mkdir(struct vop_mkdir_args * ap)5170 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
5171 {
5172 vattr_t *vap = ap->a_vap;
5173 znode_t *zp = NULL;
5174 int rc;
5175
5176 #if __FreeBSD_version < 1400068
5177 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5178 #endif
5179
5180 vattr_init_mask(vap);
5181 *ap->a_vpp = NULL;
5182
5183 rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
5184 ap->a_cnp->cn_cred, 0, NULL, NULL);
5185
5186 if (rc == 0)
5187 *ap->a_vpp = ZTOV(zp);
5188 return (rc);
5189 }
5190
5191 #ifndef _SYS_SYSPROTO_H_
5192 struct vop_rmdir_args {
5193 struct vnode *a_dvp;
5194 struct vnode *a_vp;
5195 struct componentname *a_cnp;
5196 };
5197 #endif
5198
5199 static int
zfs_freebsd_rmdir(struct vop_rmdir_args * ap)5200 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
5201 {
5202 struct componentname *cnp = ap->a_cnp;
5203
5204 #if __FreeBSD_version < 1400068
5205 ASSERT(cnp->cn_flags & SAVENAME);
5206 #endif
5207
5208 return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5209 }
5210
5211 #ifndef _SYS_SYSPROTO_H_
5212 struct vop_readdir_args {
5213 struct vnode *a_vp;
5214 struct uio *a_uio;
5215 struct ucred *a_cred;
5216 int *a_eofflag;
5217 int *a_ncookies;
5218 cookie_t **a_cookies;
5219 };
5220 #endif
5221
5222 static int
zfs_freebsd_readdir(struct vop_readdir_args * ap)5223 zfs_freebsd_readdir(struct vop_readdir_args *ap)
5224 {
5225 zfs_uio_t uio;
5226 zfs_uio_init(&uio, ap->a_uio);
5227 return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
5228 ap->a_ncookies, ap->a_cookies));
5229 }
5230
5231 #ifndef _SYS_SYSPROTO_H_
5232 struct vop_fsync_args {
5233 struct vnode *a_vp;
5234 int a_waitfor;
5235 struct thread *a_td;
5236 };
5237 #endif
5238
5239 static int
zfs_freebsd_fsync(struct vop_fsync_args * ap)5240 zfs_freebsd_fsync(struct vop_fsync_args *ap)
5241 {
5242 vnode_t *vp = ap->a_vp;
5243 int err = 0;
5244
5245 /*
5246 * Push any dirty mmap()'d data out to the DMU and ZIL, ready for
5247 * zil_commit() to be called in zfs_fsync().
5248 */
5249 if (vm_object_mightbedirty(vp->v_object)) {
5250 zfs_vmobject_wlock(vp->v_object);
5251 if (!vm_object_page_clean(vp->v_object, 0, 0, 0))
5252 err = SET_ERROR(EIO);
5253 zfs_vmobject_wunlock(vp->v_object);
5254 if (err) {
5255 /*
5256 * Unclear what state things are in. zfs_putpages()
5257 * will ensure the pages remain dirty if they haven't
5258 * been written down to the DMU, but because there may
5259 * be nothing logged, we can't assume that zfs_sync()
5260 * -> zil_commit() will give us a useful error. It's
5261 * safest if we just error out here.
5262 */
5263 return (err);
5264 }
5265 }
5266
5267 return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred));
5268 }
5269
5270 #ifndef _SYS_SYSPROTO_H_
5271 struct vop_getattr_args {
5272 struct vnode *a_vp;
5273 struct vattr *a_vap;
5274 struct ucred *a_cred;
5275 };
5276 #endif
5277
5278 static int
zfs_freebsd_getattr(struct vop_getattr_args * ap)5279 zfs_freebsd_getattr(struct vop_getattr_args *ap)
5280 {
5281 vattr_t *vap = ap->a_vap;
5282 xvattr_t xvap;
5283 ulong_t fflags = 0;
5284 int error;
5285
5286 xva_init(&xvap);
5287 xvap.xva_vattr = *vap;
5288 xvap.xva_vattr.va_mask |= AT_XVATTR;
5289
5290 /* Convert chflags into ZFS-type flags. */
5291 /* XXX: what about SF_SETTABLE?. */
5292 XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5293 XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5294 XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5295 XVA_SET_REQ(&xvap, XAT_NODUMP);
5296 XVA_SET_REQ(&xvap, XAT_READONLY);
5297 XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5298 XVA_SET_REQ(&xvap, XAT_SYSTEM);
5299 XVA_SET_REQ(&xvap, XAT_HIDDEN);
5300 XVA_SET_REQ(&xvap, XAT_REPARSE);
5301 XVA_SET_REQ(&xvap, XAT_OFFLINE);
5302 XVA_SET_REQ(&xvap, XAT_SPARSE);
5303
5304 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
5305 if (error != 0)
5306 return (error);
5307
5308 /* Convert ZFS xattr into chflags. */
5309 #define FLAG_CHECK(fflag, xflag, xfield) do { \
5310 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
5311 fflags |= (fflag); \
5312 } while (0)
5313 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5314 xvap.xva_xoptattrs.xoa_immutable);
5315 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5316 xvap.xva_xoptattrs.xoa_appendonly);
5317 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5318 xvap.xva_xoptattrs.xoa_nounlink);
5319 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5320 xvap.xva_xoptattrs.xoa_archive);
5321 FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5322 xvap.xva_xoptattrs.xoa_nodump);
5323 FLAG_CHECK(UF_READONLY, XAT_READONLY,
5324 xvap.xva_xoptattrs.xoa_readonly);
5325 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5326 xvap.xva_xoptattrs.xoa_system);
5327 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5328 xvap.xva_xoptattrs.xoa_hidden);
5329 FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5330 xvap.xva_xoptattrs.xoa_reparse);
5331 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5332 xvap.xva_xoptattrs.xoa_offline);
5333 FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5334 xvap.xva_xoptattrs.xoa_sparse);
5335
5336 #undef FLAG_CHECK
5337 *vap = xvap.xva_vattr;
5338 vap->va_flags = fflags;
5339
5340 #if __FreeBSD_version >= 1500040
5341 if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
5342 vap->va_bsdflags |= SFBSD_NAMEDATTR;
5343 #endif
5344 return (0);
5345 }
5346
5347 #ifndef _SYS_SYSPROTO_H_
5348 struct vop_setattr_args {
5349 struct vnode *a_vp;
5350 struct vattr *a_vap;
5351 struct ucred *a_cred;
5352 };
5353 #endif
5354
5355 static int
zfs_freebsd_setattr(struct vop_setattr_args * ap)5356 zfs_freebsd_setattr(struct vop_setattr_args *ap)
5357 {
5358 vnode_t *vp = ap->a_vp;
5359 vattr_t *vap = ap->a_vap;
5360 cred_t *cred = ap->a_cred;
5361 xvattr_t xvap;
5362 ulong_t fflags;
5363 uint64_t zflags;
5364
5365 vattr_init_mask(vap);
5366 vap->va_mask &= ~AT_NOSET;
5367
5368 xva_init(&xvap);
5369 xvap.xva_vattr = *vap;
5370
5371 zflags = VTOZ(vp)->z_pflags;
5372
5373 if (vap->va_flags != VNOVAL) {
5374 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5375 int error;
5376
5377 if (zfsvfs->z_use_fuids == B_FALSE)
5378 return (EOPNOTSUPP);
5379
5380 fflags = vap->va_flags;
5381 /*
5382 * XXX KDM
5383 * We need to figure out whether it makes sense to allow
5384 * UF_REPARSE through, since we don't really have other
5385 * facilities to handle reparse points and zfs_setattr()
5386 * doesn't currently allow setting that attribute anyway.
5387 */
5388 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5389 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5390 UF_OFFLINE|UF_SPARSE)) != 0)
5391 return (EOPNOTSUPP);
5392 /*
5393 * Unprivileged processes are not permitted to unset system
5394 * flags, or modify flags if any system flags are set.
5395 * Privileged non-jail processes may not modify system flags
5396 * if securelevel > 0 and any existing system flags are set.
5397 * Privileged jail processes behave like privileged non-jail
5398 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
5399 * otherwise, they behave like unprivileged processes.
5400 */
5401 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5402 priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
5403 if (zflags &
5404 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5405 error = securelevel_gt(cred, 0);
5406 if (error != 0)
5407 return (error);
5408 }
5409 } else {
5410 /*
5411 * Callers may only modify the file flags on
5412 * objects they have VADMIN rights for.
5413 */
5414 if ((error = VOP_ACCESS(vp, VADMIN, cred,
5415 curthread)) != 0)
5416 return (error);
5417 if (zflags &
5418 (ZFS_IMMUTABLE | ZFS_APPENDONLY |
5419 ZFS_NOUNLINK)) {
5420 return (EPERM);
5421 }
5422 if (fflags &
5423 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5424 return (EPERM);
5425 }
5426 }
5427
5428 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
5429 if (((fflags & (fflag)) && !(zflags & (zflag))) || \
5430 ((zflags & (zflag)) && !(fflags & (fflag)))) { \
5431 XVA_SET_REQ(&xvap, (xflag)); \
5432 (xfield) = ((fflags & (fflag)) != 0); \
5433 } \
5434 } while (0)
5435 /* Convert chflags into ZFS-type flags. */
5436 /* XXX: what about SF_SETTABLE?. */
5437 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5438 xvap.xva_xoptattrs.xoa_immutable);
5439 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5440 xvap.xva_xoptattrs.xoa_appendonly);
5441 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5442 xvap.xva_xoptattrs.xoa_nounlink);
5443 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5444 xvap.xva_xoptattrs.xoa_archive);
5445 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5446 xvap.xva_xoptattrs.xoa_nodump);
5447 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5448 xvap.xva_xoptattrs.xoa_readonly);
5449 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5450 xvap.xva_xoptattrs.xoa_system);
5451 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5452 xvap.xva_xoptattrs.xoa_hidden);
5453 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5454 xvap.xva_xoptattrs.xoa_reparse);
5455 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5456 xvap.xva_xoptattrs.xoa_offline);
5457 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5458 xvap.xva_xoptattrs.xoa_sparse);
5459 #undef FLAG_CHANGE
5460 }
5461 if (vap->va_birthtime.tv_sec != VNOVAL) {
5462 xvap.xva_vattr.va_mask |= AT_XVATTR;
5463 XVA_SET_REQ(&xvap, XAT_CREATETIME);
5464 }
5465 return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
5466 }
5467
5468 #ifndef _SYS_SYSPROTO_H_
5469 struct vop_rename_args {
5470 struct vnode *a_fdvp;
5471 struct vnode *a_fvp;
5472 struct componentname *a_fcnp;
5473 struct vnode *a_tdvp;
5474 struct vnode *a_tvp;
5475 struct componentname *a_tcnp;
5476 };
5477 #endif
5478
5479 static int
zfs_freebsd_rename(struct vop_rename_args * ap)5480 zfs_freebsd_rename(struct vop_rename_args *ap)
5481 {
5482 vnode_t *fdvp = ap->a_fdvp;
5483 vnode_t *fvp = ap->a_fvp;
5484 vnode_t *tdvp = ap->a_tdvp;
5485 vnode_t *tvp = ap->a_tvp;
5486 int error = 0;
5487
5488 #if __FreeBSD_version < 1400068
5489 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5490 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5491 #endif
5492
5493 #if __FreeBSD_version >= 1500040
5494 if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
5495 error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
5496 if (error == 0)
5497 error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
5498 }
5499 #endif
5500
5501 if (error == 0)
5502 error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5503 ap->a_tcnp, ap->a_fcnp->cn_cred);
5504
5505 vrele(fdvp);
5506 vrele(fvp);
5507 vrele(tdvp);
5508 if (tvp != NULL)
5509 vrele(tvp);
5510
5511 return (error);
5512 }
5513
5514 #ifndef _SYS_SYSPROTO_H_
5515 struct vop_symlink_args {
5516 struct vnode *a_dvp;
5517 struct vnode **a_vpp;
5518 struct componentname *a_cnp;
5519 struct vattr *a_vap;
5520 char *a_target;
5521 };
5522 #endif
5523
5524 static int
zfs_freebsd_symlink(struct vop_symlink_args * ap)5525 zfs_freebsd_symlink(struct vop_symlink_args *ap)
5526 {
5527 struct componentname *cnp = ap->a_cnp;
5528 vattr_t *vap = ap->a_vap;
5529 znode_t *zp = NULL;
5530 char *symlink;
5531 size_t symlink_len;
5532 int rc;
5533
5534 #if __FreeBSD_version < 1400068
5535 ASSERT(cnp->cn_flags & SAVENAME);
5536 #endif
5537
5538 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
5539 vattr_init_mask(vap);
5540 *ap->a_vpp = NULL;
5541
5542 rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
5543 ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
5544 if (rc == 0) {
5545 *ap->a_vpp = ZTOV(zp);
5546 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
5547 MPASS(zp->z_cached_symlink == NULL);
5548 symlink_len = strlen(ap->a_target);
5549 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
5550 if (symlink != NULL) {
5551 memcpy(symlink, ap->a_target, symlink_len);
5552 symlink[symlink_len] = '\0';
5553 atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
5554 (uintptr_t)symlink);
5555 }
5556 }
5557 return (rc);
5558 }
5559
5560 #ifndef _SYS_SYSPROTO_H_
5561 struct vop_readlink_args {
5562 struct vnode *a_vp;
5563 struct uio *a_uio;
5564 struct ucred *a_cred;
5565 };
5566 #endif
5567
5568 static int
zfs_freebsd_readlink(struct vop_readlink_args * ap)5569 zfs_freebsd_readlink(struct vop_readlink_args *ap)
5570 {
5571 zfs_uio_t uio;
5572 int error;
5573 znode_t *zp = VTOZ(ap->a_vp);
5574 char *symlink, *base;
5575 size_t symlink_len;
5576 bool trycache;
5577
5578 zfs_uio_init(&uio, ap->a_uio);
5579 trycache = false;
5580 if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
5581 zfs_uio_iovcnt(&uio) == 1) {
5582 base = zfs_uio_iovbase(&uio, 0);
5583 symlink_len = zfs_uio_iovlen(&uio, 0);
5584 trycache = true;
5585 }
5586 error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
5587 if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
5588 error != 0 || !trycache) {
5589 return (error);
5590 }
5591 symlink_len -= zfs_uio_resid(&uio);
5592 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
5593 if (symlink != NULL) {
5594 memcpy(symlink, base, symlink_len);
5595 symlink[symlink_len] = '\0';
5596 if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
5597 (uintptr_t)NULL, (uintptr_t)symlink)) {
5598 cache_symlink_free(symlink, symlink_len + 1);
5599 }
5600 }
5601 return (error);
5602 }
5603
5604 #ifndef _SYS_SYSPROTO_H_
5605 struct vop_link_args {
5606 struct vnode *a_tdvp;
5607 struct vnode *a_vp;
5608 struct componentname *a_cnp;
5609 };
5610 #endif
5611
5612 static int
zfs_freebsd_link(struct vop_link_args * ap)5613 zfs_freebsd_link(struct vop_link_args *ap)
5614 {
5615 struct componentname *cnp = ap->a_cnp;
5616 vnode_t *vp = ap->a_vp;
5617 vnode_t *tdvp = ap->a_tdvp;
5618
5619 if (tdvp->v_mount != vp->v_mount)
5620 return (EXDEV);
5621
5622 #if __FreeBSD_version < 1400068
5623 ASSERT(cnp->cn_flags & SAVENAME);
5624 #endif
5625
5626 return (zfs_link(VTOZ(tdvp), VTOZ(vp),
5627 cnp->cn_nameptr, cnp->cn_cred, 0));
5628 }
5629
5630 #ifndef _SYS_SYSPROTO_H_
5631 struct vop_inactive_args {
5632 struct vnode *a_vp;
5633 struct thread *a_td;
5634 };
5635 #endif
5636
5637 static int
zfs_freebsd_inactive(struct vop_inactive_args * ap)5638 zfs_freebsd_inactive(struct vop_inactive_args *ap)
5639 {
5640 vnode_t *vp = ap->a_vp;
5641
5642 zfs_inactive(vp, curthread->td_ucred, NULL);
5643 return (0);
5644 }
5645
5646 #ifndef _SYS_SYSPROTO_H_
5647 struct vop_need_inactive_args {
5648 struct vnode *a_vp;
5649 struct thread *a_td;
5650 };
5651 #endif
5652
5653 static int
zfs_freebsd_need_inactive(struct vop_need_inactive_args * ap)5654 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
5655 {
5656 vnode_t *vp = ap->a_vp;
5657 znode_t *zp = VTOZ(vp);
5658 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5659 int need;
5660
5661 if (vn_need_pageq_flush(vp))
5662 return (1);
5663
5664 if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
5665 return (1);
5666 need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
5667 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5668
5669 return (need);
5670 }
5671
5672 #ifndef _SYS_SYSPROTO_H_
5673 struct vop_reclaim_args {
5674 struct vnode *a_vp;
5675 struct thread *a_td;
5676 };
5677 #endif
5678
5679 static int
zfs_freebsd_reclaim(struct vop_reclaim_args * ap)5680 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
5681 {
5682 vnode_t *vp = ap->a_vp;
5683 znode_t *zp = VTOZ(vp);
5684 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5685
5686 ASSERT3P(zp, !=, NULL);
5687
5688 /*
5689 * z_teardown_inactive_lock protects from a race with
5690 * zfs_znode_dmu_fini in zfsvfs_teardown during
5691 * force unmount.
5692 */
5693 ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
5694 if (zp->z_sa_hdl == NULL)
5695 zfs_znode_free(zp);
5696 else
5697 zfs_zinactive(zp);
5698 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5699
5700 vp->v_data = NULL;
5701 return (0);
5702 }
5703
5704 #ifndef _SYS_SYSPROTO_H_
5705 struct vop_fid_args {
5706 struct vnode *a_vp;
5707 struct fid *a_fid;
5708 };
5709 #endif
5710
5711 static int
zfs_freebsd_fid(struct vop_fid_args * ap)5712 zfs_freebsd_fid(struct vop_fid_args *ap)
5713 {
5714
5715 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5716 }
5717
5718
5719 #ifndef _SYS_SYSPROTO_H_
5720 struct vop_pathconf_args {
5721 struct vnode *a_vp;
5722 int a_name;
5723 register_t *a_retval;
5724 } *ap;
5725 #endif
5726
5727 static int
zfs_freebsd_pathconf(struct vop_pathconf_args * ap)5728 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
5729 {
5730 ulong_t val;
5731 int error;
5732
5733 error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
5734 curthread->td_ucred, NULL);
5735 if (error == 0) {
5736 *ap->a_retval = val;
5737 return (error);
5738 }
5739 if (error != EOPNOTSUPP)
5740 return (error);
5741
5742 switch (ap->a_name) {
5743 case _PC_NAME_MAX:
5744 *ap->a_retval = NAME_MAX;
5745 return (0);
5746 #if __FreeBSD_version >= 1400032
5747 case _PC_DEALLOC_PRESENT:
5748 *ap->a_retval = 1;
5749 return (0);
5750 #endif
5751 case _PC_PIPE_BUF:
5752 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5753 *ap->a_retval = PIPE_BUF;
5754 return (0);
5755 }
5756 return (EINVAL);
5757 #if __FreeBSD_version >= 1500040
5758 case _PC_NAMEDATTR_ENABLED:
5759 MNT_ILOCK(ap->a_vp->v_mount);
5760 if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
5761 *ap->a_retval = 1;
5762 else
5763 *ap->a_retval = 0;
5764 MNT_IUNLOCK(ap->a_vp->v_mount);
5765 return (0);
5766 case _PC_HAS_NAMEDATTR:
5767 if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
5768 *ap->a_retval = 1;
5769 else
5770 *ap->a_retval = 0;
5771 return (0);
5772 #endif
5773 #ifdef _PC_HAS_HIDDENSYSTEM
5774 case _PC_HAS_HIDDENSYSTEM:
5775 *ap->a_retval = 1;
5776 return (0);
5777 #endif
5778 default:
5779 return (vop_stdpathconf(ap));
5780 }
5781 }
5782
5783 int zfs_xattr_compat = 1;
5784
5785 static int
zfs_check_attrname(const char * name)5786 zfs_check_attrname(const char *name)
5787 {
5788 /* We don't allow '/' character in attribute name. */
5789 if (strchr(name, '/') != NULL)
5790 return (SET_ERROR(EINVAL));
5791 /* We don't allow attribute names that start with a namespace prefix. */
5792 if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5793 return (SET_ERROR(EINVAL));
5794 return (0);
5795 }
5796
5797 /*
5798 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5799 * extended attribute name:
5800 *
5801 * NAMESPACE XATTR_COMPAT PREFIX
5802 * system * freebsd:system:
5803 * user 1 (none, can be used to access ZFS
5804 * fsattr(5) attributes created on Solaris)
5805 * user 0 user.
5806 */
5807 static int
zfs_create_attrname(int attrnamespace,const char * name,char * attrname,size_t size,boolean_t compat)5808 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5809 size_t size, boolean_t compat)
5810 {
5811 const char *namespace, *prefix, *suffix;
5812
5813 memset(attrname, 0, size);
5814
5815 switch (attrnamespace) {
5816 case EXTATTR_NAMESPACE_USER:
5817 if (compat) {
5818 /*
5819 * This is the default namespace by which we can access
5820 * all attributes created on Solaris.
5821 */
5822 prefix = namespace = suffix = "";
5823 } else {
5824 /*
5825 * This is compatible with the user namespace encoding
5826 * on Linux prior to xattr_compat, but nothing
5827 * else.
5828 */
5829 prefix = "";
5830 namespace = "user";
5831 suffix = ".";
5832 }
5833 break;
5834 case EXTATTR_NAMESPACE_SYSTEM:
5835 prefix = "freebsd:";
5836 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5837 suffix = ":";
5838 break;
5839 case EXTATTR_NAMESPACE_EMPTY:
5840 default:
5841 return (SET_ERROR(EINVAL));
5842 }
5843 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5844 name) >= size) {
5845 return (SET_ERROR(ENAMETOOLONG));
5846 }
5847 return (0);
5848 }
5849
5850 static int
zfs_ensure_xattr_cached(znode_t * zp)5851 zfs_ensure_xattr_cached(znode_t *zp)
5852 {
5853 int error = 0;
5854
5855 ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5856
5857 if (zp->z_xattr_cached != NULL)
5858 return (0);
5859
5860 if (rw_write_held(&zp->z_xattr_lock))
5861 return (zfs_sa_get_xattr(zp));
5862
5863 if (!rw_tryupgrade(&zp->z_xattr_lock)) {
5864 rw_exit(&zp->z_xattr_lock);
5865 rw_enter(&zp->z_xattr_lock, RW_WRITER);
5866 }
5867 if (zp->z_xattr_cached == NULL)
5868 error = zfs_sa_get_xattr(zp);
5869 rw_downgrade(&zp->z_xattr_lock);
5870 return (error);
5871 }
5872
5873 #ifndef _SYS_SYSPROTO_H_
5874 struct vop_getextattr {
5875 IN struct vnode *a_vp;
5876 IN int a_attrnamespace;
5877 IN const char *a_name;
5878 INOUT struct uio *a_uio;
5879 OUT size_t *a_size;
5880 IN struct ucred *a_cred;
5881 IN struct thread *a_td;
5882 };
5883 #endif
5884
5885 static int
zfs_getextattr_dir(struct vop_getextattr_args * ap,const char * attrname)5886 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
5887 {
5888 struct thread *td = ap->a_td;
5889 struct nameidata nd;
5890 struct vattr va;
5891 vnode_t *xvp = NULL, *vp;
5892 int error, flags;
5893
5894 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5895 LOOKUP_XATTR, B_FALSE);
5896 if (error != 0)
5897 return (error);
5898
5899 flags = FREAD;
5900 #if __FreeBSD_version < 1400043
5901 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5902 xvp, td);
5903 #else
5904 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5905 #endif
5906 error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
5907 if (error != 0)
5908 return (SET_ERROR(error));
5909 vp = nd.ni_vp;
5910 NDFREE_PNBUF(&nd);
5911
5912 if (ap->a_size != NULL) {
5913 error = VOP_GETATTR(vp, &va, ap->a_cred);
5914 if (error == 0)
5915 *ap->a_size = (size_t)va.va_size;
5916 } else if (ap->a_uio != NULL)
5917 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5918
5919 VOP_UNLOCK(vp);
5920 vn_close(vp, flags, ap->a_cred, td);
5921 return (error);
5922 }
5923
5924 static int
zfs_getextattr_sa(struct vop_getextattr_args * ap,const char * attrname)5925 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
5926 {
5927 znode_t *zp = VTOZ(ap->a_vp);
5928 uchar_t *nv_value;
5929 uint_t nv_size;
5930 int error;
5931
5932 error = zfs_ensure_xattr_cached(zp);
5933 if (error != 0)
5934 return (error);
5935
5936 ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5937 ASSERT3P(zp->z_xattr_cached, !=, NULL);
5938
5939 error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
5940 &nv_value, &nv_size);
5941 if (error != 0)
5942 return (SET_ERROR(error));
5943
5944 if (ap->a_size != NULL)
5945 *ap->a_size = nv_size;
5946 else if (ap->a_uio != NULL)
5947 error = uiomove(nv_value, nv_size, ap->a_uio);
5948 if (error != 0)
5949 return (SET_ERROR(error));
5950
5951 return (0);
5952 }
5953
5954 static int
zfs_getextattr_impl(struct vop_getextattr_args * ap,boolean_t compat)5955 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
5956 {
5957 znode_t *zp = VTOZ(ap->a_vp);
5958 zfsvfs_t *zfsvfs = ZTOZSB(zp);
5959 char attrname[EXTATTR_MAXNAMELEN+1];
5960 int error;
5961
5962 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5963 sizeof (attrname), compat);
5964 if (error != 0)
5965 return (error);
5966
5967 error = ENOENT;
5968 if (zfsvfs->z_use_sa && zp->z_is_sa)
5969 error = zfs_getextattr_sa(ap, attrname);
5970 if (error == ENOENT)
5971 error = zfs_getextattr_dir(ap, attrname);
5972 return (error);
5973 }
5974
5975 /*
5976 * Vnode operation to retrieve a named extended attribute.
5977 */
5978 static int
zfs_getextattr(struct vop_getextattr_args * ap)5979 zfs_getextattr(struct vop_getextattr_args *ap)
5980 {
5981 znode_t *zp = VTOZ(ap->a_vp);
5982 zfsvfs_t *zfsvfs = ZTOZSB(zp);
5983 int error;
5984
5985 /*
5986 * If the xattr property is off, refuse the request.
5987 */
5988 if (!(zfsvfs->z_flags & ZSB_XATTR))
5989 return (SET_ERROR(EOPNOTSUPP));
5990
5991 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5992 ap->a_cred, ap->a_td, VREAD);
5993 if (error != 0)
5994 return (SET_ERROR(error));
5995
5996 error = zfs_check_attrname(ap->a_name);
5997 if (error != 0)
5998 return (error);
5999
6000 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6001 return (error);
6002 error = ENOENT;
6003 rw_enter(&zp->z_xattr_lock, RW_READER);
6004
6005 error = zfs_getextattr_impl(ap, zfs_xattr_compat);
6006 if ((error == ENOENT || error == ENOATTR) &&
6007 ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6008 /*
6009 * Fall back to the alternate namespace format if we failed to
6010 * find a user xattr.
6011 */
6012 error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
6013 }
6014
6015 rw_exit(&zp->z_xattr_lock);
6016 zfs_exit(zfsvfs, FTAG);
6017 if (error == ENOENT)
6018 error = SET_ERROR(ENOATTR);
6019 return (error);
6020 }
6021
6022 #ifndef _SYS_SYSPROTO_H_
6023 struct vop_deleteextattr {
6024 IN struct vnode *a_vp;
6025 IN int a_attrnamespace;
6026 IN const char *a_name;
6027 IN struct ucred *a_cred;
6028 IN struct thread *a_td;
6029 };
6030 #endif
6031
6032 static int
zfs_deleteextattr_dir(struct vop_deleteextattr_args * ap,const char * attrname)6033 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
6034 {
6035 struct nameidata nd;
6036 vnode_t *xvp = NULL, *vp;
6037 int error;
6038
6039 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
6040 LOOKUP_XATTR, B_FALSE);
6041 if (error != 0)
6042 return (error);
6043
6044 #if __FreeBSD_version < 1400043
6045 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6046 UIO_SYSSPACE, attrname, xvp, ap->a_td);
6047 #else
6048 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6049 UIO_SYSSPACE, attrname, xvp);
6050 #endif
6051 error = namei(&nd);
6052 if (error != 0)
6053 return (SET_ERROR(error));
6054
6055 vp = nd.ni_vp;
6056 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6057 NDFREE_PNBUF(&nd);
6058
6059 vput(nd.ni_dvp);
6060 if (vp == nd.ni_dvp)
6061 vrele(vp);
6062 else
6063 vput(vp);
6064
6065 return (error);
6066 }
6067
6068 static int
zfs_deleteextattr_sa(struct vop_deleteextattr_args * ap,const char * attrname)6069 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
6070 {
6071 znode_t *zp = VTOZ(ap->a_vp);
6072 nvlist_t *nvl;
6073 int error;
6074
6075 error = zfs_ensure_xattr_cached(zp);
6076 if (error != 0)
6077 return (error);
6078
6079 ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
6080 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6081
6082 nvl = zp->z_xattr_cached;
6083 error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
6084 if (error != 0)
6085 error = SET_ERROR(error);
6086 else
6087 error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
6088 if (error != 0) {
6089 zp->z_xattr_cached = NULL;
6090 nvlist_free(nvl);
6091 }
6092 return (error);
6093 }
6094
6095 static int
zfs_deleteextattr_impl(struct vop_deleteextattr_args * ap,boolean_t compat)6096 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
6097 {
6098 znode_t *zp = VTOZ(ap->a_vp);
6099 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6100 char attrname[EXTATTR_MAXNAMELEN+1];
6101 int error;
6102
6103 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6104 sizeof (attrname), compat);
6105 if (error != 0)
6106 return (error);
6107
6108 error = ENOENT;
6109 if (zfsvfs->z_use_sa && zp->z_is_sa)
6110 error = zfs_deleteextattr_sa(ap, attrname);
6111 if (error == ENOENT)
6112 error = zfs_deleteextattr_dir(ap, attrname);
6113 return (error);
6114 }
6115
6116 /*
6117 * Vnode operation to remove a named attribute.
6118 */
6119 static int
zfs_deleteextattr(struct vop_deleteextattr_args * ap)6120 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
6121 {
6122 znode_t *zp = VTOZ(ap->a_vp);
6123 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6124 int error;
6125
6126 /*
6127 * If the xattr property is off, refuse the request.
6128 */
6129 if (!(zfsvfs->z_flags & ZSB_XATTR))
6130 return (SET_ERROR(EOPNOTSUPP));
6131
6132 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6133 ap->a_cred, ap->a_td, VWRITE);
6134 if (error != 0)
6135 return (SET_ERROR(error));
6136
6137 error = zfs_check_attrname(ap->a_name);
6138 if (error != 0)
6139 return (error);
6140
6141 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6142 return (error);
6143 rw_enter(&zp->z_xattr_lock, RW_WRITER);
6144
6145 error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
6146 if ((error == ENOENT || error == ENOATTR) &&
6147 ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6148 /*
6149 * Fall back to the alternate namespace format if we failed to
6150 * find a user xattr.
6151 */
6152 error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
6153 }
6154
6155 rw_exit(&zp->z_xattr_lock);
6156 zfs_exit(zfsvfs, FTAG);
6157 if (error == ENOENT)
6158 error = SET_ERROR(ENOATTR);
6159 return (error);
6160 }
6161
6162 #ifndef _SYS_SYSPROTO_H_
6163 struct vop_setextattr {
6164 IN struct vnode *a_vp;
6165 IN int a_attrnamespace;
6166 IN const char *a_name;
6167 INOUT struct uio *a_uio;
6168 IN struct ucred *a_cred;
6169 IN struct thread *a_td;
6170 };
6171 #endif
6172
6173 static int
zfs_setextattr_dir(struct vop_setextattr_args * ap,const char * attrname)6174 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
6175 {
6176 struct thread *td = ap->a_td;
6177 struct nameidata nd;
6178 struct vattr va;
6179 vnode_t *xvp = NULL, *vp;
6180 int error, flags;
6181
6182 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
6183 LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
6184 if (error != 0)
6185 return (error);
6186
6187 flags = FFLAGS(O_WRONLY | O_CREAT);
6188 #if __FreeBSD_version < 1400043
6189 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
6190 #else
6191 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
6192 #endif
6193 error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
6194 NULL);
6195 if (error != 0)
6196 return (SET_ERROR(error));
6197 vp = nd.ni_vp;
6198 NDFREE_PNBUF(&nd);
6199
6200 VATTR_NULL(&va);
6201 va.va_size = 0;
6202 error = VOP_SETATTR(vp, &va, ap->a_cred);
6203 if (error == 0)
6204 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6205
6206 VOP_UNLOCK(vp);
6207 vn_close(vp, flags, ap->a_cred, td);
6208 return (error);
6209 }
6210
6211 static int
zfs_setextattr_sa(struct vop_setextattr_args * ap,const char * attrname)6212 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
6213 {
6214 znode_t *zp = VTOZ(ap->a_vp);
6215 nvlist_t *nvl;
6216 size_t sa_size;
6217 int error;
6218
6219 error = zfs_ensure_xattr_cached(zp);
6220 if (error != 0)
6221 return (error);
6222
6223 ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
6224 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6225
6226 nvl = zp->z_xattr_cached;
6227 size_t entry_size = ap->a_uio->uio_resid;
6228 if (entry_size > DXATTR_MAX_ENTRY_SIZE)
6229 return (SET_ERROR(EFBIG));
6230 error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
6231 if (error != 0)
6232 return (SET_ERROR(error));
6233 if (sa_size > DXATTR_MAX_SA_SIZE)
6234 return (SET_ERROR(EFBIG));
6235 uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
6236 error = uiomove(buf, entry_size, ap->a_uio);
6237 if (error != 0) {
6238 error = SET_ERROR(error);
6239 } else {
6240 error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
6241 if (error != 0)
6242 error = SET_ERROR(error);
6243 }
6244 if (error == 0)
6245 error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
6246 kmem_free(buf, entry_size);
6247 if (error != 0) {
6248 zp->z_xattr_cached = NULL;
6249 nvlist_free(nvl);
6250 }
6251 return (error);
6252 }
6253
6254 static int
zfs_setextattr_impl(struct vop_setextattr_args * ap,boolean_t compat)6255 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
6256 {
6257 znode_t *zp = VTOZ(ap->a_vp);
6258 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6259 char attrname[EXTATTR_MAXNAMELEN+1];
6260 int error;
6261
6262 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6263 sizeof (attrname), compat);
6264 if (error != 0)
6265 return (error);
6266
6267 struct vop_deleteextattr_args vda = {
6268 .a_vp = ap->a_vp,
6269 .a_attrnamespace = ap->a_attrnamespace,
6270 .a_name = ap->a_name,
6271 .a_cred = ap->a_cred,
6272 .a_td = ap->a_td,
6273 };
6274 error = ENOENT;
6275 if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
6276 error = zfs_setextattr_sa(ap, attrname);
6277 if (error == 0) {
6278 /*
6279 * Successfully put into SA, we need to clear the one
6280 * in dir if present.
6281 */
6282 zfs_deleteextattr_dir(&vda, attrname);
6283 }
6284 }
6285 if (error != 0) {
6286 error = zfs_setextattr_dir(ap, attrname);
6287 if (error == 0 && zp->z_is_sa) {
6288 /*
6289 * Successfully put into dir, we need to clear the one
6290 * in SA if present.
6291 */
6292 zfs_deleteextattr_sa(&vda, attrname);
6293 }
6294 }
6295 if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6296 /*
6297 * Also clear all versions of the alternate compat name.
6298 */
6299 zfs_deleteextattr_impl(&vda, !compat);
6300 }
6301 return (error);
6302 }
6303
6304 /*
6305 * Vnode operation to set a named attribute.
6306 */
6307 static int
zfs_setextattr(struct vop_setextattr_args * ap)6308 zfs_setextattr(struct vop_setextattr_args *ap)
6309 {
6310 znode_t *zp = VTOZ(ap->a_vp);
6311 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6312 int error;
6313
6314 /*
6315 * If the xattr property is off, refuse the request.
6316 */
6317 if (!(zfsvfs->z_flags & ZSB_XATTR))
6318 return (SET_ERROR(EOPNOTSUPP));
6319
6320 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6321 ap->a_cred, ap->a_td, VWRITE);
6322 if (error != 0)
6323 return (SET_ERROR(error));
6324
6325 error = zfs_check_attrname(ap->a_name);
6326 if (error != 0)
6327 return (error);
6328
6329 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6330 return (error);
6331 rw_enter(&zp->z_xattr_lock, RW_WRITER);
6332
6333 error = zfs_setextattr_impl(ap, zfs_xattr_compat);
6334
6335 rw_exit(&zp->z_xattr_lock);
6336 zfs_exit(zfsvfs, FTAG);
6337 return (error);
6338 }
6339
6340 #ifndef _SYS_SYSPROTO_H_
6341 struct vop_listextattr {
6342 IN struct vnode *a_vp;
6343 IN int a_attrnamespace;
6344 INOUT struct uio *a_uio;
6345 OUT size_t *a_size;
6346 IN struct ucred *a_cred;
6347 IN struct thread *a_td;
6348 };
6349 #endif
6350
6351 static int
zfs_listextattr_dir(struct vop_listextattr_args * ap,const char * attrprefix)6352 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
6353 {
6354 struct thread *td = ap->a_td;
6355 struct nameidata nd;
6356 uint8_t dirbuf[sizeof (struct dirent)];
6357 struct iovec aiov;
6358 struct uio auio;
6359 vnode_t *xvp = NULL, *vp;
6360 int error, eof;
6361
6362 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
6363 LOOKUP_XATTR, B_FALSE);
6364 if (error != 0) {
6365 /*
6366 * ENOATTR means that the EA directory does not yet exist,
6367 * i.e. there are no extended attributes there.
6368 */
6369 if (error == ENOATTR)
6370 error = 0;
6371 return (error);
6372 }
6373
6374 #if __FreeBSD_version < 1400043
6375 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6376 UIO_SYSSPACE, ".", xvp, td);
6377 #else
6378 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6379 UIO_SYSSPACE, ".", xvp);
6380 #endif
6381 error = namei(&nd);
6382 if (error != 0)
6383 return (SET_ERROR(error));
6384 vp = nd.ni_vp;
6385 NDFREE_PNBUF(&nd);
6386
6387 auio.uio_iov = &aiov;
6388 auio.uio_iovcnt = 1;
6389 auio.uio_segflg = UIO_SYSSPACE;
6390 auio.uio_td = td;
6391 auio.uio_rw = UIO_READ;
6392 auio.uio_offset = 0;
6393
6394 size_t plen = strlen(attrprefix);
6395
6396 do {
6397 aiov.iov_base = (void *)dirbuf;
6398 aiov.iov_len = sizeof (dirbuf);
6399 auio.uio_resid = sizeof (dirbuf);
6400 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6401 if (error != 0)
6402 break;
6403 int done = sizeof (dirbuf) - auio.uio_resid;
6404 for (int pos = 0; pos < done; ) {
6405 struct dirent *dp = (struct dirent *)(dirbuf + pos);
6406 pos += dp->d_reclen;
6407 /*
6408 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6409 * is what we get when attribute was created on Solaris.
6410 */
6411 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6412 continue;
6413 else if (plen == 0 &&
6414 ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
6415 continue;
6416 else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6417 continue;
6418 uint8_t nlen = dp->d_namlen - plen;
6419 if (ap->a_size != NULL) {
6420 *ap->a_size += 1 + nlen;
6421 } else if (ap->a_uio != NULL) {
6422 /*
6423 * Format of extattr name entry is one byte for
6424 * length and the rest for name.
6425 */
6426 error = uiomove(&nlen, 1, ap->a_uio);
6427 if (error == 0) {
6428 char *namep = dp->d_name + plen;
6429 error = uiomove(namep, nlen, ap->a_uio);
6430 }
6431 if (error != 0) {
6432 error = SET_ERROR(error);
6433 break;
6434 }
6435 }
6436 }
6437 } while (!eof && error == 0);
6438
6439 vput(vp);
6440 return (error);
6441 }
6442
6443 static int
zfs_listextattr_sa(struct vop_listextattr_args * ap,const char * attrprefix)6444 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
6445 {
6446 znode_t *zp = VTOZ(ap->a_vp);
6447 int error;
6448
6449 error = zfs_ensure_xattr_cached(zp);
6450 if (error != 0)
6451 return (error);
6452
6453 ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
6454 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6455
6456 size_t plen = strlen(attrprefix);
6457 nvpair_t *nvp = NULL;
6458 while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
6459 ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
6460
6461 const char *name = nvpair_name(nvp);
6462 if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
6463 continue;
6464 else if (strncmp(name, attrprefix, plen) != 0)
6465 continue;
6466 uint8_t nlen = strlen(name) - plen;
6467 if (ap->a_size != NULL) {
6468 *ap->a_size += 1 + nlen;
6469 } else if (ap->a_uio != NULL) {
6470 /*
6471 * Format of extattr name entry is one byte for
6472 * length and the rest for name.
6473 */
6474 error = uiomove(&nlen, 1, ap->a_uio);
6475 if (error == 0) {
6476 char *namep = __DECONST(char *, name) + plen;
6477 error = uiomove(namep, nlen, ap->a_uio);
6478 }
6479 if (error != 0) {
6480 error = SET_ERROR(error);
6481 break;
6482 }
6483 }
6484 }
6485
6486 return (error);
6487 }
6488
6489 static int
zfs_listextattr_impl(struct vop_listextattr_args * ap,boolean_t compat)6490 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
6491 {
6492 znode_t *zp = VTOZ(ap->a_vp);
6493 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6494 char attrprefix[16];
6495 int error;
6496
6497 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6498 sizeof (attrprefix), compat);
6499 if (error != 0)
6500 return (error);
6501
6502 if (zfsvfs->z_use_sa && zp->z_is_sa)
6503 error = zfs_listextattr_sa(ap, attrprefix);
6504 if (error == 0)
6505 error = zfs_listextattr_dir(ap, attrprefix);
6506 return (error);
6507 }
6508
6509 /*
6510 * Vnode operation to retrieve extended attributes on a vnode.
6511 */
6512 static int
zfs_listextattr(struct vop_listextattr_args * ap)6513 zfs_listextattr(struct vop_listextattr_args *ap)
6514 {
6515 znode_t *zp = VTOZ(ap->a_vp);
6516 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6517 int error;
6518
6519 if (ap->a_size != NULL)
6520 *ap->a_size = 0;
6521
6522 /*
6523 * If the xattr property is off, refuse the request.
6524 */
6525 if (!(zfsvfs->z_flags & ZSB_XATTR))
6526 return (SET_ERROR(EOPNOTSUPP));
6527
6528 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6529 ap->a_cred, ap->a_td, VREAD);
6530 if (error != 0)
6531 return (SET_ERROR(error));
6532
6533 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6534 return (error);
6535 rw_enter(&zp->z_xattr_lock, RW_READER);
6536
6537 error = zfs_listextattr_impl(ap, zfs_xattr_compat);
6538 if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6539 /* Also list user xattrs with the alternate format. */
6540 error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
6541 }
6542
6543 rw_exit(&zp->z_xattr_lock);
6544 zfs_exit(zfsvfs, FTAG);
6545 return (error);
6546 }
6547
6548 #ifndef _SYS_SYSPROTO_H_
6549 struct vop_getacl_args {
6550 struct vnode *vp;
6551 acl_type_t type;
6552 struct acl *aclp;
6553 struct ucred *cred;
6554 struct thread *td;
6555 };
6556 #endif
6557
6558 static int
zfs_freebsd_getacl(struct vop_getacl_args * ap)6559 zfs_freebsd_getacl(struct vop_getacl_args *ap)
6560 {
6561 int error;
6562 vsecattr_t vsecattr;
6563
6564 if (ap->a_type != ACL_TYPE_NFS4)
6565 return (EINVAL);
6566
6567 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6568 if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
6569 &vsecattr, 0, ap->a_cred)))
6570 return (error);
6571
6572 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
6573 vsecattr.vsa_aclcnt);
6574 if (vsecattr.vsa_aclentp != NULL)
6575 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6576
6577 return (error);
6578 }
6579
6580 #ifndef _SYS_SYSPROTO_H_
6581 struct vop_setacl_args {
6582 struct vnode *vp;
6583 acl_type_t type;
6584 struct acl *aclp;
6585 struct ucred *cred;
6586 struct thread *td;
6587 };
6588 #endif
6589
6590 static int
zfs_freebsd_setacl(struct vop_setacl_args * ap)6591 zfs_freebsd_setacl(struct vop_setacl_args *ap)
6592 {
6593 int error;
6594 vsecattr_t vsecattr;
6595 int aclbsize; /* size of acl list in bytes */
6596 aclent_t *aaclp;
6597
6598 if (ap->a_type != ACL_TYPE_NFS4)
6599 return (EINVAL);
6600
6601 if (ap->a_aclp == NULL)
6602 return (EINVAL);
6603
6604 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6605 return (EINVAL);
6606
6607 /*
6608 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6609 * splitting every entry into two and appending "canonical six"
6610 * entries at the end. Don't allow for setting an ACL that would
6611 * cause chmod(2) to run out of ACL entries.
6612 */
6613 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6614 return (ENOSPC);
6615
6616 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6617 if (error != 0)
6618 return (error);
6619
6620 vsecattr.vsa_mask = VSA_ACE;
6621 aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
6622 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6623 aaclp = vsecattr.vsa_aclentp;
6624 vsecattr.vsa_aclentsz = aclbsize;
6625
6626 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6627 error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
6628 kmem_free(aaclp, aclbsize);
6629
6630 return (error);
6631 }
6632
6633 #ifndef _SYS_SYSPROTO_H_
6634 struct vop_aclcheck_args {
6635 struct vnode *vp;
6636 acl_type_t type;
6637 struct acl *aclp;
6638 struct ucred *cred;
6639 struct thread *td;
6640 };
6641 #endif
6642
6643 static int
zfs_freebsd_aclcheck(struct vop_aclcheck_args * ap)6644 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
6645 {
6646
6647 return (EOPNOTSUPP);
6648 }
6649
6650 #ifndef _SYS_SYSPROTO_H_
6651 struct vop_advise_args {
6652 struct vnode *a_vp;
6653 off_t a_start;
6654 off_t a_end;
6655 int a_advice;
6656 };
6657 #endif
6658
6659 static int
zfs_freebsd_advise(struct vop_advise_args * ap)6660 zfs_freebsd_advise(struct vop_advise_args *ap)
6661 {
6662 vnode_t *vp = ap->a_vp;
6663 off_t start = ap->a_start;
6664 off_t end = ap->a_end;
6665 int advice = ap->a_advice;
6666 off_t len;
6667 znode_t *zp;
6668 zfsvfs_t *zfsvfs;
6669 objset_t *os;
6670 int error = 0;
6671
6672 if (end < start)
6673 return (EINVAL);
6674
6675 error = vn_lock(vp, LK_SHARED);
6676 if (error)
6677 return (error);
6678
6679 zp = VTOZ(vp);
6680 zfsvfs = zp->z_zfsvfs;
6681 os = zp->z_zfsvfs->z_os;
6682
6683 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6684 goto out_unlock;
6685
6686 /* kern_posix_fadvise points to the last byte, we want one past */
6687 if (end != OFF_MAX)
6688 end += 1;
6689 len = end - start;
6690
6691 switch (advice) {
6692 case POSIX_FADV_WILLNEED:
6693 /*
6694 * Pass on the caller's size directly, but note that
6695 * dmu_prefetch_max will effectively cap it. If there really
6696 * is a larger sequential access pattern, perhaps dmu_zfetch
6697 * will detect it.
6698 */
6699 dmu_prefetch(os, zp->z_id, 0, start, len,
6700 ZIO_PRIORITY_ASYNC_READ);
6701 break;
6702 case POSIX_FADV_NORMAL:
6703 case POSIX_FADV_RANDOM:
6704 case POSIX_FADV_SEQUENTIAL:
6705 case POSIX_FADV_DONTNEED:
6706 case POSIX_FADV_NOREUSE:
6707 /* ignored for now */
6708 break;
6709 default:
6710 error = EINVAL;
6711 break;
6712 }
6713
6714 zfs_exit(zfsvfs, FTAG);
6715
6716 out_unlock:
6717 VOP_UNLOCK(vp);
6718
6719 return (error);
6720 }
6721
6722 static int
zfs_vptocnp(struct vop_vptocnp_args * ap)6723 zfs_vptocnp(struct vop_vptocnp_args *ap)
6724 {
6725 vnode_t *covered_vp;
6726 vnode_t *vp = ap->a_vp;
6727 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
6728 znode_t *zp = VTOZ(vp);
6729 int ltype;
6730 int error;
6731
6732 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6733 return (error);
6734
6735 /*
6736 * If we are a snapshot mounted under .zfs, run the operation
6737 * on the covered vnode.
6738 */
6739 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
6740 char name[MAXNAMLEN + 1];
6741 znode_t *dzp;
6742 size_t len;
6743
6744 error = zfs_znode_parent_and_name(zp, &dzp, name,
6745 sizeof (name));
6746 if (error == 0) {
6747 len = strlen(name);
6748 if (*ap->a_buflen < len)
6749 error = SET_ERROR(ENOMEM);
6750 }
6751 if (error == 0) {
6752 *ap->a_buflen -= len;
6753 memcpy(ap->a_buf + *ap->a_buflen, name, len);
6754 *ap->a_vpp = ZTOV(dzp);
6755 }
6756 zfs_exit(zfsvfs, FTAG);
6757 return (error);
6758 }
6759 zfs_exit(zfsvfs, FTAG);
6760
6761 covered_vp = vp->v_mount->mnt_vnodecovered;
6762 enum vgetstate vs = vget_prep(covered_vp);
6763 ltype = VOP_ISLOCKED(vp);
6764 VOP_UNLOCK(vp);
6765 error = vget_finish(covered_vp, LK_SHARED, vs);
6766 if (error == 0) {
6767 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
6768 ap->a_buflen);
6769 vput(covered_vp);
6770 }
6771 vn_lock(vp, ltype | LK_RETRY);
6772 if (VN_IS_DOOMED(vp))
6773 error = SET_ERROR(ENOENT);
6774 return (error);
6775 }
6776
6777 #if __FreeBSD_version >= 1400032
6778 static int
zfs_deallocate(struct vop_deallocate_args * ap)6779 zfs_deallocate(struct vop_deallocate_args *ap)
6780 {
6781 znode_t *zp = VTOZ(ap->a_vp);
6782 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6783 zilog_t *zilog;
6784 off_t off, len, file_sz;
6785 int error;
6786
6787 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6788 return (error);
6789
6790 /*
6791 * Callers might not be able to detect properly that we are read-only,
6792 * so check it explicitly here.
6793 */
6794 if (zfs_is_readonly(zfsvfs)) {
6795 zfs_exit(zfsvfs, FTAG);
6796 return (SET_ERROR(EROFS));
6797 }
6798
6799 zilog = zfsvfs->z_log;
6800 off = *ap->a_offset;
6801 len = *ap->a_len;
6802 file_sz = zp->z_size;
6803 if (off + len > file_sz)
6804 len = file_sz - off;
6805 /* Fast path for out-of-range request. */
6806 if (len <= 0) {
6807 *ap->a_len = 0;
6808 zfs_exit(zfsvfs, FTAG);
6809 return (0);
6810 }
6811
6812 error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
6813 if (error == 0) {
6814 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
6815 (ap->a_ioflag & IO_SYNC) != 0)
6816 error = zil_commit(zilog, zp->z_id);
6817 if (error == 0) {
6818 *ap->a_offset = off + len;
6819 *ap->a_len = 0;
6820 }
6821 }
6822
6823 zfs_exit(zfsvfs, FTAG);
6824 return (error);
6825 }
6826 #endif
6827
6828 #ifndef _SYS_SYSPROTO_H_
6829 struct vop_copy_file_range_args {
6830 struct vnode *a_invp;
6831 off_t *a_inoffp;
6832 struct vnode *a_outvp;
6833 off_t *a_outoffp;
6834 size_t *a_lenp;
6835 unsigned int a_flags;
6836 struct ucred *a_incred;
6837 struct ucred *a_outcred;
6838 struct thread *a_fsizetd;
6839 }
6840 #endif
6841 /*
6842 * TODO: FreeBSD will only call file system-specific copy_file_range() if both
6843 * files resides under the same mountpoint. In case of ZFS we want to be called
6844 * even is files are in different datasets (but on the same pools, but we need
6845 * to check that ourselves).
6846 */
6847 static int
zfs_freebsd_copy_file_range(struct vop_copy_file_range_args * ap)6848 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
6849 {
6850 zfsvfs_t *outzfsvfs;
6851 struct vnode *invp = ap->a_invp;
6852 struct vnode *outvp = ap->a_outvp;
6853 struct mount *mp;
6854 int error;
6855 uint64_t len = *ap->a_lenp;
6856
6857 if (!zfs_bclone_enabled) {
6858 mp = NULL;
6859 goto bad_write_fallback;
6860 }
6861
6862 /*
6863 * TODO: If offset/length is not aligned to recordsize, use
6864 * vn_generic_copy_file_range() on this fragment.
6865 * It would be better to do this after we lock the vnodes, but then we
6866 * need something else than vn_generic_copy_file_range().
6867 */
6868
6869 vn_start_write(outvp, &mp, V_WAIT);
6870 if (__predict_true(mp == outvp->v_mount)) {
6871 outzfsvfs = (zfsvfs_t *)mp->mnt_data;
6872 if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
6873 SPA_FEATURE_BLOCK_CLONING)) {
6874 goto bad_write_fallback;
6875 }
6876 }
6877 if (invp == outvp) {
6878 if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
6879 goto bad_write_fallback;
6880 }
6881 } else {
6882 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
6883 __FreeBSD_version >= 1400086
6884 vn_lock_pair(invp, false, LK_SHARED, outvp, false,
6885 LK_EXCLUSIVE);
6886 #else
6887 vn_lock_pair(invp, false, outvp, false);
6888 #endif
6889 if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
6890 goto bad_locked_fallback;
6891 }
6892 }
6893
6894 #ifdef MAC
6895 error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
6896 outvp);
6897 if (error != 0)
6898 goto out_locked;
6899 #endif
6900
6901 error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
6902 ap->a_outoffp, &len, ap->a_outcred);
6903 if (error == EXDEV || error == EAGAIN || error == EINVAL ||
6904 error == EOPNOTSUPP)
6905 goto bad_locked_fallback;
6906 *ap->a_lenp = (size_t)len;
6907 #ifdef MAC
6908 out_locked:
6909 #endif
6910 if (invp != outvp)
6911 VOP_UNLOCK(invp);
6912 VOP_UNLOCK(outvp);
6913 if (mp != NULL)
6914 vn_finished_write(mp);
6915 return (error);
6916
6917 bad_locked_fallback:
6918 if (invp != outvp)
6919 VOP_UNLOCK(invp);
6920 VOP_UNLOCK(outvp);
6921 bad_write_fallback:
6922 if (mp != NULL)
6923 vn_finished_write(mp);
6924 error = ENOSYS;
6925 return (error);
6926 }
6927
6928 struct vop_vector zfs_vnodeops;
6929 struct vop_vector zfs_fifoops;
6930 struct vop_vector zfs_shareops;
6931
6932 struct vop_vector zfs_vnodeops = {
6933 .vop_default = &default_vnodeops,
6934 .vop_inactive = zfs_freebsd_inactive,
6935 .vop_need_inactive = zfs_freebsd_need_inactive,
6936 .vop_reclaim = zfs_freebsd_reclaim,
6937 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
6938 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6939 .vop_access = zfs_freebsd_access,
6940 .vop_allocate = VOP_EOPNOTSUPP,
6941 #if __FreeBSD_version >= 1400032
6942 .vop_deallocate = zfs_deallocate,
6943 #endif
6944 .vop_lookup = zfs_cache_lookup,
6945 .vop_cachedlookup = zfs_freebsd_cachedlookup,
6946 .vop_getattr = zfs_freebsd_getattr,
6947 .vop_setattr = zfs_freebsd_setattr,
6948 .vop_create = zfs_freebsd_create,
6949 .vop_mknod = (vop_mknod_t *)zfs_freebsd_create,
6950 .vop_mkdir = zfs_freebsd_mkdir,
6951 .vop_readdir = zfs_freebsd_readdir,
6952 .vop_fsync = zfs_freebsd_fsync,
6953 .vop_open = zfs_freebsd_open,
6954 .vop_close = zfs_freebsd_close,
6955 .vop_rmdir = zfs_freebsd_rmdir,
6956 .vop_ioctl = zfs_freebsd_ioctl,
6957 .vop_link = zfs_freebsd_link,
6958 .vop_symlink = zfs_freebsd_symlink,
6959 .vop_readlink = zfs_freebsd_readlink,
6960 .vop_advise = zfs_freebsd_advise,
6961 .vop_read = zfs_freebsd_read,
6962 .vop_write = zfs_freebsd_write,
6963 .vop_remove = zfs_freebsd_remove,
6964 .vop_rename = zfs_freebsd_rename,
6965 .vop_pathconf = zfs_freebsd_pathconf,
6966 .vop_bmap = zfs_freebsd_bmap,
6967 .vop_fid = zfs_freebsd_fid,
6968 .vop_getextattr = zfs_getextattr,
6969 .vop_deleteextattr = zfs_deleteextattr,
6970 .vop_setextattr = zfs_setextattr,
6971 .vop_listextattr = zfs_listextattr,
6972 .vop_getacl = zfs_freebsd_getacl,
6973 .vop_setacl = zfs_freebsd_setacl,
6974 .vop_aclcheck = zfs_freebsd_aclcheck,
6975 .vop_getpages = zfs_freebsd_getpages,
6976 .vop_putpages = zfs_freebsd_putpages,
6977 .vop_vptocnp = zfs_vptocnp,
6978 .vop_lock1 = vop_lock,
6979 .vop_unlock = vop_unlock,
6980 .vop_islocked = vop_islocked,
6981 #if __FreeBSD_version >= 1400043
6982 .vop_add_writecount = vop_stdadd_writecount_nomsync,
6983 #endif
6984 .vop_copy_file_range = zfs_freebsd_copy_file_range,
6985 };
6986 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
6987
6988 struct vop_vector zfs_fifoops = {
6989 .vop_default = &fifo_specops,
6990 .vop_fsync = zfs_freebsd_fsync,
6991 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
6992 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6993 .vop_access = zfs_freebsd_access,
6994 .vop_getattr = zfs_freebsd_getattr,
6995 .vop_inactive = zfs_freebsd_inactive,
6996 .vop_read = VOP_PANIC,
6997 .vop_reclaim = zfs_freebsd_reclaim,
6998 .vop_setattr = zfs_freebsd_setattr,
6999 .vop_write = VOP_PANIC,
7000 .vop_pathconf = zfs_freebsd_pathconf,
7001 .vop_fid = zfs_freebsd_fid,
7002 .vop_getacl = zfs_freebsd_getacl,
7003 .vop_setacl = zfs_freebsd_setacl,
7004 .vop_aclcheck = zfs_freebsd_aclcheck,
7005 #if __FreeBSD_version >= 1400043
7006 .vop_add_writecount = vop_stdadd_writecount_nomsync,
7007 #endif
7008 };
7009 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
7010
7011 /*
7012 * special share hidden files vnode operations template
7013 */
7014 struct vop_vector zfs_shareops = {
7015 .vop_default = &default_vnodeops,
7016 .vop_fplookup_vexec = VOP_EAGAIN,
7017 .vop_fplookup_symlink = VOP_EAGAIN,
7018 .vop_access = zfs_freebsd_access,
7019 .vop_inactive = zfs_freebsd_inactive,
7020 .vop_reclaim = zfs_freebsd_reclaim,
7021 .vop_fid = zfs_freebsd_fid,
7022 .vop_pathconf = zfs_freebsd_pathconf,
7023 #if __FreeBSD_version >= 1400043
7024 .vop_add_writecount = vop_stdadd_writecount_nomsync,
7025 #endif
7026 };
7027 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
7028
7029 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
7030 "Use legacy ZFS xattr naming for writing new user namespace xattrs");
7031