1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2019 Joyent, Inc. 27 * Copyright 2017 Nexenta Systems, Inc. 28 */ 29 30 /* Portions Copyright 2007 Jeremy Teo */ 31 /* Portions Copyright 2010 Robert Milkowski */ 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/time.h> 36 #include <sys/systm.h> 37 #include <sys/sysmacros.h> 38 #include <sys/resource.h> 39 #include <sys/vfs.h> 40 #include <sys/vfs_opreg.h> 41 #include <sys/vnode.h> 42 #include <sys/file.h> 43 #include <sys/stat.h> 44 #include <sys/kmem.h> 45 #include <sys/taskq.h> 46 #include <sys/uio.h> 47 #include <sys/vmsystm.h> 48 #include <sys/atomic.h> 49 #include <sys/vm.h> 50 #include <vm/seg_vn.h> 51 #include <vm/pvn.h> 52 #include <vm/as.h> 53 #include <vm/kpm.h> 54 #include <vm/seg_kpm.h> 55 #include <sys/mman.h> 56 #include <sys/pathname.h> 57 #include <sys/cmn_err.h> 58 #include <sys/errno.h> 59 #include <sys/unistd.h> 60 #include <sys/zfs_dir.h> 61 #include <sys/zfs_acl.h> 62 #include <sys/zfs_ioctl.h> 63 #include <sys/fs/zfs.h> 64 #include <sys/dmu.h> 65 #include <sys/dmu_objset.h> 66 #include <sys/spa.h> 67 #include <sys/txg.h> 68 #include <sys/dbuf.h> 69 #include <sys/zap.h> 70 #include <sys/sa.h> 71 #include <sys/dirent.h> 72 #include <sys/policy.h> 73 #include <sys/sunddi.h> 74 #include <sys/filio.h> 75 #include <sys/sid.h> 76 #include "fs/fs_subr.h" 77 #include <sys/zfs_ctldir.h> 78 #include <sys/zfs_fuid.h> 79 #include <sys/zfs_sa.h> 80 #include <sys/dnlc.h> 81 #include <sys/zfs_rlock.h> 82 #include <sys/extdirent.h> 83 #include <sys/kidmap.h> 84 #include <sys/cred.h> 85 #include <sys/attr.h> 86 #include <sys/zil.h> 87 #include <sys/sa_impl.h> 88 #include <sys/zfs_project.h> 89 90 /* 91 * Programming rules. 92 * 93 * Each vnode op performs some logical unit of work. To do this, the ZPL must 94 * properly lock its in-core state, create a DMU transaction, do the work, 95 * record this work in the intent log (ZIL), commit the DMU transaction, 96 * and wait for the intent log to commit if it is a synchronous operation. 97 * Moreover, the vnode ops must work in both normal and log replay context. 98 * The ordering of events is important to avoid deadlocks and references 99 * to freed memory. The example below illustrates the following Big Rules: 100 * 101 * (1) A check must be made in each zfs thread for a mounted file system. 102 * This is done avoiding races using ZFS_ENTER(zfsvfs). 103 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 104 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 105 * can return EIO from the calling function. 106 * 107 * (2) VN_RELE() should always be the last thing except for zil_commit() 108 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 109 * First, if it's the last reference, the vnode/znode 110 * can be freed, so the zp may point to freed memory. Second, the last 111 * reference will call zfs_zinactive(), which may induce a lot of work -- 112 * pushing cached pages (which acquires range locks) and syncing out 113 * cached atime changes. Third, zfs_zinactive() may require a new tx, 114 * which could deadlock the system if you were already holding one. 115 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 116 * 117 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 118 * as they can span dmu_tx_assign() calls. 119 * 120 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 121 * dmu_tx_assign(). This is critical because we don't want to block 122 * while holding locks. 123 * 124 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 125 * reduces lock contention and CPU usage when we must wait (note that if 126 * throughput is constrained by the storage, nearly every transaction 127 * must wait). 128 * 129 * Note, in particular, that if a lock is sometimes acquired before 130 * the tx assigns, and sometimes after (e.g. z_lock), then failing 131 * to use a non-blocking assign can deadlock the system. The scenario: 132 * 133 * Thread A has grabbed a lock before calling dmu_tx_assign(). 134 * Thread B is in an already-assigned tx, and blocks for this lock. 135 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 136 * forever, because the previous txg can't quiesce until B's tx commits. 137 * 138 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 139 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 140 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, 141 * to indicate that this operation has already called dmu_tx_wait(). 142 * This will ensure that we don't retry forever, waiting a short bit 143 * each time. 144 * 145 * (5) If the operation succeeded, generate the intent log entry for it 146 * before dropping locks. This ensures that the ordering of events 147 * in the intent log matches the order in which they actually occurred. 148 * During ZIL replay the zfs_log_* functions will update the sequence 149 * number to indicate the zil transaction has replayed. 150 * 151 * (6) At the end of each vnode op, the DMU tx must always commit, 152 * regardless of whether there were any errors. 153 * 154 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 155 * to ensure that synchronous semantics are provided when necessary. 156 * 157 * In general, this is how things should be ordered in each vnode op: 158 * 159 * ZFS_ENTER(zfsvfs); // exit if unmounted 160 * top: 161 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 162 * rw_enter(...); // grab any other locks you need 163 * tx = dmu_tx_create(...); // get DMU tx 164 * dmu_tx_hold_*(); // hold each object you might modify 165 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 166 * if (error) { 167 * rw_exit(...); // drop locks 168 * zfs_dirent_unlock(dl); // unlock directory entry 169 * VN_RELE(...); // release held vnodes 170 * if (error == ERESTART) { 171 * waited = B_TRUE; 172 * dmu_tx_wait(tx); 173 * dmu_tx_abort(tx); 174 * goto top; 175 * } 176 * dmu_tx_abort(tx); // abort DMU tx 177 * ZFS_EXIT(zfsvfs); // finished in zfs 178 * return (error); // really out of space 179 * } 180 * error = do_real_work(); // do whatever this VOP does 181 * if (error == 0) 182 * zfs_log_*(...); // on success, make ZIL entry 183 * dmu_tx_commit(tx); // commit DMU tx -- error or not 184 * rw_exit(...); // drop locks 185 * zfs_dirent_unlock(dl); // unlock directory entry 186 * VN_RELE(...); // release held vnodes 187 * zil_commit(zilog, foid); // synchronous when necessary 188 * ZFS_EXIT(zfsvfs); // finished in zfs 189 * return (error); // done, report error 190 */ 191 192 /* ARGSUSED */ 193 static int 194 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 195 { 196 znode_t *zp = VTOZ(*vpp); 197 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 198 199 ZFS_ENTER(zfsvfs); 200 ZFS_VERIFY_ZP(zp); 201 202 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 203 ((flag & FAPPEND) == 0)) { 204 ZFS_EXIT(zfsvfs); 205 return (SET_ERROR(EPERM)); 206 } 207 208 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 209 ZTOV(zp)->v_type == VREG && 210 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 211 if (fs_vscan(*vpp, cr, 0) != 0) { 212 ZFS_EXIT(zfsvfs); 213 return (SET_ERROR(EACCES)); 214 } 215 } 216 217 /* Keep a count of the synchronous opens in the znode */ 218 if (flag & (FSYNC | FDSYNC)) 219 atomic_inc_32(&zp->z_sync_cnt); 220 221 ZFS_EXIT(zfsvfs); 222 return (0); 223 } 224 225 /* ARGSUSED */ 226 static int 227 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 228 caller_context_t *ct) 229 { 230 znode_t *zp = VTOZ(vp); 231 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 232 233 /* 234 * Clean up any locks held by this process on the vp. 235 */ 236 cleanlocks(vp, ddi_get_pid(), 0); 237 cleanshares(vp, ddi_get_pid()); 238 239 ZFS_ENTER(zfsvfs); 240 ZFS_VERIFY_ZP(zp); 241 242 /* Decrement the synchronous opens in the znode */ 243 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 244 atomic_dec_32(&zp->z_sync_cnt); 245 246 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 247 ZTOV(zp)->v_type == VREG && 248 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 249 VERIFY(fs_vscan(vp, cr, 1) == 0); 250 251 ZFS_EXIT(zfsvfs); 252 return (0); 253 } 254 255 /* 256 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 257 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 258 */ 259 static int 260 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 261 { 262 znode_t *zp = VTOZ(vp); 263 uint64_t noff = (uint64_t)*off; /* new offset */ 264 uint64_t file_sz; 265 int error; 266 boolean_t hole; 267 268 file_sz = zp->z_size; 269 if (noff >= file_sz) { 270 return (SET_ERROR(ENXIO)); 271 } 272 273 if (cmd == _FIO_SEEK_HOLE) 274 hole = B_TRUE; 275 else 276 hole = B_FALSE; 277 278 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 279 280 if (error == ESRCH) 281 return (SET_ERROR(ENXIO)); 282 283 /* 284 * We could find a hole that begins after the logical end-of-file, 285 * because dmu_offset_next() only works on whole blocks. If the 286 * EOF falls mid-block, then indicate that the "virtual hole" 287 * at the end of the file begins at the logical EOF, rather than 288 * at the end of the last block. 289 */ 290 if (noff > file_sz) { 291 ASSERT(hole); 292 noff = file_sz; 293 } 294 295 if (noff < *off) 296 return (error); 297 *off = noff; 298 return (error); 299 } 300 301 static int 302 zfs_ioctl_getxattr(vnode_t *vp, intptr_t data, int flag, cred_t *cr, 303 caller_context_t *ct) 304 { 305 zfsxattr_t fsx = { 0 }; 306 znode_t *zp = VTOZ(vp); 307 308 if (zp->z_pflags & ZFS_PROJINHERIT) 309 fsx.fsx_xflags = ZFS_PROJINHERIT_FL; 310 if (zp->z_pflags & ZFS_PROJID) 311 fsx.fsx_projid = zp->z_projid; 312 if (ddi_copyout(&fsx, (void *)data, sizeof (fsx), flag)) 313 return (SET_ERROR(EFAULT)); 314 315 return (0); 316 } 317 318 static int zfs_setattr(vnode_t *, vattr_t *, int, cred_t *, caller_context_t *); 319 320 static int 321 zfs_ioctl_setxattr(vnode_t *vp, intptr_t data, int flags, cred_t *cr, 322 caller_context_t *ct) 323 { 324 znode_t *zp = VTOZ(vp); 325 zfsxattr_t fsx; 326 xvattr_t xva; 327 xoptattr_t *xoap; 328 int err; 329 330 if (ddi_copyin((void *)data, &fsx, sizeof (fsx), flags)) 331 return (SET_ERROR(EFAULT)); 332 333 if (!zpl_is_valid_projid(fsx.fsx_projid)) 334 return (SET_ERROR(EINVAL)); 335 336 if (fsx.fsx_xflags & ~ZFS_PROJINHERIT_FL) 337 return (SET_ERROR(EOPNOTSUPP)); 338 339 xva_init(&xva); 340 xoap = xva_getxoptattr(&xva); 341 342 XVA_SET_REQ(&xva, XAT_PROJINHERIT); 343 if (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) 344 xoap->xoa_projinherit = B_TRUE; 345 346 XVA_SET_REQ(&xva, XAT_PROJID); 347 xoap->xoa_projid = fsx.fsx_projid; 348 349 return (zfs_setattr(vp, (vattr_t *)&xva, flags, cr, ct)); 350 } 351 352 /* ARGSUSED */ 353 static int 354 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 355 int *rvalp, caller_context_t *ct) 356 { 357 offset_t off; 358 offset_t ndata; 359 dmu_object_info_t doi; 360 int error; 361 zfsvfs_t *zfsvfs; 362 znode_t *zp; 363 364 switch (com) { 365 case _FIOFFS: 366 { 367 return (zfs_sync(vp->v_vfsp, 0, cred)); 368 369 /* 370 * The following two ioctls are used by bfu. Faking out, 371 * necessary to avoid bfu errors. 372 */ 373 } 374 case _FIOGDIO: 375 case _FIOSDIO: 376 { 377 return (0); 378 } 379 380 case _FIO_SEEK_DATA: 381 case _FIO_SEEK_HOLE: 382 { 383 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 384 return (SET_ERROR(EFAULT)); 385 386 zp = VTOZ(vp); 387 zfsvfs = zp->z_zfsvfs; 388 ZFS_ENTER(zfsvfs); 389 ZFS_VERIFY_ZP(zp); 390 391 /* offset parameter is in/out */ 392 error = zfs_holey(vp, com, &off); 393 ZFS_EXIT(zfsvfs); 394 if (error) 395 return (error); 396 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 397 return (SET_ERROR(EFAULT)); 398 return (0); 399 } 400 case _FIO_COUNT_FILLED: 401 { 402 /* 403 * _FIO_COUNT_FILLED adds a new ioctl command which 404 * exposes the number of filled blocks in a 405 * ZFS object. 406 */ 407 zp = VTOZ(vp); 408 zfsvfs = zp->z_zfsvfs; 409 ZFS_ENTER(zfsvfs); 410 ZFS_VERIFY_ZP(zp); 411 412 /* 413 * Wait for all dirty blocks for this object 414 * to get synced out to disk, and the DMU info 415 * updated. 416 */ 417 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 418 if (error) { 419 ZFS_EXIT(zfsvfs); 420 return (error); 421 } 422 423 /* 424 * Retrieve fill count from DMU object. 425 */ 426 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 427 if (error) { 428 ZFS_EXIT(zfsvfs); 429 return (error); 430 } 431 432 ndata = doi.doi_fill_count; 433 434 ZFS_EXIT(zfsvfs); 435 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 436 return (SET_ERROR(EFAULT)); 437 return (0); 438 } 439 case ZFS_IOC_FSGETXATTR: 440 return (zfs_ioctl_getxattr(vp, data, flag, cred, ct)); 441 case ZFS_IOC_FSSETXATTR: 442 return (zfs_ioctl_setxattr(vp, data, flag, cred, ct)); 443 } 444 return (SET_ERROR(ENOTTY)); 445 } 446 447 /* 448 * Utility functions to map and unmap a single physical page. These 449 * are used to manage the mappable copies of ZFS file data, and therefore 450 * do not update ref/mod bits. 451 */ 452 caddr_t 453 zfs_map_page(page_t *pp, enum seg_rw rw) 454 { 455 if (kpm_enable) 456 return (hat_kpm_mapin(pp, 0)); 457 ASSERT(rw == S_READ || rw == S_WRITE); 458 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 459 (caddr_t)-1)); 460 } 461 462 void 463 zfs_unmap_page(page_t *pp, caddr_t addr) 464 { 465 if (kpm_enable) { 466 hat_kpm_mapout(pp, 0, addr); 467 } else { 468 ppmapout(addr); 469 } 470 } 471 472 /* 473 * When a file is memory mapped, we must keep the IO data synchronized 474 * between the DMU cache and the memory mapped pages. What this means: 475 * 476 * On Write: If we find a memory mapped page, we write to *both* 477 * the page and the dmu buffer. 478 */ 479 static void 480 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 481 { 482 int64_t off; 483 484 off = start & PAGEOFFSET; 485 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 486 page_t *pp; 487 uint64_t nbytes = MIN(PAGESIZE - off, len); 488 489 if (pp = page_lookup(vp, start, SE_SHARED)) { 490 caddr_t va; 491 492 va = zfs_map_page(pp, S_WRITE); 493 (void) dmu_read(os, oid, start+off, nbytes, va+off, 494 DMU_READ_PREFETCH); 495 zfs_unmap_page(pp, va); 496 page_unlock(pp); 497 } 498 len -= nbytes; 499 off = 0; 500 } 501 } 502 503 /* 504 * When a file is memory mapped, we must keep the IO data synchronized 505 * between the DMU cache and the memory mapped pages. What this means: 506 * 507 * On Read: We "read" preferentially from memory mapped pages, 508 * else we default from the dmu buffer. 509 * 510 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 511 * the file is memory mapped. 512 */ 513 static int 514 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 515 { 516 znode_t *zp = VTOZ(vp); 517 int64_t start, off; 518 int len = nbytes; 519 int error = 0; 520 521 start = uio->uio_loffset; 522 off = start & PAGEOFFSET; 523 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 524 page_t *pp; 525 uint64_t bytes = MIN(PAGESIZE - off, len); 526 527 if (pp = page_lookup(vp, start, SE_SHARED)) { 528 caddr_t va; 529 530 va = zfs_map_page(pp, S_READ); 531 error = uiomove(va + off, bytes, UIO_READ, uio); 532 zfs_unmap_page(pp, va); 533 page_unlock(pp); 534 } else { 535 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 536 uio, bytes); 537 } 538 len -= bytes; 539 off = 0; 540 if (error) 541 break; 542 } 543 return (error); 544 } 545 546 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 547 548 /* 549 * Read bytes from specified file into supplied buffer. 550 * 551 * IN: vp - vnode of file to be read from. 552 * uio - structure supplying read location, range info, 553 * and return buffer. 554 * ioflag - SYNC flags; used to provide FRSYNC semantics. 555 * cr - credentials of caller. 556 * ct - caller context 557 * 558 * OUT: uio - updated offset and range, buffer filled. 559 * 560 * RETURN: 0 on success, error code on failure. 561 * 562 * Side Effects: 563 * vp - atime updated if byte count > 0 564 */ 565 /* ARGSUSED */ 566 static int 567 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 568 { 569 znode_t *zp = VTOZ(vp); 570 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 571 ssize_t n, nbytes; 572 int error = 0; 573 boolean_t frsync = B_FALSE; 574 xuio_t *xuio = NULL; 575 576 ZFS_ENTER(zfsvfs); 577 ZFS_VERIFY_ZP(zp); 578 579 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 580 ZFS_EXIT(zfsvfs); 581 return (SET_ERROR(EACCES)); 582 } 583 584 /* 585 * Validate file offset 586 */ 587 if (uio->uio_loffset < (offset_t)0) { 588 ZFS_EXIT(zfsvfs); 589 return (SET_ERROR(EINVAL)); 590 } 591 592 /* 593 * Fasttrack empty reads 594 */ 595 if (uio->uio_resid == 0) { 596 ZFS_EXIT(zfsvfs); 597 return (0); 598 } 599 600 /* 601 * Check for mandatory locks 602 */ 603 if (MANDMODE(zp->z_mode)) { 604 if (error = chklock(vp, FREAD, 605 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 606 ZFS_EXIT(zfsvfs); 607 return (error); 608 } 609 } 610 611 #ifdef FRSYNC 612 /* 613 * If we're in FRSYNC mode, sync out this znode before reading it. 614 * Only do this for non-snapshots. 615 * 616 * Some platforms do not support FRSYNC and instead map it 617 * to FSYNC, which results in unnecessary calls to zil_commit. We 618 * only honor FRSYNC requests on platforms which support it. 619 */ 620 frsync = !!(ioflag & FRSYNC); 621 #endif 622 623 if (zfsvfs->z_log && 624 (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 625 zil_commit(zfsvfs->z_log, zp->z_id); 626 627 /* 628 * Lock the range against changes. 629 */ 630 locked_range_t *lr = rangelock_enter(&zp->z_rangelock, 631 uio->uio_loffset, uio->uio_resid, RL_READER); 632 633 /* 634 * If we are reading past end-of-file we can skip 635 * to the end; but we might still need to set atime. 636 */ 637 if (uio->uio_loffset >= zp->z_size) { 638 error = 0; 639 goto out; 640 } 641 642 ASSERT(uio->uio_loffset < zp->z_size); 643 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 644 645 if ((uio->uio_extflg == UIO_XUIO) && 646 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 647 int nblk; 648 int blksz = zp->z_blksz; 649 uint64_t offset = uio->uio_loffset; 650 651 xuio = (xuio_t *)uio; 652 if ((ISP2(blksz))) { 653 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 654 blksz)) / blksz; 655 } else { 656 ASSERT(offset + n <= blksz); 657 nblk = 1; 658 } 659 (void) dmu_xuio_init(xuio, nblk); 660 661 if (vn_has_cached_data(vp)) { 662 /* 663 * For simplicity, we always allocate a full buffer 664 * even if we only expect to read a portion of a block. 665 */ 666 while (--nblk >= 0) { 667 (void) dmu_xuio_add(xuio, 668 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 669 blksz), 0, blksz); 670 } 671 } 672 } 673 674 while (n > 0) { 675 nbytes = MIN(n, zfs_read_chunk_size - 676 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 677 678 if (vn_has_cached_data(vp)) { 679 error = mappedread(vp, nbytes, uio); 680 } else { 681 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 682 uio, nbytes); 683 } 684 if (error) { 685 /* convert checksum errors into IO errors */ 686 if (error == ECKSUM) 687 error = SET_ERROR(EIO); 688 break; 689 } 690 691 n -= nbytes; 692 } 693 out: 694 rangelock_exit(lr); 695 696 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 697 ZFS_EXIT(zfsvfs); 698 return (error); 699 } 700 701 /* 702 * Write the bytes to a file. 703 * 704 * IN: vp - vnode of file to be written to. 705 * uio - structure supplying write location, range info, 706 * and data buffer. 707 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 708 * set if in append mode. 709 * cr - credentials of caller. 710 * ct - caller context (NFS/CIFS fem monitor only) 711 * 712 * OUT: uio - updated offset and range. 713 * 714 * RETURN: 0 on success, error code on failure. 715 * 716 * Timestamps: 717 * vp - ctime|mtime updated if byte count > 0 718 */ 719 720 /* ARGSUSED */ 721 static int 722 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 723 { 724 znode_t *zp = VTOZ(vp); 725 rlim64_t limit = uio->uio_llimit; 726 ssize_t start_resid = uio->uio_resid; 727 ssize_t tx_bytes; 728 uint64_t end_size; 729 dmu_tx_t *tx; 730 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 731 zilog_t *zilog; 732 offset_t woff; 733 ssize_t n, nbytes; 734 int max_blksz = zfsvfs->z_max_blksz; 735 int error = 0; 736 int prev_error; 737 arc_buf_t *abuf; 738 iovec_t *aiov = NULL; 739 xuio_t *xuio = NULL; 740 int i_iov = 0; 741 int iovcnt = uio->uio_iovcnt; 742 iovec_t *iovp = uio->uio_iov; 743 int write_eof; 744 int count = 0; 745 sa_bulk_attr_t bulk[4]; 746 uint64_t mtime[2], ctime[2]; 747 748 /* 749 * Fasttrack empty write 750 */ 751 n = start_resid; 752 if (n == 0) 753 return (0); 754 755 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 756 limit = MAXOFFSET_T; 757 758 ZFS_ENTER(zfsvfs); 759 ZFS_VERIFY_ZP(zp); 760 761 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 762 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 763 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 764 &zp->z_size, 8); 765 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 766 &zp->z_pflags, 8); 767 768 /* 769 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 770 * callers might not be able to detect properly that we are read-only, 771 * so check it explicitly here. 772 */ 773 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 774 ZFS_EXIT(zfsvfs); 775 return (SET_ERROR(EROFS)); 776 } 777 778 /* 779 * If immutable or not appending then return EPERM. 780 * Intentionally allow ZFS_READONLY through here. 781 * See zfs_zaccess_common() 782 */ 783 if ((zp->z_pflags & ZFS_IMMUTABLE) || 784 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 785 (uio->uio_loffset < zp->z_size))) { 786 ZFS_EXIT(zfsvfs); 787 return (SET_ERROR(EPERM)); 788 } 789 790 zilog = zfsvfs->z_log; 791 792 /* 793 * Validate file offset 794 */ 795 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 796 if (woff < 0) { 797 ZFS_EXIT(zfsvfs); 798 return (SET_ERROR(EINVAL)); 799 } 800 801 /* 802 * Check for mandatory locks before calling rangelock_enter() 803 * in order to prevent a deadlock with locks set via fcntl(). 804 */ 805 if (MANDMODE((mode_t)zp->z_mode) && 806 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 807 ZFS_EXIT(zfsvfs); 808 return (error); 809 } 810 811 /* 812 * Pre-fault the pages to ensure slow (eg NFS) pages 813 * don't hold up txg. 814 * Skip this if uio contains loaned arc_buf. 815 */ 816 if ((uio->uio_extflg == UIO_XUIO) && 817 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 818 xuio = (xuio_t *)uio; 819 else 820 uio_prefaultpages(MIN(n, max_blksz), uio); 821 822 /* 823 * If in append mode, set the io offset pointer to eof. 824 */ 825 locked_range_t *lr; 826 if (ioflag & FAPPEND) { 827 /* 828 * Obtain an appending range lock to guarantee file append 829 * semantics. We reset the write offset once we have the lock. 830 */ 831 lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 832 woff = lr->lr_offset; 833 if (lr->lr_length == UINT64_MAX) { 834 /* 835 * We overlocked the file because this write will cause 836 * the file block size to increase. 837 * Note that zp_size cannot change with this lock held. 838 */ 839 woff = zp->z_size; 840 } 841 uio->uio_loffset = woff; 842 } else { 843 /* 844 * Note that if the file block size will change as a result of 845 * this write, then this range lock will lock the entire file 846 * so that we can re-write the block safely. 847 */ 848 lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 849 } 850 851 if (woff >= limit) { 852 rangelock_exit(lr); 853 ZFS_EXIT(zfsvfs); 854 return (SET_ERROR(EFBIG)); 855 } 856 857 if ((woff + n) > limit || woff > (limit - n)) 858 n = limit - woff; 859 860 /* Will this write extend the file length? */ 861 write_eof = (woff + n > zp->z_size); 862 863 end_size = MAX(zp->z_size, woff + n); 864 865 /* 866 * Write the file in reasonable size chunks. Each chunk is written 867 * in a separate transaction; this keeps the intent log records small 868 * and allows us to do more fine-grained space accounting. 869 */ 870 while (n > 0) { 871 woff = uio->uio_loffset; 872 873 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 874 zp->z_uid) || 875 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 876 zp->z_gid) || 877 (zp->z_projid != ZFS_DEFAULT_PROJID && 878 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 879 zp->z_projid))) { 880 error = SET_ERROR(EDQUOT); 881 break; 882 } 883 884 arc_buf_t *abuf = NULL; 885 if (xuio) { 886 ASSERT(i_iov < iovcnt); 887 aiov = &iovp[i_iov]; 888 abuf = dmu_xuio_arcbuf(xuio, i_iov); 889 dmu_xuio_clear(xuio, i_iov); 890 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 891 iovec_t *, aiov, arc_buf_t *, abuf); 892 ASSERT((aiov->iov_base == abuf->b_data) || 893 ((char *)aiov->iov_base - (char *)abuf->b_data + 894 aiov->iov_len == arc_buf_size(abuf))); 895 i_iov++; 896 } else if (n >= max_blksz && woff >= zp->z_size && 897 P2PHASE(woff, max_blksz) == 0 && 898 zp->z_blksz == max_blksz) { 899 /* 900 * This write covers a full block. "Borrow" a buffer 901 * from the dmu so that we can fill it before we enter 902 * a transaction. This avoids the possibility of 903 * holding up the transaction if the data copy hangs 904 * up on a pagefault (e.g., from an NFS server mapping). 905 */ 906 size_t cbytes; 907 908 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 909 max_blksz); 910 ASSERT(abuf != NULL); 911 ASSERT(arc_buf_size(abuf) == max_blksz); 912 if (error = uiocopy(abuf->b_data, max_blksz, 913 UIO_WRITE, uio, &cbytes)) { 914 dmu_return_arcbuf(abuf); 915 break; 916 } 917 ASSERT(cbytes == max_blksz); 918 } 919 920 /* 921 * Start a transaction. 922 */ 923 tx = dmu_tx_create(zfsvfs->z_os); 924 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 925 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 926 zfs_sa_upgrade_txholds(tx, zp); 927 error = dmu_tx_assign(tx, TXG_WAIT); 928 if (error) { 929 dmu_tx_abort(tx); 930 if (abuf != NULL) 931 dmu_return_arcbuf(abuf); 932 break; 933 } 934 935 /* 936 * If rangelock_enter() over-locked we grow the blocksize 937 * and then reduce the lock range. This will only happen 938 * on the first iteration since rangelock_reduce() will 939 * shrink down lr_length to the appropriate size. 940 */ 941 if (lr->lr_length == UINT64_MAX) { 942 uint64_t new_blksz; 943 944 if (zp->z_blksz > max_blksz) { 945 /* 946 * File's blocksize is already larger than the 947 * "recordsize" property. Only let it grow to 948 * the next power of 2. 949 */ 950 ASSERT(!ISP2(zp->z_blksz)); 951 new_blksz = MIN(end_size, 952 1 << highbit64(zp->z_blksz)); 953 } else { 954 new_blksz = MIN(end_size, max_blksz); 955 } 956 zfs_grow_blocksize(zp, new_blksz, tx); 957 rangelock_reduce(lr, woff, n); 958 } 959 960 /* 961 * XXX - should we really limit each write to z_max_blksz? 962 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 963 */ 964 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 965 966 if (abuf == NULL) { 967 tx_bytes = uio->uio_resid; 968 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 969 uio, nbytes, tx); 970 tx_bytes -= uio->uio_resid; 971 } else { 972 tx_bytes = nbytes; 973 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 974 /* 975 * If this is not a full block write, but we are 976 * extending the file past EOF and this data starts 977 * block-aligned, use assign_arcbuf(). Otherwise, 978 * write via dmu_write(). 979 */ 980 if (tx_bytes < max_blksz && (!write_eof || 981 aiov->iov_base != abuf->b_data)) { 982 ASSERT(xuio); 983 dmu_write(zfsvfs->z_os, zp->z_id, woff, 984 aiov->iov_len, aiov->iov_base, tx); 985 dmu_return_arcbuf(abuf); 986 xuio_stat_wbuf_copied(); 987 } else { 988 ASSERT(xuio || tx_bytes == max_blksz); 989 dmu_assign_arcbuf_by_dbuf( 990 sa_get_db(zp->z_sa_hdl), woff, abuf, tx); 991 } 992 ASSERT(tx_bytes <= uio->uio_resid); 993 uioskip(uio, tx_bytes); 994 } 995 if (tx_bytes && vn_has_cached_data(vp)) { 996 update_pages(vp, woff, 997 tx_bytes, zfsvfs->z_os, zp->z_id); 998 } 999 1000 /* 1001 * If we made no progress, we're done. If we made even 1002 * partial progress, update the znode and ZIL accordingly. 1003 */ 1004 if (tx_bytes == 0) { 1005 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1006 (void *)&zp->z_size, sizeof (uint64_t), tx); 1007 dmu_tx_commit(tx); 1008 ASSERT(error != 0); 1009 break; 1010 } 1011 1012 /* 1013 * Clear Set-UID/Set-GID bits on successful write if not 1014 * privileged and at least one of the excute bits is set. 1015 * 1016 * It would be nice to to this after all writes have 1017 * been done, but that would still expose the ISUID/ISGID 1018 * to another app after the partial write is committed. 1019 * 1020 * Note: we don't call zfs_fuid_map_id() here because 1021 * user 0 is not an ephemeral uid. 1022 */ 1023 mutex_enter(&zp->z_acl_lock); 1024 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1025 (S_IXUSR >> 6))) != 0 && 1026 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1027 secpolicy_vnode_setid_retain(cr, 1028 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1029 uint64_t newmode; 1030 zp->z_mode &= ~(S_ISUID | S_ISGID); 1031 newmode = zp->z_mode; 1032 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1033 (void *)&newmode, sizeof (uint64_t), tx); 1034 } 1035 mutex_exit(&zp->z_acl_lock); 1036 1037 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1038 B_TRUE); 1039 1040 /* 1041 * Update the file size (zp_size) if it has changed; 1042 * account for possible concurrent updates. 1043 */ 1044 while ((end_size = zp->z_size) < uio->uio_loffset) { 1045 (void) atomic_cas_64(&zp->z_size, end_size, 1046 uio->uio_loffset); 1047 } 1048 /* 1049 * If we are replaying and eof is non zero then force 1050 * the file size to the specified eof. Note, there's no 1051 * concurrency during replay. 1052 */ 1053 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1054 zp->z_size = zfsvfs->z_replay_eof; 1055 1056 /* 1057 * Keep track of a possible pre-existing error from a partial 1058 * write via dmu_write_uio_dbuf above. 1059 */ 1060 prev_error = error; 1061 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1062 1063 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1064 dmu_tx_commit(tx); 1065 1066 if (prev_error != 0 || error != 0) 1067 break; 1068 ASSERT(tx_bytes == nbytes); 1069 n -= nbytes; 1070 1071 if (!xuio && n > 0) 1072 uio_prefaultpages(MIN(n, max_blksz), uio); 1073 } 1074 1075 rangelock_exit(lr); 1076 1077 /* 1078 * If we're in replay mode, or we made no progress, return error. 1079 * Otherwise, it's at least a partial write, so it's successful. 1080 */ 1081 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1082 ZFS_EXIT(zfsvfs); 1083 return (error); 1084 } 1085 1086 if (ioflag & (FSYNC | FDSYNC) || 1087 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1088 zil_commit(zilog, zp->z_id); 1089 1090 ZFS_EXIT(zfsvfs); 1091 return (0); 1092 } 1093 1094 /* ARGSUSED */ 1095 void 1096 zfs_get_done(zgd_t *zgd, int error) 1097 { 1098 znode_t *zp = zgd->zgd_private; 1099 objset_t *os = zp->z_zfsvfs->z_os; 1100 1101 if (zgd->zgd_db) 1102 dmu_buf_rele(zgd->zgd_db, zgd); 1103 1104 rangelock_exit(zgd->zgd_lr); 1105 1106 /* 1107 * Release the vnode asynchronously as we currently have the 1108 * txg stopped from syncing. 1109 */ 1110 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1111 1112 kmem_free(zgd, sizeof (zgd_t)); 1113 } 1114 1115 #ifdef DEBUG 1116 static int zil_fault_io = 0; 1117 #endif 1118 1119 /* 1120 * Get data to generate a TX_WRITE intent log record. 1121 */ 1122 int 1123 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) 1124 { 1125 zfsvfs_t *zfsvfs = arg; 1126 objset_t *os = zfsvfs->z_os; 1127 znode_t *zp; 1128 uint64_t object = lr->lr_foid; 1129 uint64_t offset = lr->lr_offset; 1130 uint64_t size = lr->lr_length; 1131 dmu_buf_t *db; 1132 zgd_t *zgd; 1133 int error = 0; 1134 1135 ASSERT3P(lwb, !=, NULL); 1136 ASSERT3P(zio, !=, NULL); 1137 ASSERT3U(size, !=, 0); 1138 1139 /* 1140 * Nothing to do if the file has been removed 1141 */ 1142 if (zfs_zget(zfsvfs, object, &zp) != 0) 1143 return (SET_ERROR(ENOENT)); 1144 if (zp->z_unlinked) { 1145 /* 1146 * Release the vnode asynchronously as we currently have the 1147 * txg stopped from syncing. 1148 */ 1149 VN_RELE_ASYNC(ZTOV(zp), 1150 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1151 return (SET_ERROR(ENOENT)); 1152 } 1153 1154 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1155 zgd->zgd_lwb = lwb; 1156 zgd->zgd_private = zp; 1157 1158 /* 1159 * Write records come in two flavors: immediate and indirect. 1160 * For small writes it's cheaper to store the data with the 1161 * log record (immediate); for large writes it's cheaper to 1162 * sync the data and get a pointer to it (indirect) so that 1163 * we don't have to write the data twice. 1164 */ 1165 if (buf != NULL) { /* immediate write */ 1166 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, 1167 offset, size, RL_READER); 1168 /* test for truncation needs to be done while range locked */ 1169 if (offset >= zp->z_size) { 1170 error = SET_ERROR(ENOENT); 1171 } else { 1172 error = dmu_read(os, object, offset, size, buf, 1173 DMU_READ_NO_PREFETCH); 1174 } 1175 ASSERT(error == 0 || error == ENOENT); 1176 } else { /* indirect write */ 1177 /* 1178 * Have to lock the whole block to ensure when it's 1179 * written out and its checksum is being calculated 1180 * that no one can change the data. We need to re-check 1181 * blocksize after we get the lock in case it's changed! 1182 */ 1183 for (;;) { 1184 uint64_t blkoff; 1185 size = zp->z_blksz; 1186 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1187 offset -= blkoff; 1188 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, 1189 offset, size, RL_READER); 1190 if (zp->z_blksz == size) 1191 break; 1192 offset += blkoff; 1193 rangelock_exit(zgd->zgd_lr); 1194 } 1195 /* test for truncation needs to be done while range locked */ 1196 if (lr->lr_offset >= zp->z_size) 1197 error = SET_ERROR(ENOENT); 1198 #ifdef DEBUG 1199 if (zil_fault_io) { 1200 error = SET_ERROR(EIO); 1201 zil_fault_io = 0; 1202 } 1203 #endif 1204 if (error == 0) 1205 error = dmu_buf_hold(os, object, offset, zgd, &db, 1206 DMU_READ_NO_PREFETCH); 1207 1208 if (error == 0) { 1209 blkptr_t *bp = &lr->lr_blkptr; 1210 1211 zgd->zgd_db = db; 1212 zgd->zgd_bp = bp; 1213 1214 ASSERT(db->db_offset == offset); 1215 ASSERT(db->db_size == size); 1216 1217 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1218 zfs_get_done, zgd); 1219 ASSERT(error || lr->lr_length <= size); 1220 1221 /* 1222 * On success, we need to wait for the write I/O 1223 * initiated by dmu_sync() to complete before we can 1224 * release this dbuf. We will finish everything up 1225 * in the zfs_get_done() callback. 1226 */ 1227 if (error == 0) 1228 return (0); 1229 1230 if (error == EALREADY) { 1231 lr->lr_common.lrc_txtype = TX_WRITE2; 1232 /* 1233 * TX_WRITE2 relies on the data previously 1234 * written by the TX_WRITE that caused 1235 * EALREADY. We zero out the BP because 1236 * it is the old, currently-on-disk BP. 1237 */ 1238 zgd->zgd_bp = NULL; 1239 BP_ZERO(bp); 1240 error = 0; 1241 } 1242 } 1243 } 1244 1245 zfs_get_done(zgd, error); 1246 1247 return (error); 1248 } 1249 1250 /*ARGSUSED*/ 1251 static int 1252 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1253 caller_context_t *ct) 1254 { 1255 znode_t *zp = VTOZ(vp); 1256 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1257 int error; 1258 1259 ZFS_ENTER(zfsvfs); 1260 ZFS_VERIFY_ZP(zp); 1261 1262 if (flag & V_ACE_MASK) 1263 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1264 else 1265 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1266 1267 ZFS_EXIT(zfsvfs); 1268 return (error); 1269 } 1270 1271 /* 1272 * If vnode is for a device return a specfs vnode instead. 1273 */ 1274 static int 1275 specvp_check(vnode_t **vpp, cred_t *cr) 1276 { 1277 int error = 0; 1278 1279 if (IS_DEVVP(*vpp)) { 1280 struct vnode *svp; 1281 1282 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1283 VN_RELE(*vpp); 1284 if (svp == NULL) 1285 error = SET_ERROR(ENOSYS); 1286 *vpp = svp; 1287 } 1288 return (error); 1289 } 1290 1291 1292 /* 1293 * Lookup an entry in a directory, or an extended attribute directory. 1294 * If it exists, return a held vnode reference for it. 1295 * 1296 * IN: dvp - vnode of directory to search. 1297 * nm - name of entry to lookup. 1298 * pnp - full pathname to lookup [UNUSED]. 1299 * flags - LOOKUP_XATTR set if looking for an attribute. 1300 * rdir - root directory vnode [UNUSED]. 1301 * cr - credentials of caller. 1302 * ct - caller context 1303 * direntflags - directory lookup flags 1304 * realpnp - returned pathname. 1305 * 1306 * OUT: vpp - vnode of located entry, NULL if not found. 1307 * 1308 * RETURN: 0 on success, error code on failure. 1309 * 1310 * Timestamps: 1311 * NA 1312 */ 1313 /* ARGSUSED */ 1314 static int 1315 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1316 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1317 int *direntflags, pathname_t *realpnp) 1318 { 1319 znode_t *zdp = VTOZ(dvp); 1320 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1321 int error = 0; 1322 1323 /* 1324 * Fast path lookup, however we must skip DNLC lookup 1325 * for case folding or normalizing lookups because the 1326 * DNLC code only stores the passed in name. This means 1327 * creating 'a' and removing 'A' on a case insensitive 1328 * file system would work, but DNLC still thinks 'a' 1329 * exists and won't let you create it again on the next 1330 * pass through fast path. 1331 */ 1332 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1333 1334 if (dvp->v_type != VDIR) { 1335 return (SET_ERROR(ENOTDIR)); 1336 } else if (zdp->z_sa_hdl == NULL) { 1337 return (SET_ERROR(EIO)); 1338 } 1339 1340 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1341 error = zfs_fastaccesschk_execute(zdp, cr); 1342 if (!error) { 1343 *vpp = dvp; 1344 VN_HOLD(*vpp); 1345 return (0); 1346 } 1347 return (error); 1348 } else if (!zdp->z_zfsvfs->z_norm && 1349 (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) { 1350 1351 vnode_t *tvp = dnlc_lookup(dvp, nm); 1352 1353 if (tvp) { 1354 error = zfs_fastaccesschk_execute(zdp, cr); 1355 if (error) { 1356 VN_RELE(tvp); 1357 return (error); 1358 } 1359 if (tvp == DNLC_NO_VNODE) { 1360 VN_RELE(tvp); 1361 return (SET_ERROR(ENOENT)); 1362 } else { 1363 *vpp = tvp; 1364 return (specvp_check(vpp, cr)); 1365 } 1366 } 1367 } 1368 } 1369 1370 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1371 1372 ZFS_ENTER(zfsvfs); 1373 ZFS_VERIFY_ZP(zdp); 1374 1375 *vpp = NULL; 1376 1377 if (flags & LOOKUP_XATTR) { 1378 /* 1379 * If the xattr property is off, refuse the lookup request. 1380 */ 1381 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1382 ZFS_EXIT(zfsvfs); 1383 return (SET_ERROR(EINVAL)); 1384 } 1385 1386 /* 1387 * We don't allow recursive attributes.. 1388 * Maybe someday we will. 1389 */ 1390 if (zdp->z_pflags & ZFS_XATTR) { 1391 ZFS_EXIT(zfsvfs); 1392 return (SET_ERROR(EINVAL)); 1393 } 1394 1395 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1396 ZFS_EXIT(zfsvfs); 1397 return (error); 1398 } 1399 1400 /* 1401 * Do we have permission to get into attribute directory? 1402 */ 1403 1404 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1405 B_FALSE, cr)) { 1406 VN_RELE(*vpp); 1407 *vpp = NULL; 1408 } 1409 1410 ZFS_EXIT(zfsvfs); 1411 return (error); 1412 } 1413 1414 if (dvp->v_type != VDIR) { 1415 ZFS_EXIT(zfsvfs); 1416 return (SET_ERROR(ENOTDIR)); 1417 } 1418 1419 /* 1420 * Check accessibility of directory. 1421 */ 1422 1423 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1424 ZFS_EXIT(zfsvfs); 1425 return (error); 1426 } 1427 1428 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1429 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1430 ZFS_EXIT(zfsvfs); 1431 return (SET_ERROR(EILSEQ)); 1432 } 1433 1434 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1435 if (error == 0) 1436 error = specvp_check(vpp, cr); 1437 1438 ZFS_EXIT(zfsvfs); 1439 return (error); 1440 } 1441 1442 /* 1443 * Attempt to create a new entry in a directory. If the entry 1444 * already exists, truncate the file if permissible, else return 1445 * an error. Return the vp of the created or trunc'd file. 1446 * 1447 * IN: dvp - vnode of directory to put new file entry in. 1448 * name - name of new file entry. 1449 * vap - attributes of new file. 1450 * excl - flag indicating exclusive or non-exclusive mode. 1451 * mode - mode to open file with. 1452 * cr - credentials of caller. 1453 * flag - large file flag [UNUSED]. 1454 * ct - caller context 1455 * vsecp - ACL to be set 1456 * 1457 * OUT: vpp - vnode of created or trunc'd entry. 1458 * 1459 * RETURN: 0 on success, error code on failure. 1460 * 1461 * Timestamps: 1462 * dvp - ctime|mtime updated if new entry created 1463 * vp - ctime|mtime always, atime if new 1464 */ 1465 1466 /* ARGSUSED */ 1467 static int 1468 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1469 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1470 vsecattr_t *vsecp) 1471 { 1472 znode_t *zp, *dzp = VTOZ(dvp); 1473 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1474 zilog_t *zilog; 1475 objset_t *os; 1476 zfs_dirlock_t *dl; 1477 dmu_tx_t *tx; 1478 int error; 1479 ksid_t *ksid; 1480 uid_t uid; 1481 gid_t gid = crgetgid(cr); 1482 zfs_acl_ids_t acl_ids; 1483 boolean_t fuid_dirtied; 1484 boolean_t have_acl = B_FALSE; 1485 boolean_t waited = B_FALSE; 1486 1487 /* 1488 * If we have an ephemeral id, ACL, or XVATTR then 1489 * make sure file system is at proper version 1490 */ 1491 1492 ksid = crgetsid(cr, KSID_OWNER); 1493 if (ksid) 1494 uid = ksid_getid(ksid); 1495 else 1496 uid = crgetuid(cr); 1497 1498 if (zfsvfs->z_use_fuids == B_FALSE && 1499 (vsecp || (vap->va_mask & AT_XVATTR) || 1500 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1501 return (SET_ERROR(EINVAL)); 1502 1503 ZFS_ENTER(zfsvfs); 1504 ZFS_VERIFY_ZP(dzp); 1505 os = zfsvfs->z_os; 1506 zilog = zfsvfs->z_log; 1507 1508 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1509 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1510 ZFS_EXIT(zfsvfs); 1511 return (SET_ERROR(EILSEQ)); 1512 } 1513 1514 if (vap->va_mask & AT_XVATTR) { 1515 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1516 crgetuid(cr), cr, vap->va_type)) != 0) { 1517 ZFS_EXIT(zfsvfs); 1518 return (error); 1519 } 1520 } 1521 top: 1522 *vpp = NULL; 1523 1524 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1525 vap->va_mode &= ~VSVTX; 1526 1527 if (*name == '\0') { 1528 /* 1529 * Null component name refers to the directory itself. 1530 */ 1531 VN_HOLD(dvp); 1532 zp = dzp; 1533 dl = NULL; 1534 error = 0; 1535 } else { 1536 /* possible VN_HOLD(zp) */ 1537 int zflg = 0; 1538 1539 if (flag & FIGNORECASE) 1540 zflg |= ZCILOOK; 1541 1542 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1543 NULL, NULL); 1544 if (error) { 1545 if (have_acl) 1546 zfs_acl_ids_free(&acl_ids); 1547 if (strcmp(name, "..") == 0) 1548 error = SET_ERROR(EISDIR); 1549 ZFS_EXIT(zfsvfs); 1550 return (error); 1551 } 1552 } 1553 1554 if (zp == NULL) { 1555 uint64_t txtype; 1556 uint64_t projid = ZFS_DEFAULT_PROJID; 1557 1558 /* 1559 * Create a new file object and update the directory 1560 * to reference it. 1561 */ 1562 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1563 if (have_acl) 1564 zfs_acl_ids_free(&acl_ids); 1565 goto out; 1566 } 1567 1568 /* 1569 * We only support the creation of regular files in 1570 * extended attribute directories. 1571 */ 1572 1573 if ((dzp->z_pflags & ZFS_XATTR) && 1574 (vap->va_type != VREG)) { 1575 if (have_acl) 1576 zfs_acl_ids_free(&acl_ids); 1577 error = SET_ERROR(EINVAL); 1578 goto out; 1579 } 1580 1581 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 1582 cr, vsecp, &acl_ids)) != 0) 1583 goto out; 1584 have_acl = B_TRUE; 1585 1586 if (vap->va_type == VREG || vap->va_type == VDIR) 1587 projid = zfs_inherit_projid(dzp); 1588 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 1589 zfs_acl_ids_free(&acl_ids); 1590 error = SET_ERROR(EDQUOT); 1591 goto out; 1592 } 1593 1594 tx = dmu_tx_create(os); 1595 1596 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1597 ZFS_SA_BASE_ATTR_SIZE); 1598 1599 fuid_dirtied = zfsvfs->z_fuid_dirty; 1600 if (fuid_dirtied) 1601 zfs_fuid_txhold(zfsvfs, tx); 1602 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1603 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1604 if (!zfsvfs->z_use_sa && 1605 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1606 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1607 0, acl_ids.z_aclp->z_acl_bytes); 1608 } 1609 error = dmu_tx_assign(tx, 1610 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1611 if (error) { 1612 zfs_dirent_unlock(dl); 1613 if (error == ERESTART) { 1614 waited = B_TRUE; 1615 dmu_tx_wait(tx); 1616 dmu_tx_abort(tx); 1617 goto top; 1618 } 1619 zfs_acl_ids_free(&acl_ids); 1620 dmu_tx_abort(tx); 1621 ZFS_EXIT(zfsvfs); 1622 return (error); 1623 } 1624 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1625 1626 if (fuid_dirtied) 1627 zfs_fuid_sync(zfsvfs, tx); 1628 1629 (void) zfs_link_create(dl, zp, tx, ZNEW); 1630 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1631 if (flag & FIGNORECASE) 1632 txtype |= TX_CI; 1633 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1634 vsecp, acl_ids.z_fuidp, vap); 1635 zfs_acl_ids_free(&acl_ids); 1636 dmu_tx_commit(tx); 1637 } else { 1638 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1639 1640 if (have_acl) 1641 zfs_acl_ids_free(&acl_ids); 1642 have_acl = B_FALSE; 1643 1644 /* 1645 * A directory entry already exists for this name. 1646 */ 1647 /* 1648 * Can't truncate an existing file if in exclusive mode. 1649 */ 1650 if (excl == EXCL) { 1651 error = SET_ERROR(EEXIST); 1652 goto out; 1653 } 1654 /* 1655 * Can't open a directory for writing. 1656 */ 1657 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1658 error = SET_ERROR(EISDIR); 1659 goto out; 1660 } 1661 /* 1662 * Verify requested access to file. 1663 */ 1664 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1665 goto out; 1666 } 1667 1668 mutex_enter(&dzp->z_lock); 1669 dzp->z_seq++; 1670 mutex_exit(&dzp->z_lock); 1671 1672 /* 1673 * Truncate regular files if requested. 1674 */ 1675 if ((ZTOV(zp)->v_type == VREG) && 1676 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1677 /* we can't hold any locks when calling zfs_freesp() */ 1678 zfs_dirent_unlock(dl); 1679 dl = NULL; 1680 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1681 if (error == 0) { 1682 vnevent_create(ZTOV(zp), ct); 1683 } 1684 } 1685 } 1686 out: 1687 1688 if (dl) 1689 zfs_dirent_unlock(dl); 1690 1691 if (error) { 1692 if (zp) 1693 VN_RELE(ZTOV(zp)); 1694 } else { 1695 *vpp = ZTOV(zp); 1696 error = specvp_check(vpp, cr); 1697 } 1698 1699 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1700 zil_commit(zilog, 0); 1701 1702 ZFS_EXIT(zfsvfs); 1703 return (error); 1704 } 1705 1706 /* 1707 * Remove an entry from a directory. 1708 * 1709 * IN: dvp - vnode of directory to remove entry from. 1710 * name - name of entry to remove. 1711 * cr - credentials of caller. 1712 * ct - caller context 1713 * flags - case flags 1714 * 1715 * RETURN: 0 on success, error code on failure. 1716 * 1717 * Timestamps: 1718 * dvp - ctime|mtime 1719 * vp - ctime (if nlink > 0) 1720 */ 1721 1722 uint64_t null_xattr = 0; 1723 1724 /*ARGSUSED*/ 1725 static int 1726 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1727 int flags) 1728 { 1729 znode_t *zp, *dzp = VTOZ(dvp); 1730 znode_t *xzp; 1731 vnode_t *vp; 1732 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1733 zilog_t *zilog; 1734 uint64_t acl_obj, xattr_obj; 1735 uint64_t xattr_obj_unlinked = 0; 1736 uint64_t obj = 0; 1737 zfs_dirlock_t *dl; 1738 dmu_tx_t *tx; 1739 boolean_t may_delete_now, delete_now = FALSE; 1740 boolean_t unlinked, toobig = FALSE; 1741 uint64_t txtype; 1742 pathname_t *realnmp = NULL; 1743 pathname_t realnm; 1744 int error; 1745 int zflg = ZEXISTS; 1746 boolean_t waited = B_FALSE; 1747 1748 ZFS_ENTER(zfsvfs); 1749 ZFS_VERIFY_ZP(dzp); 1750 zilog = zfsvfs->z_log; 1751 1752 if (flags & FIGNORECASE) { 1753 zflg |= ZCILOOK; 1754 pn_alloc(&realnm); 1755 realnmp = &realnm; 1756 } 1757 1758 top: 1759 xattr_obj = 0; 1760 xzp = NULL; 1761 /* 1762 * Attempt to lock directory; fail if entry doesn't exist. 1763 */ 1764 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1765 NULL, realnmp)) { 1766 if (realnmp) 1767 pn_free(realnmp); 1768 ZFS_EXIT(zfsvfs); 1769 return (error); 1770 } 1771 1772 vp = ZTOV(zp); 1773 1774 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1775 goto out; 1776 } 1777 1778 /* 1779 * Need to use rmdir for removing directories. 1780 */ 1781 if (vp->v_type == VDIR) { 1782 error = SET_ERROR(EPERM); 1783 goto out; 1784 } 1785 1786 vnevent_remove(vp, dvp, name, ct); 1787 1788 if (realnmp) 1789 dnlc_remove(dvp, realnmp->pn_buf); 1790 else 1791 dnlc_remove(dvp, name); 1792 1793 mutex_enter(&vp->v_lock); 1794 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1795 mutex_exit(&vp->v_lock); 1796 1797 /* 1798 * We may delete the znode now, or we may put it in the unlinked set; 1799 * it depends on whether we're the last link, and on whether there are 1800 * other holds on the vnode. So we dmu_tx_hold() the right things to 1801 * allow for either case. 1802 */ 1803 obj = zp->z_id; 1804 tx = dmu_tx_create(zfsvfs->z_os); 1805 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1806 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1807 zfs_sa_upgrade_txholds(tx, zp); 1808 zfs_sa_upgrade_txholds(tx, dzp); 1809 if (may_delete_now) { 1810 toobig = 1811 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1812 /* if the file is too big, only hold_free a token amount */ 1813 dmu_tx_hold_free(tx, zp->z_id, 0, 1814 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1815 } 1816 1817 /* are there any extended attributes? */ 1818 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1819 &xattr_obj, sizeof (xattr_obj)); 1820 if (error == 0 && xattr_obj) { 1821 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1822 ASSERT0(error); 1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1824 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1825 } 1826 1827 mutex_enter(&zp->z_lock); 1828 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1829 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1830 mutex_exit(&zp->z_lock); 1831 1832 /* charge as an update -- would be nice not to charge at all */ 1833 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1834 1835 /* 1836 * Mark this transaction as typically resulting in a net free of space 1837 */ 1838 dmu_tx_mark_netfree(tx); 1839 1840 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 1841 if (error) { 1842 zfs_dirent_unlock(dl); 1843 VN_RELE(vp); 1844 if (xzp) 1845 VN_RELE(ZTOV(xzp)); 1846 if (error == ERESTART) { 1847 waited = B_TRUE; 1848 dmu_tx_wait(tx); 1849 dmu_tx_abort(tx); 1850 goto top; 1851 } 1852 if (realnmp) 1853 pn_free(realnmp); 1854 dmu_tx_abort(tx); 1855 ZFS_EXIT(zfsvfs); 1856 return (error); 1857 } 1858 1859 /* 1860 * Remove the directory entry. 1861 */ 1862 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1863 1864 if (error) { 1865 dmu_tx_commit(tx); 1866 goto out; 1867 } 1868 1869 if (unlinked) { 1870 /* 1871 * Hold z_lock so that we can make sure that the ACL obj 1872 * hasn't changed. Could have been deleted due to 1873 * zfs_sa_upgrade(). 1874 */ 1875 mutex_enter(&zp->z_lock); 1876 mutex_enter(&vp->v_lock); 1877 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1878 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1879 delete_now = may_delete_now && !toobig && 1880 vp->v_count == 1 && !vn_has_cached_data(vp) && 1881 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == 1882 acl_obj; 1883 mutex_exit(&vp->v_lock); 1884 } 1885 1886 if (delete_now) { 1887 if (xattr_obj_unlinked) { 1888 ASSERT3U(xzp->z_links, ==, 2); 1889 mutex_enter(&xzp->z_lock); 1890 xzp->z_unlinked = 1; 1891 xzp->z_links = 0; 1892 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1893 &xzp->z_links, sizeof (xzp->z_links), tx); 1894 ASSERT3U(error, ==, 0); 1895 mutex_exit(&xzp->z_lock); 1896 zfs_unlinked_add(xzp, tx); 1897 1898 if (zp->z_is_sa) 1899 error = sa_remove(zp->z_sa_hdl, 1900 SA_ZPL_XATTR(zfsvfs), tx); 1901 else 1902 error = sa_update(zp->z_sa_hdl, 1903 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1904 sizeof (uint64_t), tx); 1905 ASSERT0(error); 1906 } 1907 mutex_enter(&vp->v_lock); 1908 VN_RELE_LOCKED(vp); 1909 ASSERT0(vp->v_count); 1910 mutex_exit(&vp->v_lock); 1911 mutex_exit(&zp->z_lock); 1912 zfs_znode_delete(zp, tx); 1913 } else if (unlinked) { 1914 mutex_exit(&zp->z_lock); 1915 zfs_unlinked_add(zp, tx); 1916 } 1917 1918 txtype = TX_REMOVE; 1919 if (flags & FIGNORECASE) 1920 txtype |= TX_CI; 1921 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 1922 1923 dmu_tx_commit(tx); 1924 out: 1925 if (realnmp) 1926 pn_free(realnmp); 1927 1928 zfs_dirent_unlock(dl); 1929 1930 if (!delete_now) 1931 VN_RELE(vp); 1932 if (xzp) 1933 VN_RELE(ZTOV(xzp)); 1934 1935 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1936 zil_commit(zilog, 0); 1937 1938 ZFS_EXIT(zfsvfs); 1939 return (error); 1940 } 1941 1942 /* 1943 * Create a new directory and insert it into dvp using the name 1944 * provided. Return a pointer to the inserted directory. 1945 * 1946 * IN: dvp - vnode of directory to add subdir to. 1947 * dirname - name of new directory. 1948 * vap - attributes of new directory. 1949 * cr - credentials of caller. 1950 * ct - caller context 1951 * flags - case flags 1952 * vsecp - ACL to be set 1953 * 1954 * OUT: vpp - vnode of created directory. 1955 * 1956 * RETURN: 0 on success, error code on failure. 1957 * 1958 * Timestamps: 1959 * dvp - ctime|mtime updated 1960 * vp - ctime|mtime|atime updated 1961 */ 1962 /*ARGSUSED*/ 1963 static int 1964 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1965 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1966 { 1967 znode_t *zp, *dzp = VTOZ(dvp); 1968 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1969 zilog_t *zilog; 1970 zfs_dirlock_t *dl; 1971 uint64_t txtype; 1972 dmu_tx_t *tx; 1973 int error; 1974 int zf = ZNEW; 1975 ksid_t *ksid; 1976 uid_t uid; 1977 gid_t gid = crgetgid(cr); 1978 zfs_acl_ids_t acl_ids; 1979 boolean_t fuid_dirtied; 1980 boolean_t waited = B_FALSE; 1981 1982 ASSERT(vap->va_type == VDIR); 1983 1984 /* 1985 * If we have an ephemeral id, ACL, or XVATTR then 1986 * make sure file system is at proper version 1987 */ 1988 1989 ksid = crgetsid(cr, KSID_OWNER); 1990 if (ksid) 1991 uid = ksid_getid(ksid); 1992 else 1993 uid = crgetuid(cr); 1994 if (zfsvfs->z_use_fuids == B_FALSE && 1995 (vsecp || (vap->va_mask & AT_XVATTR) || 1996 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1997 return (SET_ERROR(EINVAL)); 1998 1999 ZFS_ENTER(zfsvfs); 2000 ZFS_VERIFY_ZP(dzp); 2001 zilog = zfsvfs->z_log; 2002 2003 if (dzp->z_pflags & ZFS_XATTR) { 2004 ZFS_EXIT(zfsvfs); 2005 return (SET_ERROR(EINVAL)); 2006 } 2007 2008 if (zfsvfs->z_utf8 && u8_validate(dirname, 2009 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2010 ZFS_EXIT(zfsvfs); 2011 return (SET_ERROR(EILSEQ)); 2012 } 2013 if (flags & FIGNORECASE) 2014 zf |= ZCILOOK; 2015 2016 if (vap->va_mask & AT_XVATTR) { 2017 if ((error = secpolicy_xvattr((xvattr_t *)vap, 2018 crgetuid(cr), cr, vap->va_type)) != 0) { 2019 ZFS_EXIT(zfsvfs); 2020 return (error); 2021 } 2022 } 2023 2024 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2025 vsecp, &acl_ids)) != 0) { 2026 ZFS_EXIT(zfsvfs); 2027 return (error); 2028 } 2029 /* 2030 * First make sure the new directory doesn't exist. 2031 * 2032 * Existence is checked first to make sure we don't return 2033 * EACCES instead of EEXIST which can cause some applications 2034 * to fail. 2035 */ 2036 top: 2037 *vpp = NULL; 2038 2039 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 2040 NULL, NULL)) { 2041 zfs_acl_ids_free(&acl_ids); 2042 ZFS_EXIT(zfsvfs); 2043 return (error); 2044 } 2045 2046 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2047 zfs_acl_ids_free(&acl_ids); 2048 zfs_dirent_unlock(dl); 2049 ZFS_EXIT(zfsvfs); 2050 return (error); 2051 } 2052 2053 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 2054 zfs_acl_ids_free(&acl_ids); 2055 zfs_dirent_unlock(dl); 2056 ZFS_EXIT(zfsvfs); 2057 return (SET_ERROR(EDQUOT)); 2058 } 2059 2060 /* 2061 * Add a new entry to the directory. 2062 */ 2063 tx = dmu_tx_create(zfsvfs->z_os); 2064 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2065 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2066 fuid_dirtied = zfsvfs->z_fuid_dirty; 2067 if (fuid_dirtied) 2068 zfs_fuid_txhold(zfsvfs, tx); 2069 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2070 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2071 acl_ids.z_aclp->z_acl_bytes); 2072 } 2073 2074 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2075 ZFS_SA_BASE_ATTR_SIZE); 2076 2077 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2078 if (error) { 2079 zfs_dirent_unlock(dl); 2080 if (error == ERESTART) { 2081 waited = B_TRUE; 2082 dmu_tx_wait(tx); 2083 dmu_tx_abort(tx); 2084 goto top; 2085 } 2086 zfs_acl_ids_free(&acl_ids); 2087 dmu_tx_abort(tx); 2088 ZFS_EXIT(zfsvfs); 2089 return (error); 2090 } 2091 2092 /* 2093 * Create new node. 2094 */ 2095 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2096 2097 if (fuid_dirtied) 2098 zfs_fuid_sync(zfsvfs, tx); 2099 2100 /* 2101 * Now put new name in parent dir. 2102 */ 2103 (void) zfs_link_create(dl, zp, tx, ZNEW); 2104 2105 *vpp = ZTOV(zp); 2106 2107 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 2108 if (flags & FIGNORECASE) 2109 txtype |= TX_CI; 2110 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 2111 acl_ids.z_fuidp, vap); 2112 2113 zfs_acl_ids_free(&acl_ids); 2114 2115 dmu_tx_commit(tx); 2116 2117 zfs_dirent_unlock(dl); 2118 2119 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2120 zil_commit(zilog, 0); 2121 2122 ZFS_EXIT(zfsvfs); 2123 return (0); 2124 } 2125 2126 /* 2127 * Remove a directory subdir entry. If the current working 2128 * directory is the same as the subdir to be removed, the 2129 * remove will fail. 2130 * 2131 * IN: dvp - vnode of directory to remove from. 2132 * name - name of directory to be removed. 2133 * cwd - vnode of current working directory. 2134 * cr - credentials of caller. 2135 * ct - caller context 2136 * flags - case flags 2137 * 2138 * RETURN: 0 on success, error code on failure. 2139 * 2140 * Timestamps: 2141 * dvp - ctime|mtime updated 2142 */ 2143 /*ARGSUSED*/ 2144 static int 2145 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 2146 caller_context_t *ct, int flags) 2147 { 2148 znode_t *dzp = VTOZ(dvp); 2149 znode_t *zp; 2150 vnode_t *vp; 2151 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2152 zilog_t *zilog; 2153 zfs_dirlock_t *dl; 2154 dmu_tx_t *tx; 2155 int error; 2156 int zflg = ZEXISTS; 2157 boolean_t waited = B_FALSE; 2158 2159 ZFS_ENTER(zfsvfs); 2160 ZFS_VERIFY_ZP(dzp); 2161 zilog = zfsvfs->z_log; 2162 2163 if (flags & FIGNORECASE) 2164 zflg |= ZCILOOK; 2165 top: 2166 zp = NULL; 2167 2168 /* 2169 * Attempt to lock directory; fail if entry doesn't exist. 2170 */ 2171 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2172 NULL, NULL)) { 2173 ZFS_EXIT(zfsvfs); 2174 return (error); 2175 } 2176 2177 vp = ZTOV(zp); 2178 2179 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2180 goto out; 2181 } 2182 2183 if (vp->v_type != VDIR) { 2184 error = SET_ERROR(ENOTDIR); 2185 goto out; 2186 } 2187 2188 if (vp == cwd) { 2189 error = SET_ERROR(EINVAL); 2190 goto out; 2191 } 2192 2193 vnevent_rmdir(vp, dvp, name, ct); 2194 2195 /* 2196 * Grab a lock on the directory to make sure that noone is 2197 * trying to add (or lookup) entries while we are removing it. 2198 */ 2199 rw_enter(&zp->z_name_lock, RW_WRITER); 2200 2201 /* 2202 * Grab a lock on the parent pointer to make sure we play well 2203 * with the treewalk and directory rename code. 2204 */ 2205 rw_enter(&zp->z_parent_lock, RW_WRITER); 2206 2207 tx = dmu_tx_create(zfsvfs->z_os); 2208 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2209 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2210 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2211 zfs_sa_upgrade_txholds(tx, zp); 2212 zfs_sa_upgrade_txholds(tx, dzp); 2213 dmu_tx_mark_netfree(tx); 2214 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 2215 if (error) { 2216 rw_exit(&zp->z_parent_lock); 2217 rw_exit(&zp->z_name_lock); 2218 zfs_dirent_unlock(dl); 2219 VN_RELE(vp); 2220 if (error == ERESTART) { 2221 waited = B_TRUE; 2222 dmu_tx_wait(tx); 2223 dmu_tx_abort(tx); 2224 goto top; 2225 } 2226 dmu_tx_abort(tx); 2227 ZFS_EXIT(zfsvfs); 2228 return (error); 2229 } 2230 2231 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2232 2233 if (error == 0) { 2234 uint64_t txtype = TX_RMDIR; 2235 if (flags & FIGNORECASE) 2236 txtype |= TX_CI; 2237 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2238 } 2239 2240 dmu_tx_commit(tx); 2241 2242 rw_exit(&zp->z_parent_lock); 2243 rw_exit(&zp->z_name_lock); 2244 out: 2245 zfs_dirent_unlock(dl); 2246 2247 VN_RELE(vp); 2248 2249 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2250 zil_commit(zilog, 0); 2251 2252 ZFS_EXIT(zfsvfs); 2253 return (error); 2254 } 2255 2256 /* 2257 * Read as many directory entries as will fit into the provided 2258 * buffer from the given directory cursor position (specified in 2259 * the uio structure). 2260 * 2261 * IN: vp - vnode of directory to read. 2262 * uio - structure supplying read location, range info, 2263 * and return buffer. 2264 * cr - credentials of caller. 2265 * ct - caller context 2266 * flags - case flags 2267 * 2268 * OUT: uio - updated offset and range, buffer filled. 2269 * eofp - set to true if end-of-file detected. 2270 * 2271 * RETURN: 0 on success, error code on failure. 2272 * 2273 * Timestamps: 2274 * vp - atime updated 2275 * 2276 * Note that the low 4 bits of the cookie returned by zap is always zero. 2277 * This allows us to use the low range for "special" directory entries: 2278 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2279 * we use the offset 2 for the '.zfs' directory. 2280 */ 2281 /* ARGSUSED */ 2282 static int 2283 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 2284 caller_context_t *ct, int flags) 2285 { 2286 znode_t *zp = VTOZ(vp); 2287 iovec_t *iovp; 2288 edirent_t *eodp; 2289 dirent64_t *odp; 2290 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2291 objset_t *os; 2292 caddr_t outbuf; 2293 size_t bufsize; 2294 zap_cursor_t zc; 2295 zap_attribute_t zap; 2296 uint_t bytes_wanted; 2297 uint64_t offset; /* must be unsigned; checks for < 1 */ 2298 uint64_t parent; 2299 int local_eof; 2300 int outcount; 2301 int error; 2302 uint8_t prefetch; 2303 boolean_t check_sysattrs; 2304 2305 ZFS_ENTER(zfsvfs); 2306 ZFS_VERIFY_ZP(zp); 2307 2308 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2309 &parent, sizeof (parent))) != 0) { 2310 ZFS_EXIT(zfsvfs); 2311 return (error); 2312 } 2313 2314 /* 2315 * If we are not given an eof variable, 2316 * use a local one. 2317 */ 2318 if (eofp == NULL) 2319 eofp = &local_eof; 2320 2321 /* 2322 * Check for valid iov_len. 2323 */ 2324 if (uio->uio_iov->iov_len <= 0) { 2325 ZFS_EXIT(zfsvfs); 2326 return (SET_ERROR(EINVAL)); 2327 } 2328 2329 /* 2330 * Quit if directory has been removed (posix) 2331 */ 2332 if ((*eofp = zp->z_unlinked) != 0) { 2333 ZFS_EXIT(zfsvfs); 2334 return (0); 2335 } 2336 2337 error = 0; 2338 os = zfsvfs->z_os; 2339 offset = uio->uio_loffset; 2340 prefetch = zp->z_zn_prefetch; 2341 2342 /* 2343 * Initialize the iterator cursor. 2344 */ 2345 if (offset <= 3) { 2346 /* 2347 * Start iteration from the beginning of the directory. 2348 */ 2349 zap_cursor_init(&zc, os, zp->z_id); 2350 } else { 2351 /* 2352 * The offset is a serialized cursor. 2353 */ 2354 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2355 } 2356 2357 /* 2358 * Get space to change directory entries into fs independent format. 2359 */ 2360 iovp = uio->uio_iov; 2361 bytes_wanted = iovp->iov_len; 2362 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2363 bufsize = bytes_wanted; 2364 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2365 odp = (struct dirent64 *)outbuf; 2366 } else { 2367 bufsize = bytes_wanted; 2368 outbuf = NULL; 2369 odp = (struct dirent64 *)iovp->iov_base; 2370 } 2371 eodp = (struct edirent *)odp; 2372 2373 /* 2374 * If this VFS supports the system attribute view interface; and 2375 * we're looking at an extended attribute directory; and we care 2376 * about normalization conflicts on this vfs; then we must check 2377 * for normalization conflicts with the sysattr name space. 2378 */ 2379 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2380 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2381 (flags & V_RDDIR_ENTFLAGS); 2382 2383 /* 2384 * Transform to file-system independent format 2385 */ 2386 outcount = 0; 2387 while (outcount < bytes_wanted) { 2388 ino64_t objnum; 2389 ushort_t reclen; 2390 off64_t *next = NULL; 2391 2392 /* 2393 * Special case `.', `..', and `.zfs'. 2394 */ 2395 if (offset == 0) { 2396 (void) strcpy(zap.za_name, "."); 2397 zap.za_normalization_conflict = 0; 2398 objnum = zp->z_id; 2399 } else if (offset == 1) { 2400 (void) strcpy(zap.za_name, ".."); 2401 zap.za_normalization_conflict = 0; 2402 objnum = parent; 2403 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2404 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2405 zap.za_normalization_conflict = 0; 2406 objnum = ZFSCTL_INO_ROOT; 2407 } else { 2408 /* 2409 * Grab next entry. 2410 */ 2411 if (error = zap_cursor_retrieve(&zc, &zap)) { 2412 if ((*eofp = (error == ENOENT)) != 0) 2413 break; 2414 else 2415 goto update; 2416 } 2417 2418 if (zap.za_integer_length != 8 || 2419 zap.za_num_integers != 1) { 2420 cmn_err(CE_WARN, "zap_readdir: bad directory " 2421 "entry, obj = %lld, offset = %lld\n", 2422 (u_longlong_t)zp->z_id, 2423 (u_longlong_t)offset); 2424 error = SET_ERROR(ENXIO); 2425 goto update; 2426 } 2427 2428 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2429 /* 2430 * MacOS X can extract the object type here such as: 2431 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2432 */ 2433 2434 if (check_sysattrs && !zap.za_normalization_conflict) { 2435 zap.za_normalization_conflict = 2436 xattr_sysattr_casechk(zap.za_name); 2437 } 2438 } 2439 2440 if (flags & V_RDDIR_ACCFILTER) { 2441 /* 2442 * If we have no access at all, don't include 2443 * this entry in the returned information 2444 */ 2445 znode_t *ezp; 2446 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2447 goto skip_entry; 2448 if (!zfs_has_access(ezp, cr)) { 2449 VN_RELE(ZTOV(ezp)); 2450 goto skip_entry; 2451 } 2452 VN_RELE(ZTOV(ezp)); 2453 } 2454 2455 if (flags & V_RDDIR_ENTFLAGS) 2456 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2457 else 2458 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2459 2460 /* 2461 * Will this entry fit in the buffer? 2462 */ 2463 if (outcount + reclen > bufsize) { 2464 /* 2465 * Did we manage to fit anything in the buffer? 2466 */ 2467 if (!outcount) { 2468 error = SET_ERROR(EINVAL); 2469 goto update; 2470 } 2471 break; 2472 } 2473 if (flags & V_RDDIR_ENTFLAGS) { 2474 /* 2475 * Add extended flag entry: 2476 */ 2477 eodp->ed_ino = objnum; 2478 eodp->ed_reclen = reclen; 2479 /* NOTE: ed_off is the offset for the *next* entry */ 2480 next = &(eodp->ed_off); 2481 eodp->ed_eflags = zap.za_normalization_conflict ? 2482 ED_CASE_CONFLICT : 0; 2483 (void) strncpy(eodp->ed_name, zap.za_name, 2484 EDIRENT_NAMELEN(reclen)); 2485 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2486 } else { 2487 /* 2488 * Add normal entry: 2489 */ 2490 odp->d_ino = objnum; 2491 odp->d_reclen = reclen; 2492 /* NOTE: d_off is the offset for the *next* entry */ 2493 next = &(odp->d_off); 2494 (void) strncpy(odp->d_name, zap.za_name, 2495 DIRENT64_NAMELEN(reclen)); 2496 odp = (dirent64_t *)((intptr_t)odp + reclen); 2497 } 2498 outcount += reclen; 2499 2500 ASSERT(outcount <= bufsize); 2501 2502 /* Prefetch znode */ 2503 if (prefetch) 2504 dmu_prefetch(os, objnum, 0, 0, 0, 2505 ZIO_PRIORITY_SYNC_READ); 2506 2507 skip_entry: 2508 /* 2509 * Move to the next entry, fill in the previous offset. 2510 */ 2511 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2512 zap_cursor_advance(&zc); 2513 offset = zap_cursor_serialize(&zc); 2514 } else { 2515 offset += 1; 2516 } 2517 if (next) 2518 *next = offset; 2519 } 2520 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2521 2522 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2523 iovp->iov_base += outcount; 2524 iovp->iov_len -= outcount; 2525 uio->uio_resid -= outcount; 2526 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2527 /* 2528 * Reset the pointer. 2529 */ 2530 offset = uio->uio_loffset; 2531 } 2532 2533 update: 2534 zap_cursor_fini(&zc); 2535 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2536 kmem_free(outbuf, bufsize); 2537 2538 if (error == ENOENT) 2539 error = 0; 2540 2541 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2542 2543 uio->uio_loffset = offset; 2544 ZFS_EXIT(zfsvfs); 2545 return (error); 2546 } 2547 2548 ulong_t zfs_fsync_sync_cnt = 4; 2549 2550 static int 2551 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2552 { 2553 znode_t *zp = VTOZ(vp); 2554 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2555 2556 /* 2557 * Regardless of whether this is required for standards conformance, 2558 * this is the logical behavior when fsync() is called on a file with 2559 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2560 * going to be pushed out as part of the zil_commit(). 2561 */ 2562 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2563 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2564 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2565 2566 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2567 2568 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2569 ZFS_ENTER(zfsvfs); 2570 ZFS_VERIFY_ZP(zp); 2571 zil_commit(zfsvfs->z_log, zp->z_id); 2572 ZFS_EXIT(zfsvfs); 2573 } 2574 return (0); 2575 } 2576 2577 2578 /* 2579 * Get the requested file attributes and place them in the provided 2580 * vattr structure. 2581 * 2582 * IN: vp - vnode of file. 2583 * vap - va_mask identifies requested attributes. 2584 * If AT_XVATTR set, then optional attrs are requested 2585 * flags - ATTR_NOACLCHECK (CIFS server context) 2586 * cr - credentials of caller. 2587 * ct - caller context 2588 * 2589 * OUT: vap - attribute values. 2590 * 2591 * RETURN: 0 (always succeeds). 2592 */ 2593 /* ARGSUSED */ 2594 static int 2595 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2596 caller_context_t *ct) 2597 { 2598 znode_t *zp = VTOZ(vp); 2599 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2600 int error = 0; 2601 uint64_t links; 2602 uint64_t mtime[2], ctime[2]; 2603 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2604 xoptattr_t *xoap = NULL; 2605 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2606 sa_bulk_attr_t bulk[2]; 2607 int count = 0; 2608 2609 ZFS_ENTER(zfsvfs); 2610 ZFS_VERIFY_ZP(zp); 2611 2612 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2613 2614 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2615 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2616 2617 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2618 ZFS_EXIT(zfsvfs); 2619 return (error); 2620 } 2621 2622 /* 2623 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2624 * Also, if we are the owner don't bother, since owner should 2625 * always be allowed to read basic attributes of file. 2626 */ 2627 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2628 (vap->va_uid != crgetuid(cr))) { 2629 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2630 skipaclchk, cr)) { 2631 ZFS_EXIT(zfsvfs); 2632 return (error); 2633 } 2634 } 2635 2636 /* 2637 * Return all attributes. It's cheaper to provide the answer 2638 * than to determine whether we were asked the question. 2639 */ 2640 2641 mutex_enter(&zp->z_lock); 2642 vap->va_type = vp->v_type; 2643 vap->va_mode = zp->z_mode & MODEMASK; 2644 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2645 vap->va_nodeid = zp->z_id; 2646 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2647 links = zp->z_links + 1; 2648 else 2649 links = zp->z_links; 2650 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2651 vap->va_size = zp->z_size; 2652 vap->va_rdev = vp->v_rdev; 2653 vap->va_seq = zp->z_seq; 2654 2655 /* 2656 * Add in any requested optional attributes and the create time. 2657 * Also set the corresponding bits in the returned attribute bitmap. 2658 */ 2659 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2660 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2661 xoap->xoa_archive = 2662 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2663 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2664 } 2665 2666 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2667 xoap->xoa_readonly = 2668 ((zp->z_pflags & ZFS_READONLY) != 0); 2669 XVA_SET_RTN(xvap, XAT_READONLY); 2670 } 2671 2672 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2673 xoap->xoa_system = 2674 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2675 XVA_SET_RTN(xvap, XAT_SYSTEM); 2676 } 2677 2678 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2679 xoap->xoa_hidden = 2680 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2681 XVA_SET_RTN(xvap, XAT_HIDDEN); 2682 } 2683 2684 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2685 xoap->xoa_nounlink = 2686 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2687 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2688 } 2689 2690 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2691 xoap->xoa_immutable = 2692 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2693 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2694 } 2695 2696 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2697 xoap->xoa_appendonly = 2698 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2699 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2700 } 2701 2702 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2703 xoap->xoa_nodump = 2704 ((zp->z_pflags & ZFS_NODUMP) != 0); 2705 XVA_SET_RTN(xvap, XAT_NODUMP); 2706 } 2707 2708 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2709 xoap->xoa_opaque = 2710 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2711 XVA_SET_RTN(xvap, XAT_OPAQUE); 2712 } 2713 2714 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2715 xoap->xoa_av_quarantined = 2716 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2717 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2718 } 2719 2720 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2721 xoap->xoa_av_modified = 2722 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2723 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2724 } 2725 2726 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2727 vp->v_type == VREG) { 2728 zfs_sa_get_scanstamp(zp, xvap); 2729 } 2730 2731 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2732 uint64_t times[2]; 2733 2734 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 2735 times, sizeof (times)); 2736 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 2737 XVA_SET_RTN(xvap, XAT_CREATETIME); 2738 } 2739 2740 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2741 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2742 XVA_SET_RTN(xvap, XAT_REPARSE); 2743 } 2744 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2745 xoap->xoa_generation = zp->z_gen; 2746 XVA_SET_RTN(xvap, XAT_GEN); 2747 } 2748 2749 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2750 xoap->xoa_offline = 2751 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2752 XVA_SET_RTN(xvap, XAT_OFFLINE); 2753 } 2754 2755 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2756 xoap->xoa_sparse = 2757 ((zp->z_pflags & ZFS_SPARSE) != 0); 2758 XVA_SET_RTN(xvap, XAT_SPARSE); 2759 } 2760 2761 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2762 xoap->xoa_projinherit = 2763 ((zp->z_pflags & ZFS_PROJINHERIT) != 0); 2764 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 2765 } 2766 2767 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 2768 xoap->xoa_projid = zp->z_projid; 2769 XVA_SET_RTN(xvap, XAT_PROJID); 2770 } 2771 } 2772 2773 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2774 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2775 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2776 2777 mutex_exit(&zp->z_lock); 2778 2779 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); 2780 2781 if (zp->z_blksz == 0) { 2782 /* 2783 * Block size hasn't been set; suggest maximal I/O transfers. 2784 */ 2785 vap->va_blksize = zfsvfs->z_max_blksz; 2786 } 2787 2788 ZFS_EXIT(zfsvfs); 2789 return (0); 2790 } 2791 2792 /* 2793 * For the operation of changing file's user/group/project, we need to 2794 * handle not only the main object that is assigned to the file directly, 2795 * but also the ones that are used by the file via hidden xattr directory. 2796 * 2797 * Because the xattr directory may contain many EA entries, it may be 2798 * impossible to change all of them in the same transaction as changing the 2799 * main object's user/group/project attributes. If so, we have to change them 2800 * via other multiple independent transactions one by one. It may be not a good 2801 * solution, but we have no better idea yet. 2802 */ 2803 static int 2804 zfs_setattr_dir(znode_t *dzp) 2805 { 2806 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2807 objset_t *os = zfsvfs->z_os; 2808 zap_cursor_t zc; 2809 zap_attribute_t zap; 2810 zfs_dirlock_t *dl; 2811 znode_t *zp = NULL; 2812 dmu_tx_t *tx = NULL; 2813 sa_bulk_attr_t bulk[4]; 2814 int count; 2815 int err; 2816 2817 zap_cursor_init(&zc, os, dzp->z_id); 2818 while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { 2819 count = 0; 2820 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { 2821 err = ENXIO; 2822 break; 2823 } 2824 2825 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, 2826 ZEXISTS, NULL, NULL); 2827 if (err == ENOENT) 2828 goto next; 2829 if (err) 2830 break; 2831 2832 if (zp->z_uid == dzp->z_uid && 2833 zp->z_gid == dzp->z_gid && 2834 zp->z_projid == dzp->z_projid) 2835 goto next; 2836 2837 tx = dmu_tx_create(os); 2838 if (!(zp->z_pflags & ZFS_PROJID)) 2839 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2840 else 2841 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2842 2843 err = dmu_tx_assign(tx, TXG_WAIT); 2844 if (err) 2845 break; 2846 2847 mutex_enter(&dzp->z_lock); 2848 2849 if (zp->z_uid != dzp->z_uid) { 2850 zp->z_uid = dzp->z_uid; 2851 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2852 &dzp->z_uid, sizeof (dzp->z_uid)); 2853 } 2854 2855 if (zp->z_gid != dzp->z_gid) { 2856 zp->z_gid = dzp->z_gid; 2857 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 2858 &dzp->z_gid, sizeof (dzp->z_gid)); 2859 } 2860 2861 if (zp->z_projid != dzp->z_projid) { 2862 if (!(zp->z_pflags & ZFS_PROJID)) { 2863 zp->z_pflags |= ZFS_PROJID; 2864 SA_ADD_BULK_ATTR(bulk, count, 2865 SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 2866 sizeof (zp->z_pflags)); 2867 } 2868 2869 zp->z_projid = dzp->z_projid; 2870 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), 2871 NULL, &zp->z_projid, sizeof (zp->z_projid)); 2872 } 2873 2874 mutex_exit(&dzp->z_lock); 2875 2876 if (likely(count > 0)) { 2877 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2878 dmu_tx_commit(tx); 2879 } else { 2880 dmu_tx_abort(tx); 2881 } 2882 tx = NULL; 2883 if (err != 0 && err != ENOENT) 2884 break; 2885 2886 next: 2887 if (zp) { 2888 VN_RELE(ZTOV(zp)); 2889 zp = NULL; 2890 zfs_dirent_unlock(dl); 2891 } 2892 zap_cursor_advance(&zc); 2893 } 2894 2895 if (tx) 2896 dmu_tx_abort(tx); 2897 if (zp) { 2898 VN_RELE(ZTOV(zp)); 2899 zfs_dirent_unlock(dl); 2900 } 2901 zap_cursor_fini(&zc); 2902 2903 return (err == ENOENT ? 0 : err); 2904 } 2905 2906 /* 2907 * Set the file attributes to the values contained in the 2908 * vattr structure. 2909 * 2910 * IN: vp - vnode of file to be modified. 2911 * vap - new attribute values. 2912 * If AT_XVATTR set, then optional attrs are being set 2913 * flags - ATTR_UTIME set if non-default time values provided. 2914 * - ATTR_NOACLCHECK (CIFS context only). 2915 * cr - credentials of caller. 2916 * ct - caller context 2917 * 2918 * RETURN: 0 on success, error code on failure. 2919 * 2920 * Timestamps: 2921 * vp - ctime updated, mtime updated if size changed. 2922 */ 2923 /* ARGSUSED */ 2924 static int 2925 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2926 caller_context_t *ct) 2927 { 2928 znode_t *zp = VTOZ(vp); 2929 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2930 objset_t *os = zfsvfs->z_os; 2931 zilog_t *zilog; 2932 dmu_tx_t *tx; 2933 vattr_t oldva; 2934 xvattr_t tmpxvattr; 2935 uint_t mask = vap->va_mask; 2936 uint_t saved_mask = 0; 2937 int trim_mask = 0; 2938 uint64_t new_mode; 2939 uint64_t new_uid, new_gid; 2940 uint64_t xattr_obj; 2941 uint64_t mtime[2], ctime[2]; 2942 uint64_t projid = ZFS_INVALID_PROJID; 2943 znode_t *attrzp; 2944 int need_policy = FALSE; 2945 int err, err2 = 0; 2946 zfs_fuid_info_t *fuidp = NULL; 2947 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2948 xoptattr_t *xoap; 2949 zfs_acl_t *aclp; 2950 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2951 boolean_t fuid_dirtied = B_FALSE; 2952 boolean_t handle_eadir = B_FALSE; 2953 sa_bulk_attr_t bulk[8], xattr_bulk[8]; 2954 int count = 0, xattr_count = 0; 2955 2956 if (mask == 0) 2957 return (0); 2958 2959 if (mask & AT_NOSET) 2960 return (SET_ERROR(EINVAL)); 2961 2962 ZFS_ENTER(zfsvfs); 2963 ZFS_VERIFY_ZP(zp); 2964 2965 /* 2966 * If this is a xvattr_t, then get a pointer to the structure of 2967 * optional attributes. If this is NULL, then we have a vattr_t. 2968 */ 2969 xoap = xva_getxoptattr(xvap); 2970 if (xoap != NULL && (mask & AT_XVATTR)) { 2971 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 2972 if (!dmu_objset_projectquota_enabled(os) || 2973 (vp->v_type != VREG && vp->v_type != VDIR)) { 2974 ZFS_EXIT(zfsvfs); 2975 return (SET_ERROR(ENOTSUP)); 2976 } 2977 2978 projid = xoap->xoa_projid; 2979 if (unlikely(projid == ZFS_INVALID_PROJID)) { 2980 ZFS_EXIT(zfsvfs); 2981 return (SET_ERROR(EINVAL)); 2982 } 2983 2984 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 2985 projid = ZFS_INVALID_PROJID; 2986 else 2987 need_policy = TRUE; 2988 } 2989 2990 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 2991 (!dmu_objset_projectquota_enabled(os) || 2992 (vp->v_type != VREG && vp->v_type != VDIR))) { 2993 ZFS_EXIT(zfsvfs); 2994 return (SET_ERROR(ENOTSUP)); 2995 } 2996 } 2997 2998 zilog = zfsvfs->z_log; 2999 3000 /* 3001 * Make sure that if we have ephemeral uid/gid or xvattr specified 3002 * that file system is at proper version level 3003 */ 3004 3005 if (zfsvfs->z_use_fuids == B_FALSE && 3006 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 3007 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 3008 (mask & AT_XVATTR))) { 3009 ZFS_EXIT(zfsvfs); 3010 return (SET_ERROR(EINVAL)); 3011 } 3012 3013 if (mask & AT_SIZE && vp->v_type == VDIR) { 3014 ZFS_EXIT(zfsvfs); 3015 return (SET_ERROR(EISDIR)); 3016 } 3017 3018 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 3019 ZFS_EXIT(zfsvfs); 3020 return (SET_ERROR(EINVAL)); 3021 } 3022 3023 xva_init(&tmpxvattr); 3024 3025 /* 3026 * Immutable files can only alter immutable bit and atime 3027 */ 3028 if ((zp->z_pflags & ZFS_IMMUTABLE) && 3029 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 3030 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 3031 ZFS_EXIT(zfsvfs); 3032 return (SET_ERROR(EPERM)); 3033 } 3034 3035 /* 3036 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 3037 */ 3038 3039 /* 3040 * Verify timestamps doesn't overflow 32 bits. 3041 * ZFS can handle large timestamps, but 32bit syscalls can't 3042 * handle times greater than 2039. This check should be removed 3043 * once large timestamps are fully supported. 3044 */ 3045 if (mask & (AT_ATIME | AT_MTIME)) { 3046 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 3047 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 3048 ZFS_EXIT(zfsvfs); 3049 return (SET_ERROR(EOVERFLOW)); 3050 } 3051 } 3052 3053 top: 3054 attrzp = NULL; 3055 aclp = NULL; 3056 3057 /* Can this be moved to before the top label? */ 3058 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 3059 ZFS_EXIT(zfsvfs); 3060 return (SET_ERROR(EROFS)); 3061 } 3062 3063 /* 3064 * First validate permissions 3065 */ 3066 3067 if (mask & AT_SIZE) { 3068 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 3069 if (err) { 3070 ZFS_EXIT(zfsvfs); 3071 return (err); 3072 } 3073 /* 3074 * XXX - Note, we are not providing any open 3075 * mode flags here (like FNDELAY), so we may 3076 * block if there are locks present... this 3077 * should be addressed in openat(). 3078 */ 3079 /* XXX - would it be OK to generate a log record here? */ 3080 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 3081 if (err) { 3082 ZFS_EXIT(zfsvfs); 3083 return (err); 3084 } 3085 3086 if (vap->va_size == 0) 3087 vnevent_truncate(ZTOV(zp), ct); 3088 } 3089 3090 if (mask & (AT_ATIME|AT_MTIME) || 3091 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 3092 XVA_ISSET_REQ(xvap, XAT_READONLY) || 3093 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 3094 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 3095 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 3096 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 3097 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 3098 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 3099 skipaclchk, cr); 3100 } 3101 3102 if (mask & (AT_UID|AT_GID)) { 3103 int idmask = (mask & (AT_UID|AT_GID)); 3104 int take_owner; 3105 int take_group; 3106 3107 /* 3108 * NOTE: even if a new mode is being set, 3109 * we may clear S_ISUID/S_ISGID bits. 3110 */ 3111 3112 if (!(mask & AT_MODE)) 3113 vap->va_mode = zp->z_mode; 3114 3115 /* 3116 * Take ownership or chgrp to group we are a member of 3117 */ 3118 3119 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3120 take_group = (mask & AT_GID) && 3121 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3122 3123 /* 3124 * If both AT_UID and AT_GID are set then take_owner and 3125 * take_group must both be set in order to allow taking 3126 * ownership. 3127 * 3128 * Otherwise, send the check through secpolicy_vnode_setattr() 3129 * 3130 */ 3131 3132 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3133 ((idmask == AT_UID) && take_owner) || 3134 ((idmask == AT_GID) && take_group)) { 3135 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3136 skipaclchk, cr) == 0) { 3137 /* 3138 * Remove setuid/setgid for non-privileged users 3139 */ 3140 secpolicy_setid_clear(vap, cr); 3141 trim_mask = (mask & (AT_UID|AT_GID)); 3142 } else { 3143 need_policy = TRUE; 3144 } 3145 } else { 3146 need_policy = TRUE; 3147 } 3148 } 3149 3150 mutex_enter(&zp->z_lock); 3151 oldva.va_mode = zp->z_mode; 3152 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3153 if (mask & AT_XVATTR) { 3154 /* 3155 * Update xvattr mask to include only those attributes 3156 * that are actually changing. 3157 * 3158 * the bits will be restored prior to actually setting 3159 * the attributes so the caller thinks they were set. 3160 */ 3161 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3162 if (xoap->xoa_appendonly != 3163 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3164 need_policy = TRUE; 3165 } else { 3166 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3167 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3168 } 3169 } 3170 3171 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 3172 if (xoap->xoa_projinherit != 3173 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 3174 need_policy = TRUE; 3175 } else { 3176 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 3177 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); 3178 } 3179 } 3180 3181 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3182 if (xoap->xoa_nounlink != 3183 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3184 need_policy = TRUE; 3185 } else { 3186 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3187 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3188 } 3189 } 3190 3191 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3192 if (xoap->xoa_immutable != 3193 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3194 need_policy = TRUE; 3195 } else { 3196 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3197 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3198 } 3199 } 3200 3201 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3202 if (xoap->xoa_nodump != 3203 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3204 need_policy = TRUE; 3205 } else { 3206 XVA_CLR_REQ(xvap, XAT_NODUMP); 3207 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3208 } 3209 } 3210 3211 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3212 if (xoap->xoa_av_modified != 3213 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3214 need_policy = TRUE; 3215 } else { 3216 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3217 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3218 } 3219 } 3220 3221 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3222 if ((vp->v_type != VREG && 3223 xoap->xoa_av_quarantined) || 3224 xoap->xoa_av_quarantined != 3225 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3226 need_policy = TRUE; 3227 } else { 3228 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3229 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3230 } 3231 } 3232 3233 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3234 mutex_exit(&zp->z_lock); 3235 ZFS_EXIT(zfsvfs); 3236 return (SET_ERROR(EPERM)); 3237 } 3238 3239 if (need_policy == FALSE && 3240 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3241 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3242 need_policy = TRUE; 3243 } 3244 } 3245 3246 mutex_exit(&zp->z_lock); 3247 3248 if (mask & AT_MODE) { 3249 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3250 err = secpolicy_setid_setsticky_clear(vp, vap, 3251 &oldva, cr); 3252 if (err) { 3253 ZFS_EXIT(zfsvfs); 3254 return (err); 3255 } 3256 trim_mask |= AT_MODE; 3257 } else { 3258 need_policy = TRUE; 3259 } 3260 } 3261 3262 if (need_policy) { 3263 /* 3264 * If trim_mask is set then take ownership 3265 * has been granted or write_acl is present and user 3266 * has the ability to modify mode. In that case remove 3267 * UID|GID and or MODE from mask so that 3268 * secpolicy_vnode_setattr() doesn't revoke it. 3269 */ 3270 3271 if (trim_mask) { 3272 saved_mask = vap->va_mask; 3273 vap->va_mask &= ~trim_mask; 3274 } 3275 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3276 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3277 if (err) { 3278 ZFS_EXIT(zfsvfs); 3279 return (err); 3280 } 3281 3282 if (trim_mask) 3283 vap->va_mask |= saved_mask; 3284 } 3285 3286 /* 3287 * secpolicy_vnode_setattr, or take ownership may have 3288 * changed va_mask 3289 */ 3290 mask = vap->va_mask; 3291 3292 if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { 3293 handle_eadir = B_TRUE; 3294 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3295 &xattr_obj, sizeof (xattr_obj)); 3296 3297 if (err == 0 && xattr_obj) { 3298 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3299 if (err) 3300 goto out2; 3301 } 3302 if (mask & AT_UID) { 3303 new_uid = zfs_fuid_create(zfsvfs, 3304 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3305 if (new_uid != zp->z_uid && 3306 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 3307 new_uid)) { 3308 if (attrzp) 3309 VN_RELE(ZTOV(attrzp)); 3310 err = SET_ERROR(EDQUOT); 3311 goto out2; 3312 } 3313 } 3314 3315 if (mask & AT_GID) { 3316 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3317 cr, ZFS_GROUP, &fuidp); 3318 if (new_gid != zp->z_gid && 3319 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3320 new_gid)) { 3321 if (attrzp) 3322 VN_RELE(ZTOV(attrzp)); 3323 err = SET_ERROR(EDQUOT); 3324 goto out2; 3325 } 3326 } 3327 3328 if (projid != ZFS_INVALID_PROJID && 3329 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 3330 if (attrzp) 3331 VN_RELE(ZTOV(attrzp)); 3332 err = EDQUOT; 3333 goto out2; 3334 } 3335 } 3336 tx = dmu_tx_create(os); 3337 3338 if (mask & AT_MODE) { 3339 uint64_t pmode = zp->z_mode; 3340 uint64_t acl_obj; 3341 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3342 3343 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3344 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3345 err = SET_ERROR(EPERM); 3346 goto out; 3347 } 3348 3349 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3350 goto out; 3351 3352 mutex_enter(&zp->z_lock); 3353 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3354 /* 3355 * Are we upgrading ACL from old V0 format 3356 * to V1 format? 3357 */ 3358 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3359 zfs_znode_acl_version(zp) == 3360 ZFS_ACL_VERSION_INITIAL) { 3361 dmu_tx_hold_free(tx, acl_obj, 0, 3362 DMU_OBJECT_END); 3363 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3364 0, aclp->z_acl_bytes); 3365 } else { 3366 dmu_tx_hold_write(tx, acl_obj, 0, 3367 aclp->z_acl_bytes); 3368 } 3369 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3371 0, aclp->z_acl_bytes); 3372 } 3373 mutex_exit(&zp->z_lock); 3374 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3375 } else { 3376 if (((mask & AT_XVATTR) && 3377 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 3378 (projid != ZFS_INVALID_PROJID && 3379 !(zp->z_pflags & ZFS_PROJID))) 3380 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3381 else 3382 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3383 } 3384 3385 if (attrzp) { 3386 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3387 } 3388 3389 fuid_dirtied = zfsvfs->z_fuid_dirty; 3390 if (fuid_dirtied) 3391 zfs_fuid_txhold(zfsvfs, tx); 3392 3393 zfs_sa_upgrade_txholds(tx, zp); 3394 3395 err = dmu_tx_assign(tx, TXG_WAIT); 3396 if (err) 3397 goto out; 3398 3399 count = 0; 3400 /* 3401 * Set each attribute requested. 3402 * We group settings according to the locks they need to acquire. 3403 * 3404 * Note: you cannot set ctime directly, although it will be 3405 * updated as a side-effect of calling this function. 3406 */ 3407 3408 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 3409 /* 3410 * For the existing object that is upgraded from old system, 3411 * its on-disk layout has no slot for the project ID attribute. 3412 * But quota accounting logic needs to access related slots by 3413 * offset directly. So we need to adjust old objects' layout 3414 * to make the project ID to some unified and fixed offset. 3415 */ 3416 if (attrzp) 3417 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 3418 if (err == 0) 3419 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 3420 3421 if (unlikely(err == EEXIST)) 3422 err = 0; 3423 else if (err != 0) 3424 goto out; 3425 else 3426 projid = ZFS_INVALID_PROJID; 3427 } 3428 3429 if (mask & (AT_UID|AT_GID|AT_MODE)) 3430 mutex_enter(&zp->z_acl_lock); 3431 mutex_enter(&zp->z_lock); 3432 3433 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3434 &zp->z_pflags, sizeof (zp->z_pflags)); 3435 3436 if (attrzp) { 3437 if (mask & (AT_UID|AT_GID|AT_MODE)) 3438 mutex_enter(&attrzp->z_acl_lock); 3439 mutex_enter(&attrzp->z_lock); 3440 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3441 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3442 sizeof (attrzp->z_pflags)); 3443 if (projid != ZFS_INVALID_PROJID) { 3444 attrzp->z_projid = projid; 3445 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3446 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 3447 sizeof (attrzp->z_projid)); 3448 } 3449 } 3450 3451 if (mask & (AT_UID|AT_GID)) { 3452 3453 if (mask & AT_UID) { 3454 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3455 &new_uid, sizeof (new_uid)); 3456 zp->z_uid = new_uid; 3457 if (attrzp) { 3458 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3459 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3460 sizeof (new_uid)); 3461 attrzp->z_uid = new_uid; 3462 } 3463 } 3464 3465 if (mask & AT_GID) { 3466 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3467 NULL, &new_gid, sizeof (new_gid)); 3468 zp->z_gid = new_gid; 3469 if (attrzp) { 3470 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3471 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3472 sizeof (new_gid)); 3473 attrzp->z_gid = new_gid; 3474 } 3475 } 3476 if (!(mask & AT_MODE)) { 3477 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3478 NULL, &new_mode, sizeof (new_mode)); 3479 new_mode = zp->z_mode; 3480 } 3481 err = zfs_acl_chown_setattr(zp); 3482 ASSERT(err == 0); 3483 if (attrzp) { 3484 err = zfs_acl_chown_setattr(attrzp); 3485 ASSERT(err == 0); 3486 } 3487 } 3488 3489 if (mask & AT_MODE) { 3490 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3491 &new_mode, sizeof (new_mode)); 3492 zp->z_mode = new_mode; 3493 ASSERT3U((uintptr_t)aclp, !=, NULL); 3494 err = zfs_aclset_common(zp, aclp, cr, tx); 3495 ASSERT0(err); 3496 if (zp->z_acl_cached) 3497 zfs_acl_free(zp->z_acl_cached); 3498 zp->z_acl_cached = aclp; 3499 aclp = NULL; 3500 } 3501 3502 3503 if (mask & AT_ATIME) { 3504 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3505 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3506 &zp->z_atime, sizeof (zp->z_atime)); 3507 } 3508 3509 if (mask & AT_MTIME) { 3510 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3511 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3512 mtime, sizeof (mtime)); 3513 } 3514 3515 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3516 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3517 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3518 NULL, mtime, sizeof (mtime)); 3519 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3520 &ctime, sizeof (ctime)); 3521 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3522 B_TRUE); 3523 } else if (mask != 0) { 3524 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3525 &ctime, sizeof (ctime)); 3526 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3527 B_TRUE); 3528 if (attrzp) { 3529 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3530 SA_ZPL_CTIME(zfsvfs), NULL, 3531 &ctime, sizeof (ctime)); 3532 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3533 mtime, ctime, B_TRUE); 3534 } 3535 } 3536 3537 if (projid != ZFS_INVALID_PROJID) { 3538 zp->z_projid = projid; 3539 SA_ADD_BULK_ATTR(bulk, count, 3540 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 3541 sizeof (zp->z_projid)); 3542 } 3543 3544 /* 3545 * Do this after setting timestamps to prevent timestamp 3546 * update from toggling bit 3547 */ 3548 3549 if (xoap && (mask & AT_XVATTR)) { 3550 3551 /* 3552 * restore trimmed off masks 3553 * so that return masks can be set for caller. 3554 */ 3555 3556 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3557 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3558 } 3559 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3560 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3561 } 3562 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3563 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3564 } 3565 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3566 XVA_SET_REQ(xvap, XAT_NODUMP); 3567 } 3568 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3569 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3570 } 3571 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3572 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3573 } 3574 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { 3575 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 3576 } 3577 3578 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3579 ASSERT(vp->v_type == VREG); 3580 3581 zfs_xvattr_set(zp, xvap, tx); 3582 } 3583 3584 if (fuid_dirtied) 3585 zfs_fuid_sync(zfsvfs, tx); 3586 3587 if (mask != 0) 3588 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3589 3590 mutex_exit(&zp->z_lock); 3591 if (mask & (AT_UID|AT_GID|AT_MODE)) 3592 mutex_exit(&zp->z_acl_lock); 3593 3594 if (attrzp) { 3595 if (mask & (AT_UID|AT_GID|AT_MODE)) 3596 mutex_exit(&attrzp->z_acl_lock); 3597 mutex_exit(&attrzp->z_lock); 3598 } 3599 out: 3600 if (err == 0 && xattr_count > 0) { 3601 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3602 xattr_count, tx); 3603 ASSERT(err2 == 0); 3604 } 3605 3606 if (aclp) 3607 zfs_acl_free(aclp); 3608 3609 if (fuidp) { 3610 zfs_fuid_info_free(fuidp); 3611 fuidp = NULL; 3612 } 3613 3614 if (err) { 3615 dmu_tx_abort(tx); 3616 if (attrzp) 3617 VN_RELE(ZTOV(attrzp)); 3618 if (err == ERESTART) 3619 goto top; 3620 } else { 3621 if (count > 0) 3622 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3623 dmu_tx_commit(tx); 3624 if (attrzp) { 3625 if (err2 == 0 && handle_eadir) 3626 err2 = zfs_setattr_dir(attrzp); 3627 VN_RELE(ZTOV(attrzp)); 3628 } 3629 } 3630 3631 out2: 3632 if (os->os_sync == ZFS_SYNC_ALWAYS) 3633 zil_commit(zilog, 0); 3634 3635 ZFS_EXIT(zfsvfs); 3636 return (err); 3637 } 3638 3639 typedef struct zfs_zlock { 3640 krwlock_t *zl_rwlock; /* lock we acquired */ 3641 znode_t *zl_znode; /* znode we held */ 3642 struct zfs_zlock *zl_next; /* next in list */ 3643 } zfs_zlock_t; 3644 3645 /* 3646 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3647 */ 3648 static void 3649 zfs_rename_unlock(zfs_zlock_t **zlpp) 3650 { 3651 zfs_zlock_t *zl; 3652 3653 while ((zl = *zlpp) != NULL) { 3654 if (zl->zl_znode != NULL) 3655 VN_RELE(ZTOV(zl->zl_znode)); 3656 rw_exit(zl->zl_rwlock); 3657 *zlpp = zl->zl_next; 3658 kmem_free(zl, sizeof (*zl)); 3659 } 3660 } 3661 3662 /* 3663 * Search back through the directory tree, using the ".." entries. 3664 * Lock each directory in the chain to prevent concurrent renames. 3665 * Fail any attempt to move a directory into one of its own descendants. 3666 * XXX - z_parent_lock can overlap with map or grow locks 3667 */ 3668 static int 3669 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3670 { 3671 zfs_zlock_t *zl; 3672 znode_t *zp = tdzp; 3673 uint64_t rootid = zp->z_zfsvfs->z_root; 3674 uint64_t oidp = zp->z_id; 3675 krwlock_t *rwlp = &szp->z_parent_lock; 3676 krw_t rw = RW_WRITER; 3677 3678 /* 3679 * First pass write-locks szp and compares to zp->z_id. 3680 * Later passes read-lock zp and compare to zp->z_parent. 3681 */ 3682 do { 3683 if (!rw_tryenter(rwlp, rw)) { 3684 /* 3685 * Another thread is renaming in this path. 3686 * Note that if we are a WRITER, we don't have any 3687 * parent_locks held yet. 3688 */ 3689 if (rw == RW_READER && zp->z_id > szp->z_id) { 3690 /* 3691 * Drop our locks and restart 3692 */ 3693 zfs_rename_unlock(&zl); 3694 *zlpp = NULL; 3695 zp = tdzp; 3696 oidp = zp->z_id; 3697 rwlp = &szp->z_parent_lock; 3698 rw = RW_WRITER; 3699 continue; 3700 } else { 3701 /* 3702 * Wait for other thread to drop its locks 3703 */ 3704 rw_enter(rwlp, rw); 3705 } 3706 } 3707 3708 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3709 zl->zl_rwlock = rwlp; 3710 zl->zl_znode = NULL; 3711 zl->zl_next = *zlpp; 3712 *zlpp = zl; 3713 3714 if (oidp == szp->z_id) /* We're a descendant of szp */ 3715 return (SET_ERROR(EINVAL)); 3716 3717 if (oidp == rootid) /* We've hit the top */ 3718 return (0); 3719 3720 if (rw == RW_READER) { /* i.e. not the first pass */ 3721 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); 3722 if (error) 3723 return (error); 3724 zl->zl_znode = zp; 3725 } 3726 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), 3727 &oidp, sizeof (oidp)); 3728 rwlp = &zp->z_parent_lock; 3729 rw = RW_READER; 3730 3731 } while (zp->z_id != sdzp->z_id); 3732 3733 return (0); 3734 } 3735 3736 /* 3737 * Move an entry from the provided source directory to the target 3738 * directory. Change the entry name as indicated. 3739 * 3740 * IN: sdvp - Source directory containing the "old entry". 3741 * snm - Old entry name. 3742 * tdvp - Target directory to contain the "new entry". 3743 * tnm - New entry name. 3744 * cr - credentials of caller. 3745 * ct - caller context 3746 * flags - case flags 3747 * 3748 * RETURN: 0 on success, error code on failure. 3749 * 3750 * Timestamps: 3751 * sdvp,tdvp - ctime|mtime updated 3752 */ 3753 /*ARGSUSED*/ 3754 static int 3755 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3756 caller_context_t *ct, int flags) 3757 { 3758 znode_t *tdzp, *szp, *tzp; 3759 znode_t *sdzp = VTOZ(sdvp); 3760 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3761 zilog_t *zilog; 3762 vnode_t *realvp; 3763 zfs_dirlock_t *sdl, *tdl; 3764 dmu_tx_t *tx; 3765 zfs_zlock_t *zl; 3766 int cmp, serr, terr; 3767 int error = 0, rm_err = 0; 3768 int zflg = 0; 3769 boolean_t waited = B_FALSE; 3770 3771 ZFS_ENTER(zfsvfs); 3772 ZFS_VERIFY_ZP(sdzp); 3773 zilog = zfsvfs->z_log; 3774 3775 /* 3776 * Make sure we have the real vp for the target directory. 3777 */ 3778 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3779 tdvp = realvp; 3780 3781 tdzp = VTOZ(tdvp); 3782 ZFS_VERIFY_ZP(tdzp); 3783 3784 /* 3785 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3786 * ctldir appear to have the same v_vfsp. 3787 */ 3788 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { 3789 ZFS_EXIT(zfsvfs); 3790 return (SET_ERROR(EXDEV)); 3791 } 3792 3793 if (zfsvfs->z_utf8 && u8_validate(tnm, 3794 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3795 ZFS_EXIT(zfsvfs); 3796 return (SET_ERROR(EILSEQ)); 3797 } 3798 3799 if (flags & FIGNORECASE) 3800 zflg |= ZCILOOK; 3801 3802 top: 3803 szp = NULL; 3804 tzp = NULL; 3805 zl = NULL; 3806 3807 /* 3808 * This is to prevent the creation of links into attribute space 3809 * by renaming a linked file into/outof an attribute directory. 3810 * See the comment in zfs_link() for why this is considered bad. 3811 */ 3812 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3813 ZFS_EXIT(zfsvfs); 3814 return (SET_ERROR(EINVAL)); 3815 } 3816 3817 /* 3818 * Lock source and target directory entries. To prevent deadlock, 3819 * a lock ordering must be defined. We lock the directory with 3820 * the smallest object id first, or if it's a tie, the one with 3821 * the lexically first name. 3822 */ 3823 if (sdzp->z_id < tdzp->z_id) { 3824 cmp = -1; 3825 } else if (sdzp->z_id > tdzp->z_id) { 3826 cmp = 1; 3827 } else { 3828 /* 3829 * First compare the two name arguments without 3830 * considering any case folding. 3831 */ 3832 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3833 3834 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3835 ASSERT(error == 0 || !zfsvfs->z_utf8); 3836 if (cmp == 0) { 3837 /* 3838 * POSIX: "If the old argument and the new argument 3839 * both refer to links to the same existing file, 3840 * the rename() function shall return successfully 3841 * and perform no other action." 3842 */ 3843 ZFS_EXIT(zfsvfs); 3844 return (0); 3845 } 3846 /* 3847 * If the file system is case-folding, then we may 3848 * have some more checking to do. A case-folding file 3849 * system is either supporting mixed case sensitivity 3850 * access or is completely case-insensitive. Note 3851 * that the file system is always case preserving. 3852 * 3853 * In mixed sensitivity mode case sensitive behavior 3854 * is the default. FIGNORECASE must be used to 3855 * explicitly request case insensitive behavior. 3856 * 3857 * If the source and target names provided differ only 3858 * by case (e.g., a request to rename 'tim' to 'Tim'), 3859 * we will treat this as a special case in the 3860 * case-insensitive mode: as long as the source name 3861 * is an exact match, we will allow this to proceed as 3862 * a name-change request. 3863 */ 3864 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3865 (zfsvfs->z_case == ZFS_CASE_MIXED && 3866 flags & FIGNORECASE)) && 3867 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3868 &error) == 0) { 3869 /* 3870 * case preserving rename request, require exact 3871 * name matches 3872 */ 3873 zflg |= ZCIEXACT; 3874 zflg &= ~ZCILOOK; 3875 } 3876 } 3877 3878 /* 3879 * If the source and destination directories are the same, we should 3880 * grab the z_name_lock of that directory only once. 3881 */ 3882 if (sdzp == tdzp) { 3883 zflg |= ZHAVELOCK; 3884 rw_enter(&sdzp->z_name_lock, RW_READER); 3885 } 3886 3887 if (cmp < 0) { 3888 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3889 ZEXISTS | zflg, NULL, NULL); 3890 terr = zfs_dirent_lock(&tdl, 3891 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3892 } else { 3893 terr = zfs_dirent_lock(&tdl, 3894 tdzp, tnm, &tzp, zflg, NULL, NULL); 3895 serr = zfs_dirent_lock(&sdl, 3896 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3897 NULL, NULL); 3898 } 3899 3900 if (serr) { 3901 /* 3902 * Source entry invalid or not there. 3903 */ 3904 if (!terr) { 3905 zfs_dirent_unlock(tdl); 3906 if (tzp) 3907 VN_RELE(ZTOV(tzp)); 3908 } 3909 3910 if (sdzp == tdzp) 3911 rw_exit(&sdzp->z_name_lock); 3912 3913 if (strcmp(snm, "..") == 0) 3914 serr = SET_ERROR(EINVAL); 3915 ZFS_EXIT(zfsvfs); 3916 return (serr); 3917 } 3918 if (terr) { 3919 zfs_dirent_unlock(sdl); 3920 VN_RELE(ZTOV(szp)); 3921 3922 if (sdzp == tdzp) 3923 rw_exit(&sdzp->z_name_lock); 3924 3925 if (strcmp(tnm, "..") == 0) 3926 terr = SET_ERROR(EINVAL); 3927 ZFS_EXIT(zfsvfs); 3928 return (terr); 3929 } 3930 3931 /* 3932 * If we are using project inheritance, it means if the directory has 3933 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3934 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3935 * such case, we only allow renames into our tree when the project 3936 * IDs are the same. 3937 */ 3938 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3939 tdzp->z_projid != szp->z_projid) { 3940 error = SET_ERROR(EXDEV); 3941 goto out; 3942 } 3943 3944 /* 3945 * Must have write access at the source to remove the old entry 3946 * and write access at the target to create the new entry. 3947 * Note that if target and source are the same, this can be 3948 * done in a single check. 3949 */ 3950 3951 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3952 goto out; 3953 3954 if (ZTOV(szp)->v_type == VDIR) { 3955 /* 3956 * Check to make sure rename is valid. 3957 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3958 */ 3959 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3960 goto out; 3961 } 3962 3963 /* 3964 * Does target exist? 3965 */ 3966 if (tzp) { 3967 /* 3968 * Source and target must be the same type. 3969 */ 3970 if (ZTOV(szp)->v_type == VDIR) { 3971 if (ZTOV(tzp)->v_type != VDIR) { 3972 error = SET_ERROR(ENOTDIR); 3973 goto out; 3974 } 3975 } else { 3976 if (ZTOV(tzp)->v_type == VDIR) { 3977 error = SET_ERROR(EISDIR); 3978 goto out; 3979 } 3980 } 3981 /* 3982 * POSIX dictates that when the source and target 3983 * entries refer to the same file object, rename 3984 * must do nothing and exit without error. 3985 */ 3986 if (szp->z_id == tzp->z_id) { 3987 error = 0; 3988 goto out; 3989 } 3990 } 3991 3992 vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct); 3993 if (tzp) 3994 vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3995 3996 /* 3997 * notify the target directory if it is not the same 3998 * as source directory. 3999 */ 4000 if (tdvp != sdvp) { 4001 vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); 4002 } 4003 4004 tx = dmu_tx_create(zfsvfs->z_os); 4005 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4006 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 4007 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 4008 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 4009 if (sdzp != tdzp) { 4010 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 4011 zfs_sa_upgrade_txholds(tx, tdzp); 4012 } 4013 if (tzp) { 4014 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 4015 zfs_sa_upgrade_txholds(tx, tzp); 4016 } 4017 4018 zfs_sa_upgrade_txholds(tx, szp); 4019 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 4020 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 4021 if (error) { 4022 if (zl != NULL) 4023 zfs_rename_unlock(&zl); 4024 zfs_dirent_unlock(sdl); 4025 zfs_dirent_unlock(tdl); 4026 4027 if (sdzp == tdzp) 4028 rw_exit(&sdzp->z_name_lock); 4029 4030 VN_RELE(ZTOV(szp)); 4031 if (tzp) 4032 VN_RELE(ZTOV(tzp)); 4033 if (error == ERESTART) { 4034 waited = B_TRUE; 4035 dmu_tx_wait(tx); 4036 dmu_tx_abort(tx); 4037 goto top; 4038 } 4039 dmu_tx_abort(tx); 4040 ZFS_EXIT(zfsvfs); 4041 return (error); 4042 } 4043 4044 if (tzp) /* Attempt to remove the existing target */ 4045 error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 4046 4047 if (error == 0) { 4048 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 4049 if (error == 0) { 4050 szp->z_pflags |= ZFS_AV_MODIFIED; 4051 if (tdzp->z_pflags & ZFS_PROJINHERIT) 4052 szp->z_pflags |= ZFS_PROJINHERIT; 4053 4054 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 4055 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 4056 ASSERT0(error); 4057 4058 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 4059 if (error == 0) { 4060 zfs_log_rename(zilog, tx, TX_RENAME | 4061 (flags & FIGNORECASE ? TX_CI : 0), sdzp, 4062 sdl->dl_name, tdzp, tdl->dl_name, szp); 4063 4064 /* 4065 * Update path information for the target vnode 4066 */ 4067 vn_renamepath(tdvp, ZTOV(szp), tnm, 4068 strlen(tnm)); 4069 } else { 4070 /* 4071 * At this point, we have successfully created 4072 * the target name, but have failed to remove 4073 * the source name. Since the create was done 4074 * with the ZRENAMING flag, there are 4075 * complications; for one, the link count is 4076 * wrong. The easiest way to deal with this 4077 * is to remove the newly created target, and 4078 * return the original error. This must 4079 * succeed; fortunately, it is very unlikely to 4080 * fail, since we just created it. 4081 */ 4082 VERIFY3U(zfs_link_destroy(tdl, szp, tx, 4083 ZRENAMING, NULL), ==, 0); 4084 } 4085 } 4086 } 4087 4088 dmu_tx_commit(tx); 4089 4090 if (tzp && rm_err == 0) 4091 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 4092 4093 if (error == 0) { 4094 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 4095 /* notify the target dir if it is not the same as source dir */ 4096 if (tdvp != sdvp) 4097 vnevent_rename_dest_dir(tdvp, ct); 4098 } 4099 out: 4100 if (zl != NULL) 4101 zfs_rename_unlock(&zl); 4102 4103 zfs_dirent_unlock(sdl); 4104 zfs_dirent_unlock(tdl); 4105 4106 if (sdzp == tdzp) 4107 rw_exit(&sdzp->z_name_lock); 4108 4109 4110 VN_RELE(ZTOV(szp)); 4111 if (tzp) 4112 VN_RELE(ZTOV(tzp)); 4113 4114 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4115 zil_commit(zilog, 0); 4116 4117 ZFS_EXIT(zfsvfs); 4118 return (error); 4119 } 4120 4121 /* 4122 * Insert the indicated symbolic reference entry into the directory. 4123 * 4124 * IN: dvp - Directory to contain new symbolic link. 4125 * link - Name for new symlink entry. 4126 * vap - Attributes of new entry. 4127 * cr - credentials of caller. 4128 * ct - caller context 4129 * flags - case flags 4130 * 4131 * RETURN: 0 on success, error code on failure. 4132 * 4133 * Timestamps: 4134 * dvp - ctime|mtime updated 4135 */ 4136 /*ARGSUSED*/ 4137 static int 4138 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 4139 caller_context_t *ct, int flags) 4140 { 4141 znode_t *zp, *dzp = VTOZ(dvp); 4142 zfs_dirlock_t *dl; 4143 dmu_tx_t *tx; 4144 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4145 zilog_t *zilog; 4146 uint64_t len = strlen(link); 4147 int error; 4148 int zflg = ZNEW; 4149 zfs_acl_ids_t acl_ids; 4150 boolean_t fuid_dirtied; 4151 uint64_t txtype = TX_SYMLINK; 4152 boolean_t waited = B_FALSE; 4153 4154 ASSERT(vap->va_type == VLNK); 4155 4156 ZFS_ENTER(zfsvfs); 4157 ZFS_VERIFY_ZP(dzp); 4158 zilog = zfsvfs->z_log; 4159 4160 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4161 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4162 ZFS_EXIT(zfsvfs); 4163 return (SET_ERROR(EILSEQ)); 4164 } 4165 if (flags & FIGNORECASE) 4166 zflg |= ZCILOOK; 4167 4168 if (len > MAXPATHLEN) { 4169 ZFS_EXIT(zfsvfs); 4170 return (SET_ERROR(ENAMETOOLONG)); 4171 } 4172 4173 if ((error = zfs_acl_ids_create(dzp, 0, 4174 vap, cr, NULL, &acl_ids)) != 0) { 4175 ZFS_EXIT(zfsvfs); 4176 return (error); 4177 } 4178 top: 4179 /* 4180 * Attempt to lock directory; fail if entry already exists. 4181 */ 4182 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 4183 if (error) { 4184 zfs_acl_ids_free(&acl_ids); 4185 ZFS_EXIT(zfsvfs); 4186 return (error); 4187 } 4188 4189 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4190 zfs_acl_ids_free(&acl_ids); 4191 zfs_dirent_unlock(dl); 4192 ZFS_EXIT(zfsvfs); 4193 return (error); 4194 } 4195 4196 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 4197 zfs_acl_ids_free(&acl_ids); 4198 zfs_dirent_unlock(dl); 4199 ZFS_EXIT(zfsvfs); 4200 return (SET_ERROR(EDQUOT)); 4201 } 4202 tx = dmu_tx_create(zfsvfs->z_os); 4203 fuid_dirtied = zfsvfs->z_fuid_dirty; 4204 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4205 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4206 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4207 ZFS_SA_BASE_ATTR_SIZE + len); 4208 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4209 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4210 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4211 acl_ids.z_aclp->z_acl_bytes); 4212 } 4213 if (fuid_dirtied) 4214 zfs_fuid_txhold(zfsvfs, tx); 4215 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 4216 if (error) { 4217 zfs_dirent_unlock(dl); 4218 if (error == ERESTART) { 4219 waited = B_TRUE; 4220 dmu_tx_wait(tx); 4221 dmu_tx_abort(tx); 4222 goto top; 4223 } 4224 zfs_acl_ids_free(&acl_ids); 4225 dmu_tx_abort(tx); 4226 ZFS_EXIT(zfsvfs); 4227 return (error); 4228 } 4229 4230 /* 4231 * Create a new object for the symlink. 4232 * for version 4 ZPL datsets the symlink will be an SA attribute 4233 */ 4234 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4235 4236 if (fuid_dirtied) 4237 zfs_fuid_sync(zfsvfs, tx); 4238 4239 mutex_enter(&zp->z_lock); 4240 if (zp->z_is_sa) 4241 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4242 link, len, tx); 4243 else 4244 zfs_sa_symlink(zp, link, len, tx); 4245 mutex_exit(&zp->z_lock); 4246 4247 zp->z_size = len; 4248 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4249 &zp->z_size, sizeof (zp->z_size), tx); 4250 /* 4251 * Insert the new object into the directory. 4252 */ 4253 (void) zfs_link_create(dl, zp, tx, ZNEW); 4254 4255 if (flags & FIGNORECASE) 4256 txtype |= TX_CI; 4257 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4258 4259 zfs_acl_ids_free(&acl_ids); 4260 4261 dmu_tx_commit(tx); 4262 4263 zfs_dirent_unlock(dl); 4264 4265 VN_RELE(ZTOV(zp)); 4266 4267 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4268 zil_commit(zilog, 0); 4269 4270 ZFS_EXIT(zfsvfs); 4271 return (error); 4272 } 4273 4274 /* 4275 * Return, in the buffer contained in the provided uio structure, 4276 * the symbolic path referred to by vp. 4277 * 4278 * IN: vp - vnode of symbolic link. 4279 * uio - structure to contain the link path. 4280 * cr - credentials of caller. 4281 * ct - caller context 4282 * 4283 * OUT: uio - structure containing the link path. 4284 * 4285 * RETURN: 0 on success, error code on failure. 4286 * 4287 * Timestamps: 4288 * vp - atime updated 4289 */ 4290 /* ARGSUSED */ 4291 static int 4292 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4293 { 4294 znode_t *zp = VTOZ(vp); 4295 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4296 int error; 4297 4298 ZFS_ENTER(zfsvfs); 4299 ZFS_VERIFY_ZP(zp); 4300 4301 mutex_enter(&zp->z_lock); 4302 if (zp->z_is_sa) 4303 error = sa_lookup_uio(zp->z_sa_hdl, 4304 SA_ZPL_SYMLINK(zfsvfs), uio); 4305 else 4306 error = zfs_sa_readlink(zp, uio); 4307 mutex_exit(&zp->z_lock); 4308 4309 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4310 4311 ZFS_EXIT(zfsvfs); 4312 return (error); 4313 } 4314 4315 /* 4316 * Insert a new entry into directory tdvp referencing svp. 4317 * 4318 * IN: tdvp - Directory to contain new entry. 4319 * svp - vnode of new entry. 4320 * name - name of new entry. 4321 * cr - credentials of caller. 4322 * ct - caller context 4323 * 4324 * RETURN: 0 on success, error code on failure. 4325 * 4326 * Timestamps: 4327 * tdvp - ctime|mtime updated 4328 * svp - ctime updated 4329 */ 4330 /* ARGSUSED */ 4331 static int 4332 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4333 caller_context_t *ct, int flags) 4334 { 4335 znode_t *dzp = VTOZ(tdvp); 4336 znode_t *tzp, *szp; 4337 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4338 zilog_t *zilog; 4339 zfs_dirlock_t *dl; 4340 dmu_tx_t *tx; 4341 vnode_t *realvp; 4342 int error; 4343 int zf = ZNEW; 4344 uint64_t parent; 4345 uid_t owner; 4346 boolean_t waited = B_FALSE; 4347 4348 ASSERT(tdvp->v_type == VDIR); 4349 4350 ZFS_ENTER(zfsvfs); 4351 ZFS_VERIFY_ZP(dzp); 4352 zilog = zfsvfs->z_log; 4353 4354 if (VOP_REALVP(svp, &realvp, ct) == 0) 4355 svp = realvp; 4356 4357 /* 4358 * POSIX dictates that we return EPERM here. 4359 * Better choices include ENOTSUP or EISDIR. 4360 */ 4361 if (svp->v_type == VDIR) { 4362 ZFS_EXIT(zfsvfs); 4363 return (SET_ERROR(EPERM)); 4364 } 4365 4366 szp = VTOZ(svp); 4367 ZFS_VERIFY_ZP(szp); 4368 4369 /* 4370 * If we are using project inheritance, it means if the directory has 4371 * ZFS_PROJINHERIT set, then its descendant directories will inherit 4372 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 4373 * such case, we only allow hard link creation in our tree when the 4374 * project IDs are the same. 4375 */ 4376 if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { 4377 ZFS_EXIT(zfsvfs); 4378 return (SET_ERROR(EXDEV)); 4379 } 4380 4381 /* 4382 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 4383 * ctldir appear to have the same v_vfsp. 4384 */ 4385 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { 4386 ZFS_EXIT(zfsvfs); 4387 return (SET_ERROR(EXDEV)); 4388 } 4389 4390 /* Prevent links to .zfs/shares files */ 4391 4392 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4393 &parent, sizeof (uint64_t))) != 0) { 4394 ZFS_EXIT(zfsvfs); 4395 return (error); 4396 } 4397 if (parent == zfsvfs->z_shares_dir) { 4398 ZFS_EXIT(zfsvfs); 4399 return (SET_ERROR(EPERM)); 4400 } 4401 4402 if (zfsvfs->z_utf8 && u8_validate(name, 4403 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4404 ZFS_EXIT(zfsvfs); 4405 return (SET_ERROR(EILSEQ)); 4406 } 4407 if (flags & FIGNORECASE) 4408 zf |= ZCILOOK; 4409 4410 /* 4411 * We do not support links between attributes and non-attributes 4412 * because of the potential security risk of creating links 4413 * into "normal" file space in order to circumvent restrictions 4414 * imposed in attribute space. 4415 */ 4416 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4417 ZFS_EXIT(zfsvfs); 4418 return (SET_ERROR(EINVAL)); 4419 } 4420 4421 4422 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4423 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 4424 ZFS_EXIT(zfsvfs); 4425 return (SET_ERROR(EPERM)); 4426 } 4427 4428 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4429 ZFS_EXIT(zfsvfs); 4430 return (error); 4431 } 4432 4433 top: 4434 /* 4435 * Attempt to lock directory; fail if entry already exists. 4436 */ 4437 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 4438 if (error) { 4439 ZFS_EXIT(zfsvfs); 4440 return (error); 4441 } 4442 4443 tx = dmu_tx_create(zfsvfs->z_os); 4444 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4445 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4446 zfs_sa_upgrade_txholds(tx, szp); 4447 zfs_sa_upgrade_txholds(tx, dzp); 4448 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); 4449 if (error) { 4450 zfs_dirent_unlock(dl); 4451 if (error == ERESTART) { 4452 waited = B_TRUE; 4453 dmu_tx_wait(tx); 4454 dmu_tx_abort(tx); 4455 goto top; 4456 } 4457 dmu_tx_abort(tx); 4458 ZFS_EXIT(zfsvfs); 4459 return (error); 4460 } 4461 4462 error = zfs_link_create(dl, szp, tx, 0); 4463 4464 if (error == 0) { 4465 uint64_t txtype = TX_LINK; 4466 if (flags & FIGNORECASE) 4467 txtype |= TX_CI; 4468 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4469 } 4470 4471 dmu_tx_commit(tx); 4472 4473 zfs_dirent_unlock(dl); 4474 4475 if (error == 0) { 4476 vnevent_link(svp, ct); 4477 } 4478 4479 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4480 zil_commit(zilog, 0); 4481 4482 ZFS_EXIT(zfsvfs); 4483 return (error); 4484 } 4485 4486 /* 4487 * zfs_null_putapage() is used when the file system has been force 4488 * unmounted. It just drops the pages. 4489 */ 4490 /* ARGSUSED */ 4491 static int 4492 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4493 size_t *lenp, int flags, cred_t *cr) 4494 { 4495 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 4496 return (0); 4497 } 4498 4499 /* 4500 * Push a page out to disk, klustering if possible. 4501 * 4502 * IN: vp - file to push page to. 4503 * pp - page to push. 4504 * flags - additional flags. 4505 * cr - credentials of caller. 4506 * 4507 * OUT: offp - start of range pushed. 4508 * lenp - len of range pushed. 4509 * 4510 * RETURN: 0 on success, error code on failure. 4511 * 4512 * NOTE: callers must have locked the page to be pushed. On 4513 * exit, the page (and all other pages in the kluster) must be 4514 * unlocked. 4515 */ 4516 /* ARGSUSED */ 4517 static int 4518 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4519 size_t *lenp, int flags, cred_t *cr) 4520 { 4521 znode_t *zp = VTOZ(vp); 4522 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4523 dmu_tx_t *tx; 4524 u_offset_t off, koff; 4525 size_t len, klen; 4526 int err; 4527 4528 off = pp->p_offset; 4529 len = PAGESIZE; 4530 /* 4531 * If our blocksize is bigger than the page size, try to kluster 4532 * multiple pages so that we write a full block (thus avoiding 4533 * a read-modify-write). 4534 */ 4535 if (off < zp->z_size && zp->z_blksz > PAGESIZE) { 4536 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 4537 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 4538 ASSERT(koff <= zp->z_size); 4539 if (koff + klen > zp->z_size) 4540 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); 4541 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 4542 } 4543 ASSERT3U(btop(len), ==, btopr(len)); 4544 4545 /* 4546 * Can't push pages past end-of-file. 4547 */ 4548 if (off >= zp->z_size) { 4549 /* ignore all pages */ 4550 err = 0; 4551 goto out; 4552 } else if (off + len > zp->z_size) { 4553 int npages = btopr(zp->z_size - off); 4554 page_t *trunc; 4555 4556 page_list_break(&pp, &trunc, npages); 4557 /* ignore pages past end of file */ 4558 if (trunc) 4559 pvn_write_done(trunc, flags); 4560 len = zp->z_size - off; 4561 } 4562 4563 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || 4564 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid)) { 4565 err = SET_ERROR(EDQUOT); 4566 goto out; 4567 } 4568 tx = dmu_tx_create(zfsvfs->z_os); 4569 dmu_tx_hold_write(tx, zp->z_id, off, len); 4570 4571 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4572 zfs_sa_upgrade_txholds(tx, zp); 4573 err = dmu_tx_assign(tx, TXG_WAIT); 4574 if (err != 0) { 4575 dmu_tx_abort(tx); 4576 goto out; 4577 } 4578 4579 if (zp->z_blksz <= PAGESIZE) { 4580 caddr_t va = zfs_map_page(pp, S_READ); 4581 ASSERT3U(len, <=, PAGESIZE); 4582 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 4583 zfs_unmap_page(pp, va); 4584 } else { 4585 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 4586 } 4587 4588 if (err == 0) { 4589 uint64_t mtime[2], ctime[2]; 4590 sa_bulk_attr_t bulk[3]; 4591 int count = 0; 4592 4593 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4594 &mtime, 16); 4595 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4596 &ctime, 16); 4597 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4598 &zp->z_pflags, 8); 4599 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4600 B_TRUE); 4601 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 4602 ASSERT0(err); 4603 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4604 } 4605 dmu_tx_commit(tx); 4606 4607 out: 4608 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 4609 if (offp) 4610 *offp = off; 4611 if (lenp) 4612 *lenp = len; 4613 4614 return (err); 4615 } 4616 4617 /* 4618 * Copy the portion of the file indicated from pages into the file. 4619 * The pages are stored in a page list attached to the files vnode. 4620 * 4621 * IN: vp - vnode of file to push page data to. 4622 * off - position in file to put data. 4623 * len - amount of data to write. 4624 * flags - flags to control the operation. 4625 * cr - credentials of caller. 4626 * ct - caller context. 4627 * 4628 * RETURN: 0 on success, error code on failure. 4629 * 4630 * Timestamps: 4631 * vp - ctime|mtime updated 4632 */ 4633 /*ARGSUSED*/ 4634 static int 4635 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4636 caller_context_t *ct) 4637 { 4638 znode_t *zp = VTOZ(vp); 4639 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4640 page_t *pp; 4641 size_t io_len; 4642 u_offset_t io_off; 4643 uint_t blksz; 4644 locked_range_t *lr; 4645 int error = 0; 4646 4647 ZFS_ENTER(zfsvfs); 4648 ZFS_VERIFY_ZP(zp); 4649 4650 /* 4651 * There's nothing to do if no data is cached. 4652 */ 4653 if (!vn_has_cached_data(vp)) { 4654 ZFS_EXIT(zfsvfs); 4655 return (0); 4656 } 4657 4658 /* 4659 * Align this request to the file block size in case we kluster. 4660 * XXX - this can result in pretty aggresive locking, which can 4661 * impact simultanious read/write access. One option might be 4662 * to break up long requests (len == 0) into block-by-block 4663 * operations to get narrower locking. 4664 */ 4665 blksz = zp->z_blksz; 4666 if (ISP2(blksz)) 4667 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4668 else 4669 io_off = 0; 4670 if (len > 0 && ISP2(blksz)) 4671 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4672 else 4673 io_len = 0; 4674 4675 if (io_len == 0) { 4676 /* 4677 * Search the entire vp list for pages >= io_off. 4678 */ 4679 lr = rangelock_enter(&zp->z_rangelock, 4680 io_off, UINT64_MAX, RL_WRITER); 4681 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4682 goto out; 4683 } 4684 lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER); 4685 4686 if (off > zp->z_size) { 4687 /* past end of file */ 4688 rangelock_exit(lr); 4689 ZFS_EXIT(zfsvfs); 4690 return (0); 4691 } 4692 4693 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); 4694 4695 for (off = io_off; io_off < off + len; io_off += io_len) { 4696 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4697 pp = page_lookup(vp, io_off, 4698 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4699 } else { 4700 pp = page_lookup_nowait(vp, io_off, 4701 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4702 } 4703 4704 if (pp != NULL && pvn_getdirty(pp, flags)) { 4705 int err; 4706 4707 /* 4708 * Found a dirty page to push 4709 */ 4710 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4711 if (err) 4712 error = err; 4713 } else { 4714 io_len = PAGESIZE; 4715 } 4716 } 4717 out: 4718 rangelock_exit(lr); 4719 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4720 zil_commit(zfsvfs->z_log, zp->z_id); 4721 ZFS_EXIT(zfsvfs); 4722 return (error); 4723 } 4724 4725 /*ARGSUSED*/ 4726 void 4727 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4728 { 4729 znode_t *zp = VTOZ(vp); 4730 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4731 int error; 4732 4733 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4734 if (zp->z_sa_hdl == NULL) { 4735 /* 4736 * The fs has been unmounted, or we did a 4737 * suspend/resume and this file no longer exists. 4738 */ 4739 if (vn_has_cached_data(vp)) { 4740 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 4741 B_INVAL, cr); 4742 } 4743 4744 mutex_enter(&zp->z_lock); 4745 mutex_enter(&vp->v_lock); 4746 ASSERT(vp->v_count == 1); 4747 VN_RELE_LOCKED(vp); 4748 mutex_exit(&vp->v_lock); 4749 mutex_exit(&zp->z_lock); 4750 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4751 zfs_znode_free(zp); 4752 return; 4753 } 4754 4755 /* 4756 * Attempt to push any data in the page cache. If this fails 4757 * we will get kicked out later in zfs_zinactive(). 4758 */ 4759 if (vn_has_cached_data(vp)) { 4760 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 4761 cr); 4762 } 4763 4764 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4765 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4766 4767 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4768 zfs_sa_upgrade_txholds(tx, zp); 4769 error = dmu_tx_assign(tx, TXG_WAIT); 4770 if (error) { 4771 dmu_tx_abort(tx); 4772 } else { 4773 mutex_enter(&zp->z_lock); 4774 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4775 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4776 zp->z_atime_dirty = 0; 4777 mutex_exit(&zp->z_lock); 4778 dmu_tx_commit(tx); 4779 } 4780 } 4781 4782 zfs_zinactive(zp); 4783 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4784 } 4785 4786 /* 4787 * Bounds-check the seek operation. 4788 * 4789 * IN: vp - vnode seeking within 4790 * ooff - old file offset 4791 * noffp - pointer to new file offset 4792 * ct - caller context 4793 * 4794 * RETURN: 0 on success, EINVAL if new offset invalid. 4795 */ 4796 /* ARGSUSED */ 4797 static int 4798 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4799 caller_context_t *ct) 4800 { 4801 if (vp->v_type == VDIR) 4802 return (0); 4803 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4804 } 4805 4806 /* 4807 * Pre-filter the generic locking function to trap attempts to place 4808 * a mandatory lock on a memory mapped file. 4809 */ 4810 static int 4811 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4812 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4813 { 4814 znode_t *zp = VTOZ(vp); 4815 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4816 4817 ZFS_ENTER(zfsvfs); 4818 ZFS_VERIFY_ZP(zp); 4819 4820 /* 4821 * We are following the UFS semantics with respect to mapcnt 4822 * here: If we see that the file is mapped already, then we will 4823 * return an error, but we don't worry about races between this 4824 * function and zfs_map(). 4825 */ 4826 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { 4827 ZFS_EXIT(zfsvfs); 4828 return (SET_ERROR(EAGAIN)); 4829 } 4830 ZFS_EXIT(zfsvfs); 4831 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4832 } 4833 4834 /* 4835 * If we can't find a page in the cache, we will create a new page 4836 * and fill it with file data. For efficiency, we may try to fill 4837 * multiple pages at once (klustering) to fill up the supplied page 4838 * list. Note that the pages to be filled are held with an exclusive 4839 * lock to prevent access by other threads while they are being filled. 4840 */ 4841 static int 4842 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4843 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4844 { 4845 znode_t *zp = VTOZ(vp); 4846 page_t *pp, *cur_pp; 4847 objset_t *os = zp->z_zfsvfs->z_os; 4848 u_offset_t io_off, total; 4849 size_t io_len; 4850 int err; 4851 4852 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4853 /* 4854 * We only have a single page, don't bother klustering 4855 */ 4856 io_off = off; 4857 io_len = PAGESIZE; 4858 pp = page_create_va(vp, io_off, io_len, 4859 PG_EXCL | PG_WAIT, seg, addr); 4860 } else { 4861 /* 4862 * Try to find enough pages to fill the page list 4863 */ 4864 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4865 &io_len, off, plsz, 0); 4866 } 4867 if (pp == NULL) { 4868 /* 4869 * The page already exists, nothing to do here. 4870 */ 4871 *pl = NULL; 4872 return (0); 4873 } 4874 4875 /* 4876 * Fill the pages in the kluster. 4877 */ 4878 cur_pp = pp; 4879 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4880 caddr_t va; 4881 4882 ASSERT3U(io_off, ==, cur_pp->p_offset); 4883 va = zfs_map_page(cur_pp, S_WRITE); 4884 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4885 DMU_READ_PREFETCH); 4886 zfs_unmap_page(cur_pp, va); 4887 if (err) { 4888 /* On error, toss the entire kluster */ 4889 pvn_read_done(pp, B_ERROR); 4890 /* convert checksum errors into IO errors */ 4891 if (err == ECKSUM) 4892 err = SET_ERROR(EIO); 4893 return (err); 4894 } 4895 cur_pp = cur_pp->p_next; 4896 } 4897 4898 /* 4899 * Fill in the page list array from the kluster starting 4900 * from the desired offset `off'. 4901 * NOTE: the page list will always be null terminated. 4902 */ 4903 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4904 ASSERT(pl == NULL || (*pl)->p_offset == off); 4905 4906 return (0); 4907 } 4908 4909 /* 4910 * Return pointers to the pages for the file region [off, off + len] 4911 * in the pl array. If plsz is greater than len, this function may 4912 * also return page pointers from after the specified region 4913 * (i.e. the region [off, off + plsz]). These additional pages are 4914 * only returned if they are already in the cache, or were created as 4915 * part of a klustered read. 4916 * 4917 * IN: vp - vnode of file to get data from. 4918 * off - position in file to get data from. 4919 * len - amount of data to retrieve. 4920 * plsz - length of provided page list. 4921 * seg - segment to obtain pages for. 4922 * addr - virtual address of fault. 4923 * rw - mode of created pages. 4924 * cr - credentials of caller. 4925 * ct - caller context. 4926 * 4927 * OUT: protp - protection mode of created pages. 4928 * pl - list of pages created. 4929 * 4930 * RETURN: 0 on success, error code on failure. 4931 * 4932 * Timestamps: 4933 * vp - atime updated 4934 */ 4935 /* ARGSUSED */ 4936 static int 4937 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4938 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4939 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4940 { 4941 znode_t *zp = VTOZ(vp); 4942 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4943 page_t **pl0 = pl; 4944 int err = 0; 4945 4946 /* we do our own caching, faultahead is unnecessary */ 4947 if (pl == NULL) 4948 return (0); 4949 else if (len > plsz) 4950 len = plsz; 4951 else 4952 len = P2ROUNDUP(len, PAGESIZE); 4953 ASSERT(plsz >= len); 4954 4955 ZFS_ENTER(zfsvfs); 4956 ZFS_VERIFY_ZP(zp); 4957 4958 if (protp) 4959 *protp = PROT_ALL; 4960 4961 /* 4962 * Loop through the requested range [off, off + len) looking 4963 * for pages. If we don't find a page, we will need to create 4964 * a new page and fill it with data from the file. 4965 */ 4966 while (len > 0) { 4967 if (*pl = page_lookup(vp, off, SE_SHARED)) 4968 *(pl+1) = NULL; 4969 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 4970 goto out; 4971 while (*pl) { 4972 ASSERT3U((*pl)->p_offset, ==, off); 4973 off += PAGESIZE; 4974 addr += PAGESIZE; 4975 if (len > 0) { 4976 ASSERT3U(len, >=, PAGESIZE); 4977 len -= PAGESIZE; 4978 } 4979 ASSERT3U(plsz, >=, PAGESIZE); 4980 plsz -= PAGESIZE; 4981 pl++; 4982 } 4983 } 4984 4985 /* 4986 * Fill out the page array with any pages already in the cache. 4987 */ 4988 while (plsz > 0 && 4989 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 4990 off += PAGESIZE; 4991 plsz -= PAGESIZE; 4992 } 4993 out: 4994 if (err) { 4995 /* 4996 * Release any pages we have previously locked. 4997 */ 4998 while (pl > pl0) 4999 page_unlock(*--pl); 5000 } else { 5001 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 5002 } 5003 5004 *pl = NULL; 5005 5006 ZFS_EXIT(zfsvfs); 5007 return (err); 5008 } 5009 5010 /* 5011 * Request a memory map for a section of a file. This code interacts 5012 * with common code and the VM system as follows: 5013 * 5014 * - common code calls mmap(), which ends up in smmap_common() 5015 * - this calls VOP_MAP(), which takes you into (say) zfs 5016 * - zfs_map() calls as_map(), passing segvn_create() as the callback 5017 * - segvn_create() creates the new segment and calls VOP_ADDMAP() 5018 * - zfs_addmap() updates z_mapcnt 5019 */ 5020 /*ARGSUSED*/ 5021 static int 5022 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 5023 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 5024 caller_context_t *ct) 5025 { 5026 znode_t *zp = VTOZ(vp); 5027 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5028 segvn_crargs_t vn_a; 5029 int error; 5030 5031 ZFS_ENTER(zfsvfs); 5032 ZFS_VERIFY_ZP(zp); 5033 5034 /* 5035 * Note: ZFS_READONLY is handled in zfs_zaccess_common. 5036 */ 5037 5038 if ((prot & PROT_WRITE) && (zp->z_pflags & 5039 (ZFS_IMMUTABLE | ZFS_APPENDONLY))) { 5040 ZFS_EXIT(zfsvfs); 5041 return (SET_ERROR(EPERM)); 5042 } 5043 5044 if ((prot & (PROT_READ | PROT_EXEC)) && 5045 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 5046 ZFS_EXIT(zfsvfs); 5047 return (SET_ERROR(EACCES)); 5048 } 5049 5050 if (vp->v_flag & VNOMAP) { 5051 ZFS_EXIT(zfsvfs); 5052 return (SET_ERROR(ENOSYS)); 5053 } 5054 5055 if (off < 0 || len > MAXOFFSET_T - off) { 5056 ZFS_EXIT(zfsvfs); 5057 return (SET_ERROR(ENXIO)); 5058 } 5059 5060 if (vp->v_type != VREG) { 5061 ZFS_EXIT(zfsvfs); 5062 return (SET_ERROR(ENODEV)); 5063 } 5064 5065 /* 5066 * If file is locked, disallow mapping. 5067 */ 5068 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { 5069 ZFS_EXIT(zfsvfs); 5070 return (SET_ERROR(EAGAIN)); 5071 } 5072 5073 as_rangelock(as); 5074 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 5075 if (error != 0) { 5076 as_rangeunlock(as); 5077 ZFS_EXIT(zfsvfs); 5078 return (error); 5079 } 5080 5081 vn_a.vp = vp; 5082 vn_a.offset = (u_offset_t)off; 5083 vn_a.type = flags & MAP_TYPE; 5084 vn_a.prot = prot; 5085 vn_a.maxprot = maxprot; 5086 vn_a.cred = cr; 5087 vn_a.amp = NULL; 5088 vn_a.flags = flags & ~MAP_TYPE; 5089 vn_a.szc = 0; 5090 vn_a.lgrp_mem_policy_flags = 0; 5091 5092 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5093 5094 as_rangeunlock(as); 5095 ZFS_EXIT(zfsvfs); 5096 return (error); 5097 } 5098 5099 /* ARGSUSED */ 5100 static int 5101 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5102 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 5103 caller_context_t *ct) 5104 { 5105 uint64_t pages = btopr(len); 5106 5107 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 5108 return (0); 5109 } 5110 5111 /* 5112 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 5113 * more accurate mtime for the associated file. Since we don't have a way of 5114 * detecting when the data was actually modified, we have to resort to 5115 * heuristics. If an explicit msync() is done, then we mark the mtime when the 5116 * last page is pushed. The problem occurs when the msync() call is omitted, 5117 * which by far the most common case: 5118 * 5119 * open() 5120 * mmap() 5121 * <modify memory> 5122 * munmap() 5123 * close() 5124 * <time lapse> 5125 * putpage() via fsflush 5126 * 5127 * If we wait until fsflush to come along, we can have a modification time that 5128 * is some arbitrary point in the future. In order to prevent this in the 5129 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 5130 * torn down. 5131 */ 5132 /* ARGSUSED */ 5133 static int 5134 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5135 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 5136 caller_context_t *ct) 5137 { 5138 uint64_t pages = btopr(len); 5139 5140 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 5141 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 5142 5143 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 5144 vn_has_cached_data(vp)) 5145 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 5146 5147 return (0); 5148 } 5149 5150 /* 5151 * Free or allocate space in a file. Currently, this function only 5152 * supports the `F_FREESP' command. However, this command is somewhat 5153 * misnamed, as its functionality includes the ability to allocate as 5154 * well as free space. 5155 * 5156 * IN: vp - vnode of file to free data in. 5157 * cmd - action to take (only F_FREESP supported). 5158 * bfp - section of file to free/alloc. 5159 * flag - current file open mode flags. 5160 * offset - current file offset. 5161 * cr - credentials of caller [UNUSED]. 5162 * ct - caller context. 5163 * 5164 * RETURN: 0 on success, error code on failure. 5165 * 5166 * Timestamps: 5167 * vp - ctime|mtime updated 5168 */ 5169 /* ARGSUSED */ 5170 static int 5171 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 5172 offset_t offset, cred_t *cr, caller_context_t *ct) 5173 { 5174 znode_t *zp = VTOZ(vp); 5175 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5176 uint64_t off, len; 5177 int error; 5178 5179 ZFS_ENTER(zfsvfs); 5180 ZFS_VERIFY_ZP(zp); 5181 5182 if (cmd != F_FREESP) { 5183 ZFS_EXIT(zfsvfs); 5184 return (SET_ERROR(EINVAL)); 5185 } 5186 5187 /* 5188 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 5189 * callers might not be able to detect properly that we are read-only, 5190 * so check it explicitly here. 5191 */ 5192 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 5193 ZFS_EXIT(zfsvfs); 5194 return (SET_ERROR(EROFS)); 5195 } 5196 5197 if (error = convoff(vp, bfp, 0, offset)) { 5198 ZFS_EXIT(zfsvfs); 5199 return (error); 5200 } 5201 5202 if (bfp->l_len < 0) { 5203 ZFS_EXIT(zfsvfs); 5204 return (SET_ERROR(EINVAL)); 5205 } 5206 5207 off = bfp->l_start; 5208 len = bfp->l_len; /* 0 means from off to end of file */ 5209 5210 error = zfs_freesp(zp, off, len, flag, TRUE); 5211 5212 if (error == 0 && off == 0 && len == 0) 5213 vnevent_truncate(ZTOV(zp), ct); 5214 5215 ZFS_EXIT(zfsvfs); 5216 return (error); 5217 } 5218 5219 /*ARGSUSED*/ 5220 static int 5221 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 5222 { 5223 znode_t *zp = VTOZ(vp); 5224 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5225 uint32_t gen; 5226 uint64_t gen64; 5227 uint64_t object = zp->z_id; 5228 zfid_short_t *zfid; 5229 int size, i, error; 5230 5231 ZFS_ENTER(zfsvfs); 5232 ZFS_VERIFY_ZP(zp); 5233 5234 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 5235 &gen64, sizeof (uint64_t))) != 0) { 5236 ZFS_EXIT(zfsvfs); 5237 return (error); 5238 } 5239 5240 gen = (uint32_t)gen64; 5241 5242 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 5243 if (fidp->fid_len < size) { 5244 fidp->fid_len = size; 5245 ZFS_EXIT(zfsvfs); 5246 return (SET_ERROR(ENOSPC)); 5247 } 5248 5249 zfid = (zfid_short_t *)fidp; 5250 5251 zfid->zf_len = size; 5252 5253 for (i = 0; i < sizeof (zfid->zf_object); i++) 5254 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 5255 5256 /* Must have a non-zero generation number to distinguish from .zfs */ 5257 if (gen == 0) 5258 gen = 1; 5259 for (i = 0; i < sizeof (zfid->zf_gen); i++) 5260 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 5261 5262 if (size == LONG_FID_LEN) { 5263 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 5264 zfid_long_t *zlfid; 5265 5266 zlfid = (zfid_long_t *)fidp; 5267 5268 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 5269 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 5270 5271 /* XXX - this should be the generation number for the objset */ 5272 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 5273 zlfid->zf_setgen[i] = 0; 5274 } 5275 5276 ZFS_EXIT(zfsvfs); 5277 return (0); 5278 } 5279 5280 static int 5281 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 5282 caller_context_t *ct) 5283 { 5284 znode_t *zp, *xzp; 5285 zfsvfs_t *zfsvfs; 5286 zfs_dirlock_t *dl; 5287 int error; 5288 5289 switch (cmd) { 5290 case _PC_LINK_MAX: 5291 *valp = ULONG_MAX; 5292 return (0); 5293 5294 case _PC_FILESIZEBITS: 5295 *valp = 64; 5296 return (0); 5297 5298 case _PC_XATTR_EXISTS: 5299 zp = VTOZ(vp); 5300 zfsvfs = zp->z_zfsvfs; 5301 ZFS_ENTER(zfsvfs); 5302 ZFS_VERIFY_ZP(zp); 5303 *valp = 0; 5304 error = zfs_dirent_lock(&dl, zp, "", &xzp, 5305 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 5306 if (error == 0) { 5307 zfs_dirent_unlock(dl); 5308 if (!zfs_dirempty(xzp)) 5309 *valp = 1; 5310 VN_RELE(ZTOV(xzp)); 5311 } else if (error == ENOENT) { 5312 /* 5313 * If there aren't extended attributes, it's the 5314 * same as having zero of them. 5315 */ 5316 error = 0; 5317 } 5318 ZFS_EXIT(zfsvfs); 5319 return (error); 5320 5321 case _PC_SATTR_ENABLED: 5322 case _PC_SATTR_EXISTS: 5323 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 5324 (vp->v_type == VREG || vp->v_type == VDIR); 5325 return (0); 5326 5327 case _PC_ACCESS_FILTERING: 5328 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 5329 vp->v_type == VDIR; 5330 return (0); 5331 5332 case _PC_ACL_ENABLED: 5333 *valp = _ACL_ACE_ENABLED; 5334 return (0); 5335 5336 case _PC_MIN_HOLE_SIZE: 5337 *valp = (ulong_t)SPA_MINBLOCKSIZE; 5338 return (0); 5339 5340 case _PC_TIMESTAMP_RESOLUTION: 5341 /* nanosecond timestamp resolution */ 5342 *valp = 1L; 5343 return (0); 5344 5345 default: 5346 return (fs_pathconf(vp, cmd, valp, cr, ct)); 5347 } 5348 } 5349 5350 /*ARGSUSED*/ 5351 static int 5352 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5353 caller_context_t *ct) 5354 { 5355 znode_t *zp = VTOZ(vp); 5356 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5357 int error; 5358 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5359 5360 ZFS_ENTER(zfsvfs); 5361 ZFS_VERIFY_ZP(zp); 5362 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 5363 ZFS_EXIT(zfsvfs); 5364 5365 return (error); 5366 } 5367 5368 /*ARGSUSED*/ 5369 static int 5370 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5371 caller_context_t *ct) 5372 { 5373 znode_t *zp = VTOZ(vp); 5374 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5375 int error; 5376 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5377 zilog_t *zilog = zfsvfs->z_log; 5378 5379 ZFS_ENTER(zfsvfs); 5380 ZFS_VERIFY_ZP(zp); 5381 5382 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 5383 5384 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 5385 zil_commit(zilog, 0); 5386 5387 ZFS_EXIT(zfsvfs); 5388 return (error); 5389 } 5390 5391 /* 5392 * The smallest read we may consider to loan out an arcbuf. 5393 * This must be a power of 2. 5394 */ 5395 int zcr_blksz_min = (1 << 10); /* 1K */ 5396 /* 5397 * If set to less than the file block size, allow loaning out of an 5398 * arcbuf for a partial block read. This must be a power of 2. 5399 */ 5400 int zcr_blksz_max = (1 << 17); /* 128K */ 5401 5402 /*ARGSUSED*/ 5403 static int 5404 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 5405 caller_context_t *ct) 5406 { 5407 znode_t *zp = VTOZ(vp); 5408 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5409 int max_blksz = zfsvfs->z_max_blksz; 5410 uio_t *uio = &xuio->xu_uio; 5411 ssize_t size = uio->uio_resid; 5412 offset_t offset = uio->uio_loffset; 5413 int blksz; 5414 int fullblk, i; 5415 arc_buf_t *abuf; 5416 ssize_t maxsize; 5417 int preamble, postamble; 5418 5419 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 5420 return (SET_ERROR(EINVAL)); 5421 5422 ZFS_ENTER(zfsvfs); 5423 ZFS_VERIFY_ZP(zp); 5424 switch (ioflag) { 5425 case UIO_WRITE: 5426 /* 5427 * Loan out an arc_buf for write if write size is bigger than 5428 * max_blksz, and the file's block size is also max_blksz. 5429 */ 5430 blksz = max_blksz; 5431 if (size < blksz || zp->z_blksz != blksz) { 5432 ZFS_EXIT(zfsvfs); 5433 return (SET_ERROR(EINVAL)); 5434 } 5435 /* 5436 * Caller requests buffers for write before knowing where the 5437 * write offset might be (e.g. NFS TCP write). 5438 */ 5439 if (offset == -1) { 5440 preamble = 0; 5441 } else { 5442 preamble = P2PHASE(offset, blksz); 5443 if (preamble) { 5444 preamble = blksz - preamble; 5445 size -= preamble; 5446 } 5447 } 5448 5449 postamble = P2PHASE(size, blksz); 5450 size -= postamble; 5451 5452 fullblk = size / blksz; 5453 (void) dmu_xuio_init(xuio, 5454 (preamble != 0) + fullblk + (postamble != 0)); 5455 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5456 int, postamble, int, 5457 (preamble != 0) + fullblk + (postamble != 0)); 5458 5459 /* 5460 * Have to fix iov base/len for partial buffers. They 5461 * currently represent full arc_buf's. 5462 */ 5463 if (preamble) { 5464 /* data begins in the middle of the arc_buf */ 5465 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5466 blksz); 5467 ASSERT(abuf); 5468 (void) dmu_xuio_add(xuio, abuf, 5469 blksz - preamble, preamble); 5470 } 5471 5472 for (i = 0; i < fullblk; i++) { 5473 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5474 blksz); 5475 ASSERT(abuf); 5476 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5477 } 5478 5479 if (postamble) { 5480 /* data ends in the middle of the arc_buf */ 5481 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5482 blksz); 5483 ASSERT(abuf); 5484 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5485 } 5486 break; 5487 case UIO_READ: 5488 /* 5489 * Loan out an arc_buf for read if the read size is larger than 5490 * the current file block size. Block alignment is not 5491 * considered. Partial arc_buf will be loaned out for read. 5492 */ 5493 blksz = zp->z_blksz; 5494 if (blksz < zcr_blksz_min) 5495 blksz = zcr_blksz_min; 5496 if (blksz > zcr_blksz_max) 5497 blksz = zcr_blksz_max; 5498 /* avoid potential complexity of dealing with it */ 5499 if (blksz > max_blksz) { 5500 ZFS_EXIT(zfsvfs); 5501 return (SET_ERROR(EINVAL)); 5502 } 5503 5504 maxsize = zp->z_size - uio->uio_loffset; 5505 if (size > maxsize) 5506 size = maxsize; 5507 5508 if (size < blksz || vn_has_cached_data(vp)) { 5509 ZFS_EXIT(zfsvfs); 5510 return (SET_ERROR(EINVAL)); 5511 } 5512 break; 5513 default: 5514 ZFS_EXIT(zfsvfs); 5515 return (SET_ERROR(EINVAL)); 5516 } 5517 5518 uio->uio_extflg = UIO_XUIO; 5519 XUIO_XUZC_RW(xuio) = ioflag; 5520 ZFS_EXIT(zfsvfs); 5521 return (0); 5522 } 5523 5524 /*ARGSUSED*/ 5525 static int 5526 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5527 { 5528 int i; 5529 arc_buf_t *abuf; 5530 int ioflag = XUIO_XUZC_RW(xuio); 5531 5532 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5533 5534 i = dmu_xuio_cnt(xuio); 5535 while (i-- > 0) { 5536 abuf = dmu_xuio_arcbuf(xuio, i); 5537 /* 5538 * if abuf == NULL, it must be a write buffer 5539 * that has been returned in zfs_write(). 5540 */ 5541 if (abuf) 5542 dmu_return_arcbuf(abuf); 5543 ASSERT(abuf || ioflag == UIO_WRITE); 5544 } 5545 5546 dmu_xuio_fini(xuio); 5547 return (0); 5548 } 5549 5550 /* 5551 * Predeclare these here so that the compiler assumes that 5552 * this is an "old style" function declaration that does 5553 * not include arguments => we won't get type mismatch errors 5554 * in the initializations that follow. 5555 */ 5556 static int zfs_inval(); 5557 static int zfs_isdir(); 5558 5559 static int 5560 zfs_inval() 5561 { 5562 return (SET_ERROR(EINVAL)); 5563 } 5564 5565 static int 5566 zfs_isdir() 5567 { 5568 return (SET_ERROR(EISDIR)); 5569 } 5570 /* 5571 * Directory vnode operations template 5572 */ 5573 vnodeops_t *zfs_dvnodeops; 5574 const fs_operation_def_t zfs_dvnodeops_template[] = { 5575 VOPNAME_OPEN, { .vop_open = zfs_open }, 5576 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5577 VOPNAME_READ, { .error = zfs_isdir }, 5578 VOPNAME_WRITE, { .error = zfs_isdir }, 5579 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5580 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5581 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5582 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5583 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5584 VOPNAME_CREATE, { .vop_create = zfs_create }, 5585 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5586 VOPNAME_LINK, { .vop_link = zfs_link }, 5587 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5588 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 5589 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5590 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5591 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 5592 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5593 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5594 VOPNAME_FID, { .vop_fid = zfs_fid }, 5595 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5596 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5597 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5598 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5599 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5600 NULL, NULL 5601 }; 5602 5603 /* 5604 * Regular file vnode operations template 5605 */ 5606 vnodeops_t *zfs_fvnodeops; 5607 const fs_operation_def_t zfs_fvnodeops_template[] = { 5608 VOPNAME_OPEN, { .vop_open = zfs_open }, 5609 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5610 VOPNAME_READ, { .vop_read = zfs_read }, 5611 VOPNAME_WRITE, { .vop_write = zfs_write }, 5612 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5613 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5614 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5615 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5616 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5617 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5618 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5619 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5620 VOPNAME_FID, { .vop_fid = zfs_fid }, 5621 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5622 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 5623 VOPNAME_SPACE, { .vop_space = zfs_space }, 5624 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 5625 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 5626 VOPNAME_MAP, { .vop_map = zfs_map }, 5627 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 5628 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 5629 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5630 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5631 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5632 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5633 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, 5634 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, 5635 NULL, NULL 5636 }; 5637 5638 /* 5639 * Symbolic link vnode operations template 5640 */ 5641 vnodeops_t *zfs_symvnodeops; 5642 const fs_operation_def_t zfs_symvnodeops_template[] = { 5643 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5644 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5645 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5646 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5647 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 5648 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5649 VOPNAME_FID, { .vop_fid = zfs_fid }, 5650 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5651 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5652 NULL, NULL 5653 }; 5654 5655 /* 5656 * special share hidden files vnode operations template 5657 */ 5658 vnodeops_t *zfs_sharevnodeops; 5659 const fs_operation_def_t zfs_sharevnodeops_template[] = { 5660 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5661 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5662 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5663 VOPNAME_FID, { .vop_fid = zfs_fid }, 5664 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5665 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5666 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5667 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5668 NULL, NULL 5669 }; 5670 5671 /* 5672 * Extended attribute directory vnode operations template 5673 * 5674 * This template is identical to the directory vnodes 5675 * operation template except for restricted operations: 5676 * VOP_MKDIR() 5677 * VOP_SYMLINK() 5678 * 5679 * Note that there are other restrictions embedded in: 5680 * zfs_create() - restrict type to VREG 5681 * zfs_link() - no links into/out of attribute space 5682 * zfs_rename() - no moves into/out of attribute space 5683 */ 5684 vnodeops_t *zfs_xdvnodeops; 5685 const fs_operation_def_t zfs_xdvnodeops_template[] = { 5686 VOPNAME_OPEN, { .vop_open = zfs_open }, 5687 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5688 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5689 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5690 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5691 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5692 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5693 VOPNAME_CREATE, { .vop_create = zfs_create }, 5694 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5695 VOPNAME_LINK, { .vop_link = zfs_link }, 5696 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5697 VOPNAME_MKDIR, { .error = zfs_inval }, 5698 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5699 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5700 VOPNAME_SYMLINK, { .error = zfs_inval }, 5701 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5702 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5703 VOPNAME_FID, { .vop_fid = zfs_fid }, 5704 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5705 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5706 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5707 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5708 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5709 NULL, NULL 5710 }; 5711 5712 /* 5713 * Error vnode operations template 5714 */ 5715 vnodeops_t *zfs_evnodeops; 5716 const fs_operation_def_t zfs_evnodeops_template[] = { 5717 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5718 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5719 NULL, NULL 5720 }; 5721