1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/vfs.h> 35 #include <sys/vfs_opreg.h> 36 #include <sys/vnode.h> 37 #include <sys/file.h> 38 #include <sys/stat.h> 39 #include <sys/kmem.h> 40 #include <sys/taskq.h> 41 #include <sys/uio.h> 42 #include <sys/vmsystm.h> 43 #include <sys/atomic.h> 44 #include <sys/vm.h> 45 #include <vm/seg_vn.h> 46 #include <vm/pvn.h> 47 #include <vm/as.h> 48 #include <vm/kpm.h> 49 #include <vm/seg_kpm.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/cmn_err.h> 53 #include <sys/errno.h> 54 #include <sys/unistd.h> 55 #include <sys/zfs_dir.h> 56 #include <sys/zfs_acl.h> 57 #include <sys/zfs_ioctl.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/dmu.h> 60 #include <sys/spa.h> 61 #include <sys/txg.h> 62 #include <sys/dbuf.h> 63 #include <sys/zap.h> 64 #include <sys/dirent.h> 65 #include <sys/policy.h> 66 #include <sys/sunddi.h> 67 #include <sys/filio.h> 68 #include <sys/sid.h> 69 #include "fs/fs_subr.h" 70 #include <sys/zfs_ctldir.h> 71 #include <sys/zfs_fuid.h> 72 #include <sys/dnlc.h> 73 #include <sys/zfs_rlock.h> 74 #include <sys/extdirent.h> 75 #include <sys/kidmap.h> 76 #include <sys/cred_impl.h> 77 #include <sys/attr.h> 78 79 /* 80 * Programming rules. 81 * 82 * Each vnode op performs some logical unit of work. To do this, the ZPL must 83 * properly lock its in-core state, create a DMU transaction, do the work, 84 * record this work in the intent log (ZIL), commit the DMU transaction, 85 * and wait for the intent log to commit if it is a synchronous operation. 86 * Moreover, the vnode ops must work in both normal and log replay context. 87 * The ordering of events is important to avoid deadlocks and references 88 * to freed memory. The example below illustrates the following Big Rules: 89 * 90 * (1) A check must be made in each zfs thread for a mounted file system. 91 * This is done avoiding races using ZFS_ENTER(zfsvfs). 92 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 93 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 94 * can return EIO from the calling function. 95 * 96 * (2) VN_RELE() should always be the last thing except for zil_commit() 97 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 98 * First, if it's the last reference, the vnode/znode 99 * can be freed, so the zp may point to freed memory. Second, the last 100 * reference will call zfs_zinactive(), which may induce a lot of work -- 101 * pushing cached pages (which acquires range locks) and syncing out 102 * cached atime changes. Third, zfs_zinactive() may require a new tx, 103 * which could deadlock the system if you were already holding one. 104 * 105 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 106 * as they can span dmu_tx_assign() calls. 107 * 108 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 109 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 110 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 111 * This is critical because we don't want to block while holding locks. 112 * Note, in particular, that if a lock is sometimes acquired before 113 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 114 * use a non-blocking assign can deadlock the system. The scenario: 115 * 116 * Thread A has grabbed a lock before calling dmu_tx_assign(). 117 * Thread B is in an already-assigned tx, and blocks for this lock. 118 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 119 * forever, because the previous txg can't quiesce until B's tx commits. 120 * 121 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 122 * then drop all locks, call dmu_tx_wait(), and try again. 123 * 124 * (5) If the operation succeeded, generate the intent log entry for it 125 * before dropping locks. This ensures that the ordering of events 126 * in the intent log matches the order in which they actually occurred. 127 * 128 * (6) At the end of each vnode op, the DMU tx must always commit, 129 * regardless of whether there were any errors. 130 * 131 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 132 * to ensure that synchronous semantics are provided when necessary. 133 * 134 * In general, this is how things should be ordered in each vnode op: 135 * 136 * ZFS_ENTER(zfsvfs); // exit if unmounted 137 * top: 138 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 139 * rw_enter(...); // grab any other locks you need 140 * tx = dmu_tx_create(...); // get DMU tx 141 * dmu_tx_hold_*(); // hold each object you might modify 142 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 143 * if (error) { 144 * rw_exit(...); // drop locks 145 * zfs_dirent_unlock(dl); // unlock directory entry 146 * VN_RELE(...); // release held vnodes 147 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 148 * dmu_tx_wait(tx); 149 * dmu_tx_abort(tx); 150 * goto top; 151 * } 152 * dmu_tx_abort(tx); // abort DMU tx 153 * ZFS_EXIT(zfsvfs); // finished in zfs 154 * return (error); // really out of space 155 * } 156 * error = do_real_work(); // do whatever this VOP does 157 * if (error == 0) 158 * zfs_log_*(...); // on success, make ZIL entry 159 * dmu_tx_commit(tx); // commit DMU tx -- error or not 160 * rw_exit(...); // drop locks 161 * zfs_dirent_unlock(dl); // unlock directory entry 162 * VN_RELE(...); // release held vnodes 163 * zil_commit(zilog, seq, foid); // synchronous when necessary 164 * ZFS_EXIT(zfsvfs); // finished in zfs 165 * return (error); // done, report error 166 */ 167 168 /* ARGSUSED */ 169 static int 170 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 171 { 172 znode_t *zp = VTOZ(*vpp); 173 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 174 175 ZFS_ENTER(zfsvfs); 176 ZFS_VERIFY_ZP(zp); 177 178 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 179 ((flag & FAPPEND) == 0)) { 180 ZFS_EXIT(zfsvfs); 181 return (EPERM); 182 } 183 184 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 185 ZTOV(zp)->v_type == VREG && 186 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 187 zp->z_phys->zp_size > 0) { 188 if (fs_vscan(*vpp, cr, 0) != 0) { 189 ZFS_EXIT(zfsvfs); 190 return (EACCES); 191 } 192 } 193 194 /* Keep a count of the synchronous opens in the znode */ 195 if (flag & (FSYNC | FDSYNC)) 196 atomic_inc_32(&zp->z_sync_cnt); 197 198 ZFS_EXIT(zfsvfs); 199 return (0); 200 } 201 202 /* ARGSUSED */ 203 static int 204 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 205 caller_context_t *ct) 206 { 207 znode_t *zp = VTOZ(vp); 208 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 209 210 ZFS_ENTER(zfsvfs); 211 ZFS_VERIFY_ZP(zp); 212 213 /* Decrement the synchronous opens in the znode */ 214 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 215 atomic_dec_32(&zp->z_sync_cnt); 216 217 /* 218 * Clean up any locks held by this process on the vp. 219 */ 220 cleanlocks(vp, ddi_get_pid(), 0); 221 cleanshares(vp, ddi_get_pid()); 222 223 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 224 ZTOV(zp)->v_type == VREG && 225 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 226 zp->z_phys->zp_size > 0) 227 VERIFY(fs_vscan(vp, cr, 1) == 0); 228 229 ZFS_EXIT(zfsvfs); 230 return (0); 231 } 232 233 /* 234 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 235 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 236 */ 237 static int 238 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 239 { 240 znode_t *zp = VTOZ(vp); 241 uint64_t noff = (uint64_t)*off; /* new offset */ 242 uint64_t file_sz; 243 int error; 244 boolean_t hole; 245 246 file_sz = zp->z_phys->zp_size; 247 if (noff >= file_sz) { 248 return (ENXIO); 249 } 250 251 if (cmd == _FIO_SEEK_HOLE) 252 hole = B_TRUE; 253 else 254 hole = B_FALSE; 255 256 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 257 258 /* end of file? */ 259 if ((error == ESRCH) || (noff > file_sz)) { 260 /* 261 * Handle the virtual hole at the end of file. 262 */ 263 if (hole) { 264 *off = file_sz; 265 return (0); 266 } 267 return (ENXIO); 268 } 269 270 if (noff < *off) 271 return (error); 272 *off = noff; 273 return (error); 274 } 275 276 /* ARGSUSED */ 277 static int 278 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 279 int *rvalp, caller_context_t *ct) 280 { 281 offset_t off; 282 int error; 283 zfsvfs_t *zfsvfs; 284 znode_t *zp; 285 286 switch (com) { 287 case _FIOFFS: 288 return (zfs_sync(vp->v_vfsp, 0, cred)); 289 290 /* 291 * The following two ioctls are used by bfu. Faking out, 292 * necessary to avoid bfu errors. 293 */ 294 case _FIOGDIO: 295 case _FIOSDIO: 296 return (0); 297 298 case _FIO_SEEK_DATA: 299 case _FIO_SEEK_HOLE: 300 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 301 return (EFAULT); 302 303 zp = VTOZ(vp); 304 zfsvfs = zp->z_zfsvfs; 305 ZFS_ENTER(zfsvfs); 306 ZFS_VERIFY_ZP(zp); 307 308 /* offset parameter is in/out */ 309 error = zfs_holey(vp, com, &off); 310 ZFS_EXIT(zfsvfs); 311 if (error) 312 return (error); 313 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 314 return (EFAULT); 315 return (0); 316 } 317 return (ENOTTY); 318 } 319 320 /* 321 * Utility functions to map and unmap a single physical page. These 322 * are used to manage the mappable copies of ZFS file data, and therefore 323 * do not update ref/mod bits. 324 */ 325 caddr_t 326 zfs_map_page(page_t *pp, enum seg_rw rw) 327 { 328 if (kpm_enable) 329 return (hat_kpm_mapin(pp, 0)); 330 ASSERT(rw == S_READ || rw == S_WRITE); 331 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 332 (caddr_t)-1)); 333 } 334 335 void 336 zfs_unmap_page(page_t *pp, caddr_t addr) 337 { 338 if (kpm_enable) { 339 hat_kpm_mapout(pp, 0, addr); 340 } else { 341 ppmapout(addr); 342 } 343 } 344 345 /* 346 * When a file is memory mapped, we must keep the IO data synchronized 347 * between the DMU cache and the memory mapped pages. What this means: 348 * 349 * On Write: If we find a memory mapped page, we write to *both* 350 * the page and the dmu buffer. 351 * 352 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 353 * the file is memory mapped. 354 */ 355 static int 356 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) 357 { 358 znode_t *zp = VTOZ(vp); 359 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 360 int64_t start, off; 361 int len = nbytes; 362 int error = 0; 363 364 start = uio->uio_loffset; 365 off = start & PAGEOFFSET; 366 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 367 page_t *pp; 368 uint64_t bytes = MIN(PAGESIZE - off, len); 369 uint64_t woff = uio->uio_loffset; 370 371 /* 372 * We don't want a new page to "appear" in the middle of 373 * the file update (because it may not get the write 374 * update data), so we grab a lock to block 375 * zfs_getpage(). 376 */ 377 rw_enter(&zp->z_map_lock, RW_WRITER); 378 if (pp = page_lookup(vp, start, SE_SHARED)) { 379 caddr_t va; 380 381 rw_exit(&zp->z_map_lock); 382 va = zfs_map_page(pp, S_WRITE); 383 error = uiomove(va+off, bytes, UIO_WRITE, uio); 384 if (error == 0) { 385 dmu_write(zfsvfs->z_os, zp->z_id, 386 woff, bytes, va+off, tx); 387 } 388 zfs_unmap_page(pp, va); 389 page_unlock(pp); 390 } else { 391 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 392 uio, bytes, tx); 393 rw_exit(&zp->z_map_lock); 394 } 395 len -= bytes; 396 off = 0; 397 if (error) 398 break; 399 } 400 return (error); 401 } 402 403 /* 404 * When a file is memory mapped, we must keep the IO data synchronized 405 * between the DMU cache and the memory mapped pages. What this means: 406 * 407 * On Read: We "read" preferentially from memory mapped pages, 408 * else we default from the dmu buffer. 409 * 410 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 411 * the file is memory mapped. 412 */ 413 static int 414 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 415 { 416 znode_t *zp = VTOZ(vp); 417 objset_t *os = zp->z_zfsvfs->z_os; 418 int64_t start, off; 419 int len = nbytes; 420 int error = 0; 421 422 start = uio->uio_loffset; 423 off = start & PAGEOFFSET; 424 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 425 page_t *pp; 426 uint64_t bytes = MIN(PAGESIZE - off, len); 427 428 if (pp = page_lookup(vp, start, SE_SHARED)) { 429 caddr_t va; 430 431 va = zfs_map_page(pp, S_READ); 432 error = uiomove(va + off, bytes, UIO_READ, uio); 433 zfs_unmap_page(pp, va); 434 page_unlock(pp); 435 } else { 436 error = dmu_read_uio(os, zp->z_id, uio, bytes); 437 } 438 len -= bytes; 439 off = 0; 440 if (error) 441 break; 442 } 443 return (error); 444 } 445 446 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 447 448 /* 449 * Read bytes from specified file into supplied buffer. 450 * 451 * IN: vp - vnode of file to be read from. 452 * uio - structure supplying read location, range info, 453 * and return buffer. 454 * ioflag - SYNC flags; used to provide FRSYNC semantics. 455 * cr - credentials of caller. 456 * ct - caller context 457 * 458 * OUT: uio - updated offset and range, buffer filled. 459 * 460 * RETURN: 0 if success 461 * error code if failure 462 * 463 * Side Effects: 464 * vp - atime updated if byte count > 0 465 */ 466 /* ARGSUSED */ 467 static int 468 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 469 { 470 znode_t *zp = VTOZ(vp); 471 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 472 objset_t *os; 473 ssize_t n, nbytes; 474 int error; 475 rl_t *rl; 476 477 ZFS_ENTER(zfsvfs); 478 ZFS_VERIFY_ZP(zp); 479 os = zfsvfs->z_os; 480 481 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 482 ZFS_EXIT(zfsvfs); 483 return (EACCES); 484 } 485 486 /* 487 * Validate file offset 488 */ 489 if (uio->uio_loffset < (offset_t)0) { 490 ZFS_EXIT(zfsvfs); 491 return (EINVAL); 492 } 493 494 /* 495 * Fasttrack empty reads 496 */ 497 if (uio->uio_resid == 0) { 498 ZFS_EXIT(zfsvfs); 499 return (0); 500 } 501 502 /* 503 * Check for mandatory locks 504 */ 505 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 506 if (error = chklock(vp, FREAD, 507 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 508 ZFS_EXIT(zfsvfs); 509 return (error); 510 } 511 } 512 513 /* 514 * If we're in FRSYNC mode, sync out this znode before reading it. 515 */ 516 if (ioflag & FRSYNC) 517 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 518 519 /* 520 * Lock the range against changes. 521 */ 522 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 523 524 /* 525 * If we are reading past end-of-file we can skip 526 * to the end; but we might still need to set atime. 527 */ 528 if (uio->uio_loffset >= zp->z_phys->zp_size) { 529 error = 0; 530 goto out; 531 } 532 533 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 534 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 535 536 while (n > 0) { 537 nbytes = MIN(n, zfs_read_chunk_size - 538 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 539 540 if (vn_has_cached_data(vp)) 541 error = mappedread(vp, nbytes, uio); 542 else 543 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 544 if (error) { 545 /* convert checksum errors into IO errors */ 546 if (error == ECKSUM) 547 error = EIO; 548 break; 549 } 550 551 n -= nbytes; 552 } 553 554 out: 555 zfs_range_unlock(rl); 556 557 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 558 ZFS_EXIT(zfsvfs); 559 return (error); 560 } 561 562 /* 563 * Fault in the pages of the first n bytes specified by the uio structure. 564 * 1 byte in each page is touched and the uio struct is unmodified. 565 * Any error will exit this routine as this is only a best 566 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 567 */ 568 static void 569 zfs_prefault_write(ssize_t n, struct uio *uio) 570 { 571 struct iovec *iov; 572 ulong_t cnt, incr; 573 caddr_t p; 574 uint8_t tmp; 575 576 iov = uio->uio_iov; 577 578 while (n) { 579 cnt = MIN(iov->iov_len, n); 580 if (cnt == 0) { 581 /* empty iov entry */ 582 iov++; 583 continue; 584 } 585 n -= cnt; 586 /* 587 * touch each page in this segment. 588 */ 589 p = iov->iov_base; 590 while (cnt) { 591 switch (uio->uio_segflg) { 592 case UIO_USERSPACE: 593 case UIO_USERISPACE: 594 if (fuword8(p, &tmp)) 595 return; 596 break; 597 case UIO_SYSSPACE: 598 if (kcopy(p, &tmp, 1)) 599 return; 600 break; 601 } 602 incr = MIN(cnt, PAGESIZE); 603 p += incr; 604 cnt -= incr; 605 } 606 /* 607 * touch the last byte in case it straddles a page. 608 */ 609 p--; 610 switch (uio->uio_segflg) { 611 case UIO_USERSPACE: 612 case UIO_USERISPACE: 613 if (fuword8(p, &tmp)) 614 return; 615 break; 616 case UIO_SYSSPACE: 617 if (kcopy(p, &tmp, 1)) 618 return; 619 break; 620 } 621 iov++; 622 } 623 } 624 625 /* 626 * Write the bytes to a file. 627 * 628 * IN: vp - vnode of file to be written to. 629 * uio - structure supplying write location, range info, 630 * and data buffer. 631 * ioflag - FAPPEND flag set if in append mode. 632 * cr - credentials of caller. 633 * ct - caller context (NFS/CIFS fem monitor only) 634 * 635 * OUT: uio - updated offset and range. 636 * 637 * RETURN: 0 if success 638 * error code if failure 639 * 640 * Timestamps: 641 * vp - ctime|mtime updated if byte count > 0 642 */ 643 /* ARGSUSED */ 644 static int 645 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 646 { 647 znode_t *zp = VTOZ(vp); 648 rlim64_t limit = uio->uio_llimit; 649 ssize_t start_resid = uio->uio_resid; 650 ssize_t tx_bytes; 651 uint64_t end_size; 652 dmu_tx_t *tx; 653 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 654 zilog_t *zilog; 655 offset_t woff; 656 ssize_t n, nbytes; 657 rl_t *rl; 658 int max_blksz = zfsvfs->z_max_blksz; 659 uint64_t pflags; 660 int error; 661 662 /* 663 * Fasttrack empty write 664 */ 665 n = start_resid; 666 if (n == 0) 667 return (0); 668 669 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 670 limit = MAXOFFSET_T; 671 672 ZFS_ENTER(zfsvfs); 673 ZFS_VERIFY_ZP(zp); 674 675 /* 676 * If immutable or not appending then return EPERM 677 */ 678 pflags = zp->z_phys->zp_flags; 679 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 680 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 681 (uio->uio_loffset < zp->z_phys->zp_size))) { 682 ZFS_EXIT(zfsvfs); 683 return (EPERM); 684 } 685 686 zilog = zfsvfs->z_log; 687 688 /* 689 * Pre-fault the pages to ensure slow (eg NFS) pages 690 * don't hold up txg. 691 */ 692 zfs_prefault_write(n, uio); 693 694 /* 695 * If in append mode, set the io offset pointer to eof. 696 */ 697 if (ioflag & FAPPEND) { 698 /* 699 * Range lock for a file append: 700 * The value for the start of range will be determined by 701 * zfs_range_lock() (to guarantee append semantics). 702 * If this write will cause the block size to increase, 703 * zfs_range_lock() will lock the entire file, so we must 704 * later reduce the range after we grow the block size. 705 */ 706 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 707 if (rl->r_len == UINT64_MAX) { 708 /* overlocked, zp_size can't change */ 709 woff = uio->uio_loffset = zp->z_phys->zp_size; 710 } else { 711 woff = uio->uio_loffset = rl->r_off; 712 } 713 } else { 714 woff = uio->uio_loffset; 715 /* 716 * Validate file offset 717 */ 718 if (woff < 0) { 719 ZFS_EXIT(zfsvfs); 720 return (EINVAL); 721 } 722 723 /* 724 * If we need to grow the block size then zfs_range_lock() 725 * will lock a wider range than we request here. 726 * Later after growing the block size we reduce the range. 727 */ 728 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 729 } 730 731 if (woff >= limit) { 732 zfs_range_unlock(rl); 733 ZFS_EXIT(zfsvfs); 734 return (EFBIG); 735 } 736 737 if ((woff + n) > limit || woff > (limit - n)) 738 n = limit - woff; 739 740 /* 741 * Check for mandatory locks 742 */ 743 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 744 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 745 zfs_range_unlock(rl); 746 ZFS_EXIT(zfsvfs); 747 return (error); 748 } 749 end_size = MAX(zp->z_phys->zp_size, woff + n); 750 751 /* 752 * Write the file in reasonable size chunks. Each chunk is written 753 * in a separate transaction; this keeps the intent log records small 754 * and allows us to do more fine-grained space accounting. 755 */ 756 while (n > 0) { 757 /* 758 * Start a transaction. 759 */ 760 woff = uio->uio_loffset; 761 tx = dmu_tx_create(zfsvfs->z_os); 762 dmu_tx_hold_bonus(tx, zp->z_id); 763 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 764 error = dmu_tx_assign(tx, zfsvfs->z_assign); 765 if (error) { 766 if (error == ERESTART && 767 zfsvfs->z_assign == TXG_NOWAIT) { 768 dmu_tx_wait(tx); 769 dmu_tx_abort(tx); 770 continue; 771 } 772 dmu_tx_abort(tx); 773 break; 774 } 775 776 /* 777 * If zfs_range_lock() over-locked we grow the blocksize 778 * and then reduce the lock range. This will only happen 779 * on the first iteration since zfs_range_reduce() will 780 * shrink down r_len to the appropriate size. 781 */ 782 if (rl->r_len == UINT64_MAX) { 783 uint64_t new_blksz; 784 785 if (zp->z_blksz > max_blksz) { 786 ASSERT(!ISP2(zp->z_blksz)); 787 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 788 } else { 789 new_blksz = MIN(end_size, max_blksz); 790 } 791 zfs_grow_blocksize(zp, new_blksz, tx); 792 zfs_range_reduce(rl, woff, n); 793 } 794 795 /* 796 * XXX - should we really limit each write to z_max_blksz? 797 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 798 */ 799 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 800 rw_enter(&zp->z_map_lock, RW_READER); 801 802 tx_bytes = uio->uio_resid; 803 if (vn_has_cached_data(vp)) { 804 rw_exit(&zp->z_map_lock); 805 error = mappedwrite(vp, nbytes, uio, tx); 806 } else { 807 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 808 uio, nbytes, tx); 809 rw_exit(&zp->z_map_lock); 810 } 811 tx_bytes -= uio->uio_resid; 812 813 /* 814 * If we made no progress, we're done. If we made even 815 * partial progress, update the znode and ZIL accordingly. 816 */ 817 if (tx_bytes == 0) { 818 dmu_tx_commit(tx); 819 ASSERT(error != 0); 820 break; 821 } 822 823 /* 824 * Clear Set-UID/Set-GID bits on successful write if not 825 * privileged and at least one of the excute bits is set. 826 * 827 * It would be nice to to this after all writes have 828 * been done, but that would still expose the ISUID/ISGID 829 * to another app after the partial write is committed. 830 * 831 * Note: we don't call zfs_fuid_map_id() here because 832 * user 0 is not an ephemeral uid. 833 */ 834 mutex_enter(&zp->z_acl_lock); 835 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 836 (S_IXUSR >> 6))) != 0 && 837 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 838 secpolicy_vnode_setid_retain(cr, 839 (zp->z_phys->zp_mode & S_ISUID) != 0 && 840 zp->z_phys->zp_uid == 0) != 0) { 841 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 842 } 843 mutex_exit(&zp->z_acl_lock); 844 845 /* 846 * Update time stamp. NOTE: This marks the bonus buffer as 847 * dirty, so we don't have to do it again for zp_size. 848 */ 849 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 850 851 /* 852 * Update the file size (zp_size) if it has changed; 853 * account for possible concurrent updates. 854 */ 855 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 856 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 857 uio->uio_loffset); 858 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 859 dmu_tx_commit(tx); 860 861 if (error != 0) 862 break; 863 ASSERT(tx_bytes == nbytes); 864 n -= nbytes; 865 } 866 867 zfs_range_unlock(rl); 868 869 /* 870 * If we're in replay mode, or we made no progress, return error. 871 * Otherwise, it's at least a partial write, so it's successful. 872 */ 873 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 874 ZFS_EXIT(zfsvfs); 875 return (error); 876 } 877 878 if (ioflag & (FSYNC | FDSYNC)) 879 zil_commit(zilog, zp->z_last_itx, zp->z_id); 880 881 ZFS_EXIT(zfsvfs); 882 return (0); 883 } 884 885 void 886 zfs_get_done(dmu_buf_t *db, void *vzgd) 887 { 888 zgd_t *zgd = (zgd_t *)vzgd; 889 rl_t *rl = zgd->zgd_rl; 890 vnode_t *vp = ZTOV(rl->r_zp); 891 892 dmu_buf_rele(db, vzgd); 893 zfs_range_unlock(rl); 894 VN_RELE(vp); 895 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 896 kmem_free(zgd, sizeof (zgd_t)); 897 } 898 899 /* 900 * Get data to generate a TX_WRITE intent log record. 901 */ 902 int 903 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 904 { 905 zfsvfs_t *zfsvfs = arg; 906 objset_t *os = zfsvfs->z_os; 907 znode_t *zp; 908 uint64_t off = lr->lr_offset; 909 dmu_buf_t *db; 910 rl_t *rl; 911 zgd_t *zgd; 912 int dlen = lr->lr_length; /* length of user data */ 913 int error = 0; 914 915 ASSERT(zio); 916 ASSERT(dlen != 0); 917 918 /* 919 * Nothing to do if the file has been removed 920 */ 921 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 922 return (ENOENT); 923 if (zp->z_unlinked) { 924 VN_RELE(ZTOV(zp)); 925 return (ENOENT); 926 } 927 928 /* 929 * Write records come in two flavors: immediate and indirect. 930 * For small writes it's cheaper to store the data with the 931 * log record (immediate); for large writes it's cheaper to 932 * sync the data and get a pointer to it (indirect) so that 933 * we don't have to write the data twice. 934 */ 935 if (buf != NULL) { /* immediate write */ 936 rl = zfs_range_lock(zp, off, dlen, RL_READER); 937 /* test for truncation needs to be done while range locked */ 938 if (off >= zp->z_phys->zp_size) { 939 error = ENOENT; 940 goto out; 941 } 942 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 943 } else { /* indirect write */ 944 uint64_t boff; /* block starting offset */ 945 946 /* 947 * Have to lock the whole block to ensure when it's 948 * written out and it's checksum is being calculated 949 * that no one can change the data. We need to re-check 950 * blocksize after we get the lock in case it's changed! 951 */ 952 for (;;) { 953 if (ISP2(zp->z_blksz)) { 954 boff = P2ALIGN_TYPED(off, zp->z_blksz, 955 uint64_t); 956 } else { 957 boff = 0; 958 } 959 dlen = zp->z_blksz; 960 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 961 if (zp->z_blksz == dlen) 962 break; 963 zfs_range_unlock(rl); 964 } 965 /* test for truncation needs to be done while range locked */ 966 if (off >= zp->z_phys->zp_size) { 967 error = ENOENT; 968 goto out; 969 } 970 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 971 zgd->zgd_rl = rl; 972 zgd->zgd_zilog = zfsvfs->z_log; 973 zgd->zgd_bp = &lr->lr_blkptr; 974 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 975 ASSERT(boff == db->db_offset); 976 lr->lr_blkoff = off - boff; 977 error = dmu_sync(zio, db, &lr->lr_blkptr, 978 lr->lr_common.lrc_txg, zfs_get_done, zgd); 979 ASSERT((error && error != EINPROGRESS) || 980 lr->lr_length <= zp->z_blksz); 981 if (error == 0) 982 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 983 /* 984 * If we get EINPROGRESS, then we need to wait for a 985 * write IO initiated by dmu_sync() to complete before 986 * we can release this dbuf. We will finish everything 987 * up in the zfs_get_done() callback. 988 */ 989 if (error == EINPROGRESS) 990 return (0); 991 dmu_buf_rele(db, zgd); 992 kmem_free(zgd, sizeof (zgd_t)); 993 } 994 out: 995 zfs_range_unlock(rl); 996 VN_RELE(ZTOV(zp)); 997 return (error); 998 } 999 1000 /*ARGSUSED*/ 1001 static int 1002 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1003 caller_context_t *ct) 1004 { 1005 znode_t *zp = VTOZ(vp); 1006 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1007 int error; 1008 1009 ZFS_ENTER(zfsvfs); 1010 ZFS_VERIFY_ZP(zp); 1011 1012 if (flag & V_ACE_MASK) 1013 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1014 else 1015 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1016 1017 ZFS_EXIT(zfsvfs); 1018 return (error); 1019 } 1020 1021 /* 1022 * Lookup an entry in a directory, or an extended attribute directory. 1023 * If it exists, return a held vnode reference for it. 1024 * 1025 * IN: dvp - vnode of directory to search. 1026 * nm - name of entry to lookup. 1027 * pnp - full pathname to lookup [UNUSED]. 1028 * flags - LOOKUP_XATTR set if looking for an attribute. 1029 * rdir - root directory vnode [UNUSED]. 1030 * cr - credentials of caller. 1031 * ct - caller context 1032 * direntflags - directory lookup flags 1033 * realpnp - returned pathname. 1034 * 1035 * OUT: vpp - vnode of located entry, NULL if not found. 1036 * 1037 * RETURN: 0 if success 1038 * error code if failure 1039 * 1040 * Timestamps: 1041 * NA 1042 */ 1043 /* ARGSUSED */ 1044 static int 1045 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1046 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1047 int *direntflags, pathname_t *realpnp) 1048 { 1049 znode_t *zdp = VTOZ(dvp); 1050 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1051 int error; 1052 1053 ZFS_ENTER(zfsvfs); 1054 ZFS_VERIFY_ZP(zdp); 1055 1056 *vpp = NULL; 1057 1058 if (flags & LOOKUP_XATTR) { 1059 /* 1060 * If the xattr property is off, refuse the lookup request. 1061 */ 1062 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1063 ZFS_EXIT(zfsvfs); 1064 return (EINVAL); 1065 } 1066 1067 /* 1068 * We don't allow recursive attributes.. 1069 * Maybe someday we will. 1070 */ 1071 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 1072 ZFS_EXIT(zfsvfs); 1073 return (EINVAL); 1074 } 1075 1076 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1077 ZFS_EXIT(zfsvfs); 1078 return (error); 1079 } 1080 1081 /* 1082 * Do we have permission to get into attribute directory? 1083 */ 1084 1085 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1086 B_FALSE, cr)) { 1087 VN_RELE(*vpp); 1088 *vpp = NULL; 1089 } 1090 1091 ZFS_EXIT(zfsvfs); 1092 return (error); 1093 } 1094 1095 if (dvp->v_type != VDIR) { 1096 ZFS_EXIT(zfsvfs); 1097 return (ENOTDIR); 1098 } 1099 1100 /* 1101 * Check accessibility of directory. 1102 */ 1103 1104 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1105 ZFS_EXIT(zfsvfs); 1106 return (error); 1107 } 1108 1109 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1110 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1111 ZFS_EXIT(zfsvfs); 1112 return (EILSEQ); 1113 } 1114 1115 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1116 if (error == 0) { 1117 /* 1118 * Convert device special files 1119 */ 1120 if (IS_DEVVP(*vpp)) { 1121 vnode_t *svp; 1122 1123 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1124 VN_RELE(*vpp); 1125 if (svp == NULL) 1126 error = ENOSYS; 1127 else 1128 *vpp = svp; 1129 } 1130 } 1131 1132 ZFS_EXIT(zfsvfs); 1133 return (error); 1134 } 1135 1136 /* 1137 * Attempt to create a new entry in a directory. If the entry 1138 * already exists, truncate the file if permissible, else return 1139 * an error. Return the vp of the created or trunc'd file. 1140 * 1141 * IN: dvp - vnode of directory to put new file entry in. 1142 * name - name of new file entry. 1143 * vap - attributes of new file. 1144 * excl - flag indicating exclusive or non-exclusive mode. 1145 * mode - mode to open file with. 1146 * cr - credentials of caller. 1147 * flag - large file flag [UNUSED]. 1148 * ct - caller context 1149 * vsecp - ACL to be set 1150 * 1151 * OUT: vpp - vnode of created or trunc'd entry. 1152 * 1153 * RETURN: 0 if success 1154 * error code if failure 1155 * 1156 * Timestamps: 1157 * dvp - ctime|mtime updated if new entry created 1158 * vp - ctime|mtime always, atime if new 1159 */ 1160 1161 /* ARGSUSED */ 1162 static int 1163 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1164 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1165 vsecattr_t *vsecp) 1166 { 1167 znode_t *zp, *dzp = VTOZ(dvp); 1168 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1169 zilog_t *zilog; 1170 objset_t *os; 1171 zfs_dirlock_t *dl; 1172 dmu_tx_t *tx; 1173 int error; 1174 zfs_acl_t *aclp = NULL; 1175 zfs_fuid_info_t *fuidp = NULL; 1176 ksid_t *ksid; 1177 uid_t uid; 1178 gid_t gid = crgetgid(cr); 1179 1180 /* 1181 * If we have an ephemeral id, ACL, or XVATTR then 1182 * make sure file system is at proper version 1183 */ 1184 1185 ksid = crgetsid(cr, KSID_OWNER); 1186 if (ksid) 1187 uid = ksid_getid(ksid); 1188 else 1189 uid = crgetuid(cr); 1190 1191 if (zfsvfs->z_use_fuids == B_FALSE && 1192 (vsecp || (vap->va_mask & AT_XVATTR) || 1193 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1194 return (EINVAL); 1195 1196 ZFS_ENTER(zfsvfs); 1197 ZFS_VERIFY_ZP(dzp); 1198 os = zfsvfs->z_os; 1199 zilog = zfsvfs->z_log; 1200 1201 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1202 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1203 ZFS_EXIT(zfsvfs); 1204 return (EILSEQ); 1205 } 1206 1207 if (vap->va_mask & AT_XVATTR) { 1208 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1209 crgetuid(cr), cr, vap->va_type)) != 0) { 1210 ZFS_EXIT(zfsvfs); 1211 return (error); 1212 } 1213 } 1214 top: 1215 *vpp = NULL; 1216 1217 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1218 vap->va_mode &= ~VSVTX; 1219 1220 if (*name == '\0') { 1221 /* 1222 * Null component name refers to the directory itself. 1223 */ 1224 VN_HOLD(dvp); 1225 zp = dzp; 1226 dl = NULL; 1227 error = 0; 1228 } else { 1229 /* possible VN_HOLD(zp) */ 1230 int zflg = 0; 1231 1232 if (flag & FIGNORECASE) 1233 zflg |= ZCILOOK; 1234 1235 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1236 NULL, NULL); 1237 if (error) { 1238 if (strcmp(name, "..") == 0) 1239 error = EISDIR; 1240 ZFS_EXIT(zfsvfs); 1241 if (aclp) 1242 zfs_acl_free(aclp); 1243 return (error); 1244 } 1245 } 1246 if (vsecp && aclp == NULL) { 1247 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1248 if (error) { 1249 ZFS_EXIT(zfsvfs); 1250 if (dl) 1251 zfs_dirent_unlock(dl); 1252 return (error); 1253 } 1254 } 1255 1256 if (zp == NULL) { 1257 uint64_t txtype; 1258 1259 /* 1260 * Create a new file object and update the directory 1261 * to reference it. 1262 */ 1263 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1264 goto out; 1265 } 1266 1267 /* 1268 * We only support the creation of regular files in 1269 * extended attribute directories. 1270 */ 1271 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1272 (vap->va_type != VREG)) { 1273 error = EINVAL; 1274 goto out; 1275 } 1276 1277 tx = dmu_tx_create(os); 1278 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1279 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || 1280 IS_EPHEMERAL(gid)) { 1281 if (zfsvfs->z_fuid_obj == 0) { 1282 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1283 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1284 FUID_SIZE_ESTIMATE(zfsvfs)); 1285 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1286 FALSE, NULL); 1287 } else { 1288 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1289 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1290 FUID_SIZE_ESTIMATE(zfsvfs)); 1291 } 1292 } 1293 dmu_tx_hold_bonus(tx, dzp->z_id); 1294 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1295 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { 1296 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1297 0, SPA_MAXBLOCKSIZE); 1298 } 1299 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1300 if (error) { 1301 zfs_dirent_unlock(dl); 1302 if (error == ERESTART && 1303 zfsvfs->z_assign == TXG_NOWAIT) { 1304 dmu_tx_wait(tx); 1305 dmu_tx_abort(tx); 1306 goto top; 1307 } 1308 dmu_tx_abort(tx); 1309 ZFS_EXIT(zfsvfs); 1310 if (aclp) 1311 zfs_acl_free(aclp); 1312 return (error); 1313 } 1314 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1315 (void) zfs_link_create(dl, zp, tx, ZNEW); 1316 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1317 if (flag & FIGNORECASE) 1318 txtype |= TX_CI; 1319 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1320 vsecp, fuidp, vap); 1321 if (fuidp) 1322 zfs_fuid_info_free(fuidp); 1323 dmu_tx_commit(tx); 1324 } else { 1325 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1326 1327 /* 1328 * A directory entry already exists for this name. 1329 */ 1330 /* 1331 * Can't truncate an existing file if in exclusive mode. 1332 */ 1333 if (excl == EXCL) { 1334 error = EEXIST; 1335 goto out; 1336 } 1337 /* 1338 * Can't open a directory for writing. 1339 */ 1340 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1341 error = EISDIR; 1342 goto out; 1343 } 1344 /* 1345 * Verify requested access to file. 1346 */ 1347 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1348 goto out; 1349 } 1350 1351 mutex_enter(&dzp->z_lock); 1352 dzp->z_seq++; 1353 mutex_exit(&dzp->z_lock); 1354 1355 /* 1356 * Truncate regular files if requested. 1357 */ 1358 if ((ZTOV(zp)->v_type == VREG) && 1359 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1360 /* we can't hold any locks when calling zfs_freesp() */ 1361 zfs_dirent_unlock(dl); 1362 dl = NULL; 1363 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1364 if (error == 0) { 1365 vnevent_create(ZTOV(zp), ct); 1366 } 1367 } 1368 } 1369 out: 1370 1371 if (dl) 1372 zfs_dirent_unlock(dl); 1373 1374 if (error) { 1375 if (zp) 1376 VN_RELE(ZTOV(zp)); 1377 } else { 1378 *vpp = ZTOV(zp); 1379 /* 1380 * If vnode is for a device return a specfs vnode instead. 1381 */ 1382 if (IS_DEVVP(*vpp)) { 1383 struct vnode *svp; 1384 1385 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1386 VN_RELE(*vpp); 1387 if (svp == NULL) { 1388 error = ENOSYS; 1389 } 1390 *vpp = svp; 1391 } 1392 } 1393 if (aclp) 1394 zfs_acl_free(aclp); 1395 1396 ZFS_EXIT(zfsvfs); 1397 return (error); 1398 } 1399 1400 /* 1401 * Remove an entry from a directory. 1402 * 1403 * IN: dvp - vnode of directory to remove entry from. 1404 * name - name of entry to remove. 1405 * cr - credentials of caller. 1406 * ct - caller context 1407 * flags - case flags 1408 * 1409 * RETURN: 0 if success 1410 * error code if failure 1411 * 1412 * Timestamps: 1413 * dvp - ctime|mtime 1414 * vp - ctime (if nlink > 0) 1415 */ 1416 /*ARGSUSED*/ 1417 static int 1418 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1419 int flags) 1420 { 1421 znode_t *zp, *dzp = VTOZ(dvp); 1422 znode_t *xzp = NULL; 1423 vnode_t *vp; 1424 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1425 zilog_t *zilog; 1426 uint64_t acl_obj, xattr_obj; 1427 zfs_dirlock_t *dl; 1428 dmu_tx_t *tx; 1429 boolean_t may_delete_now, delete_now = FALSE; 1430 boolean_t unlinked, toobig = FALSE; 1431 uint64_t txtype; 1432 pathname_t *realnmp = NULL; 1433 pathname_t realnm; 1434 int error; 1435 int zflg = ZEXISTS; 1436 1437 ZFS_ENTER(zfsvfs); 1438 ZFS_VERIFY_ZP(dzp); 1439 zilog = zfsvfs->z_log; 1440 1441 if (flags & FIGNORECASE) { 1442 zflg |= ZCILOOK; 1443 pn_alloc(&realnm); 1444 realnmp = &realnm; 1445 } 1446 1447 top: 1448 /* 1449 * Attempt to lock directory; fail if entry doesn't exist. 1450 */ 1451 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1452 NULL, realnmp)) { 1453 if (realnmp) 1454 pn_free(realnmp); 1455 ZFS_EXIT(zfsvfs); 1456 return (error); 1457 } 1458 1459 vp = ZTOV(zp); 1460 1461 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1462 goto out; 1463 } 1464 1465 /* 1466 * Need to use rmdir for removing directories. 1467 */ 1468 if (vp->v_type == VDIR) { 1469 error = EPERM; 1470 goto out; 1471 } 1472 1473 vnevent_remove(vp, dvp, name, ct); 1474 1475 if (realnmp) 1476 dnlc_remove(dvp, realnmp->pn_buf); 1477 else 1478 dnlc_remove(dvp, name); 1479 1480 mutex_enter(&vp->v_lock); 1481 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1482 mutex_exit(&vp->v_lock); 1483 1484 /* 1485 * We may delete the znode now, or we may put it in the unlinked set; 1486 * it depends on whether we're the last link, and on whether there are 1487 * other holds on the vnode. So we dmu_tx_hold() the right things to 1488 * allow for either case. 1489 */ 1490 tx = dmu_tx_create(zfsvfs->z_os); 1491 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1492 dmu_tx_hold_bonus(tx, zp->z_id); 1493 if (may_delete_now) { 1494 toobig = 1495 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1496 /* if the file is too big, only hold_free a token amount */ 1497 dmu_tx_hold_free(tx, zp->z_id, 0, 1498 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1499 } 1500 1501 /* are there any extended attributes? */ 1502 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1503 /* XXX - do we need this if we are deleting? */ 1504 dmu_tx_hold_bonus(tx, xattr_obj); 1505 } 1506 1507 /* are there any additional acls */ 1508 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1509 may_delete_now) 1510 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1511 1512 /* charge as an update -- would be nice not to charge at all */ 1513 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1514 1515 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1516 if (error) { 1517 zfs_dirent_unlock(dl); 1518 VN_RELE(vp); 1519 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1520 dmu_tx_wait(tx); 1521 dmu_tx_abort(tx); 1522 goto top; 1523 } 1524 if (realnmp) 1525 pn_free(realnmp); 1526 dmu_tx_abort(tx); 1527 ZFS_EXIT(zfsvfs); 1528 return (error); 1529 } 1530 1531 /* 1532 * Remove the directory entry. 1533 */ 1534 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1535 1536 if (error) { 1537 dmu_tx_commit(tx); 1538 goto out; 1539 } 1540 1541 if (unlinked) { 1542 mutex_enter(&vp->v_lock); 1543 delete_now = may_delete_now && !toobig && 1544 vp->v_count == 1 && !vn_has_cached_data(vp) && 1545 zp->z_phys->zp_xattr == xattr_obj && 1546 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1547 mutex_exit(&vp->v_lock); 1548 } 1549 1550 if (delete_now) { 1551 if (zp->z_phys->zp_xattr) { 1552 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1553 ASSERT3U(error, ==, 0); 1554 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1555 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1556 mutex_enter(&xzp->z_lock); 1557 xzp->z_unlinked = 1; 1558 xzp->z_phys->zp_links = 0; 1559 mutex_exit(&xzp->z_lock); 1560 zfs_unlinked_add(xzp, tx); 1561 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1562 } 1563 mutex_enter(&zp->z_lock); 1564 mutex_enter(&vp->v_lock); 1565 vp->v_count--; 1566 ASSERT3U(vp->v_count, ==, 0); 1567 mutex_exit(&vp->v_lock); 1568 mutex_exit(&zp->z_lock); 1569 zfs_znode_delete(zp, tx); 1570 } else if (unlinked) { 1571 zfs_unlinked_add(zp, tx); 1572 } 1573 1574 txtype = TX_REMOVE; 1575 if (flags & FIGNORECASE) 1576 txtype |= TX_CI; 1577 zfs_log_remove(zilog, tx, txtype, dzp, name); 1578 1579 dmu_tx_commit(tx); 1580 out: 1581 if (realnmp) 1582 pn_free(realnmp); 1583 1584 zfs_dirent_unlock(dl); 1585 1586 if (!delete_now) { 1587 VN_RELE(vp); 1588 } else if (xzp) { 1589 /* this rele is delayed to prevent nesting transactions */ 1590 VN_RELE(ZTOV(xzp)); 1591 } 1592 1593 ZFS_EXIT(zfsvfs); 1594 return (error); 1595 } 1596 1597 /* 1598 * Create a new directory and insert it into dvp using the name 1599 * provided. Return a pointer to the inserted directory. 1600 * 1601 * IN: dvp - vnode of directory to add subdir to. 1602 * dirname - name of new directory. 1603 * vap - attributes of new directory. 1604 * cr - credentials of caller. 1605 * ct - caller context 1606 * vsecp - ACL to be set 1607 * 1608 * OUT: vpp - vnode of created directory. 1609 * 1610 * RETURN: 0 if success 1611 * error code if failure 1612 * 1613 * Timestamps: 1614 * dvp - ctime|mtime updated 1615 * vp - ctime|mtime|atime updated 1616 */ 1617 /*ARGSUSED*/ 1618 static int 1619 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1620 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1621 { 1622 znode_t *zp, *dzp = VTOZ(dvp); 1623 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1624 zilog_t *zilog; 1625 zfs_dirlock_t *dl; 1626 uint64_t txtype; 1627 dmu_tx_t *tx; 1628 int error; 1629 zfs_acl_t *aclp = NULL; 1630 zfs_fuid_info_t *fuidp = NULL; 1631 int zf = ZNEW; 1632 ksid_t *ksid; 1633 uid_t uid; 1634 gid_t gid = crgetgid(cr); 1635 1636 ASSERT(vap->va_type == VDIR); 1637 1638 /* 1639 * If we have an ephemeral id, ACL, or XVATTR then 1640 * make sure file system is at proper version 1641 */ 1642 1643 ksid = crgetsid(cr, KSID_OWNER); 1644 if (ksid) 1645 uid = ksid_getid(ksid); 1646 else 1647 uid = crgetuid(cr); 1648 if (zfsvfs->z_use_fuids == B_FALSE && 1649 (vsecp || (vap->va_mask & AT_XVATTR) || 1650 IS_EPHEMERAL(uid)) || IS_EPHEMERAL(gid)) 1651 return (EINVAL); 1652 1653 ZFS_ENTER(zfsvfs); 1654 ZFS_VERIFY_ZP(dzp); 1655 zilog = zfsvfs->z_log; 1656 1657 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1658 ZFS_EXIT(zfsvfs); 1659 return (EINVAL); 1660 } 1661 1662 if (zfsvfs->z_utf8 && u8_validate(dirname, 1663 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1664 ZFS_EXIT(zfsvfs); 1665 return (EILSEQ); 1666 } 1667 if (flags & FIGNORECASE) 1668 zf |= ZCILOOK; 1669 1670 if (vap->va_mask & AT_XVATTR) 1671 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1672 crgetuid(cr), cr, vap->va_type)) != 0) { 1673 ZFS_EXIT(zfsvfs); 1674 return (error); 1675 } 1676 1677 /* 1678 * First make sure the new directory doesn't exist. 1679 */ 1680 top: 1681 *vpp = NULL; 1682 1683 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1684 NULL, NULL)) { 1685 ZFS_EXIT(zfsvfs); 1686 return (error); 1687 } 1688 1689 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1690 zfs_dirent_unlock(dl); 1691 ZFS_EXIT(zfsvfs); 1692 return (error); 1693 } 1694 1695 if (vsecp && aclp == NULL) { 1696 error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); 1697 if (error) { 1698 zfs_dirent_unlock(dl); 1699 ZFS_EXIT(zfsvfs); 1700 return (error); 1701 } 1702 } 1703 /* 1704 * Add a new entry to the directory. 1705 */ 1706 tx = dmu_tx_create(zfsvfs->z_os); 1707 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1708 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1709 if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || 1710 IS_EPHEMERAL(gid)) { 1711 if (zfsvfs->z_fuid_obj == 0) { 1712 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1713 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1714 FUID_SIZE_ESTIMATE(zfsvfs)); 1715 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1716 } else { 1717 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1718 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1719 FUID_SIZE_ESTIMATE(zfsvfs)); 1720 } 1721 } 1722 if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) 1723 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1724 0, SPA_MAXBLOCKSIZE); 1725 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1726 if (error) { 1727 zfs_dirent_unlock(dl); 1728 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1729 dmu_tx_wait(tx); 1730 dmu_tx_abort(tx); 1731 goto top; 1732 } 1733 dmu_tx_abort(tx); 1734 ZFS_EXIT(zfsvfs); 1735 if (aclp) 1736 zfs_acl_free(aclp); 1737 return (error); 1738 } 1739 1740 /* 1741 * Create new node. 1742 */ 1743 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); 1744 1745 if (aclp) 1746 zfs_acl_free(aclp); 1747 1748 /* 1749 * Now put new name in parent dir. 1750 */ 1751 (void) zfs_link_create(dl, zp, tx, ZNEW); 1752 1753 *vpp = ZTOV(zp); 1754 1755 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1756 if (flags & FIGNORECASE) 1757 txtype |= TX_CI; 1758 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); 1759 1760 if (fuidp) 1761 zfs_fuid_info_free(fuidp); 1762 dmu_tx_commit(tx); 1763 1764 zfs_dirent_unlock(dl); 1765 1766 ZFS_EXIT(zfsvfs); 1767 return (0); 1768 } 1769 1770 /* 1771 * Remove a directory subdir entry. If the current working 1772 * directory is the same as the subdir to be removed, the 1773 * remove will fail. 1774 * 1775 * IN: dvp - vnode of directory to remove from. 1776 * name - name of directory to be removed. 1777 * cwd - vnode of current working directory. 1778 * cr - credentials of caller. 1779 * ct - caller context 1780 * flags - case flags 1781 * 1782 * RETURN: 0 if success 1783 * error code if failure 1784 * 1785 * Timestamps: 1786 * dvp - ctime|mtime updated 1787 */ 1788 /*ARGSUSED*/ 1789 static int 1790 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1791 caller_context_t *ct, int flags) 1792 { 1793 znode_t *dzp = VTOZ(dvp); 1794 znode_t *zp; 1795 vnode_t *vp; 1796 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1797 zilog_t *zilog; 1798 zfs_dirlock_t *dl; 1799 dmu_tx_t *tx; 1800 int error; 1801 int zflg = ZEXISTS; 1802 1803 ZFS_ENTER(zfsvfs); 1804 ZFS_VERIFY_ZP(dzp); 1805 zilog = zfsvfs->z_log; 1806 1807 if (flags & FIGNORECASE) 1808 zflg |= ZCILOOK; 1809 top: 1810 zp = NULL; 1811 1812 /* 1813 * Attempt to lock directory; fail if entry doesn't exist. 1814 */ 1815 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1816 NULL, NULL)) { 1817 ZFS_EXIT(zfsvfs); 1818 return (error); 1819 } 1820 1821 vp = ZTOV(zp); 1822 1823 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1824 goto out; 1825 } 1826 1827 if (vp->v_type != VDIR) { 1828 error = ENOTDIR; 1829 goto out; 1830 } 1831 1832 if (vp == cwd) { 1833 error = EINVAL; 1834 goto out; 1835 } 1836 1837 vnevent_rmdir(vp, dvp, name, ct); 1838 1839 /* 1840 * Grab a lock on the directory to make sure that noone is 1841 * trying to add (or lookup) entries while we are removing it. 1842 */ 1843 rw_enter(&zp->z_name_lock, RW_WRITER); 1844 1845 /* 1846 * Grab a lock on the parent pointer to make sure we play well 1847 * with the treewalk and directory rename code. 1848 */ 1849 rw_enter(&zp->z_parent_lock, RW_WRITER); 1850 1851 tx = dmu_tx_create(zfsvfs->z_os); 1852 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1853 dmu_tx_hold_bonus(tx, zp->z_id); 1854 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1855 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1856 if (error) { 1857 rw_exit(&zp->z_parent_lock); 1858 rw_exit(&zp->z_name_lock); 1859 zfs_dirent_unlock(dl); 1860 VN_RELE(vp); 1861 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1862 dmu_tx_wait(tx); 1863 dmu_tx_abort(tx); 1864 goto top; 1865 } 1866 dmu_tx_abort(tx); 1867 ZFS_EXIT(zfsvfs); 1868 return (error); 1869 } 1870 1871 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1872 1873 if (error == 0) { 1874 uint64_t txtype = TX_RMDIR; 1875 if (flags & FIGNORECASE) 1876 txtype |= TX_CI; 1877 zfs_log_remove(zilog, tx, txtype, dzp, name); 1878 } 1879 1880 dmu_tx_commit(tx); 1881 1882 rw_exit(&zp->z_parent_lock); 1883 rw_exit(&zp->z_name_lock); 1884 out: 1885 zfs_dirent_unlock(dl); 1886 1887 VN_RELE(vp); 1888 1889 ZFS_EXIT(zfsvfs); 1890 return (error); 1891 } 1892 1893 /* 1894 * Read as many directory entries as will fit into the provided 1895 * buffer from the given directory cursor position (specified in 1896 * the uio structure. 1897 * 1898 * IN: vp - vnode of directory to read. 1899 * uio - structure supplying read location, range info, 1900 * and return buffer. 1901 * cr - credentials of caller. 1902 * ct - caller context 1903 * flags - case flags 1904 * 1905 * OUT: uio - updated offset and range, buffer filled. 1906 * eofp - set to true if end-of-file detected. 1907 * 1908 * RETURN: 0 if success 1909 * error code if failure 1910 * 1911 * Timestamps: 1912 * vp - atime updated 1913 * 1914 * Note that the low 4 bits of the cookie returned by zap is always zero. 1915 * This allows us to use the low range for "special" directory entries: 1916 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1917 * we use the offset 2 for the '.zfs' directory. 1918 */ 1919 /* ARGSUSED */ 1920 static int 1921 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 1922 caller_context_t *ct, int flags) 1923 { 1924 znode_t *zp = VTOZ(vp); 1925 iovec_t *iovp; 1926 edirent_t *eodp; 1927 dirent64_t *odp; 1928 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1929 objset_t *os; 1930 caddr_t outbuf; 1931 size_t bufsize; 1932 zap_cursor_t zc; 1933 zap_attribute_t zap; 1934 uint_t bytes_wanted; 1935 uint64_t offset; /* must be unsigned; checks for < 1 */ 1936 int local_eof; 1937 int outcount; 1938 int error; 1939 uint8_t prefetch; 1940 boolean_t check_sysattrs; 1941 1942 ZFS_ENTER(zfsvfs); 1943 ZFS_VERIFY_ZP(zp); 1944 1945 /* 1946 * If we are not given an eof variable, 1947 * use a local one. 1948 */ 1949 if (eofp == NULL) 1950 eofp = &local_eof; 1951 1952 /* 1953 * Check for valid iov_len. 1954 */ 1955 if (uio->uio_iov->iov_len <= 0) { 1956 ZFS_EXIT(zfsvfs); 1957 return (EINVAL); 1958 } 1959 1960 /* 1961 * Quit if directory has been removed (posix) 1962 */ 1963 if ((*eofp = zp->z_unlinked) != 0) { 1964 ZFS_EXIT(zfsvfs); 1965 return (0); 1966 } 1967 1968 error = 0; 1969 os = zfsvfs->z_os; 1970 offset = uio->uio_loffset; 1971 prefetch = zp->z_zn_prefetch; 1972 1973 /* 1974 * Initialize the iterator cursor. 1975 */ 1976 if (offset <= 3) { 1977 /* 1978 * Start iteration from the beginning of the directory. 1979 */ 1980 zap_cursor_init(&zc, os, zp->z_id); 1981 } else { 1982 /* 1983 * The offset is a serialized cursor. 1984 */ 1985 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1986 } 1987 1988 /* 1989 * Get space to change directory entries into fs independent format. 1990 */ 1991 iovp = uio->uio_iov; 1992 bytes_wanted = iovp->iov_len; 1993 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 1994 bufsize = bytes_wanted; 1995 outbuf = kmem_alloc(bufsize, KM_SLEEP); 1996 odp = (struct dirent64 *)outbuf; 1997 } else { 1998 bufsize = bytes_wanted; 1999 odp = (struct dirent64 *)iovp->iov_base; 2000 } 2001 eodp = (struct edirent *)odp; 2002 2003 /* 2004 * If this VFS supports the system attribute view interface; and 2005 * we're looking at an extended attribute directory; and we care 2006 * about normalization conflicts on this vfs; then we must check 2007 * for normalization conflicts with the sysattr name space. 2008 */ 2009 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2010 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2011 (flags & V_RDDIR_ENTFLAGS); 2012 2013 /* 2014 * Transform to file-system independent format 2015 */ 2016 outcount = 0; 2017 while (outcount < bytes_wanted) { 2018 ino64_t objnum; 2019 ushort_t reclen; 2020 off64_t *next; 2021 2022 /* 2023 * Special case `.', `..', and `.zfs'. 2024 */ 2025 if (offset == 0) { 2026 (void) strcpy(zap.za_name, "."); 2027 zap.za_normalization_conflict = 0; 2028 objnum = zp->z_id; 2029 } else if (offset == 1) { 2030 (void) strcpy(zap.za_name, ".."); 2031 zap.za_normalization_conflict = 0; 2032 objnum = zp->z_phys->zp_parent; 2033 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2034 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2035 zap.za_normalization_conflict = 0; 2036 objnum = ZFSCTL_INO_ROOT; 2037 } else { 2038 /* 2039 * Grab next entry. 2040 */ 2041 if (error = zap_cursor_retrieve(&zc, &zap)) { 2042 if ((*eofp = (error == ENOENT)) != 0) 2043 break; 2044 else 2045 goto update; 2046 } 2047 2048 if (zap.za_integer_length != 8 || 2049 zap.za_num_integers != 1) { 2050 cmn_err(CE_WARN, "zap_readdir: bad directory " 2051 "entry, obj = %lld, offset = %lld\n", 2052 (u_longlong_t)zp->z_id, 2053 (u_longlong_t)offset); 2054 error = ENXIO; 2055 goto update; 2056 } 2057 2058 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2059 /* 2060 * MacOS X can extract the object type here such as: 2061 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2062 */ 2063 2064 if (check_sysattrs && !zap.za_normalization_conflict) { 2065 zap.za_normalization_conflict = 2066 xattr_sysattr_casechk(zap.za_name); 2067 } 2068 } 2069 2070 if (flags & V_RDDIR_ENTFLAGS) 2071 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2072 else 2073 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2074 2075 /* 2076 * Will this entry fit in the buffer? 2077 */ 2078 if (outcount + reclen > bufsize) { 2079 /* 2080 * Did we manage to fit anything in the buffer? 2081 */ 2082 if (!outcount) { 2083 error = EINVAL; 2084 goto update; 2085 } 2086 break; 2087 } 2088 if (flags & V_RDDIR_ENTFLAGS) { 2089 /* 2090 * Add extended flag entry: 2091 */ 2092 eodp->ed_ino = objnum; 2093 eodp->ed_reclen = reclen; 2094 /* NOTE: ed_off is the offset for the *next* entry */ 2095 next = &(eodp->ed_off); 2096 eodp->ed_eflags = zap.za_normalization_conflict ? 2097 ED_CASE_CONFLICT : 0; 2098 (void) strncpy(eodp->ed_name, zap.za_name, 2099 EDIRENT_NAMELEN(reclen)); 2100 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2101 } else { 2102 /* 2103 * Add normal entry: 2104 */ 2105 odp->d_ino = objnum; 2106 odp->d_reclen = reclen; 2107 /* NOTE: d_off is the offset for the *next* entry */ 2108 next = &(odp->d_off); 2109 (void) strncpy(odp->d_name, zap.za_name, 2110 DIRENT64_NAMELEN(reclen)); 2111 odp = (dirent64_t *)((intptr_t)odp + reclen); 2112 } 2113 outcount += reclen; 2114 2115 ASSERT(outcount <= bufsize); 2116 2117 /* Prefetch znode */ 2118 if (prefetch) 2119 dmu_prefetch(os, objnum, 0, 0); 2120 2121 /* 2122 * Move to the next entry, fill in the previous offset. 2123 */ 2124 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2125 zap_cursor_advance(&zc); 2126 offset = zap_cursor_serialize(&zc); 2127 } else { 2128 offset += 1; 2129 } 2130 *next = offset; 2131 } 2132 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2133 2134 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2135 iovp->iov_base += outcount; 2136 iovp->iov_len -= outcount; 2137 uio->uio_resid -= outcount; 2138 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2139 /* 2140 * Reset the pointer. 2141 */ 2142 offset = uio->uio_loffset; 2143 } 2144 2145 update: 2146 zap_cursor_fini(&zc); 2147 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2148 kmem_free(outbuf, bufsize); 2149 2150 if (error == ENOENT) 2151 error = 0; 2152 2153 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2154 2155 uio->uio_loffset = offset; 2156 ZFS_EXIT(zfsvfs); 2157 return (error); 2158 } 2159 2160 ulong_t zfs_fsync_sync_cnt = 4; 2161 2162 static int 2163 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2164 { 2165 znode_t *zp = VTOZ(vp); 2166 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2167 2168 /* 2169 * Regardless of whether this is required for standards conformance, 2170 * this is the logical behavior when fsync() is called on a file with 2171 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2172 * going to be pushed out as part of the zil_commit(). 2173 */ 2174 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2175 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2176 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2177 2178 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2179 2180 ZFS_ENTER(zfsvfs); 2181 ZFS_VERIFY_ZP(zp); 2182 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2183 ZFS_EXIT(zfsvfs); 2184 return (0); 2185 } 2186 2187 2188 /* 2189 * Get the requested file attributes and place them in the provided 2190 * vattr structure. 2191 * 2192 * IN: vp - vnode of file. 2193 * vap - va_mask identifies requested attributes. 2194 * If AT_XVATTR set, then optional attrs are requested 2195 * flags - ATTR_NOACLCHECK (CIFS server context) 2196 * cr - credentials of caller. 2197 * ct - caller context 2198 * 2199 * OUT: vap - attribute values. 2200 * 2201 * RETURN: 0 (always succeeds) 2202 */ 2203 /* ARGSUSED */ 2204 static int 2205 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2206 caller_context_t *ct) 2207 { 2208 znode_t *zp = VTOZ(vp); 2209 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2210 znode_phys_t *pzp; 2211 int error = 0; 2212 uint64_t links; 2213 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2214 xoptattr_t *xoap = NULL; 2215 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2216 2217 ZFS_ENTER(zfsvfs); 2218 ZFS_VERIFY_ZP(zp); 2219 pzp = zp->z_phys; 2220 2221 mutex_enter(&zp->z_lock); 2222 2223 /* 2224 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2225 * Also, if we are the owner don't bother, since owner should 2226 * always be allowed to read basic attributes of file. 2227 */ 2228 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2229 (pzp->zp_uid != crgetuid(cr))) { 2230 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2231 skipaclchk, cr)) { 2232 mutex_exit(&zp->z_lock); 2233 ZFS_EXIT(zfsvfs); 2234 return (error); 2235 } 2236 } 2237 2238 /* 2239 * Return all attributes. It's cheaper to provide the answer 2240 * than to determine whether we were asked the question. 2241 */ 2242 2243 vap->va_type = vp->v_type; 2244 vap->va_mode = pzp->zp_mode & MODEMASK; 2245 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2246 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2247 vap->va_nodeid = zp->z_id; 2248 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2249 links = pzp->zp_links + 1; 2250 else 2251 links = pzp->zp_links; 2252 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2253 vap->va_size = pzp->zp_size; 2254 vap->va_rdev = vp->v_rdev; 2255 vap->va_seq = zp->z_seq; 2256 2257 /* 2258 * Add in any requested optional attributes and the create time. 2259 * Also set the corresponding bits in the returned attribute bitmap. 2260 */ 2261 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2262 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2263 xoap->xoa_archive = 2264 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2265 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2266 } 2267 2268 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2269 xoap->xoa_readonly = 2270 ((pzp->zp_flags & ZFS_READONLY) != 0); 2271 XVA_SET_RTN(xvap, XAT_READONLY); 2272 } 2273 2274 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2275 xoap->xoa_system = 2276 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2277 XVA_SET_RTN(xvap, XAT_SYSTEM); 2278 } 2279 2280 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2281 xoap->xoa_hidden = 2282 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2283 XVA_SET_RTN(xvap, XAT_HIDDEN); 2284 } 2285 2286 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2287 xoap->xoa_nounlink = 2288 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2289 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2290 } 2291 2292 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2293 xoap->xoa_immutable = 2294 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2295 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2296 } 2297 2298 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2299 xoap->xoa_appendonly = 2300 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2301 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2302 } 2303 2304 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2305 xoap->xoa_nodump = 2306 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2307 XVA_SET_RTN(xvap, XAT_NODUMP); 2308 } 2309 2310 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2311 xoap->xoa_opaque = 2312 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2313 XVA_SET_RTN(xvap, XAT_OPAQUE); 2314 } 2315 2316 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2317 xoap->xoa_av_quarantined = 2318 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2319 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2320 } 2321 2322 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2323 xoap->xoa_av_modified = 2324 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2325 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2326 } 2327 2328 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2329 vp->v_type == VREG && 2330 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2331 size_t len; 2332 dmu_object_info_t doi; 2333 2334 /* 2335 * Only VREG files have anti-virus scanstamps, so we 2336 * won't conflict with symlinks in the bonus buffer. 2337 */ 2338 dmu_object_info_from_db(zp->z_dbuf, &doi); 2339 len = sizeof (xoap->xoa_av_scanstamp) + 2340 sizeof (znode_phys_t); 2341 if (len <= doi.doi_bonus_size) { 2342 /* 2343 * pzp points to the start of the 2344 * znode_phys_t. pzp + 1 points to the 2345 * first byte after the znode_phys_t. 2346 */ 2347 (void) memcpy(xoap->xoa_av_scanstamp, 2348 pzp + 1, 2349 sizeof (xoap->xoa_av_scanstamp)); 2350 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2351 } 2352 } 2353 2354 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2355 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2356 XVA_SET_RTN(xvap, XAT_CREATETIME); 2357 } 2358 } 2359 2360 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2361 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2362 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2363 2364 mutex_exit(&zp->z_lock); 2365 2366 dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); 2367 2368 if (zp->z_blksz == 0) { 2369 /* 2370 * Block size hasn't been set; suggest maximal I/O transfers. 2371 */ 2372 vap->va_blksize = zfsvfs->z_max_blksz; 2373 } 2374 2375 ZFS_EXIT(zfsvfs); 2376 return (0); 2377 } 2378 2379 /* 2380 * Set the file attributes to the values contained in the 2381 * vattr structure. 2382 * 2383 * IN: vp - vnode of file to be modified. 2384 * vap - new attribute values. 2385 * If AT_XVATTR set, then optional attrs are being set 2386 * flags - ATTR_UTIME set if non-default time values provided. 2387 * - ATTR_NOACLCHECK (CIFS context only). 2388 * cr - credentials of caller. 2389 * ct - caller context 2390 * 2391 * RETURN: 0 if success 2392 * error code if failure 2393 * 2394 * Timestamps: 2395 * vp - ctime updated, mtime updated if size changed. 2396 */ 2397 /* ARGSUSED */ 2398 static int 2399 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2400 caller_context_t *ct) 2401 { 2402 znode_t *zp = VTOZ(vp); 2403 znode_phys_t *pzp; 2404 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2405 zilog_t *zilog; 2406 dmu_tx_t *tx; 2407 vattr_t oldva; 2408 uint_t mask = vap->va_mask; 2409 uint_t saved_mask; 2410 int trim_mask = 0; 2411 uint64_t new_mode; 2412 znode_t *attrzp; 2413 int need_policy = FALSE; 2414 int err; 2415 zfs_fuid_info_t *fuidp = NULL; 2416 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2417 xoptattr_t *xoap; 2418 zfs_acl_t *aclp = NULL; 2419 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2420 2421 if (mask == 0) 2422 return (0); 2423 2424 if (mask & AT_NOSET) 2425 return (EINVAL); 2426 2427 ZFS_ENTER(zfsvfs); 2428 ZFS_VERIFY_ZP(zp); 2429 2430 pzp = zp->z_phys; 2431 zilog = zfsvfs->z_log; 2432 2433 /* 2434 * Make sure that if we have ephemeral uid/gid or xvattr specified 2435 * that file system is at proper version level 2436 */ 2437 2438 if (zfsvfs->z_use_fuids == B_FALSE && 2439 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2440 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2441 (mask & AT_XVATTR))) { 2442 ZFS_EXIT(zfsvfs); 2443 return (EINVAL); 2444 } 2445 2446 if (mask & AT_SIZE && vp->v_type == VDIR) { 2447 ZFS_EXIT(zfsvfs); 2448 return (EISDIR); 2449 } 2450 2451 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2452 ZFS_EXIT(zfsvfs); 2453 return (EINVAL); 2454 } 2455 2456 /* 2457 * If this is an xvattr_t, then get a pointer to the structure of 2458 * optional attributes. If this is NULL, then we have a vattr_t. 2459 */ 2460 xoap = xva_getxoptattr(xvap); 2461 2462 /* 2463 * Immutable files can only alter immutable bit and atime 2464 */ 2465 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2466 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2467 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2468 ZFS_EXIT(zfsvfs); 2469 return (EPERM); 2470 } 2471 2472 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2473 ZFS_EXIT(zfsvfs); 2474 return (EPERM); 2475 } 2476 2477 /* 2478 * Verify timestamps doesn't overflow 32 bits. 2479 * ZFS can handle large timestamps, but 32bit syscalls can't 2480 * handle times greater than 2039. This check should be removed 2481 * once large timestamps are fully supported. 2482 */ 2483 if (mask & (AT_ATIME | AT_MTIME)) { 2484 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2485 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2486 ZFS_EXIT(zfsvfs); 2487 return (EOVERFLOW); 2488 } 2489 } 2490 2491 top: 2492 attrzp = NULL; 2493 2494 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2495 ZFS_EXIT(zfsvfs); 2496 return (EROFS); 2497 } 2498 2499 /* 2500 * First validate permissions 2501 */ 2502 2503 if (mask & AT_SIZE) { 2504 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2505 if (err) { 2506 ZFS_EXIT(zfsvfs); 2507 return (err); 2508 } 2509 /* 2510 * XXX - Note, we are not providing any open 2511 * mode flags here (like FNDELAY), so we may 2512 * block if there are locks present... this 2513 * should be addressed in openat(). 2514 */ 2515 /* XXX - would it be OK to generate a log record here? */ 2516 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2517 if (err) { 2518 ZFS_EXIT(zfsvfs); 2519 return (err); 2520 } 2521 } 2522 2523 if (mask & (AT_ATIME|AT_MTIME) || 2524 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2525 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2526 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2527 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2528 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2529 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2530 skipaclchk, cr); 2531 2532 if (mask & (AT_UID|AT_GID)) { 2533 int idmask = (mask & (AT_UID|AT_GID)); 2534 int take_owner; 2535 int take_group; 2536 2537 /* 2538 * NOTE: even if a new mode is being set, 2539 * we may clear S_ISUID/S_ISGID bits. 2540 */ 2541 2542 if (!(mask & AT_MODE)) 2543 vap->va_mode = pzp->zp_mode; 2544 2545 /* 2546 * Take ownership or chgrp to group we are a member of 2547 */ 2548 2549 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2550 take_group = (mask & AT_GID) && 2551 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2552 2553 /* 2554 * If both AT_UID and AT_GID are set then take_owner and 2555 * take_group must both be set in order to allow taking 2556 * ownership. 2557 * 2558 * Otherwise, send the check through secpolicy_vnode_setattr() 2559 * 2560 */ 2561 2562 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2563 ((idmask == AT_UID) && take_owner) || 2564 ((idmask == AT_GID) && take_group)) { 2565 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2566 skipaclchk, cr) == 0) { 2567 /* 2568 * Remove setuid/setgid for non-privileged users 2569 */ 2570 secpolicy_setid_clear(vap, cr); 2571 trim_mask = (mask & (AT_UID|AT_GID)); 2572 } else { 2573 need_policy = TRUE; 2574 } 2575 } else { 2576 need_policy = TRUE; 2577 } 2578 } 2579 2580 mutex_enter(&zp->z_lock); 2581 oldva.va_mode = pzp->zp_mode; 2582 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2583 if (mask & AT_XVATTR) { 2584 if ((need_policy == FALSE) && 2585 (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && 2586 xoap->xoa_appendonly != 2587 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || 2588 (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && 2589 xoap->xoa_nounlink != 2590 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || 2591 (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && 2592 xoap->xoa_immutable != 2593 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || 2594 (XVA_ISSET_REQ(xvap, XAT_NODUMP) && 2595 xoap->xoa_nodump != 2596 ((pzp->zp_flags & ZFS_NODUMP) != 0)) || 2597 (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && 2598 xoap->xoa_av_modified != 2599 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || 2600 ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && 2601 ((vp->v_type != VREG && xoap->xoa_av_quarantined) || 2602 xoap->xoa_av_quarantined != 2603 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || 2604 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2605 (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2606 need_policy = TRUE; 2607 } 2608 } 2609 2610 mutex_exit(&zp->z_lock); 2611 2612 if (mask & AT_MODE) { 2613 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2614 err = secpolicy_setid_setsticky_clear(vp, vap, 2615 &oldva, cr); 2616 if (err) { 2617 ZFS_EXIT(zfsvfs); 2618 return (err); 2619 } 2620 trim_mask |= AT_MODE; 2621 } else { 2622 need_policy = TRUE; 2623 } 2624 } 2625 2626 if (need_policy) { 2627 /* 2628 * If trim_mask is set then take ownership 2629 * has been granted or write_acl is present and user 2630 * has the ability to modify mode. In that case remove 2631 * UID|GID and or MODE from mask so that 2632 * secpolicy_vnode_setattr() doesn't revoke it. 2633 */ 2634 2635 if (trim_mask) { 2636 saved_mask = vap->va_mask; 2637 vap->va_mask &= ~trim_mask; 2638 } 2639 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2640 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2641 if (err) { 2642 ZFS_EXIT(zfsvfs); 2643 return (err); 2644 } 2645 2646 if (trim_mask) 2647 vap->va_mask |= saved_mask; 2648 } 2649 2650 /* 2651 * secpolicy_vnode_setattr, or take ownership may have 2652 * changed va_mask 2653 */ 2654 mask = vap->va_mask; 2655 2656 tx = dmu_tx_create(zfsvfs->z_os); 2657 dmu_tx_hold_bonus(tx, zp->z_id); 2658 if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2659 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { 2660 if (zfsvfs->z_fuid_obj == 0) { 2661 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2662 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2663 FUID_SIZE_ESTIMATE(zfsvfs)); 2664 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 2665 } else { 2666 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2667 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2668 FUID_SIZE_ESTIMATE(zfsvfs)); 2669 } 2670 } 2671 2672 if (mask & AT_MODE) { 2673 uint64_t pmode = pzp->zp_mode; 2674 2675 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2676 2677 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2678 dmu_tx_abort(tx); 2679 ZFS_EXIT(zfsvfs); 2680 return (err); 2681 } 2682 if (pzp->zp_acl.z_acl_extern_obj) { 2683 /* Are we upgrading ACL from old V0 format to new V1 */ 2684 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2685 pzp->zp_acl.z_acl_version == 2686 ZFS_ACL_VERSION_INITIAL) { 2687 dmu_tx_hold_free(tx, 2688 pzp->zp_acl.z_acl_extern_obj, 0, 2689 DMU_OBJECT_END); 2690 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2691 0, aclp->z_acl_bytes); 2692 } else { 2693 dmu_tx_hold_write(tx, 2694 pzp->zp_acl.z_acl_extern_obj, 0, 2695 aclp->z_acl_bytes); 2696 } 2697 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2698 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2699 0, aclp->z_acl_bytes); 2700 } 2701 } 2702 2703 if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { 2704 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2705 if (err) { 2706 dmu_tx_abort(tx); 2707 ZFS_EXIT(zfsvfs); 2708 if (aclp) 2709 zfs_acl_free(aclp); 2710 return (err); 2711 } 2712 dmu_tx_hold_bonus(tx, attrzp->z_id); 2713 } 2714 2715 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2716 if (err) { 2717 if (attrzp) 2718 VN_RELE(ZTOV(attrzp)); 2719 2720 if (aclp) { 2721 zfs_acl_free(aclp); 2722 aclp = NULL; 2723 } 2724 2725 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2726 dmu_tx_wait(tx); 2727 dmu_tx_abort(tx); 2728 goto top; 2729 } 2730 dmu_tx_abort(tx); 2731 ZFS_EXIT(zfsvfs); 2732 return (err); 2733 } 2734 2735 dmu_buf_will_dirty(zp->z_dbuf, tx); 2736 2737 /* 2738 * Set each attribute requested. 2739 * We group settings according to the locks they need to acquire. 2740 * 2741 * Note: you cannot set ctime directly, although it will be 2742 * updated as a side-effect of calling this function. 2743 */ 2744 2745 mutex_enter(&zp->z_lock); 2746 2747 if (mask & AT_MODE) { 2748 mutex_enter(&zp->z_acl_lock); 2749 zp->z_phys->zp_mode = new_mode; 2750 err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); 2751 ASSERT3U(err, ==, 0); 2752 mutex_exit(&zp->z_acl_lock); 2753 } 2754 2755 if (attrzp) 2756 mutex_enter(&attrzp->z_lock); 2757 2758 if (mask & AT_UID) { 2759 pzp->zp_uid = zfs_fuid_create(zfsvfs, 2760 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2761 if (attrzp) { 2762 attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, 2763 vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); 2764 } 2765 } 2766 2767 if (mask & AT_GID) { 2768 pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, 2769 cr, ZFS_GROUP, tx, &fuidp); 2770 if (attrzp) 2771 attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, 2772 vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); 2773 } 2774 2775 if (aclp) 2776 zfs_acl_free(aclp); 2777 2778 if (attrzp) 2779 mutex_exit(&attrzp->z_lock); 2780 2781 if (mask & AT_ATIME) 2782 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2783 2784 if (mask & AT_MTIME) 2785 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2786 2787 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2788 if (mask & AT_SIZE) 2789 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2790 else if (mask != 0) 2791 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2792 /* 2793 * Do this after setting timestamps to prevent timestamp 2794 * update from toggling bit 2795 */ 2796 2797 if (xoap && (mask & AT_XVATTR)) { 2798 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2799 size_t len; 2800 dmu_object_info_t doi; 2801 2802 ASSERT(vp->v_type == VREG); 2803 2804 /* Grow the bonus buffer if necessary. */ 2805 dmu_object_info_from_db(zp->z_dbuf, &doi); 2806 len = sizeof (xoap->xoa_av_scanstamp) + 2807 sizeof (znode_phys_t); 2808 if (len > doi.doi_bonus_size) 2809 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2810 } 2811 zfs_xvattr_set(zp, xvap); 2812 } 2813 2814 if (mask != 0) 2815 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2816 2817 if (fuidp) 2818 zfs_fuid_info_free(fuidp); 2819 mutex_exit(&zp->z_lock); 2820 2821 if (attrzp) 2822 VN_RELE(ZTOV(attrzp)); 2823 2824 dmu_tx_commit(tx); 2825 2826 ZFS_EXIT(zfsvfs); 2827 return (err); 2828 } 2829 2830 typedef struct zfs_zlock { 2831 krwlock_t *zl_rwlock; /* lock we acquired */ 2832 znode_t *zl_znode; /* znode we held */ 2833 struct zfs_zlock *zl_next; /* next in list */ 2834 } zfs_zlock_t; 2835 2836 /* 2837 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2838 */ 2839 static void 2840 zfs_rename_unlock(zfs_zlock_t **zlpp) 2841 { 2842 zfs_zlock_t *zl; 2843 2844 while ((zl = *zlpp) != NULL) { 2845 if (zl->zl_znode != NULL) 2846 VN_RELE(ZTOV(zl->zl_znode)); 2847 rw_exit(zl->zl_rwlock); 2848 *zlpp = zl->zl_next; 2849 kmem_free(zl, sizeof (*zl)); 2850 } 2851 } 2852 2853 /* 2854 * Search back through the directory tree, using the ".." entries. 2855 * Lock each directory in the chain to prevent concurrent renames. 2856 * Fail any attempt to move a directory into one of its own descendants. 2857 * XXX - z_parent_lock can overlap with map or grow locks 2858 */ 2859 static int 2860 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2861 { 2862 zfs_zlock_t *zl; 2863 znode_t *zp = tdzp; 2864 uint64_t rootid = zp->z_zfsvfs->z_root; 2865 uint64_t *oidp = &zp->z_id; 2866 krwlock_t *rwlp = &szp->z_parent_lock; 2867 krw_t rw = RW_WRITER; 2868 2869 /* 2870 * First pass write-locks szp and compares to zp->z_id. 2871 * Later passes read-lock zp and compare to zp->z_parent. 2872 */ 2873 do { 2874 if (!rw_tryenter(rwlp, rw)) { 2875 /* 2876 * Another thread is renaming in this path. 2877 * Note that if we are a WRITER, we don't have any 2878 * parent_locks held yet. 2879 */ 2880 if (rw == RW_READER && zp->z_id > szp->z_id) { 2881 /* 2882 * Drop our locks and restart 2883 */ 2884 zfs_rename_unlock(&zl); 2885 *zlpp = NULL; 2886 zp = tdzp; 2887 oidp = &zp->z_id; 2888 rwlp = &szp->z_parent_lock; 2889 rw = RW_WRITER; 2890 continue; 2891 } else { 2892 /* 2893 * Wait for other thread to drop its locks 2894 */ 2895 rw_enter(rwlp, rw); 2896 } 2897 } 2898 2899 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2900 zl->zl_rwlock = rwlp; 2901 zl->zl_znode = NULL; 2902 zl->zl_next = *zlpp; 2903 *zlpp = zl; 2904 2905 if (*oidp == szp->z_id) /* We're a descendant of szp */ 2906 return (EINVAL); 2907 2908 if (*oidp == rootid) /* We've hit the top */ 2909 return (0); 2910 2911 if (rw == RW_READER) { /* i.e. not the first pass */ 2912 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 2913 if (error) 2914 return (error); 2915 zl->zl_znode = zp; 2916 } 2917 oidp = &zp->z_phys->zp_parent; 2918 rwlp = &zp->z_parent_lock; 2919 rw = RW_READER; 2920 2921 } while (zp->z_id != sdzp->z_id); 2922 2923 return (0); 2924 } 2925 2926 /* 2927 * Move an entry from the provided source directory to the target 2928 * directory. Change the entry name as indicated. 2929 * 2930 * IN: sdvp - Source directory containing the "old entry". 2931 * snm - Old entry name. 2932 * tdvp - Target directory to contain the "new entry". 2933 * tnm - New entry name. 2934 * cr - credentials of caller. 2935 * ct - caller context 2936 * flags - case flags 2937 * 2938 * RETURN: 0 if success 2939 * error code if failure 2940 * 2941 * Timestamps: 2942 * sdvp,tdvp - ctime|mtime updated 2943 */ 2944 /*ARGSUSED*/ 2945 static int 2946 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 2947 caller_context_t *ct, int flags) 2948 { 2949 znode_t *tdzp, *szp, *tzp; 2950 znode_t *sdzp = VTOZ(sdvp); 2951 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 2952 zilog_t *zilog; 2953 vnode_t *realvp; 2954 zfs_dirlock_t *sdl, *tdl; 2955 dmu_tx_t *tx; 2956 zfs_zlock_t *zl; 2957 int cmp, serr, terr; 2958 int error = 0; 2959 int zflg = 0; 2960 2961 ZFS_ENTER(zfsvfs); 2962 ZFS_VERIFY_ZP(sdzp); 2963 zilog = zfsvfs->z_log; 2964 2965 /* 2966 * Make sure we have the real vp for the target directory. 2967 */ 2968 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 2969 tdvp = realvp; 2970 2971 if (tdvp->v_vfsp != sdvp->v_vfsp) { 2972 ZFS_EXIT(zfsvfs); 2973 return (EXDEV); 2974 } 2975 2976 tdzp = VTOZ(tdvp); 2977 ZFS_VERIFY_ZP(tdzp); 2978 if (zfsvfs->z_utf8 && u8_validate(tnm, 2979 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2980 ZFS_EXIT(zfsvfs); 2981 return (EILSEQ); 2982 } 2983 2984 if (flags & FIGNORECASE) 2985 zflg |= ZCILOOK; 2986 2987 top: 2988 szp = NULL; 2989 tzp = NULL; 2990 zl = NULL; 2991 2992 /* 2993 * This is to prevent the creation of links into attribute space 2994 * by renaming a linked file into/outof an attribute directory. 2995 * See the comment in zfs_link() for why this is considered bad. 2996 */ 2997 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 2998 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 2999 ZFS_EXIT(zfsvfs); 3000 return (EINVAL); 3001 } 3002 3003 /* 3004 * Lock source and target directory entries. To prevent deadlock, 3005 * a lock ordering must be defined. We lock the directory with 3006 * the smallest object id first, or if it's a tie, the one with 3007 * the lexically first name. 3008 */ 3009 if (sdzp->z_id < tdzp->z_id) { 3010 cmp = -1; 3011 } else if (sdzp->z_id > tdzp->z_id) { 3012 cmp = 1; 3013 } else { 3014 /* 3015 * First compare the two name arguments without 3016 * considering any case folding. 3017 */ 3018 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3019 3020 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3021 ASSERT(error == 0 || !zfsvfs->z_utf8); 3022 if (cmp == 0) { 3023 /* 3024 * POSIX: "If the old argument and the new argument 3025 * both refer to links to the same existing file, 3026 * the rename() function shall return successfully 3027 * and perform no other action." 3028 */ 3029 ZFS_EXIT(zfsvfs); 3030 return (0); 3031 } 3032 /* 3033 * If the file system is case-folding, then we may 3034 * have some more checking to do. A case-folding file 3035 * system is either supporting mixed case sensitivity 3036 * access or is completely case-insensitive. Note 3037 * that the file system is always case preserving. 3038 * 3039 * In mixed sensitivity mode case sensitive behavior 3040 * is the default. FIGNORECASE must be used to 3041 * explicitly request case insensitive behavior. 3042 * 3043 * If the source and target names provided differ only 3044 * by case (e.g., a request to rename 'tim' to 'Tim'), 3045 * we will treat this as a special case in the 3046 * case-insensitive mode: as long as the source name 3047 * is an exact match, we will allow this to proceed as 3048 * a name-change request. 3049 */ 3050 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3051 (zfsvfs->z_case == ZFS_CASE_MIXED && 3052 flags & FIGNORECASE)) && 3053 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3054 &error) == 0) { 3055 /* 3056 * case preserving rename request, require exact 3057 * name matches 3058 */ 3059 zflg |= ZCIEXACT; 3060 zflg &= ~ZCILOOK; 3061 } 3062 } 3063 3064 if (cmp < 0) { 3065 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3066 ZEXISTS | zflg, NULL, NULL); 3067 terr = zfs_dirent_lock(&tdl, 3068 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3069 } else { 3070 terr = zfs_dirent_lock(&tdl, 3071 tdzp, tnm, &tzp, zflg, NULL, NULL); 3072 serr = zfs_dirent_lock(&sdl, 3073 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3074 NULL, NULL); 3075 } 3076 3077 if (serr) { 3078 /* 3079 * Source entry invalid or not there. 3080 */ 3081 if (!terr) { 3082 zfs_dirent_unlock(tdl); 3083 if (tzp) 3084 VN_RELE(ZTOV(tzp)); 3085 } 3086 if (strcmp(snm, "..") == 0) 3087 serr = EINVAL; 3088 ZFS_EXIT(zfsvfs); 3089 return (serr); 3090 } 3091 if (terr) { 3092 zfs_dirent_unlock(sdl); 3093 VN_RELE(ZTOV(szp)); 3094 if (strcmp(tnm, "..") == 0) 3095 terr = EINVAL; 3096 ZFS_EXIT(zfsvfs); 3097 return (terr); 3098 } 3099 3100 /* 3101 * Must have write access at the source to remove the old entry 3102 * and write access at the target to create the new entry. 3103 * Note that if target and source are the same, this can be 3104 * done in a single check. 3105 */ 3106 3107 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3108 goto out; 3109 3110 if (ZTOV(szp)->v_type == VDIR) { 3111 /* 3112 * Check to make sure rename is valid. 3113 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3114 */ 3115 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3116 goto out; 3117 } 3118 3119 /* 3120 * Does target exist? 3121 */ 3122 if (tzp) { 3123 /* 3124 * Source and target must be the same type. 3125 */ 3126 if (ZTOV(szp)->v_type == VDIR) { 3127 if (ZTOV(tzp)->v_type != VDIR) { 3128 error = ENOTDIR; 3129 goto out; 3130 } 3131 } else { 3132 if (ZTOV(tzp)->v_type == VDIR) { 3133 error = EISDIR; 3134 goto out; 3135 } 3136 } 3137 /* 3138 * POSIX dictates that when the source and target 3139 * entries refer to the same file object, rename 3140 * must do nothing and exit without error. 3141 */ 3142 if (szp->z_id == tzp->z_id) { 3143 error = 0; 3144 goto out; 3145 } 3146 } 3147 3148 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3149 if (tzp) 3150 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3151 3152 /* 3153 * notify the target directory if it is not the same 3154 * as source directory. 3155 */ 3156 if (tdvp != sdvp) { 3157 vnevent_rename_dest_dir(tdvp, ct); 3158 } 3159 3160 tx = dmu_tx_create(zfsvfs->z_os); 3161 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3162 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3163 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3164 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3165 if (sdzp != tdzp) 3166 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3167 if (tzp) 3168 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3169 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3170 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3171 if (error) { 3172 if (zl != NULL) 3173 zfs_rename_unlock(&zl); 3174 zfs_dirent_unlock(sdl); 3175 zfs_dirent_unlock(tdl); 3176 VN_RELE(ZTOV(szp)); 3177 if (tzp) 3178 VN_RELE(ZTOV(tzp)); 3179 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3180 dmu_tx_wait(tx); 3181 dmu_tx_abort(tx); 3182 goto top; 3183 } 3184 dmu_tx_abort(tx); 3185 ZFS_EXIT(zfsvfs); 3186 return (error); 3187 } 3188 3189 if (tzp) /* Attempt to remove the existing target */ 3190 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3191 3192 if (error == 0) { 3193 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3194 if (error == 0) { 3195 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3196 3197 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3198 ASSERT(error == 0); 3199 3200 zfs_log_rename(zilog, tx, 3201 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3202 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3203 3204 /* Update path information for the target vnode */ 3205 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3206 } 3207 } 3208 3209 dmu_tx_commit(tx); 3210 out: 3211 if (zl != NULL) 3212 zfs_rename_unlock(&zl); 3213 3214 zfs_dirent_unlock(sdl); 3215 zfs_dirent_unlock(tdl); 3216 3217 VN_RELE(ZTOV(szp)); 3218 if (tzp) 3219 VN_RELE(ZTOV(tzp)); 3220 3221 ZFS_EXIT(zfsvfs); 3222 return (error); 3223 } 3224 3225 /* 3226 * Insert the indicated symbolic reference entry into the directory. 3227 * 3228 * IN: dvp - Directory to contain new symbolic link. 3229 * link - Name for new symlink entry. 3230 * vap - Attributes of new entry. 3231 * target - Target path of new symlink. 3232 * cr - credentials of caller. 3233 * ct - caller context 3234 * flags - case flags 3235 * 3236 * RETURN: 0 if success 3237 * error code if failure 3238 * 3239 * Timestamps: 3240 * dvp - ctime|mtime updated 3241 */ 3242 /*ARGSUSED*/ 3243 static int 3244 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 3245 caller_context_t *ct, int flags) 3246 { 3247 znode_t *zp, *dzp = VTOZ(dvp); 3248 zfs_dirlock_t *dl; 3249 dmu_tx_t *tx; 3250 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3251 zilog_t *zilog; 3252 int len = strlen(link); 3253 int error; 3254 int zflg = ZNEW; 3255 zfs_fuid_info_t *fuidp = NULL; 3256 3257 ASSERT(vap->va_type == VLNK); 3258 3259 ZFS_ENTER(zfsvfs); 3260 ZFS_VERIFY_ZP(dzp); 3261 zilog = zfsvfs->z_log; 3262 3263 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3264 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3265 ZFS_EXIT(zfsvfs); 3266 return (EILSEQ); 3267 } 3268 if (flags & FIGNORECASE) 3269 zflg |= ZCILOOK; 3270 top: 3271 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3272 ZFS_EXIT(zfsvfs); 3273 return (error); 3274 } 3275 3276 if (len > MAXPATHLEN) { 3277 ZFS_EXIT(zfsvfs); 3278 return (ENAMETOOLONG); 3279 } 3280 3281 /* 3282 * Attempt to lock directory; fail if entry already exists. 3283 */ 3284 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3285 if (error) { 3286 ZFS_EXIT(zfsvfs); 3287 return (error); 3288 } 3289 3290 tx = dmu_tx_create(zfsvfs->z_os); 3291 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3292 dmu_tx_hold_bonus(tx, dzp->z_id); 3293 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3294 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 3295 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3296 if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { 3297 if (zfsvfs->z_fuid_obj == 0) { 3298 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3299 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3300 FUID_SIZE_ESTIMATE(zfsvfs)); 3301 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3302 } else { 3303 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3304 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3305 FUID_SIZE_ESTIMATE(zfsvfs)); 3306 } 3307 } 3308 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3309 if (error) { 3310 zfs_dirent_unlock(dl); 3311 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3312 dmu_tx_wait(tx); 3313 dmu_tx_abort(tx); 3314 goto top; 3315 } 3316 dmu_tx_abort(tx); 3317 ZFS_EXIT(zfsvfs); 3318 return (error); 3319 } 3320 3321 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3322 3323 /* 3324 * Create a new object for the symlink. 3325 * Put the link content into bonus buffer if it will fit; 3326 * otherwise, store it just like any other file data. 3327 */ 3328 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3329 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); 3330 if (len != 0) 3331 bcopy(link, zp->z_phys + 1, len); 3332 } else { 3333 dmu_buf_t *dbp; 3334 3335 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); 3336 /* 3337 * Nothing can access the znode yet so no locking needed 3338 * for growing the znode's blocksize. 3339 */ 3340 zfs_grow_blocksize(zp, len, tx); 3341 3342 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3343 zp->z_id, 0, FTAG, &dbp)); 3344 dmu_buf_will_dirty(dbp, tx); 3345 3346 ASSERT3U(len, <=, dbp->db_size); 3347 bcopy(link, dbp->db_data, len); 3348 dmu_buf_rele(dbp, FTAG); 3349 } 3350 zp->z_phys->zp_size = len; 3351 3352 /* 3353 * Insert the new object into the directory. 3354 */ 3355 (void) zfs_link_create(dl, zp, tx, ZNEW); 3356 out: 3357 if (error == 0) { 3358 uint64_t txtype = TX_SYMLINK; 3359 if (flags & FIGNORECASE) 3360 txtype |= TX_CI; 3361 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3362 } 3363 if (fuidp) 3364 zfs_fuid_info_free(fuidp); 3365 3366 dmu_tx_commit(tx); 3367 3368 zfs_dirent_unlock(dl); 3369 3370 VN_RELE(ZTOV(zp)); 3371 3372 ZFS_EXIT(zfsvfs); 3373 return (error); 3374 } 3375 3376 /* 3377 * Return, in the buffer contained in the provided uio structure, 3378 * the symbolic path referred to by vp. 3379 * 3380 * IN: vp - vnode of symbolic link. 3381 * uoip - structure to contain the link path. 3382 * cr - credentials of caller. 3383 * ct - caller context 3384 * 3385 * OUT: uio - structure to contain the link path. 3386 * 3387 * RETURN: 0 if success 3388 * error code if failure 3389 * 3390 * Timestamps: 3391 * vp - atime updated 3392 */ 3393 /* ARGSUSED */ 3394 static int 3395 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3396 { 3397 znode_t *zp = VTOZ(vp); 3398 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3399 size_t bufsz; 3400 int error; 3401 3402 ZFS_ENTER(zfsvfs); 3403 ZFS_VERIFY_ZP(zp); 3404 3405 bufsz = (size_t)zp->z_phys->zp_size; 3406 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3407 error = uiomove(zp->z_phys + 1, 3408 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3409 } else { 3410 dmu_buf_t *dbp; 3411 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3412 if (error) { 3413 ZFS_EXIT(zfsvfs); 3414 return (error); 3415 } 3416 error = uiomove(dbp->db_data, 3417 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3418 dmu_buf_rele(dbp, FTAG); 3419 } 3420 3421 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3422 ZFS_EXIT(zfsvfs); 3423 return (error); 3424 } 3425 3426 /* 3427 * Insert a new entry into directory tdvp referencing svp. 3428 * 3429 * IN: tdvp - Directory to contain new entry. 3430 * svp - vnode of new entry. 3431 * name - name of new entry. 3432 * cr - credentials of caller. 3433 * ct - caller context 3434 * 3435 * RETURN: 0 if success 3436 * error code if failure 3437 * 3438 * Timestamps: 3439 * tdvp - ctime|mtime updated 3440 * svp - ctime updated 3441 */ 3442 /* ARGSUSED */ 3443 static int 3444 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3445 caller_context_t *ct, int flags) 3446 { 3447 znode_t *dzp = VTOZ(tdvp); 3448 znode_t *tzp, *szp; 3449 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3450 zilog_t *zilog; 3451 zfs_dirlock_t *dl; 3452 dmu_tx_t *tx; 3453 vnode_t *realvp; 3454 int error; 3455 int zf = ZNEW; 3456 uid_t owner; 3457 3458 ASSERT(tdvp->v_type == VDIR); 3459 3460 ZFS_ENTER(zfsvfs); 3461 ZFS_VERIFY_ZP(dzp); 3462 zilog = zfsvfs->z_log; 3463 3464 if (VOP_REALVP(svp, &realvp, ct) == 0) 3465 svp = realvp; 3466 3467 if (svp->v_vfsp != tdvp->v_vfsp) { 3468 ZFS_EXIT(zfsvfs); 3469 return (EXDEV); 3470 } 3471 szp = VTOZ(svp); 3472 ZFS_VERIFY_ZP(szp); 3473 3474 if (zfsvfs->z_utf8 && u8_validate(name, 3475 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3476 ZFS_EXIT(zfsvfs); 3477 return (EILSEQ); 3478 } 3479 if (flags & FIGNORECASE) 3480 zf |= ZCILOOK; 3481 3482 top: 3483 /* 3484 * We do not support links between attributes and non-attributes 3485 * because of the potential security risk of creating links 3486 * into "normal" file space in order to circumvent restrictions 3487 * imposed in attribute space. 3488 */ 3489 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3490 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3491 ZFS_EXIT(zfsvfs); 3492 return (EINVAL); 3493 } 3494 3495 /* 3496 * POSIX dictates that we return EPERM here. 3497 * Better choices include ENOTSUP or EISDIR. 3498 */ 3499 if (svp->v_type == VDIR) { 3500 ZFS_EXIT(zfsvfs); 3501 return (EPERM); 3502 } 3503 3504 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3505 if (owner != crgetuid(cr) && 3506 secpolicy_basic_link(cr) != 0) { 3507 ZFS_EXIT(zfsvfs); 3508 return (EPERM); 3509 } 3510 3511 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3512 ZFS_EXIT(zfsvfs); 3513 return (error); 3514 } 3515 3516 /* 3517 * Attempt to lock directory; fail if entry already exists. 3518 */ 3519 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3520 if (error) { 3521 ZFS_EXIT(zfsvfs); 3522 return (error); 3523 } 3524 3525 tx = dmu_tx_create(zfsvfs->z_os); 3526 dmu_tx_hold_bonus(tx, szp->z_id); 3527 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3528 error = dmu_tx_assign(tx, zfsvfs->z_assign); 3529 if (error) { 3530 zfs_dirent_unlock(dl); 3531 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3532 dmu_tx_wait(tx); 3533 dmu_tx_abort(tx); 3534 goto top; 3535 } 3536 dmu_tx_abort(tx); 3537 ZFS_EXIT(zfsvfs); 3538 return (error); 3539 } 3540 3541 error = zfs_link_create(dl, szp, tx, 0); 3542 3543 if (error == 0) { 3544 uint64_t txtype = TX_LINK; 3545 if (flags & FIGNORECASE) 3546 txtype |= TX_CI; 3547 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3548 } 3549 3550 dmu_tx_commit(tx); 3551 3552 zfs_dirent_unlock(dl); 3553 3554 if (error == 0) { 3555 vnevent_link(svp, ct); 3556 } 3557 3558 ZFS_EXIT(zfsvfs); 3559 return (error); 3560 } 3561 3562 /* 3563 * zfs_null_putapage() is used when the file system has been force 3564 * unmounted. It just drops the pages. 3565 */ 3566 /* ARGSUSED */ 3567 static int 3568 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 3569 size_t *lenp, int flags, cred_t *cr) 3570 { 3571 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 3572 return (0); 3573 } 3574 3575 /* 3576 * Push a page out to disk, klustering if possible. 3577 * 3578 * IN: vp - file to push page to. 3579 * pp - page to push. 3580 * flags - additional flags. 3581 * cr - credentials of caller. 3582 * 3583 * OUT: offp - start of range pushed. 3584 * lenp - len of range pushed. 3585 * 3586 * RETURN: 0 if success 3587 * error code if failure 3588 * 3589 * NOTE: callers must have locked the page to be pushed. On 3590 * exit, the page (and all other pages in the kluster) must be 3591 * unlocked. 3592 */ 3593 /* ARGSUSED */ 3594 static int 3595 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 3596 size_t *lenp, int flags, cred_t *cr) 3597 { 3598 znode_t *zp = VTOZ(vp); 3599 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3600 zilog_t *zilog = zfsvfs->z_log; 3601 dmu_tx_t *tx; 3602 rl_t *rl; 3603 u_offset_t off, koff; 3604 size_t len, klen; 3605 uint64_t filesz; 3606 int err; 3607 3608 filesz = zp->z_phys->zp_size; 3609 off = pp->p_offset; 3610 len = PAGESIZE; 3611 /* 3612 * If our blocksize is bigger than the page size, try to kluster 3613 * muiltiple pages so that we write a full block (thus avoiding 3614 * a read-modify-write). 3615 */ 3616 if (off < filesz && zp->z_blksz > PAGESIZE) { 3617 if (!ISP2(zp->z_blksz)) { 3618 /* Only one block in the file. */ 3619 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 3620 koff = 0; 3621 } else { 3622 klen = zp->z_blksz; 3623 koff = P2ALIGN(off, (u_offset_t)klen); 3624 } 3625 ASSERT(koff <= filesz); 3626 if (koff + klen > filesz) 3627 klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); 3628 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 3629 } 3630 ASSERT3U(btop(len), ==, btopr(len)); 3631 top: 3632 rl = zfs_range_lock(zp, off, len, RL_WRITER); 3633 /* 3634 * Can't push pages past end-of-file. 3635 */ 3636 filesz = zp->z_phys->zp_size; 3637 if (off >= filesz) { 3638 /* ignore all pages */ 3639 err = 0; 3640 goto out; 3641 } else if (off + len > filesz) { 3642 int npages = btopr(filesz - off); 3643 page_t *trunc; 3644 3645 page_list_break(&pp, &trunc, npages); 3646 /* ignore pages past end of file */ 3647 if (trunc) 3648 pvn_write_done(trunc, flags); 3649 len = filesz - off; 3650 } 3651 3652 tx = dmu_tx_create(zfsvfs->z_os); 3653 dmu_tx_hold_write(tx, zp->z_id, off, len); 3654 dmu_tx_hold_bonus(tx, zp->z_id); 3655 err = dmu_tx_assign(tx, zfsvfs->z_assign); 3656 if (err != 0) { 3657 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 3658 zfs_range_unlock(rl); 3659 dmu_tx_wait(tx); 3660 dmu_tx_abort(tx); 3661 err = 0; 3662 goto top; 3663 } 3664 dmu_tx_abort(tx); 3665 goto out; 3666 } 3667 3668 if (zp->z_blksz <= PAGESIZE) { 3669 caddr_t va = zfs_map_page(pp, S_READ); 3670 ASSERT3U(len, <=, PAGESIZE); 3671 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 3672 zfs_unmap_page(pp, va); 3673 } else { 3674 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 3675 } 3676 3677 if (err == 0) { 3678 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 3679 zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); 3680 dmu_tx_commit(tx); 3681 } 3682 3683 out: 3684 zfs_range_unlock(rl); 3685 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 3686 if (offp) 3687 *offp = off; 3688 if (lenp) 3689 *lenp = len; 3690 3691 return (err); 3692 } 3693 3694 /* 3695 * Copy the portion of the file indicated from pages into the file. 3696 * The pages are stored in a page list attached to the files vnode. 3697 * 3698 * IN: vp - vnode of file to push page data to. 3699 * off - position in file to put data. 3700 * len - amount of data to write. 3701 * flags - flags to control the operation. 3702 * cr - credentials of caller. 3703 * ct - caller context. 3704 * 3705 * RETURN: 0 if success 3706 * error code if failure 3707 * 3708 * Timestamps: 3709 * vp - ctime|mtime updated 3710 */ 3711 /*ARGSUSED*/ 3712 static int 3713 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 3714 caller_context_t *ct) 3715 { 3716 znode_t *zp = VTOZ(vp); 3717 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3718 page_t *pp; 3719 size_t io_len; 3720 u_offset_t io_off; 3721 uint64_t filesz; 3722 int error = 0; 3723 3724 ZFS_ENTER(zfsvfs); 3725 ZFS_VERIFY_ZP(zp); 3726 3727 if (len == 0) { 3728 /* 3729 * Search the entire vp list for pages >= off. 3730 */ 3731 error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, 3732 flags, cr); 3733 goto out; 3734 } 3735 3736 filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ 3737 if (off > filesz) { 3738 /* past end of file */ 3739 ZFS_EXIT(zfsvfs); 3740 return (0); 3741 } 3742 3743 len = MIN(len, filesz - off); 3744 3745 for (io_off = off; io_off < off + len; io_off += io_len) { 3746 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 3747 pp = page_lookup(vp, io_off, 3748 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 3749 } else { 3750 pp = page_lookup_nowait(vp, io_off, 3751 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 3752 } 3753 3754 if (pp != NULL && pvn_getdirty(pp, flags)) { 3755 int err; 3756 3757 /* 3758 * Found a dirty page to push 3759 */ 3760 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 3761 if (err) 3762 error = err; 3763 } else { 3764 io_len = PAGESIZE; 3765 } 3766 } 3767 out: 3768 if ((flags & B_ASYNC) == 0) 3769 zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); 3770 ZFS_EXIT(zfsvfs); 3771 return (error); 3772 } 3773 3774 /*ARGSUSED*/ 3775 void 3776 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3777 { 3778 znode_t *zp = VTOZ(vp); 3779 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3780 int error; 3781 3782 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3783 if (zp->z_dbuf == NULL) { 3784 /* 3785 * The fs has been unmounted, or we did a 3786 * suspend/resume and this file no longer exists. 3787 */ 3788 if (vn_has_cached_data(vp)) { 3789 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 3790 B_INVAL, cr); 3791 } 3792 3793 mutex_enter(&zp->z_lock); 3794 vp->v_count = 0; /* count arrives as 1 */ 3795 mutex_exit(&zp->z_lock); 3796 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3797 zfs_znode_free(zp); 3798 return; 3799 } 3800 3801 /* 3802 * Attempt to push any data in the page cache. If this fails 3803 * we will get kicked out later in zfs_zinactive(). 3804 */ 3805 if (vn_has_cached_data(vp)) { 3806 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 3807 cr); 3808 } 3809 3810 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3811 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3812 3813 dmu_tx_hold_bonus(tx, zp->z_id); 3814 error = dmu_tx_assign(tx, TXG_WAIT); 3815 if (error) { 3816 dmu_tx_abort(tx); 3817 } else { 3818 dmu_buf_will_dirty(zp->z_dbuf, tx); 3819 mutex_enter(&zp->z_lock); 3820 zp->z_atime_dirty = 0; 3821 mutex_exit(&zp->z_lock); 3822 dmu_tx_commit(tx); 3823 } 3824 } 3825 3826 zfs_zinactive(zp); 3827 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3828 } 3829 3830 /* 3831 * Bounds-check the seek operation. 3832 * 3833 * IN: vp - vnode seeking within 3834 * ooff - old file offset 3835 * noffp - pointer to new file offset 3836 * ct - caller context 3837 * 3838 * RETURN: 0 if success 3839 * EINVAL if new offset invalid 3840 */ 3841 /* ARGSUSED */ 3842 static int 3843 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 3844 caller_context_t *ct) 3845 { 3846 if (vp->v_type == VDIR) 3847 return (0); 3848 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 3849 } 3850 3851 /* 3852 * Pre-filter the generic locking function to trap attempts to place 3853 * a mandatory lock on a memory mapped file. 3854 */ 3855 static int 3856 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 3857 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 3858 { 3859 znode_t *zp = VTOZ(vp); 3860 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3861 int error; 3862 3863 ZFS_ENTER(zfsvfs); 3864 ZFS_VERIFY_ZP(zp); 3865 3866 /* 3867 * We are following the UFS semantics with respect to mapcnt 3868 * here: If we see that the file is mapped already, then we will 3869 * return an error, but we don't worry about races between this 3870 * function and zfs_map(). 3871 */ 3872 if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { 3873 ZFS_EXIT(zfsvfs); 3874 return (EAGAIN); 3875 } 3876 error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); 3877 ZFS_EXIT(zfsvfs); 3878 return (error); 3879 } 3880 3881 /* 3882 * If we can't find a page in the cache, we will create a new page 3883 * and fill it with file data. For efficiency, we may try to fill 3884 * multiple pages at once (klustering). 3885 */ 3886 static int 3887 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 3888 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 3889 { 3890 znode_t *zp = VTOZ(vp); 3891 page_t *pp, *cur_pp; 3892 objset_t *os = zp->z_zfsvfs->z_os; 3893 caddr_t va; 3894 u_offset_t io_off, total; 3895 uint64_t oid = zp->z_id; 3896 size_t io_len; 3897 uint64_t filesz; 3898 int err; 3899 3900 /* 3901 * If we are only asking for a single page don't bother klustering. 3902 */ 3903 filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ 3904 if (off >= filesz) 3905 return (EFAULT); 3906 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 3907 io_off = off; 3908 io_len = PAGESIZE; 3909 pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); 3910 } else { 3911 /* 3912 * Try to fill a kluster of pages (a blocks worth). 3913 */ 3914 size_t klen; 3915 u_offset_t koff; 3916 3917 if (!ISP2(zp->z_blksz)) { 3918 /* Only one block in the file. */ 3919 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 3920 koff = 0; 3921 } else { 3922 /* 3923 * It would be ideal to align our offset to the 3924 * blocksize but doing so has resulted in some 3925 * strange application crashes. For now, we 3926 * leave the offset as is and only adjust the 3927 * length if we are off the end of the file. 3928 */ 3929 koff = off; 3930 klen = plsz; 3931 } 3932 ASSERT(koff <= filesz); 3933 if (koff + klen > filesz) 3934 klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; 3935 ASSERT3U(off, >=, koff); 3936 ASSERT3U(off, <, koff + klen); 3937 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3938 &io_len, koff, klen, 0); 3939 } 3940 if (pp == NULL) { 3941 /* 3942 * Some other thread entered the page before us. 3943 * Return to zfs_getpage to retry the lookup. 3944 */ 3945 *pl = NULL; 3946 return (0); 3947 } 3948 3949 /* 3950 * Fill the pages in the kluster. 3951 */ 3952 cur_pp = pp; 3953 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 3954 ASSERT3U(io_off, ==, cur_pp->p_offset); 3955 va = zfs_map_page(cur_pp, S_WRITE); 3956 err = dmu_read(os, oid, io_off, PAGESIZE, va); 3957 zfs_unmap_page(cur_pp, va); 3958 if (err) { 3959 /* On error, toss the entire kluster */ 3960 pvn_read_done(pp, B_ERROR); 3961 /* convert checksum errors into IO errors */ 3962 if (err == ECKSUM) 3963 err = EIO; 3964 return (err); 3965 } 3966 cur_pp = cur_pp->p_next; 3967 } 3968 out: 3969 /* 3970 * Fill in the page list array from the kluster. If 3971 * there are too many pages in the kluster, return 3972 * as many pages as possible starting from the desired 3973 * offset `off'. 3974 * NOTE: the page list will always be null terminated. 3975 */ 3976 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3977 3978 return (0); 3979 } 3980 3981 /* 3982 * Return pointers to the pages for the file region [off, off + len] 3983 * in the pl array. If plsz is greater than len, this function may 3984 * also return page pointers from before or after the specified 3985 * region (i.e. some region [off', off' + plsz]). These additional 3986 * pages are only returned if they are already in the cache, or were 3987 * created as part of a klustered read. 3988 * 3989 * IN: vp - vnode of file to get data from. 3990 * off - position in file to get data from. 3991 * len - amount of data to retrieve. 3992 * plsz - length of provided page list. 3993 * seg - segment to obtain pages for. 3994 * addr - virtual address of fault. 3995 * rw - mode of created pages. 3996 * cr - credentials of caller. 3997 * ct - caller context. 3998 * 3999 * OUT: protp - protection mode of created pages. 4000 * pl - list of pages created. 4001 * 4002 * RETURN: 0 if success 4003 * error code if failure 4004 * 4005 * Timestamps: 4006 * vp - atime updated 4007 */ 4008 /* ARGSUSED */ 4009 static int 4010 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4011 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4012 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4013 { 4014 znode_t *zp = VTOZ(vp); 4015 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4016 page_t *pp, **pl0 = pl; 4017 int need_unlock = 0, err = 0; 4018 offset_t orig_off; 4019 4020 ZFS_ENTER(zfsvfs); 4021 ZFS_VERIFY_ZP(zp); 4022 4023 if (protp) 4024 *protp = PROT_ALL; 4025 4026 /* no faultahead (for now) */ 4027 if (pl == NULL) { 4028 ZFS_EXIT(zfsvfs); 4029 return (0); 4030 } 4031 4032 /* can't fault past EOF */ 4033 if (off >= zp->z_phys->zp_size) { 4034 ZFS_EXIT(zfsvfs); 4035 return (EFAULT); 4036 } 4037 orig_off = off; 4038 4039 /* 4040 * If we already own the lock, then we must be page faulting 4041 * in the middle of a write to this file (i.e., we are writing 4042 * to this file using data from a mapped region of the file). 4043 */ 4044 if (rw_owner(&zp->z_map_lock) != curthread) { 4045 rw_enter(&zp->z_map_lock, RW_WRITER); 4046 need_unlock = TRUE; 4047 } 4048 4049 /* 4050 * Loop through the requested range [off, off + len] looking 4051 * for pages. If we don't find a page, we will need to create 4052 * a new page and fill it with data from the file. 4053 */ 4054 while (len > 0) { 4055 if (plsz < PAGESIZE) 4056 break; 4057 if (pp = page_lookup(vp, off, SE_SHARED)) { 4058 *pl++ = pp; 4059 off += PAGESIZE; 4060 addr += PAGESIZE; 4061 len -= PAGESIZE; 4062 plsz -= PAGESIZE; 4063 } else { 4064 err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); 4065 if (err) 4066 goto out; 4067 /* 4068 * klustering may have changed our region 4069 * to be block aligned. 4070 */ 4071 if (((pp = *pl) != 0) && (off != pp->p_offset)) { 4072 int delta = off - pp->p_offset; 4073 len += delta; 4074 off -= delta; 4075 addr -= delta; 4076 } 4077 while (*pl) { 4078 pl++; 4079 off += PAGESIZE; 4080 addr += PAGESIZE; 4081 plsz -= PAGESIZE; 4082 if (len > PAGESIZE) 4083 len -= PAGESIZE; 4084 else 4085 len = 0; 4086 } 4087 } 4088 } 4089 4090 /* 4091 * Fill out the page array with any pages already in the cache. 4092 */ 4093 while (plsz > 0) { 4094 pp = page_lookup_nowait(vp, off, SE_SHARED); 4095 if (pp == NULL) 4096 break; 4097 *pl++ = pp; 4098 off += PAGESIZE; 4099 plsz -= PAGESIZE; 4100 } 4101 4102 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4103 out: 4104 /* 4105 * We can't grab the range lock for the page as reader which would 4106 * stop truncation as this leads to deadlock. So we need to recheck 4107 * the file size. 4108 */ 4109 if (orig_off >= zp->z_phys->zp_size) 4110 err = EFAULT; 4111 if (err) { 4112 /* 4113 * Release any pages we have previously locked. 4114 */ 4115 while (pl > pl0) 4116 page_unlock(*--pl); 4117 } 4118 4119 *pl = NULL; 4120 4121 if (need_unlock) 4122 rw_exit(&zp->z_map_lock); 4123 4124 ZFS_EXIT(zfsvfs); 4125 return (err); 4126 } 4127 4128 /* 4129 * Request a memory map for a section of a file. This code interacts 4130 * with common code and the VM system as follows: 4131 * 4132 * common code calls mmap(), which ends up in smmap_common() 4133 * 4134 * this calls VOP_MAP(), which takes you into (say) zfs 4135 * 4136 * zfs_map() calls as_map(), passing segvn_create() as the callback 4137 * 4138 * segvn_create() creates the new segment and calls VOP_ADDMAP() 4139 * 4140 * zfs_addmap() updates z_mapcnt 4141 */ 4142 /*ARGSUSED*/ 4143 static int 4144 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4145 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4146 caller_context_t *ct) 4147 { 4148 znode_t *zp = VTOZ(vp); 4149 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4150 segvn_crargs_t vn_a; 4151 int error; 4152 4153 ZFS_ENTER(zfsvfs); 4154 ZFS_VERIFY_ZP(zp); 4155 4156 if ((prot & PROT_WRITE) && 4157 (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | 4158 ZFS_APPENDONLY))) { 4159 ZFS_EXIT(zfsvfs); 4160 return (EPERM); 4161 } 4162 4163 if ((prot & (PROT_READ | PROT_EXEC)) && 4164 (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { 4165 ZFS_EXIT(zfsvfs); 4166 return (EACCES); 4167 } 4168 4169 if (vp->v_flag & VNOMAP) { 4170 ZFS_EXIT(zfsvfs); 4171 return (ENOSYS); 4172 } 4173 4174 if (off < 0 || len > MAXOFFSET_T - off) { 4175 ZFS_EXIT(zfsvfs); 4176 return (ENXIO); 4177 } 4178 4179 if (vp->v_type != VREG) { 4180 ZFS_EXIT(zfsvfs); 4181 return (ENODEV); 4182 } 4183 4184 /* 4185 * If file is locked, disallow mapping. 4186 */ 4187 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { 4188 ZFS_EXIT(zfsvfs); 4189 return (EAGAIN); 4190 } 4191 4192 as_rangelock(as); 4193 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4194 if (error != 0) { 4195 as_rangeunlock(as); 4196 ZFS_EXIT(zfsvfs); 4197 return (error); 4198 } 4199 4200 vn_a.vp = vp; 4201 vn_a.offset = (u_offset_t)off; 4202 vn_a.type = flags & MAP_TYPE; 4203 vn_a.prot = prot; 4204 vn_a.maxprot = maxprot; 4205 vn_a.cred = cr; 4206 vn_a.amp = NULL; 4207 vn_a.flags = flags & ~MAP_TYPE; 4208 vn_a.szc = 0; 4209 vn_a.lgrp_mem_policy_flags = 0; 4210 4211 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4212 4213 as_rangeunlock(as); 4214 ZFS_EXIT(zfsvfs); 4215 return (error); 4216 } 4217 4218 /* ARGSUSED */ 4219 static int 4220 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4221 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4222 caller_context_t *ct) 4223 { 4224 uint64_t pages = btopr(len); 4225 4226 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 4227 return (0); 4228 } 4229 4230 /* 4231 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 4232 * more accurate mtime for the associated file. Since we don't have a way of 4233 * detecting when the data was actually modified, we have to resort to 4234 * heuristics. If an explicit msync() is done, then we mark the mtime when the 4235 * last page is pushed. The problem occurs when the msync() call is omitted, 4236 * which by far the most common case: 4237 * 4238 * open() 4239 * mmap() 4240 * <modify memory> 4241 * munmap() 4242 * close() 4243 * <time lapse> 4244 * putpage() via fsflush 4245 * 4246 * If we wait until fsflush to come along, we can have a modification time that 4247 * is some arbitrary point in the future. In order to prevent this in the 4248 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 4249 * torn down. 4250 */ 4251 /* ARGSUSED */ 4252 static int 4253 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4254 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4255 caller_context_t *ct) 4256 { 4257 uint64_t pages = btopr(len); 4258 4259 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 4260 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 4261 4262 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 4263 vn_has_cached_data(vp)) 4264 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 4265 4266 return (0); 4267 } 4268 4269 /* 4270 * Free or allocate space in a file. Currently, this function only 4271 * supports the `F_FREESP' command. However, this command is somewhat 4272 * misnamed, as its functionality includes the ability to allocate as 4273 * well as free space. 4274 * 4275 * IN: vp - vnode of file to free data in. 4276 * cmd - action to take (only F_FREESP supported). 4277 * bfp - section of file to free/alloc. 4278 * flag - current file open mode flags. 4279 * offset - current file offset. 4280 * cr - credentials of caller [UNUSED]. 4281 * ct - caller context. 4282 * 4283 * RETURN: 0 if success 4284 * error code if failure 4285 * 4286 * Timestamps: 4287 * vp - ctime|mtime updated 4288 */ 4289 /* ARGSUSED */ 4290 static int 4291 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 4292 offset_t offset, cred_t *cr, caller_context_t *ct) 4293 { 4294 znode_t *zp = VTOZ(vp); 4295 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4296 uint64_t off, len; 4297 int error; 4298 4299 ZFS_ENTER(zfsvfs); 4300 ZFS_VERIFY_ZP(zp); 4301 4302 if (cmd != F_FREESP) { 4303 ZFS_EXIT(zfsvfs); 4304 return (EINVAL); 4305 } 4306 4307 if (error = convoff(vp, bfp, 0, offset)) { 4308 ZFS_EXIT(zfsvfs); 4309 return (error); 4310 } 4311 4312 if (bfp->l_len < 0) { 4313 ZFS_EXIT(zfsvfs); 4314 return (EINVAL); 4315 } 4316 4317 off = bfp->l_start; 4318 len = bfp->l_len; /* 0 means from off to end of file */ 4319 4320 error = zfs_freesp(zp, off, len, flag, TRUE); 4321 4322 ZFS_EXIT(zfsvfs); 4323 return (error); 4324 } 4325 4326 /*ARGSUSED*/ 4327 static int 4328 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4329 { 4330 znode_t *zp = VTOZ(vp); 4331 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4332 uint32_t gen; 4333 uint64_t object = zp->z_id; 4334 zfid_short_t *zfid; 4335 int size, i; 4336 4337 ZFS_ENTER(zfsvfs); 4338 ZFS_VERIFY_ZP(zp); 4339 gen = (uint32_t)zp->z_gen; 4340 4341 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4342 if (fidp->fid_len < size) { 4343 fidp->fid_len = size; 4344 ZFS_EXIT(zfsvfs); 4345 return (ENOSPC); 4346 } 4347 4348 zfid = (zfid_short_t *)fidp; 4349 4350 zfid->zf_len = size; 4351 4352 for (i = 0; i < sizeof (zfid->zf_object); i++) 4353 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4354 4355 /* Must have a non-zero generation number to distinguish from .zfs */ 4356 if (gen == 0) 4357 gen = 1; 4358 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4359 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4360 4361 if (size == LONG_FID_LEN) { 4362 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4363 zfid_long_t *zlfid; 4364 4365 zlfid = (zfid_long_t *)fidp; 4366 4367 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4368 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4369 4370 /* XXX - this should be the generation number for the objset */ 4371 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4372 zlfid->zf_setgen[i] = 0; 4373 } 4374 4375 ZFS_EXIT(zfsvfs); 4376 return (0); 4377 } 4378 4379 static int 4380 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4381 caller_context_t *ct) 4382 { 4383 znode_t *zp, *xzp; 4384 zfsvfs_t *zfsvfs; 4385 zfs_dirlock_t *dl; 4386 int error; 4387 4388 switch (cmd) { 4389 case _PC_LINK_MAX: 4390 *valp = ULONG_MAX; 4391 return (0); 4392 4393 case _PC_FILESIZEBITS: 4394 *valp = 64; 4395 return (0); 4396 4397 case _PC_XATTR_EXISTS: 4398 zp = VTOZ(vp); 4399 zfsvfs = zp->z_zfsvfs; 4400 ZFS_ENTER(zfsvfs); 4401 ZFS_VERIFY_ZP(zp); 4402 *valp = 0; 4403 error = zfs_dirent_lock(&dl, zp, "", &xzp, 4404 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 4405 if (error == 0) { 4406 zfs_dirent_unlock(dl); 4407 if (!zfs_dirempty(xzp)) 4408 *valp = 1; 4409 VN_RELE(ZTOV(xzp)); 4410 } else if (error == ENOENT) { 4411 /* 4412 * If there aren't extended attributes, it's the 4413 * same as having zero of them. 4414 */ 4415 error = 0; 4416 } 4417 ZFS_EXIT(zfsvfs); 4418 return (error); 4419 4420 case _PC_SATTR_ENABLED: 4421 case _PC_SATTR_EXISTS: 4422 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4423 (vp->v_type == VREG || vp->v_type == VDIR); 4424 return (0); 4425 4426 case _PC_ACL_ENABLED: 4427 *valp = _ACL_ACE_ENABLED; 4428 return (0); 4429 4430 case _PC_MIN_HOLE_SIZE: 4431 *valp = (ulong_t)SPA_MINBLOCKSIZE; 4432 return (0); 4433 4434 default: 4435 return (fs_pathconf(vp, cmd, valp, cr, ct)); 4436 } 4437 } 4438 4439 /*ARGSUSED*/ 4440 static int 4441 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4442 caller_context_t *ct) 4443 { 4444 znode_t *zp = VTOZ(vp); 4445 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4446 int error; 4447 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4448 4449 ZFS_ENTER(zfsvfs); 4450 ZFS_VERIFY_ZP(zp); 4451 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4452 ZFS_EXIT(zfsvfs); 4453 4454 return (error); 4455 } 4456 4457 /*ARGSUSED*/ 4458 static int 4459 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4460 caller_context_t *ct) 4461 { 4462 znode_t *zp = VTOZ(vp); 4463 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4464 int error; 4465 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4466 4467 ZFS_ENTER(zfsvfs); 4468 ZFS_VERIFY_ZP(zp); 4469 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4470 ZFS_EXIT(zfsvfs); 4471 return (error); 4472 } 4473 4474 /* 4475 * Predeclare these here so that the compiler assumes that 4476 * this is an "old style" function declaration that does 4477 * not include arguments => we won't get type mismatch errors 4478 * in the initializations that follow. 4479 */ 4480 static int zfs_inval(); 4481 static int zfs_isdir(); 4482 4483 static int 4484 zfs_inval() 4485 { 4486 return (EINVAL); 4487 } 4488 4489 static int 4490 zfs_isdir() 4491 { 4492 return (EISDIR); 4493 } 4494 /* 4495 * Directory vnode operations template 4496 */ 4497 vnodeops_t *zfs_dvnodeops; 4498 const fs_operation_def_t zfs_dvnodeops_template[] = { 4499 VOPNAME_OPEN, { .vop_open = zfs_open }, 4500 VOPNAME_CLOSE, { .vop_close = zfs_close }, 4501 VOPNAME_READ, { .error = zfs_isdir }, 4502 VOPNAME_WRITE, { .error = zfs_isdir }, 4503 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 4504 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4505 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4506 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4507 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 4508 VOPNAME_CREATE, { .vop_create = zfs_create }, 4509 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 4510 VOPNAME_LINK, { .vop_link = zfs_link }, 4511 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4512 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 4513 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 4514 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 4515 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 4516 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 4517 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4518 VOPNAME_FID, { .vop_fid = zfs_fid }, 4519 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 4520 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4521 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4522 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4523 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4524 NULL, NULL 4525 }; 4526 4527 /* 4528 * Regular file vnode operations template 4529 */ 4530 vnodeops_t *zfs_fvnodeops; 4531 const fs_operation_def_t zfs_fvnodeops_template[] = { 4532 VOPNAME_OPEN, { .vop_open = zfs_open }, 4533 VOPNAME_CLOSE, { .vop_close = zfs_close }, 4534 VOPNAME_READ, { .vop_read = zfs_read }, 4535 VOPNAME_WRITE, { .vop_write = zfs_write }, 4536 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 4537 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4538 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4539 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4540 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 4541 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4542 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 4543 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4544 VOPNAME_FID, { .vop_fid = zfs_fid }, 4545 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 4546 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 4547 VOPNAME_SPACE, { .vop_space = zfs_space }, 4548 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 4549 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 4550 VOPNAME_MAP, { .vop_map = zfs_map }, 4551 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 4552 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 4553 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4554 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4555 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4556 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4557 NULL, NULL 4558 }; 4559 4560 /* 4561 * Symbolic link vnode operations template 4562 */ 4563 vnodeops_t *zfs_symvnodeops; 4564 const fs_operation_def_t zfs_symvnodeops_template[] = { 4565 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4566 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4567 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4568 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4569 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 4570 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4571 VOPNAME_FID, { .vop_fid = zfs_fid }, 4572 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4573 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4574 NULL, NULL 4575 }; 4576 4577 /* 4578 * Extended attribute directory vnode operations template 4579 * This template is identical to the directory vnodes 4580 * operation template except for restricted operations: 4581 * VOP_MKDIR() 4582 * VOP_SYMLINK() 4583 * Note that there are other restrictions embedded in: 4584 * zfs_create() - restrict type to VREG 4585 * zfs_link() - no links into/out of attribute space 4586 * zfs_rename() - no moves into/out of attribute space 4587 */ 4588 vnodeops_t *zfs_xdvnodeops; 4589 const fs_operation_def_t zfs_xdvnodeops_template[] = { 4590 VOPNAME_OPEN, { .vop_open = zfs_open }, 4591 VOPNAME_CLOSE, { .vop_close = zfs_close }, 4592 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 4593 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4594 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4595 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4596 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 4597 VOPNAME_CREATE, { .vop_create = zfs_create }, 4598 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 4599 VOPNAME_LINK, { .vop_link = zfs_link }, 4600 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4601 VOPNAME_MKDIR, { .error = zfs_inval }, 4602 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 4603 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 4604 VOPNAME_SYMLINK, { .error = zfs_inval }, 4605 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 4606 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4607 VOPNAME_FID, { .vop_fid = zfs_fid }, 4608 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 4609 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4610 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4611 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4612 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4613 NULL, NULL 4614 }; 4615 4616 /* 4617 * Error vnode operations template 4618 */ 4619 vnodeops_t *zfs_evnodeops; 4620 const fs_operation_def_t zfs_evnodeops_template[] = { 4621 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4622 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4623 NULL, NULL 4624 }; 4625