1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Portions Copyright 2007 Jeremy Teo */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/vfs.h> 35 #include <sys/vfs_opreg.h> 36 #include <sys/vnode.h> 37 #include <sys/file.h> 38 #include <sys/stat.h> 39 #include <sys/kmem.h> 40 #include <sys/taskq.h> 41 #include <sys/uio.h> 42 #include <sys/vmsystm.h> 43 #include <sys/atomic.h> 44 #include <sys/vm.h> 45 #include <vm/seg_vn.h> 46 #include <vm/pvn.h> 47 #include <vm/as.h> 48 #include <vm/kpm.h> 49 #include <vm/seg_kpm.h> 50 #include <sys/mman.h> 51 #include <sys/pathname.h> 52 #include <sys/cmn_err.h> 53 #include <sys/errno.h> 54 #include <sys/unistd.h> 55 #include <sys/zfs_dir.h> 56 #include <sys/zfs_acl.h> 57 #include <sys/zfs_ioctl.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/dmu.h> 60 #include <sys/spa.h> 61 #include <sys/txg.h> 62 #include <sys/dbuf.h> 63 #include <sys/zap.h> 64 #include <sys/dirent.h> 65 #include <sys/policy.h> 66 #include <sys/sunddi.h> 67 #include <sys/filio.h> 68 #include <sys/sid.h> 69 #include "fs/fs_subr.h" 70 #include <sys/zfs_ctldir.h> 71 #include <sys/zfs_fuid.h> 72 #include <sys/dnlc.h> 73 #include <sys/zfs_rlock.h> 74 #include <sys/extdirent.h> 75 #include <sys/kidmap.h> 76 #include <sys/cred_impl.h> 77 #include <sys/attr.h> 78 79 /* 80 * Programming rules. 81 * 82 * Each vnode op performs some logical unit of work. To do this, the ZPL must 83 * properly lock its in-core state, create a DMU transaction, do the work, 84 * record this work in the intent log (ZIL), commit the DMU transaction, 85 * and wait for the intent log to commit if it is a synchronous operation. 86 * Moreover, the vnode ops must work in both normal and log replay context. 87 * The ordering of events is important to avoid deadlocks and references 88 * to freed memory. The example below illustrates the following Big Rules: 89 * 90 * (1) A check must be made in each zfs thread for a mounted file system. 91 * This is done avoiding races using ZFS_ENTER(zfsvfs). 92 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 93 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 94 * can return EIO from the calling function. 95 * 96 * (2) VN_RELE() should always be the last thing except for zil_commit() 97 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 98 * First, if it's the last reference, the vnode/znode 99 * can be freed, so the zp may point to freed memory. Second, the last 100 * reference will call zfs_zinactive(), which may induce a lot of work -- 101 * pushing cached pages (which acquires range locks) and syncing out 102 * cached atime changes. Third, zfs_zinactive() may require a new tx, 103 * which could deadlock the system if you were already holding one. 104 * 105 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 106 * as they can span dmu_tx_assign() calls. 107 * 108 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). 109 * This is critical because we don't want to block while holding locks. 110 * Note, in particular, that if a lock is sometimes acquired before 111 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 112 * use a non-blocking assign can deadlock the system. The scenario: 113 * 114 * Thread A has grabbed a lock before calling dmu_tx_assign(). 115 * Thread B is in an already-assigned tx, and blocks for this lock. 116 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 117 * forever, because the previous txg can't quiesce until B's tx commits. 118 * 119 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 120 * then drop all locks, call dmu_tx_wait(), and try again. 121 * 122 * (5) If the operation succeeded, generate the intent log entry for it 123 * before dropping locks. This ensures that the ordering of events 124 * in the intent log matches the order in which they actually occurred. 125 * During ZIL replay the zfs_log_* functions will update the sequence 126 * number to indicate the zil transaction has replayed. 127 * 128 * (6) At the end of each vnode op, the DMU tx must always commit, 129 * regardless of whether there were any errors. 130 * 131 * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) 132 * to ensure that synchronous semantics are provided when necessary. 133 * 134 * In general, this is how things should be ordered in each vnode op: 135 * 136 * ZFS_ENTER(zfsvfs); // exit if unmounted 137 * top: 138 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 139 * rw_enter(...); // grab any other locks you need 140 * tx = dmu_tx_create(...); // get DMU tx 141 * dmu_tx_hold_*(); // hold each object you might modify 142 * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign 143 * if (error) { 144 * rw_exit(...); // drop locks 145 * zfs_dirent_unlock(dl); // unlock directory entry 146 * VN_RELE(...); // release held vnodes 147 * if (error == ERESTART) { 148 * dmu_tx_wait(tx); 149 * dmu_tx_abort(tx); 150 * goto top; 151 * } 152 * dmu_tx_abort(tx); // abort DMU tx 153 * ZFS_EXIT(zfsvfs); // finished in zfs 154 * return (error); // really out of space 155 * } 156 * error = do_real_work(); // do whatever this VOP does 157 * if (error == 0) 158 * zfs_log_*(...); // on success, make ZIL entry 159 * dmu_tx_commit(tx); // commit DMU tx -- error or not 160 * rw_exit(...); // drop locks 161 * zfs_dirent_unlock(dl); // unlock directory entry 162 * VN_RELE(...); // release held vnodes 163 * zil_commit(zilog, seq, foid); // synchronous when necessary 164 * ZFS_EXIT(zfsvfs); // finished in zfs 165 * return (error); // done, report error 166 */ 167 168 /* ARGSUSED */ 169 static int 170 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 171 { 172 znode_t *zp = VTOZ(*vpp); 173 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 174 175 ZFS_ENTER(zfsvfs); 176 ZFS_VERIFY_ZP(zp); 177 178 if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && 179 ((flag & FAPPEND) == 0)) { 180 ZFS_EXIT(zfsvfs); 181 return (EPERM); 182 } 183 184 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 185 ZTOV(zp)->v_type == VREG && 186 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 187 zp->z_phys->zp_size > 0) { 188 if (fs_vscan(*vpp, cr, 0) != 0) { 189 ZFS_EXIT(zfsvfs); 190 return (EACCES); 191 } 192 } 193 194 /* Keep a count of the synchronous opens in the znode */ 195 if (flag & (FSYNC | FDSYNC)) 196 atomic_inc_32(&zp->z_sync_cnt); 197 198 ZFS_EXIT(zfsvfs); 199 return (0); 200 } 201 202 /* ARGSUSED */ 203 static int 204 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 205 caller_context_t *ct) 206 { 207 znode_t *zp = VTOZ(vp); 208 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 209 210 ZFS_ENTER(zfsvfs); 211 ZFS_VERIFY_ZP(zp); 212 213 /* Decrement the synchronous opens in the znode */ 214 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 215 atomic_dec_32(&zp->z_sync_cnt); 216 217 /* 218 * Clean up any locks held by this process on the vp. 219 */ 220 cleanlocks(vp, ddi_get_pid(), 0); 221 cleanshares(vp, ddi_get_pid()); 222 223 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 224 ZTOV(zp)->v_type == VREG && 225 !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && 226 zp->z_phys->zp_size > 0) 227 VERIFY(fs_vscan(vp, cr, 1) == 0); 228 229 ZFS_EXIT(zfsvfs); 230 return (0); 231 } 232 233 /* 234 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 235 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 236 */ 237 static int 238 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 239 { 240 znode_t *zp = VTOZ(vp); 241 uint64_t noff = (uint64_t)*off; /* new offset */ 242 uint64_t file_sz; 243 int error; 244 boolean_t hole; 245 246 file_sz = zp->z_phys->zp_size; 247 if (noff >= file_sz) { 248 return (ENXIO); 249 } 250 251 if (cmd == _FIO_SEEK_HOLE) 252 hole = B_TRUE; 253 else 254 hole = B_FALSE; 255 256 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 257 258 /* end of file? */ 259 if ((error == ESRCH) || (noff > file_sz)) { 260 /* 261 * Handle the virtual hole at the end of file. 262 */ 263 if (hole) { 264 *off = file_sz; 265 return (0); 266 } 267 return (ENXIO); 268 } 269 270 if (noff < *off) 271 return (error); 272 *off = noff; 273 return (error); 274 } 275 276 /* ARGSUSED */ 277 static int 278 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 279 int *rvalp, caller_context_t *ct) 280 { 281 offset_t off; 282 int error; 283 zfsvfs_t *zfsvfs; 284 znode_t *zp; 285 286 switch (com) { 287 case _FIOFFS: 288 return (zfs_sync(vp->v_vfsp, 0, cred)); 289 290 /* 291 * The following two ioctls are used by bfu. Faking out, 292 * necessary to avoid bfu errors. 293 */ 294 case _FIOGDIO: 295 case _FIOSDIO: 296 return (0); 297 298 case _FIO_SEEK_DATA: 299 case _FIO_SEEK_HOLE: 300 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 301 return (EFAULT); 302 303 zp = VTOZ(vp); 304 zfsvfs = zp->z_zfsvfs; 305 ZFS_ENTER(zfsvfs); 306 ZFS_VERIFY_ZP(zp); 307 308 /* offset parameter is in/out */ 309 error = zfs_holey(vp, com, &off); 310 ZFS_EXIT(zfsvfs); 311 if (error) 312 return (error); 313 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 314 return (EFAULT); 315 return (0); 316 } 317 return (ENOTTY); 318 } 319 320 /* 321 * Utility functions to map and unmap a single physical page. These 322 * are used to manage the mappable copies of ZFS file data, and therefore 323 * do not update ref/mod bits. 324 */ 325 caddr_t 326 zfs_map_page(page_t *pp, enum seg_rw rw) 327 { 328 if (kpm_enable) 329 return (hat_kpm_mapin(pp, 0)); 330 ASSERT(rw == S_READ || rw == S_WRITE); 331 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 332 (caddr_t)-1)); 333 } 334 335 void 336 zfs_unmap_page(page_t *pp, caddr_t addr) 337 { 338 if (kpm_enable) { 339 hat_kpm_mapout(pp, 0, addr); 340 } else { 341 ppmapout(addr); 342 } 343 } 344 345 /* 346 * When a file is memory mapped, we must keep the IO data synchronized 347 * between the DMU cache and the memory mapped pages. What this means: 348 * 349 * On Write: If we find a memory mapped page, we write to *both* 350 * the page and the dmu buffer. 351 */ 352 static void 353 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 354 { 355 int64_t off; 356 357 off = start & PAGEOFFSET; 358 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 359 page_t *pp; 360 uint64_t nbytes = MIN(PAGESIZE - off, len); 361 362 if (pp = page_lookup(vp, start, SE_SHARED)) { 363 caddr_t va; 364 365 va = zfs_map_page(pp, S_WRITE); 366 (void) dmu_read(os, oid, start+off, nbytes, va+off); 367 zfs_unmap_page(pp, va); 368 page_unlock(pp); 369 } 370 len -= nbytes; 371 off = 0; 372 } 373 } 374 375 /* 376 * When a file is memory mapped, we must keep the IO data synchronized 377 * between the DMU cache and the memory mapped pages. What this means: 378 * 379 * On Read: We "read" preferentially from memory mapped pages, 380 * else we default from the dmu buffer. 381 * 382 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 383 * the file is memory mapped. 384 */ 385 static int 386 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 387 { 388 znode_t *zp = VTOZ(vp); 389 objset_t *os = zp->z_zfsvfs->z_os; 390 int64_t start, off; 391 int len = nbytes; 392 int error = 0; 393 394 start = uio->uio_loffset; 395 off = start & PAGEOFFSET; 396 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 397 page_t *pp; 398 uint64_t bytes = MIN(PAGESIZE - off, len); 399 400 if (pp = page_lookup(vp, start, SE_SHARED)) { 401 caddr_t va; 402 403 va = zfs_map_page(pp, S_READ); 404 error = uiomove(va + off, bytes, UIO_READ, uio); 405 zfs_unmap_page(pp, va); 406 page_unlock(pp); 407 } else { 408 error = dmu_read_uio(os, zp->z_id, uio, bytes); 409 } 410 len -= bytes; 411 off = 0; 412 if (error) 413 break; 414 } 415 return (error); 416 } 417 418 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 419 420 /* 421 * Read bytes from specified file into supplied buffer. 422 * 423 * IN: vp - vnode of file to be read from. 424 * uio - structure supplying read location, range info, 425 * and return buffer. 426 * ioflag - SYNC flags; used to provide FRSYNC semantics. 427 * cr - credentials of caller. 428 * ct - caller context 429 * 430 * OUT: uio - updated offset and range, buffer filled. 431 * 432 * RETURN: 0 if success 433 * error code if failure 434 * 435 * Side Effects: 436 * vp - atime updated if byte count > 0 437 */ 438 /* ARGSUSED */ 439 static int 440 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 441 { 442 znode_t *zp = VTOZ(vp); 443 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 444 objset_t *os; 445 ssize_t n, nbytes; 446 int error; 447 rl_t *rl; 448 449 ZFS_ENTER(zfsvfs); 450 ZFS_VERIFY_ZP(zp); 451 os = zfsvfs->z_os; 452 453 if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { 454 ZFS_EXIT(zfsvfs); 455 return (EACCES); 456 } 457 458 /* 459 * Validate file offset 460 */ 461 if (uio->uio_loffset < (offset_t)0) { 462 ZFS_EXIT(zfsvfs); 463 return (EINVAL); 464 } 465 466 /* 467 * Fasttrack empty reads 468 */ 469 if (uio->uio_resid == 0) { 470 ZFS_EXIT(zfsvfs); 471 return (0); 472 } 473 474 /* 475 * Check for mandatory locks 476 */ 477 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 478 if (error = chklock(vp, FREAD, 479 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 480 ZFS_EXIT(zfsvfs); 481 return (error); 482 } 483 } 484 485 /* 486 * If we're in FRSYNC mode, sync out this znode before reading it. 487 */ 488 if (ioflag & FRSYNC) 489 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 490 491 /* 492 * Lock the range against changes. 493 */ 494 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 495 496 /* 497 * If we are reading past end-of-file we can skip 498 * to the end; but we might still need to set atime. 499 */ 500 if (uio->uio_loffset >= zp->z_phys->zp_size) { 501 error = 0; 502 goto out; 503 } 504 505 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 506 n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 507 508 while (n > 0) { 509 nbytes = MIN(n, zfs_read_chunk_size - 510 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 511 512 if (vn_has_cached_data(vp)) 513 error = mappedread(vp, nbytes, uio); 514 else 515 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 516 if (error) { 517 /* convert checksum errors into IO errors */ 518 if (error == ECKSUM) 519 error = EIO; 520 break; 521 } 522 523 n -= nbytes; 524 } 525 526 out: 527 zfs_range_unlock(rl); 528 529 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 530 ZFS_EXIT(zfsvfs); 531 return (error); 532 } 533 534 /* 535 * Write the bytes to a file. 536 * 537 * IN: vp - vnode of file to be written to. 538 * uio - structure supplying write location, range info, 539 * and data buffer. 540 * ioflag - FAPPEND flag set if in append mode. 541 * cr - credentials of caller. 542 * ct - caller context (NFS/CIFS fem monitor only) 543 * 544 * OUT: uio - updated offset and range. 545 * 546 * RETURN: 0 if success 547 * error code if failure 548 * 549 * Timestamps: 550 * vp - ctime|mtime updated if byte count > 0 551 */ 552 /* ARGSUSED */ 553 static int 554 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 555 { 556 znode_t *zp = VTOZ(vp); 557 rlim64_t limit = uio->uio_llimit; 558 ssize_t start_resid = uio->uio_resid; 559 ssize_t tx_bytes; 560 uint64_t end_size; 561 dmu_tx_t *tx; 562 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 563 zilog_t *zilog; 564 offset_t woff; 565 ssize_t n, nbytes; 566 rl_t *rl; 567 int max_blksz = zfsvfs->z_max_blksz; 568 uint64_t pflags; 569 int error; 570 571 /* 572 * Fasttrack empty write 573 */ 574 n = start_resid; 575 if (n == 0) 576 return (0); 577 578 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 579 limit = MAXOFFSET_T; 580 581 ZFS_ENTER(zfsvfs); 582 ZFS_VERIFY_ZP(zp); 583 584 /* 585 * If immutable or not appending then return EPERM 586 */ 587 pflags = zp->z_phys->zp_flags; 588 if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 589 ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 590 (uio->uio_loffset < zp->z_phys->zp_size))) { 591 ZFS_EXIT(zfsvfs); 592 return (EPERM); 593 } 594 595 zilog = zfsvfs->z_log; 596 597 /* 598 * Pre-fault the pages to ensure slow (eg NFS) pages 599 * don't hold up txg. 600 */ 601 uio_prefaultpages(n, uio); 602 603 /* 604 * If in append mode, set the io offset pointer to eof. 605 */ 606 if (ioflag & FAPPEND) { 607 /* 608 * Range lock for a file append: 609 * The value for the start of range will be determined by 610 * zfs_range_lock() (to guarantee append semantics). 611 * If this write will cause the block size to increase, 612 * zfs_range_lock() will lock the entire file, so we must 613 * later reduce the range after we grow the block size. 614 */ 615 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 616 if (rl->r_len == UINT64_MAX) { 617 /* overlocked, zp_size can't change */ 618 woff = uio->uio_loffset = zp->z_phys->zp_size; 619 } else { 620 woff = uio->uio_loffset = rl->r_off; 621 } 622 } else { 623 woff = uio->uio_loffset; 624 /* 625 * Validate file offset 626 */ 627 if (woff < 0) { 628 ZFS_EXIT(zfsvfs); 629 return (EINVAL); 630 } 631 632 /* 633 * If we need to grow the block size then zfs_range_lock() 634 * will lock a wider range than we request here. 635 * Later after growing the block size we reduce the range. 636 */ 637 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 638 } 639 640 if (woff >= limit) { 641 zfs_range_unlock(rl); 642 ZFS_EXIT(zfsvfs); 643 return (EFBIG); 644 } 645 646 if ((woff + n) > limit || woff > (limit - n)) 647 n = limit - woff; 648 649 /* 650 * Check for mandatory locks 651 */ 652 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 653 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 654 zfs_range_unlock(rl); 655 ZFS_EXIT(zfsvfs); 656 return (error); 657 } 658 end_size = MAX(zp->z_phys->zp_size, woff + n); 659 660 /* 661 * Write the file in reasonable size chunks. Each chunk is written 662 * in a separate transaction; this keeps the intent log records small 663 * and allows us to do more fine-grained space accounting. 664 */ 665 while (n > 0) { 666 /* 667 * Start a transaction. 668 */ 669 woff = uio->uio_loffset; 670 tx = dmu_tx_create(zfsvfs->z_os); 671 dmu_tx_hold_bonus(tx, zp->z_id); 672 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 673 error = dmu_tx_assign(tx, TXG_NOWAIT); 674 if (error) { 675 if (error == ERESTART) { 676 dmu_tx_wait(tx); 677 dmu_tx_abort(tx); 678 continue; 679 } 680 dmu_tx_abort(tx); 681 break; 682 } 683 684 /* 685 * If zfs_range_lock() over-locked we grow the blocksize 686 * and then reduce the lock range. This will only happen 687 * on the first iteration since zfs_range_reduce() will 688 * shrink down r_len to the appropriate size. 689 */ 690 if (rl->r_len == UINT64_MAX) { 691 uint64_t new_blksz; 692 693 if (zp->z_blksz > max_blksz) { 694 ASSERT(!ISP2(zp->z_blksz)); 695 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 696 } else { 697 new_blksz = MIN(end_size, max_blksz); 698 } 699 zfs_grow_blocksize(zp, new_blksz, tx); 700 zfs_range_reduce(rl, woff, n); 701 } 702 703 /* 704 * XXX - should we really limit each write to z_max_blksz? 705 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 706 */ 707 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 708 709 tx_bytes = uio->uio_resid; 710 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); 711 tx_bytes -= uio->uio_resid; 712 if (tx_bytes && vn_has_cached_data(vp)) 713 update_pages(vp, woff, 714 tx_bytes, zfsvfs->z_os, zp->z_id); 715 716 /* 717 * If we made no progress, we're done. If we made even 718 * partial progress, update the znode and ZIL accordingly. 719 */ 720 if (tx_bytes == 0) { 721 dmu_tx_commit(tx); 722 ASSERT(error != 0); 723 break; 724 } 725 726 /* 727 * Clear Set-UID/Set-GID bits on successful write if not 728 * privileged and at least one of the excute bits is set. 729 * 730 * It would be nice to to this after all writes have 731 * been done, but that would still expose the ISUID/ISGID 732 * to another app after the partial write is committed. 733 * 734 * Note: we don't call zfs_fuid_map_id() here because 735 * user 0 is not an ephemeral uid. 736 */ 737 mutex_enter(&zp->z_acl_lock); 738 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 739 (S_IXUSR >> 6))) != 0 && 740 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 741 secpolicy_vnode_setid_retain(cr, 742 (zp->z_phys->zp_mode & S_ISUID) != 0 && 743 zp->z_phys->zp_uid == 0) != 0) { 744 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 745 } 746 mutex_exit(&zp->z_acl_lock); 747 748 /* 749 * Update time stamp. NOTE: This marks the bonus buffer as 750 * dirty, so we don't have to do it again for zp_size. 751 */ 752 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 753 754 /* 755 * Update the file size (zp_size) if it has changed; 756 * account for possible concurrent updates. 757 */ 758 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) 759 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 760 uio->uio_loffset); 761 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 762 dmu_tx_commit(tx); 763 764 if (error != 0) 765 break; 766 ASSERT(tx_bytes == nbytes); 767 n -= nbytes; 768 } 769 770 zfs_range_unlock(rl); 771 772 /* 773 * If we're in replay mode, or we made no progress, return error. 774 * Otherwise, it's at least a partial write, so it's successful. 775 */ 776 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 777 ZFS_EXIT(zfsvfs); 778 return (error); 779 } 780 781 if (ioflag & (FSYNC | FDSYNC)) 782 zil_commit(zilog, zp->z_last_itx, zp->z_id); 783 784 ZFS_EXIT(zfsvfs); 785 return (0); 786 } 787 788 void 789 zfs_get_done(dmu_buf_t *db, void *vzgd) 790 { 791 zgd_t *zgd = (zgd_t *)vzgd; 792 rl_t *rl = zgd->zgd_rl; 793 vnode_t *vp = ZTOV(rl->r_zp); 794 795 dmu_buf_rele(db, vzgd); 796 zfs_range_unlock(rl); 797 VN_RELE(vp); 798 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 799 kmem_free(zgd, sizeof (zgd_t)); 800 } 801 802 /* 803 * Get data to generate a TX_WRITE intent log record. 804 */ 805 int 806 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 807 { 808 zfsvfs_t *zfsvfs = arg; 809 objset_t *os = zfsvfs->z_os; 810 znode_t *zp; 811 uint64_t off = lr->lr_offset; 812 dmu_buf_t *db; 813 rl_t *rl; 814 zgd_t *zgd; 815 int dlen = lr->lr_length; /* length of user data */ 816 int error = 0; 817 818 ASSERT(zio); 819 ASSERT(dlen != 0); 820 821 /* 822 * Nothing to do if the file has been removed 823 */ 824 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 825 return (ENOENT); 826 if (zp->z_unlinked) { 827 VN_RELE(ZTOV(zp)); 828 return (ENOENT); 829 } 830 831 /* 832 * Write records come in two flavors: immediate and indirect. 833 * For small writes it's cheaper to store the data with the 834 * log record (immediate); for large writes it's cheaper to 835 * sync the data and get a pointer to it (indirect) so that 836 * we don't have to write the data twice. 837 */ 838 if (buf != NULL) { /* immediate write */ 839 rl = zfs_range_lock(zp, off, dlen, RL_READER); 840 /* test for truncation needs to be done while range locked */ 841 if (off >= zp->z_phys->zp_size) { 842 error = ENOENT; 843 goto out; 844 } 845 VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); 846 } else { /* indirect write */ 847 uint64_t boff; /* block starting offset */ 848 849 /* 850 * Have to lock the whole block to ensure when it's 851 * written out and it's checksum is being calculated 852 * that no one can change the data. We need to re-check 853 * blocksize after we get the lock in case it's changed! 854 */ 855 for (;;) { 856 if (ISP2(zp->z_blksz)) { 857 boff = P2ALIGN_TYPED(off, zp->z_blksz, 858 uint64_t); 859 } else { 860 boff = 0; 861 } 862 dlen = zp->z_blksz; 863 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 864 if (zp->z_blksz == dlen) 865 break; 866 zfs_range_unlock(rl); 867 } 868 /* test for truncation needs to be done while range locked */ 869 if (off >= zp->z_phys->zp_size) { 870 error = ENOENT; 871 goto out; 872 } 873 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 874 zgd->zgd_rl = rl; 875 zgd->zgd_zilog = zfsvfs->z_log; 876 zgd->zgd_bp = &lr->lr_blkptr; 877 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); 878 ASSERT(boff == db->db_offset); 879 lr->lr_blkoff = off - boff; 880 error = dmu_sync(zio, db, &lr->lr_blkptr, 881 lr->lr_common.lrc_txg, zfs_get_done, zgd); 882 ASSERT((error && error != EINPROGRESS) || 883 lr->lr_length <= zp->z_blksz); 884 if (error == 0) 885 zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); 886 /* 887 * If we get EINPROGRESS, then we need to wait for a 888 * write IO initiated by dmu_sync() to complete before 889 * we can release this dbuf. We will finish everything 890 * up in the zfs_get_done() callback. 891 */ 892 if (error == EINPROGRESS) 893 return (0); 894 dmu_buf_rele(db, zgd); 895 kmem_free(zgd, sizeof (zgd_t)); 896 } 897 out: 898 zfs_range_unlock(rl); 899 VN_RELE(ZTOV(zp)); 900 return (error); 901 } 902 903 /*ARGSUSED*/ 904 static int 905 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 906 caller_context_t *ct) 907 { 908 znode_t *zp = VTOZ(vp); 909 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 910 int error; 911 912 ZFS_ENTER(zfsvfs); 913 ZFS_VERIFY_ZP(zp); 914 915 if (flag & V_ACE_MASK) 916 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 917 else 918 error = zfs_zaccess_rwx(zp, mode, flag, cr); 919 920 ZFS_EXIT(zfsvfs); 921 return (error); 922 } 923 924 /* 925 * Lookup an entry in a directory, or an extended attribute directory. 926 * If it exists, return a held vnode reference for it. 927 * 928 * IN: dvp - vnode of directory to search. 929 * nm - name of entry to lookup. 930 * pnp - full pathname to lookup [UNUSED]. 931 * flags - LOOKUP_XATTR set if looking for an attribute. 932 * rdir - root directory vnode [UNUSED]. 933 * cr - credentials of caller. 934 * ct - caller context 935 * direntflags - directory lookup flags 936 * realpnp - returned pathname. 937 * 938 * OUT: vpp - vnode of located entry, NULL if not found. 939 * 940 * RETURN: 0 if success 941 * error code if failure 942 * 943 * Timestamps: 944 * NA 945 */ 946 /* ARGSUSED */ 947 static int 948 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 949 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 950 int *direntflags, pathname_t *realpnp) 951 { 952 znode_t *zdp = VTOZ(dvp); 953 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 954 int error; 955 956 ZFS_ENTER(zfsvfs); 957 ZFS_VERIFY_ZP(zdp); 958 959 *vpp = NULL; 960 961 if (flags & LOOKUP_XATTR) { 962 /* 963 * If the xattr property is off, refuse the lookup request. 964 */ 965 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 966 ZFS_EXIT(zfsvfs); 967 return (EINVAL); 968 } 969 970 /* 971 * We don't allow recursive attributes.. 972 * Maybe someday we will. 973 */ 974 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 975 ZFS_EXIT(zfsvfs); 976 return (EINVAL); 977 } 978 979 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 980 ZFS_EXIT(zfsvfs); 981 return (error); 982 } 983 984 /* 985 * Do we have permission to get into attribute directory? 986 */ 987 988 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 989 B_FALSE, cr)) { 990 VN_RELE(*vpp); 991 *vpp = NULL; 992 } 993 994 ZFS_EXIT(zfsvfs); 995 return (error); 996 } 997 998 if (dvp->v_type != VDIR) { 999 ZFS_EXIT(zfsvfs); 1000 return (ENOTDIR); 1001 } 1002 1003 /* 1004 * Check accessibility of directory. 1005 */ 1006 1007 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1008 ZFS_EXIT(zfsvfs); 1009 return (error); 1010 } 1011 1012 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1013 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1014 ZFS_EXIT(zfsvfs); 1015 return (EILSEQ); 1016 } 1017 1018 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1019 if (error == 0) { 1020 /* 1021 * Convert device special files 1022 */ 1023 if (IS_DEVVP(*vpp)) { 1024 vnode_t *svp; 1025 1026 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1027 VN_RELE(*vpp); 1028 if (svp == NULL) 1029 error = ENOSYS; 1030 else 1031 *vpp = svp; 1032 } 1033 } 1034 1035 ZFS_EXIT(zfsvfs); 1036 return (error); 1037 } 1038 1039 /* 1040 * Attempt to create a new entry in a directory. If the entry 1041 * already exists, truncate the file if permissible, else return 1042 * an error. Return the vp of the created or trunc'd file. 1043 * 1044 * IN: dvp - vnode of directory to put new file entry in. 1045 * name - name of new file entry. 1046 * vap - attributes of new file. 1047 * excl - flag indicating exclusive or non-exclusive mode. 1048 * mode - mode to open file with. 1049 * cr - credentials of caller. 1050 * flag - large file flag [UNUSED]. 1051 * ct - caller context 1052 * vsecp - ACL to be set 1053 * 1054 * OUT: vpp - vnode of created or trunc'd entry. 1055 * 1056 * RETURN: 0 if success 1057 * error code if failure 1058 * 1059 * Timestamps: 1060 * dvp - ctime|mtime updated if new entry created 1061 * vp - ctime|mtime always, atime if new 1062 */ 1063 1064 /* ARGSUSED */ 1065 static int 1066 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1067 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1068 vsecattr_t *vsecp) 1069 { 1070 znode_t *zp, *dzp = VTOZ(dvp); 1071 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1072 zilog_t *zilog; 1073 objset_t *os; 1074 zfs_dirlock_t *dl; 1075 dmu_tx_t *tx; 1076 int error; 1077 ksid_t *ksid; 1078 uid_t uid; 1079 gid_t gid = crgetgid(cr); 1080 zfs_acl_ids_t acl_ids; 1081 boolean_t fuid_dirtied; 1082 1083 /* 1084 * If we have an ephemeral id, ACL, or XVATTR then 1085 * make sure file system is at proper version 1086 */ 1087 1088 ksid = crgetsid(cr, KSID_OWNER); 1089 if (ksid) 1090 uid = ksid_getid(ksid); 1091 else 1092 uid = crgetuid(cr); 1093 1094 if (zfsvfs->z_use_fuids == B_FALSE && 1095 (vsecp || (vap->va_mask & AT_XVATTR) || 1096 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1097 return (EINVAL); 1098 1099 ZFS_ENTER(zfsvfs); 1100 ZFS_VERIFY_ZP(dzp); 1101 os = zfsvfs->z_os; 1102 zilog = zfsvfs->z_log; 1103 1104 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1105 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1106 ZFS_EXIT(zfsvfs); 1107 return (EILSEQ); 1108 } 1109 1110 if (vap->va_mask & AT_XVATTR) { 1111 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1112 crgetuid(cr), cr, vap->va_type)) != 0) { 1113 ZFS_EXIT(zfsvfs); 1114 return (error); 1115 } 1116 } 1117 top: 1118 *vpp = NULL; 1119 1120 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1121 vap->va_mode &= ~VSVTX; 1122 1123 if (*name == '\0') { 1124 /* 1125 * Null component name refers to the directory itself. 1126 */ 1127 VN_HOLD(dvp); 1128 zp = dzp; 1129 dl = NULL; 1130 error = 0; 1131 } else { 1132 /* possible VN_HOLD(zp) */ 1133 int zflg = 0; 1134 1135 if (flag & FIGNORECASE) 1136 zflg |= ZCILOOK; 1137 1138 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1139 NULL, NULL); 1140 if (error) { 1141 if (strcmp(name, "..") == 0) 1142 error = EISDIR; 1143 ZFS_EXIT(zfsvfs); 1144 return (error); 1145 } 1146 } 1147 if (zp == NULL) { 1148 uint64_t txtype; 1149 1150 /* 1151 * Create a new file object and update the directory 1152 * to reference it. 1153 */ 1154 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1155 goto out; 1156 } 1157 1158 /* 1159 * We only support the creation of regular files in 1160 * extended attribute directories. 1161 */ 1162 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1163 (vap->va_type != VREG)) { 1164 error = EINVAL; 1165 goto out; 1166 } 1167 1168 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, 1169 &acl_ids)) != 0) 1170 goto out; 1171 1172 tx = dmu_tx_create(os); 1173 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1174 fuid_dirtied = zfsvfs->z_fuid_dirty; 1175 if (fuid_dirtied) { 1176 if (zfsvfs->z_fuid_obj == 0) { 1177 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1178 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1179 FUID_SIZE_ESTIMATE(zfsvfs)); 1180 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 1181 FALSE, NULL); 1182 } else { 1183 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1184 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1185 FUID_SIZE_ESTIMATE(zfsvfs)); 1186 } 1187 } 1188 dmu_tx_hold_bonus(tx, dzp->z_id); 1189 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1190 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1191 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1192 0, SPA_MAXBLOCKSIZE); 1193 } 1194 error = dmu_tx_assign(tx, TXG_NOWAIT); 1195 if (error) { 1196 zfs_acl_ids_free(&acl_ids); 1197 zfs_dirent_unlock(dl); 1198 if (error == ERESTART) { 1199 dmu_tx_wait(tx); 1200 dmu_tx_abort(tx); 1201 goto top; 1202 } 1203 dmu_tx_abort(tx); 1204 ZFS_EXIT(zfsvfs); 1205 return (error); 1206 } 1207 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); 1208 1209 if (fuid_dirtied) 1210 zfs_fuid_sync(zfsvfs, tx); 1211 1212 (void) zfs_link_create(dl, zp, tx, ZNEW); 1213 1214 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1215 if (flag & FIGNORECASE) 1216 txtype |= TX_CI; 1217 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1218 vsecp, acl_ids.z_fuidp, vap); 1219 zfs_acl_ids_free(&acl_ids); 1220 dmu_tx_commit(tx); 1221 } else { 1222 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1223 1224 /* 1225 * A directory entry already exists for this name. 1226 */ 1227 /* 1228 * Can't truncate an existing file if in exclusive mode. 1229 */ 1230 if (excl == EXCL) { 1231 error = EEXIST; 1232 goto out; 1233 } 1234 /* 1235 * Can't open a directory for writing. 1236 */ 1237 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1238 error = EISDIR; 1239 goto out; 1240 } 1241 /* 1242 * Verify requested access to file. 1243 */ 1244 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1245 goto out; 1246 } 1247 1248 mutex_enter(&dzp->z_lock); 1249 dzp->z_seq++; 1250 mutex_exit(&dzp->z_lock); 1251 1252 /* 1253 * Truncate regular files if requested. 1254 */ 1255 if ((ZTOV(zp)->v_type == VREG) && 1256 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1257 /* we can't hold any locks when calling zfs_freesp() */ 1258 zfs_dirent_unlock(dl); 1259 dl = NULL; 1260 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1261 if (error == 0) { 1262 vnevent_create(ZTOV(zp), ct); 1263 } 1264 } 1265 } 1266 out: 1267 1268 if (dl) 1269 zfs_dirent_unlock(dl); 1270 1271 if (error) { 1272 if (zp) 1273 VN_RELE(ZTOV(zp)); 1274 } else { 1275 *vpp = ZTOV(zp); 1276 /* 1277 * If vnode is for a device return a specfs vnode instead. 1278 */ 1279 if (IS_DEVVP(*vpp)) { 1280 struct vnode *svp; 1281 1282 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1283 VN_RELE(*vpp); 1284 if (svp == NULL) { 1285 error = ENOSYS; 1286 } 1287 *vpp = svp; 1288 } 1289 } 1290 1291 ZFS_EXIT(zfsvfs); 1292 return (error); 1293 } 1294 1295 /* 1296 * Remove an entry from a directory. 1297 * 1298 * IN: dvp - vnode of directory to remove entry from. 1299 * name - name of entry to remove. 1300 * cr - credentials of caller. 1301 * ct - caller context 1302 * flags - case flags 1303 * 1304 * RETURN: 0 if success 1305 * error code if failure 1306 * 1307 * Timestamps: 1308 * dvp - ctime|mtime 1309 * vp - ctime (if nlink > 0) 1310 */ 1311 /*ARGSUSED*/ 1312 static int 1313 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1314 int flags) 1315 { 1316 znode_t *zp, *dzp = VTOZ(dvp); 1317 znode_t *xzp = NULL; 1318 vnode_t *vp; 1319 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1320 zilog_t *zilog; 1321 uint64_t acl_obj, xattr_obj; 1322 zfs_dirlock_t *dl; 1323 dmu_tx_t *tx; 1324 boolean_t may_delete_now, delete_now = FALSE; 1325 boolean_t unlinked, toobig = FALSE; 1326 uint64_t txtype; 1327 pathname_t *realnmp = NULL; 1328 pathname_t realnm; 1329 int error; 1330 int zflg = ZEXISTS; 1331 1332 ZFS_ENTER(zfsvfs); 1333 ZFS_VERIFY_ZP(dzp); 1334 zilog = zfsvfs->z_log; 1335 1336 if (flags & FIGNORECASE) { 1337 zflg |= ZCILOOK; 1338 pn_alloc(&realnm); 1339 realnmp = &realnm; 1340 } 1341 1342 top: 1343 /* 1344 * Attempt to lock directory; fail if entry doesn't exist. 1345 */ 1346 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1347 NULL, realnmp)) { 1348 if (realnmp) 1349 pn_free(realnmp); 1350 ZFS_EXIT(zfsvfs); 1351 return (error); 1352 } 1353 1354 vp = ZTOV(zp); 1355 1356 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1357 goto out; 1358 } 1359 1360 /* 1361 * Need to use rmdir for removing directories. 1362 */ 1363 if (vp->v_type == VDIR) { 1364 error = EPERM; 1365 goto out; 1366 } 1367 1368 vnevent_remove(vp, dvp, name, ct); 1369 1370 if (realnmp) 1371 dnlc_remove(dvp, realnmp->pn_buf); 1372 else 1373 dnlc_remove(dvp, name); 1374 1375 mutex_enter(&vp->v_lock); 1376 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1377 mutex_exit(&vp->v_lock); 1378 1379 /* 1380 * We may delete the znode now, or we may put it in the unlinked set; 1381 * it depends on whether we're the last link, and on whether there are 1382 * other holds on the vnode. So we dmu_tx_hold() the right things to 1383 * allow for either case. 1384 */ 1385 tx = dmu_tx_create(zfsvfs->z_os); 1386 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1387 dmu_tx_hold_bonus(tx, zp->z_id); 1388 if (may_delete_now) { 1389 toobig = 1390 zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1391 /* if the file is too big, only hold_free a token amount */ 1392 dmu_tx_hold_free(tx, zp->z_id, 0, 1393 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1394 } 1395 1396 /* are there any extended attributes? */ 1397 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1398 /* XXX - do we need this if we are deleting? */ 1399 dmu_tx_hold_bonus(tx, xattr_obj); 1400 } 1401 1402 /* are there any additional acls */ 1403 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1404 may_delete_now) 1405 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1406 1407 /* charge as an update -- would be nice not to charge at all */ 1408 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1409 1410 error = dmu_tx_assign(tx, TXG_NOWAIT); 1411 if (error) { 1412 zfs_dirent_unlock(dl); 1413 VN_RELE(vp); 1414 if (error == ERESTART) { 1415 dmu_tx_wait(tx); 1416 dmu_tx_abort(tx); 1417 goto top; 1418 } 1419 if (realnmp) 1420 pn_free(realnmp); 1421 dmu_tx_abort(tx); 1422 ZFS_EXIT(zfsvfs); 1423 return (error); 1424 } 1425 1426 /* 1427 * Remove the directory entry. 1428 */ 1429 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1430 1431 if (error) { 1432 dmu_tx_commit(tx); 1433 goto out; 1434 } 1435 1436 if (unlinked) { 1437 mutex_enter(&vp->v_lock); 1438 delete_now = may_delete_now && !toobig && 1439 vp->v_count == 1 && !vn_has_cached_data(vp) && 1440 zp->z_phys->zp_xattr == xattr_obj && 1441 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1442 mutex_exit(&vp->v_lock); 1443 } 1444 1445 if (delete_now) { 1446 if (zp->z_phys->zp_xattr) { 1447 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1448 ASSERT3U(error, ==, 0); 1449 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1450 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1451 mutex_enter(&xzp->z_lock); 1452 xzp->z_unlinked = 1; 1453 xzp->z_phys->zp_links = 0; 1454 mutex_exit(&xzp->z_lock); 1455 zfs_unlinked_add(xzp, tx); 1456 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1457 } 1458 mutex_enter(&zp->z_lock); 1459 mutex_enter(&vp->v_lock); 1460 vp->v_count--; 1461 ASSERT3U(vp->v_count, ==, 0); 1462 mutex_exit(&vp->v_lock); 1463 mutex_exit(&zp->z_lock); 1464 zfs_znode_delete(zp, tx); 1465 } else if (unlinked) { 1466 zfs_unlinked_add(zp, tx); 1467 } 1468 1469 txtype = TX_REMOVE; 1470 if (flags & FIGNORECASE) 1471 txtype |= TX_CI; 1472 zfs_log_remove(zilog, tx, txtype, dzp, name); 1473 1474 dmu_tx_commit(tx); 1475 out: 1476 if (realnmp) 1477 pn_free(realnmp); 1478 1479 zfs_dirent_unlock(dl); 1480 1481 if (!delete_now) { 1482 VN_RELE(vp); 1483 } else if (xzp) { 1484 /* this rele is delayed to prevent nesting transactions */ 1485 VN_RELE(ZTOV(xzp)); 1486 } 1487 1488 ZFS_EXIT(zfsvfs); 1489 return (error); 1490 } 1491 1492 /* 1493 * Create a new directory and insert it into dvp using the name 1494 * provided. Return a pointer to the inserted directory. 1495 * 1496 * IN: dvp - vnode of directory to add subdir to. 1497 * dirname - name of new directory. 1498 * vap - attributes of new directory. 1499 * cr - credentials of caller. 1500 * ct - caller context 1501 * vsecp - ACL to be set 1502 * 1503 * OUT: vpp - vnode of created directory. 1504 * 1505 * RETURN: 0 if success 1506 * error code if failure 1507 * 1508 * Timestamps: 1509 * dvp - ctime|mtime updated 1510 * vp - ctime|mtime|atime updated 1511 */ 1512 /*ARGSUSED*/ 1513 static int 1514 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1515 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1516 { 1517 znode_t *zp, *dzp = VTOZ(dvp); 1518 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1519 zilog_t *zilog; 1520 zfs_dirlock_t *dl; 1521 uint64_t txtype; 1522 dmu_tx_t *tx; 1523 int error; 1524 int zf = ZNEW; 1525 ksid_t *ksid; 1526 uid_t uid; 1527 gid_t gid = crgetgid(cr); 1528 zfs_acl_ids_t acl_ids; 1529 boolean_t fuid_dirtied; 1530 1531 ASSERT(vap->va_type == VDIR); 1532 1533 /* 1534 * If we have an ephemeral id, ACL, or XVATTR then 1535 * make sure file system is at proper version 1536 */ 1537 1538 ksid = crgetsid(cr, KSID_OWNER); 1539 if (ksid) 1540 uid = ksid_getid(ksid); 1541 else 1542 uid = crgetuid(cr); 1543 if (zfsvfs->z_use_fuids == B_FALSE && 1544 (vsecp || (vap->va_mask & AT_XVATTR) || 1545 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1546 return (EINVAL); 1547 1548 ZFS_ENTER(zfsvfs); 1549 ZFS_VERIFY_ZP(dzp); 1550 zilog = zfsvfs->z_log; 1551 1552 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1553 ZFS_EXIT(zfsvfs); 1554 return (EINVAL); 1555 } 1556 1557 if (zfsvfs->z_utf8 && u8_validate(dirname, 1558 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1559 ZFS_EXIT(zfsvfs); 1560 return (EILSEQ); 1561 } 1562 if (flags & FIGNORECASE) 1563 zf |= ZCILOOK; 1564 1565 if (vap->va_mask & AT_XVATTR) 1566 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1567 crgetuid(cr), cr, vap->va_type)) != 0) { 1568 ZFS_EXIT(zfsvfs); 1569 return (error); 1570 } 1571 1572 /* 1573 * First make sure the new directory doesn't exist. 1574 */ 1575 top: 1576 *vpp = NULL; 1577 1578 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1579 NULL, NULL)) { 1580 ZFS_EXIT(zfsvfs); 1581 return (error); 1582 } 1583 1584 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1585 zfs_dirent_unlock(dl); 1586 ZFS_EXIT(zfsvfs); 1587 return (error); 1588 } 1589 1590 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, 1591 &acl_ids)) != 0) { 1592 zfs_dirent_unlock(dl); 1593 ZFS_EXIT(zfsvfs); 1594 return (error); 1595 } 1596 1597 /* 1598 * Add a new entry to the directory. 1599 */ 1600 tx = dmu_tx_create(zfsvfs->z_os); 1601 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1602 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1603 fuid_dirtied = zfsvfs->z_fuid_dirty; 1604 if (fuid_dirtied) { 1605 if (zfsvfs->z_fuid_obj == 0) { 1606 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1607 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1608 FUID_SIZE_ESTIMATE(zfsvfs)); 1609 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 1610 } else { 1611 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 1612 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 1613 FUID_SIZE_ESTIMATE(zfsvfs)); 1614 } 1615 } 1616 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) 1617 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1618 0, SPA_MAXBLOCKSIZE); 1619 error = dmu_tx_assign(tx, TXG_NOWAIT); 1620 if (error) { 1621 zfs_acl_ids_free(&acl_ids); 1622 zfs_dirent_unlock(dl); 1623 if (error == ERESTART) { 1624 dmu_tx_wait(tx); 1625 dmu_tx_abort(tx); 1626 goto top; 1627 } 1628 dmu_tx_abort(tx); 1629 ZFS_EXIT(zfsvfs); 1630 return (error); 1631 } 1632 1633 /* 1634 * Create new node. 1635 */ 1636 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); 1637 1638 if (fuid_dirtied) 1639 zfs_fuid_sync(zfsvfs, tx); 1640 /* 1641 * Now put new name in parent dir. 1642 */ 1643 (void) zfs_link_create(dl, zp, tx, ZNEW); 1644 1645 *vpp = ZTOV(zp); 1646 1647 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1648 if (flags & FIGNORECASE) 1649 txtype |= TX_CI; 1650 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1651 acl_ids.z_fuidp, vap); 1652 1653 zfs_acl_ids_free(&acl_ids); 1654 dmu_tx_commit(tx); 1655 1656 zfs_dirent_unlock(dl); 1657 1658 ZFS_EXIT(zfsvfs); 1659 return (0); 1660 } 1661 1662 /* 1663 * Remove a directory subdir entry. If the current working 1664 * directory is the same as the subdir to be removed, the 1665 * remove will fail. 1666 * 1667 * IN: dvp - vnode of directory to remove from. 1668 * name - name of directory to be removed. 1669 * cwd - vnode of current working directory. 1670 * cr - credentials of caller. 1671 * ct - caller context 1672 * flags - case flags 1673 * 1674 * RETURN: 0 if success 1675 * error code if failure 1676 * 1677 * Timestamps: 1678 * dvp - ctime|mtime updated 1679 */ 1680 /*ARGSUSED*/ 1681 static int 1682 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1683 caller_context_t *ct, int flags) 1684 { 1685 znode_t *dzp = VTOZ(dvp); 1686 znode_t *zp; 1687 vnode_t *vp; 1688 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1689 zilog_t *zilog; 1690 zfs_dirlock_t *dl; 1691 dmu_tx_t *tx; 1692 int error; 1693 int zflg = ZEXISTS; 1694 1695 ZFS_ENTER(zfsvfs); 1696 ZFS_VERIFY_ZP(dzp); 1697 zilog = zfsvfs->z_log; 1698 1699 if (flags & FIGNORECASE) 1700 zflg |= ZCILOOK; 1701 top: 1702 zp = NULL; 1703 1704 /* 1705 * Attempt to lock directory; fail if entry doesn't exist. 1706 */ 1707 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1708 NULL, NULL)) { 1709 ZFS_EXIT(zfsvfs); 1710 return (error); 1711 } 1712 1713 vp = ZTOV(zp); 1714 1715 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1716 goto out; 1717 } 1718 1719 if (vp->v_type != VDIR) { 1720 error = ENOTDIR; 1721 goto out; 1722 } 1723 1724 if (vp == cwd) { 1725 error = EINVAL; 1726 goto out; 1727 } 1728 1729 vnevent_rmdir(vp, dvp, name, ct); 1730 1731 /* 1732 * Grab a lock on the directory to make sure that noone is 1733 * trying to add (or lookup) entries while we are removing it. 1734 */ 1735 rw_enter(&zp->z_name_lock, RW_WRITER); 1736 1737 /* 1738 * Grab a lock on the parent pointer to make sure we play well 1739 * with the treewalk and directory rename code. 1740 */ 1741 rw_enter(&zp->z_parent_lock, RW_WRITER); 1742 1743 tx = dmu_tx_create(zfsvfs->z_os); 1744 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1745 dmu_tx_hold_bonus(tx, zp->z_id); 1746 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1747 error = dmu_tx_assign(tx, TXG_NOWAIT); 1748 if (error) { 1749 rw_exit(&zp->z_parent_lock); 1750 rw_exit(&zp->z_name_lock); 1751 zfs_dirent_unlock(dl); 1752 VN_RELE(vp); 1753 if (error == ERESTART) { 1754 dmu_tx_wait(tx); 1755 dmu_tx_abort(tx); 1756 goto top; 1757 } 1758 dmu_tx_abort(tx); 1759 ZFS_EXIT(zfsvfs); 1760 return (error); 1761 } 1762 1763 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1764 1765 if (error == 0) { 1766 uint64_t txtype = TX_RMDIR; 1767 if (flags & FIGNORECASE) 1768 txtype |= TX_CI; 1769 zfs_log_remove(zilog, tx, txtype, dzp, name); 1770 } 1771 1772 dmu_tx_commit(tx); 1773 1774 rw_exit(&zp->z_parent_lock); 1775 rw_exit(&zp->z_name_lock); 1776 out: 1777 zfs_dirent_unlock(dl); 1778 1779 VN_RELE(vp); 1780 1781 ZFS_EXIT(zfsvfs); 1782 return (error); 1783 } 1784 1785 /* 1786 * Read as many directory entries as will fit into the provided 1787 * buffer from the given directory cursor position (specified in 1788 * the uio structure. 1789 * 1790 * IN: vp - vnode of directory to read. 1791 * uio - structure supplying read location, range info, 1792 * and return buffer. 1793 * cr - credentials of caller. 1794 * ct - caller context 1795 * flags - case flags 1796 * 1797 * OUT: uio - updated offset and range, buffer filled. 1798 * eofp - set to true if end-of-file detected. 1799 * 1800 * RETURN: 0 if success 1801 * error code if failure 1802 * 1803 * Timestamps: 1804 * vp - atime updated 1805 * 1806 * Note that the low 4 bits of the cookie returned by zap is always zero. 1807 * This allows us to use the low range for "special" directory entries: 1808 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1809 * we use the offset 2 for the '.zfs' directory. 1810 */ 1811 /* ARGSUSED */ 1812 static int 1813 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 1814 caller_context_t *ct, int flags) 1815 { 1816 znode_t *zp = VTOZ(vp); 1817 iovec_t *iovp; 1818 edirent_t *eodp; 1819 dirent64_t *odp; 1820 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1821 objset_t *os; 1822 caddr_t outbuf; 1823 size_t bufsize; 1824 zap_cursor_t zc; 1825 zap_attribute_t zap; 1826 uint_t bytes_wanted; 1827 uint64_t offset; /* must be unsigned; checks for < 1 */ 1828 int local_eof; 1829 int outcount; 1830 int error; 1831 uint8_t prefetch; 1832 boolean_t check_sysattrs; 1833 1834 ZFS_ENTER(zfsvfs); 1835 ZFS_VERIFY_ZP(zp); 1836 1837 /* 1838 * If we are not given an eof variable, 1839 * use a local one. 1840 */ 1841 if (eofp == NULL) 1842 eofp = &local_eof; 1843 1844 /* 1845 * Check for valid iov_len. 1846 */ 1847 if (uio->uio_iov->iov_len <= 0) { 1848 ZFS_EXIT(zfsvfs); 1849 return (EINVAL); 1850 } 1851 1852 /* 1853 * Quit if directory has been removed (posix) 1854 */ 1855 if ((*eofp = zp->z_unlinked) != 0) { 1856 ZFS_EXIT(zfsvfs); 1857 return (0); 1858 } 1859 1860 error = 0; 1861 os = zfsvfs->z_os; 1862 offset = uio->uio_loffset; 1863 prefetch = zp->z_zn_prefetch; 1864 1865 /* 1866 * Initialize the iterator cursor. 1867 */ 1868 if (offset <= 3) { 1869 /* 1870 * Start iteration from the beginning of the directory. 1871 */ 1872 zap_cursor_init(&zc, os, zp->z_id); 1873 } else { 1874 /* 1875 * The offset is a serialized cursor. 1876 */ 1877 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1878 } 1879 1880 /* 1881 * Get space to change directory entries into fs independent format. 1882 */ 1883 iovp = uio->uio_iov; 1884 bytes_wanted = iovp->iov_len; 1885 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 1886 bufsize = bytes_wanted; 1887 outbuf = kmem_alloc(bufsize, KM_SLEEP); 1888 odp = (struct dirent64 *)outbuf; 1889 } else { 1890 bufsize = bytes_wanted; 1891 odp = (struct dirent64 *)iovp->iov_base; 1892 } 1893 eodp = (struct edirent *)odp; 1894 1895 /* 1896 * If this VFS supports the system attribute view interface; and 1897 * we're looking at an extended attribute directory; and we care 1898 * about normalization conflicts on this vfs; then we must check 1899 * for normalization conflicts with the sysattr name space. 1900 */ 1901 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 1902 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 1903 (flags & V_RDDIR_ENTFLAGS); 1904 1905 /* 1906 * Transform to file-system independent format 1907 */ 1908 outcount = 0; 1909 while (outcount < bytes_wanted) { 1910 ino64_t objnum; 1911 ushort_t reclen; 1912 off64_t *next; 1913 1914 /* 1915 * Special case `.', `..', and `.zfs'. 1916 */ 1917 if (offset == 0) { 1918 (void) strcpy(zap.za_name, "."); 1919 zap.za_normalization_conflict = 0; 1920 objnum = zp->z_id; 1921 } else if (offset == 1) { 1922 (void) strcpy(zap.za_name, ".."); 1923 zap.za_normalization_conflict = 0; 1924 objnum = zp->z_phys->zp_parent; 1925 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1926 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1927 zap.za_normalization_conflict = 0; 1928 objnum = ZFSCTL_INO_ROOT; 1929 } else { 1930 /* 1931 * Grab next entry. 1932 */ 1933 if (error = zap_cursor_retrieve(&zc, &zap)) { 1934 if ((*eofp = (error == ENOENT)) != 0) 1935 break; 1936 else 1937 goto update; 1938 } 1939 1940 if (zap.za_integer_length != 8 || 1941 zap.za_num_integers != 1) { 1942 cmn_err(CE_WARN, "zap_readdir: bad directory " 1943 "entry, obj = %lld, offset = %lld\n", 1944 (u_longlong_t)zp->z_id, 1945 (u_longlong_t)offset); 1946 error = ENXIO; 1947 goto update; 1948 } 1949 1950 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 1951 /* 1952 * MacOS X can extract the object type here such as: 1953 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 1954 */ 1955 1956 if (check_sysattrs && !zap.za_normalization_conflict) { 1957 zap.za_normalization_conflict = 1958 xattr_sysattr_casechk(zap.za_name); 1959 } 1960 } 1961 1962 if (flags & V_RDDIR_ENTFLAGS) 1963 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 1964 else 1965 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 1966 1967 /* 1968 * Will this entry fit in the buffer? 1969 */ 1970 if (outcount + reclen > bufsize) { 1971 /* 1972 * Did we manage to fit anything in the buffer? 1973 */ 1974 if (!outcount) { 1975 error = EINVAL; 1976 goto update; 1977 } 1978 break; 1979 } 1980 if (flags & V_RDDIR_ENTFLAGS) { 1981 /* 1982 * Add extended flag entry: 1983 */ 1984 eodp->ed_ino = objnum; 1985 eodp->ed_reclen = reclen; 1986 /* NOTE: ed_off is the offset for the *next* entry */ 1987 next = &(eodp->ed_off); 1988 eodp->ed_eflags = zap.za_normalization_conflict ? 1989 ED_CASE_CONFLICT : 0; 1990 (void) strncpy(eodp->ed_name, zap.za_name, 1991 EDIRENT_NAMELEN(reclen)); 1992 eodp = (edirent_t *)((intptr_t)eodp + reclen); 1993 } else { 1994 /* 1995 * Add normal entry: 1996 */ 1997 odp->d_ino = objnum; 1998 odp->d_reclen = reclen; 1999 /* NOTE: d_off is the offset for the *next* entry */ 2000 next = &(odp->d_off); 2001 (void) strncpy(odp->d_name, zap.za_name, 2002 DIRENT64_NAMELEN(reclen)); 2003 odp = (dirent64_t *)((intptr_t)odp + reclen); 2004 } 2005 outcount += reclen; 2006 2007 ASSERT(outcount <= bufsize); 2008 2009 /* Prefetch znode */ 2010 if (prefetch) 2011 dmu_prefetch(os, objnum, 0, 0); 2012 2013 /* 2014 * Move to the next entry, fill in the previous offset. 2015 */ 2016 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2017 zap_cursor_advance(&zc); 2018 offset = zap_cursor_serialize(&zc); 2019 } else { 2020 offset += 1; 2021 } 2022 *next = offset; 2023 } 2024 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2025 2026 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2027 iovp->iov_base += outcount; 2028 iovp->iov_len -= outcount; 2029 uio->uio_resid -= outcount; 2030 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2031 /* 2032 * Reset the pointer. 2033 */ 2034 offset = uio->uio_loffset; 2035 } 2036 2037 update: 2038 zap_cursor_fini(&zc); 2039 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2040 kmem_free(outbuf, bufsize); 2041 2042 if (error == ENOENT) 2043 error = 0; 2044 2045 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2046 2047 uio->uio_loffset = offset; 2048 ZFS_EXIT(zfsvfs); 2049 return (error); 2050 } 2051 2052 ulong_t zfs_fsync_sync_cnt = 4; 2053 2054 static int 2055 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2056 { 2057 znode_t *zp = VTOZ(vp); 2058 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2059 2060 /* 2061 * Regardless of whether this is required for standards conformance, 2062 * this is the logical behavior when fsync() is called on a file with 2063 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2064 * going to be pushed out as part of the zil_commit(). 2065 */ 2066 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2067 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2068 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2069 2070 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2071 2072 ZFS_ENTER(zfsvfs); 2073 ZFS_VERIFY_ZP(zp); 2074 zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); 2075 ZFS_EXIT(zfsvfs); 2076 return (0); 2077 } 2078 2079 2080 /* 2081 * Get the requested file attributes and place them in the provided 2082 * vattr structure. 2083 * 2084 * IN: vp - vnode of file. 2085 * vap - va_mask identifies requested attributes. 2086 * If AT_XVATTR set, then optional attrs are requested 2087 * flags - ATTR_NOACLCHECK (CIFS server context) 2088 * cr - credentials of caller. 2089 * ct - caller context 2090 * 2091 * OUT: vap - attribute values. 2092 * 2093 * RETURN: 0 (always succeeds) 2094 */ 2095 /* ARGSUSED */ 2096 static int 2097 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2098 caller_context_t *ct) 2099 { 2100 znode_t *zp = VTOZ(vp); 2101 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2102 znode_phys_t *pzp; 2103 int error = 0; 2104 uint64_t links; 2105 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2106 xoptattr_t *xoap = NULL; 2107 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2108 2109 ZFS_ENTER(zfsvfs); 2110 ZFS_VERIFY_ZP(zp); 2111 pzp = zp->z_phys; 2112 2113 mutex_enter(&zp->z_lock); 2114 2115 /* 2116 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2117 * Also, if we are the owner don't bother, since owner should 2118 * always be allowed to read basic attributes of file. 2119 */ 2120 if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && 2121 (pzp->zp_uid != crgetuid(cr))) { 2122 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2123 skipaclchk, cr)) { 2124 mutex_exit(&zp->z_lock); 2125 ZFS_EXIT(zfsvfs); 2126 return (error); 2127 } 2128 } 2129 2130 /* 2131 * Return all attributes. It's cheaper to provide the answer 2132 * than to determine whether we were asked the question. 2133 */ 2134 2135 vap->va_type = vp->v_type; 2136 vap->va_mode = pzp->zp_mode & MODEMASK; 2137 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2138 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2139 vap->va_nodeid = zp->z_id; 2140 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2141 links = pzp->zp_links + 1; 2142 else 2143 links = pzp->zp_links; 2144 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2145 vap->va_size = pzp->zp_size; 2146 vap->va_rdev = vp->v_rdev; 2147 vap->va_seq = zp->z_seq; 2148 2149 /* 2150 * Add in any requested optional attributes and the create time. 2151 * Also set the corresponding bits in the returned attribute bitmap. 2152 */ 2153 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2154 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2155 xoap->xoa_archive = 2156 ((pzp->zp_flags & ZFS_ARCHIVE) != 0); 2157 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2158 } 2159 2160 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2161 xoap->xoa_readonly = 2162 ((pzp->zp_flags & ZFS_READONLY) != 0); 2163 XVA_SET_RTN(xvap, XAT_READONLY); 2164 } 2165 2166 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2167 xoap->xoa_system = 2168 ((pzp->zp_flags & ZFS_SYSTEM) != 0); 2169 XVA_SET_RTN(xvap, XAT_SYSTEM); 2170 } 2171 2172 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2173 xoap->xoa_hidden = 2174 ((pzp->zp_flags & ZFS_HIDDEN) != 0); 2175 XVA_SET_RTN(xvap, XAT_HIDDEN); 2176 } 2177 2178 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2179 xoap->xoa_nounlink = 2180 ((pzp->zp_flags & ZFS_NOUNLINK) != 0); 2181 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2182 } 2183 2184 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2185 xoap->xoa_immutable = 2186 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); 2187 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2188 } 2189 2190 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2191 xoap->xoa_appendonly = 2192 ((pzp->zp_flags & ZFS_APPENDONLY) != 0); 2193 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2194 } 2195 2196 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2197 xoap->xoa_nodump = 2198 ((pzp->zp_flags & ZFS_NODUMP) != 0); 2199 XVA_SET_RTN(xvap, XAT_NODUMP); 2200 } 2201 2202 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2203 xoap->xoa_opaque = 2204 ((pzp->zp_flags & ZFS_OPAQUE) != 0); 2205 XVA_SET_RTN(xvap, XAT_OPAQUE); 2206 } 2207 2208 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2209 xoap->xoa_av_quarantined = 2210 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); 2211 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2212 } 2213 2214 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2215 xoap->xoa_av_modified = 2216 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); 2217 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2218 } 2219 2220 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2221 vp->v_type == VREG && 2222 (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { 2223 size_t len; 2224 dmu_object_info_t doi; 2225 2226 /* 2227 * Only VREG files have anti-virus scanstamps, so we 2228 * won't conflict with symlinks in the bonus buffer. 2229 */ 2230 dmu_object_info_from_db(zp->z_dbuf, &doi); 2231 len = sizeof (xoap->xoa_av_scanstamp) + 2232 sizeof (znode_phys_t); 2233 if (len <= doi.doi_bonus_size) { 2234 /* 2235 * pzp points to the start of the 2236 * znode_phys_t. pzp + 1 points to the 2237 * first byte after the znode_phys_t. 2238 */ 2239 (void) memcpy(xoap->xoa_av_scanstamp, 2240 pzp + 1, 2241 sizeof (xoap->xoa_av_scanstamp)); 2242 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 2243 } 2244 } 2245 2246 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2247 ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); 2248 XVA_SET_RTN(xvap, XAT_CREATETIME); 2249 } 2250 } 2251 2252 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 2253 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 2254 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 2255 2256 mutex_exit(&zp->z_lock); 2257 2258 dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); 2259 2260 if (zp->z_blksz == 0) { 2261 /* 2262 * Block size hasn't been set; suggest maximal I/O transfers. 2263 */ 2264 vap->va_blksize = zfsvfs->z_max_blksz; 2265 } 2266 2267 ZFS_EXIT(zfsvfs); 2268 return (0); 2269 } 2270 2271 /* 2272 * Set the file attributes to the values contained in the 2273 * vattr structure. 2274 * 2275 * IN: vp - vnode of file to be modified. 2276 * vap - new attribute values. 2277 * If AT_XVATTR set, then optional attrs are being set 2278 * flags - ATTR_UTIME set if non-default time values provided. 2279 * - ATTR_NOACLCHECK (CIFS context only). 2280 * cr - credentials of caller. 2281 * ct - caller context 2282 * 2283 * RETURN: 0 if success 2284 * error code if failure 2285 * 2286 * Timestamps: 2287 * vp - ctime updated, mtime updated if size changed. 2288 */ 2289 /* ARGSUSED */ 2290 static int 2291 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2292 caller_context_t *ct) 2293 { 2294 znode_t *zp = VTOZ(vp); 2295 znode_phys_t *pzp; 2296 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2297 zilog_t *zilog; 2298 dmu_tx_t *tx; 2299 vattr_t oldva; 2300 xvattr_t tmpxvattr; 2301 uint_t mask = vap->va_mask; 2302 uint_t saved_mask; 2303 int trim_mask = 0; 2304 uint64_t new_mode; 2305 uint64_t new_uid, new_gid; 2306 znode_t *attrzp; 2307 int need_policy = FALSE; 2308 int err; 2309 zfs_fuid_info_t *fuidp = NULL; 2310 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2311 xoptattr_t *xoap; 2312 zfs_acl_t *aclp = NULL; 2313 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2314 boolean_t fuid_dirtied = B_FALSE; 2315 2316 if (mask == 0) 2317 return (0); 2318 2319 if (mask & AT_NOSET) 2320 return (EINVAL); 2321 2322 ZFS_ENTER(zfsvfs); 2323 ZFS_VERIFY_ZP(zp); 2324 2325 pzp = zp->z_phys; 2326 zilog = zfsvfs->z_log; 2327 2328 /* 2329 * Make sure that if we have ephemeral uid/gid or xvattr specified 2330 * that file system is at proper version level 2331 */ 2332 2333 if (zfsvfs->z_use_fuids == B_FALSE && 2334 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2335 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2336 (mask & AT_XVATTR))) { 2337 ZFS_EXIT(zfsvfs); 2338 return (EINVAL); 2339 } 2340 2341 if (mask & AT_SIZE && vp->v_type == VDIR) { 2342 ZFS_EXIT(zfsvfs); 2343 return (EISDIR); 2344 } 2345 2346 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2347 ZFS_EXIT(zfsvfs); 2348 return (EINVAL); 2349 } 2350 2351 /* 2352 * If this is an xvattr_t, then get a pointer to the structure of 2353 * optional attributes. If this is NULL, then we have a vattr_t. 2354 */ 2355 xoap = xva_getxoptattr(xvap); 2356 2357 xva_init(&tmpxvattr); 2358 2359 /* 2360 * Immutable files can only alter immutable bit and atime 2361 */ 2362 if ((pzp->zp_flags & ZFS_IMMUTABLE) && 2363 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2364 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2365 ZFS_EXIT(zfsvfs); 2366 return (EPERM); 2367 } 2368 2369 if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { 2370 ZFS_EXIT(zfsvfs); 2371 return (EPERM); 2372 } 2373 2374 /* 2375 * Verify timestamps doesn't overflow 32 bits. 2376 * ZFS can handle large timestamps, but 32bit syscalls can't 2377 * handle times greater than 2039. This check should be removed 2378 * once large timestamps are fully supported. 2379 */ 2380 if (mask & (AT_ATIME | AT_MTIME)) { 2381 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2382 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2383 ZFS_EXIT(zfsvfs); 2384 return (EOVERFLOW); 2385 } 2386 } 2387 2388 top: 2389 attrzp = NULL; 2390 2391 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2392 ZFS_EXIT(zfsvfs); 2393 return (EROFS); 2394 } 2395 2396 /* 2397 * First validate permissions 2398 */ 2399 2400 if (mask & AT_SIZE) { 2401 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2402 if (err) { 2403 ZFS_EXIT(zfsvfs); 2404 return (err); 2405 } 2406 /* 2407 * XXX - Note, we are not providing any open 2408 * mode flags here (like FNDELAY), so we may 2409 * block if there are locks present... this 2410 * should be addressed in openat(). 2411 */ 2412 /* XXX - would it be OK to generate a log record here? */ 2413 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2414 if (err) { 2415 ZFS_EXIT(zfsvfs); 2416 return (err); 2417 } 2418 } 2419 2420 if (mask & (AT_ATIME|AT_MTIME) || 2421 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2422 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2423 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2424 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2425 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) 2426 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2427 skipaclchk, cr); 2428 2429 if (mask & (AT_UID|AT_GID)) { 2430 int idmask = (mask & (AT_UID|AT_GID)); 2431 int take_owner; 2432 int take_group; 2433 2434 /* 2435 * NOTE: even if a new mode is being set, 2436 * we may clear S_ISUID/S_ISGID bits. 2437 */ 2438 2439 if (!(mask & AT_MODE)) 2440 vap->va_mode = pzp->zp_mode; 2441 2442 /* 2443 * Take ownership or chgrp to group we are a member of 2444 */ 2445 2446 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2447 take_group = (mask & AT_GID) && 2448 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2449 2450 /* 2451 * If both AT_UID and AT_GID are set then take_owner and 2452 * take_group must both be set in order to allow taking 2453 * ownership. 2454 * 2455 * Otherwise, send the check through secpolicy_vnode_setattr() 2456 * 2457 */ 2458 2459 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2460 ((idmask == AT_UID) && take_owner) || 2461 ((idmask == AT_GID) && take_group)) { 2462 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2463 skipaclchk, cr) == 0) { 2464 /* 2465 * Remove setuid/setgid for non-privileged users 2466 */ 2467 secpolicy_setid_clear(vap, cr); 2468 trim_mask = (mask & (AT_UID|AT_GID)); 2469 } else { 2470 need_policy = TRUE; 2471 } 2472 } else { 2473 need_policy = TRUE; 2474 } 2475 } 2476 2477 mutex_enter(&zp->z_lock); 2478 oldva.va_mode = pzp->zp_mode; 2479 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2480 if (mask & AT_XVATTR) { 2481 /* 2482 * Update xvattr mask to include only those attributes 2483 * that are actually changing. 2484 * 2485 * the bits will be restored prior to actually setting 2486 * the attributes so the caller thinks they were set. 2487 */ 2488 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2489 if (xoap->xoa_appendonly != 2490 ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { 2491 need_policy = TRUE; 2492 } else { 2493 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2494 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 2495 } 2496 } 2497 2498 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2499 if (xoap->xoa_nounlink != 2500 ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { 2501 need_policy = TRUE; 2502 } else { 2503 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2504 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 2505 } 2506 } 2507 2508 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2509 if (xoap->xoa_immutable != 2510 ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { 2511 need_policy = TRUE; 2512 } else { 2513 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2514 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 2515 } 2516 } 2517 2518 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2519 if (xoap->xoa_nodump != 2520 ((pzp->zp_flags & ZFS_NODUMP) != 0)) { 2521 need_policy = TRUE; 2522 } else { 2523 XVA_CLR_REQ(xvap, XAT_NODUMP); 2524 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 2525 } 2526 } 2527 2528 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2529 if (xoap->xoa_av_modified != 2530 ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { 2531 need_policy = TRUE; 2532 } else { 2533 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2534 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 2535 } 2536 } 2537 2538 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2539 if ((vp->v_type != VREG && 2540 xoap->xoa_av_quarantined) || 2541 xoap->xoa_av_quarantined != 2542 ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { 2543 need_policy = TRUE; 2544 } else { 2545 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2546 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 2547 } 2548 } 2549 2550 if (need_policy == FALSE && 2551 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2552 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2553 need_policy = TRUE; 2554 } 2555 } 2556 2557 mutex_exit(&zp->z_lock); 2558 2559 if (mask & AT_MODE) { 2560 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2561 err = secpolicy_setid_setsticky_clear(vp, vap, 2562 &oldva, cr); 2563 if (err) { 2564 ZFS_EXIT(zfsvfs); 2565 return (err); 2566 } 2567 trim_mask |= AT_MODE; 2568 } else { 2569 need_policy = TRUE; 2570 } 2571 } 2572 2573 if (need_policy) { 2574 /* 2575 * If trim_mask is set then take ownership 2576 * has been granted or write_acl is present and user 2577 * has the ability to modify mode. In that case remove 2578 * UID|GID and or MODE from mask so that 2579 * secpolicy_vnode_setattr() doesn't revoke it. 2580 */ 2581 2582 if (trim_mask) { 2583 saved_mask = vap->va_mask; 2584 vap->va_mask &= ~trim_mask; 2585 } 2586 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2587 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2588 if (err) { 2589 ZFS_EXIT(zfsvfs); 2590 return (err); 2591 } 2592 2593 if (trim_mask) 2594 vap->va_mask |= saved_mask; 2595 } 2596 2597 /* 2598 * secpolicy_vnode_setattr, or take ownership may have 2599 * changed va_mask 2600 */ 2601 mask = vap->va_mask; 2602 2603 tx = dmu_tx_create(zfsvfs->z_os); 2604 dmu_tx_hold_bonus(tx, zp->z_id); 2605 2606 if (mask & AT_MODE) { 2607 uint64_t pmode = pzp->zp_mode; 2608 2609 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2610 2611 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { 2612 dmu_tx_abort(tx); 2613 ZFS_EXIT(zfsvfs); 2614 return (err); 2615 } 2616 if (pzp->zp_acl.z_acl_extern_obj) { 2617 /* Are we upgrading ACL from old V0 format to new V1 */ 2618 if (zfsvfs->z_version <= ZPL_VERSION_FUID && 2619 pzp->zp_acl.z_acl_version == 2620 ZFS_ACL_VERSION_INITIAL) { 2621 dmu_tx_hold_free(tx, 2622 pzp->zp_acl.z_acl_extern_obj, 0, 2623 DMU_OBJECT_END); 2624 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2625 0, aclp->z_acl_bytes); 2626 } else { 2627 dmu_tx_hold_write(tx, 2628 pzp->zp_acl.z_acl_extern_obj, 0, 2629 aclp->z_acl_bytes); 2630 } 2631 } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2632 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2633 0, aclp->z_acl_bytes); 2634 } 2635 } 2636 2637 if (mask & (AT_UID | AT_GID)) { 2638 if (pzp->zp_xattr) { 2639 err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); 2640 if (err) { 2641 dmu_tx_abort(tx); 2642 ZFS_EXIT(zfsvfs); 2643 if (aclp) 2644 zfs_acl_free(aclp); 2645 return (err); 2646 } 2647 dmu_tx_hold_bonus(tx, attrzp->z_id); 2648 } 2649 if (mask & AT_UID) { 2650 new_uid = zfs_fuid_create(zfsvfs, 2651 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2652 } 2653 if (mask & AT_GID) { 2654 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 2655 cr, ZFS_GROUP, &fuidp); 2656 } 2657 fuid_dirtied = zfsvfs->z_fuid_dirty; 2658 if (fuid_dirtied) { 2659 if (zfsvfs->z_fuid_obj == 0) { 2660 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 2661 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2662 FUID_SIZE_ESTIMATE(zfsvfs)); 2663 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 2664 FALSE, NULL); 2665 } else { 2666 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 2667 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 2668 FUID_SIZE_ESTIMATE(zfsvfs)); 2669 } 2670 } 2671 } 2672 2673 err = dmu_tx_assign(tx, TXG_NOWAIT); 2674 if (err) { 2675 if (attrzp) 2676 VN_RELE(ZTOV(attrzp)); 2677 2678 if (aclp) { 2679 zfs_acl_free(aclp); 2680 aclp = NULL; 2681 } 2682 2683 if (err == ERESTART) { 2684 dmu_tx_wait(tx); 2685 dmu_tx_abort(tx); 2686 goto top; 2687 } 2688 dmu_tx_abort(tx); 2689 ZFS_EXIT(zfsvfs); 2690 return (err); 2691 } 2692 2693 dmu_buf_will_dirty(zp->z_dbuf, tx); 2694 2695 /* 2696 * Set each attribute requested. 2697 * We group settings according to the locks they need to acquire. 2698 * 2699 * Note: you cannot set ctime directly, although it will be 2700 * updated as a side-effect of calling this function. 2701 */ 2702 2703 mutex_enter(&zp->z_lock); 2704 2705 if (mask & AT_MODE) { 2706 mutex_enter(&zp->z_acl_lock); 2707 zp->z_phys->zp_mode = new_mode; 2708 err = zfs_aclset_common(zp, aclp, cr, tx); 2709 ASSERT3U(err, ==, 0); 2710 mutex_exit(&zp->z_acl_lock); 2711 } 2712 2713 if (attrzp) 2714 mutex_enter(&attrzp->z_lock); 2715 2716 if (mask & AT_UID) { 2717 pzp->zp_uid = new_uid; 2718 if (attrzp) 2719 attrzp->z_phys->zp_uid = new_uid; 2720 } 2721 2722 if (mask & AT_GID) { 2723 pzp->zp_gid = new_gid; 2724 if (attrzp) 2725 attrzp->z_phys->zp_gid = new_gid; 2726 } 2727 2728 if (aclp) 2729 zfs_acl_free(aclp); 2730 2731 if (attrzp) 2732 mutex_exit(&attrzp->z_lock); 2733 2734 if (mask & AT_ATIME) 2735 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2736 2737 if (mask & AT_MTIME) 2738 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2739 2740 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 2741 if (mask & AT_SIZE) 2742 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2743 else if (mask != 0) 2744 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2745 /* 2746 * Do this after setting timestamps to prevent timestamp 2747 * update from toggling bit 2748 */ 2749 2750 if (xoap && (mask & AT_XVATTR)) { 2751 2752 /* 2753 * restore trimmed off masks 2754 * so that return masks can be set for caller. 2755 */ 2756 2757 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 2758 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2759 } 2760 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 2761 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2762 } 2763 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 2764 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2765 } 2766 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 2767 XVA_SET_REQ(xvap, XAT_NODUMP); 2768 } 2769 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 2770 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2771 } 2772 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 2773 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2774 } 2775 2776 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 2777 size_t len; 2778 dmu_object_info_t doi; 2779 2780 ASSERT(vp->v_type == VREG); 2781 2782 /* Grow the bonus buffer if necessary. */ 2783 dmu_object_info_from_db(zp->z_dbuf, &doi); 2784 len = sizeof (xoap->xoa_av_scanstamp) + 2785 sizeof (znode_phys_t); 2786 if (len > doi.doi_bonus_size) 2787 VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); 2788 } 2789 zfs_xvattr_set(zp, xvap); 2790 } 2791 2792 if (fuid_dirtied) 2793 zfs_fuid_sync(zfsvfs, tx); 2794 2795 if (mask != 0) 2796 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2797 2798 if (fuidp) 2799 zfs_fuid_info_free(fuidp); 2800 mutex_exit(&zp->z_lock); 2801 2802 dmu_tx_commit(tx); 2803 2804 if (attrzp) 2805 VN_RELE(ZTOV(attrzp)); 2806 2807 2808 ZFS_EXIT(zfsvfs); 2809 return (err); 2810 } 2811 2812 typedef struct zfs_zlock { 2813 krwlock_t *zl_rwlock; /* lock we acquired */ 2814 znode_t *zl_znode; /* znode we held */ 2815 struct zfs_zlock *zl_next; /* next in list */ 2816 } zfs_zlock_t; 2817 2818 /* 2819 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2820 */ 2821 static void 2822 zfs_rename_unlock(zfs_zlock_t **zlpp) 2823 { 2824 zfs_zlock_t *zl; 2825 2826 while ((zl = *zlpp) != NULL) { 2827 if (zl->zl_znode != NULL) 2828 VN_RELE(ZTOV(zl->zl_znode)); 2829 rw_exit(zl->zl_rwlock); 2830 *zlpp = zl->zl_next; 2831 kmem_free(zl, sizeof (*zl)); 2832 } 2833 } 2834 2835 /* 2836 * Search back through the directory tree, using the ".." entries. 2837 * Lock each directory in the chain to prevent concurrent renames. 2838 * Fail any attempt to move a directory into one of its own descendants. 2839 * XXX - z_parent_lock can overlap with map or grow locks 2840 */ 2841 static int 2842 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2843 { 2844 zfs_zlock_t *zl; 2845 znode_t *zp = tdzp; 2846 uint64_t rootid = zp->z_zfsvfs->z_root; 2847 uint64_t *oidp = &zp->z_id; 2848 krwlock_t *rwlp = &szp->z_parent_lock; 2849 krw_t rw = RW_WRITER; 2850 2851 /* 2852 * First pass write-locks szp and compares to zp->z_id. 2853 * Later passes read-lock zp and compare to zp->z_parent. 2854 */ 2855 do { 2856 if (!rw_tryenter(rwlp, rw)) { 2857 /* 2858 * Another thread is renaming in this path. 2859 * Note that if we are a WRITER, we don't have any 2860 * parent_locks held yet. 2861 */ 2862 if (rw == RW_READER && zp->z_id > szp->z_id) { 2863 /* 2864 * Drop our locks and restart 2865 */ 2866 zfs_rename_unlock(&zl); 2867 *zlpp = NULL; 2868 zp = tdzp; 2869 oidp = &zp->z_id; 2870 rwlp = &szp->z_parent_lock; 2871 rw = RW_WRITER; 2872 continue; 2873 } else { 2874 /* 2875 * Wait for other thread to drop its locks 2876 */ 2877 rw_enter(rwlp, rw); 2878 } 2879 } 2880 2881 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2882 zl->zl_rwlock = rwlp; 2883 zl->zl_znode = NULL; 2884 zl->zl_next = *zlpp; 2885 *zlpp = zl; 2886 2887 if (*oidp == szp->z_id) /* We're a descendant of szp */ 2888 return (EINVAL); 2889 2890 if (*oidp == rootid) /* We've hit the top */ 2891 return (0); 2892 2893 if (rw == RW_READER) { /* i.e. not the first pass */ 2894 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 2895 if (error) 2896 return (error); 2897 zl->zl_znode = zp; 2898 } 2899 oidp = &zp->z_phys->zp_parent; 2900 rwlp = &zp->z_parent_lock; 2901 rw = RW_READER; 2902 2903 } while (zp->z_id != sdzp->z_id); 2904 2905 return (0); 2906 } 2907 2908 /* 2909 * Move an entry from the provided source directory to the target 2910 * directory. Change the entry name as indicated. 2911 * 2912 * IN: sdvp - Source directory containing the "old entry". 2913 * snm - Old entry name. 2914 * tdvp - Target directory to contain the "new entry". 2915 * tnm - New entry name. 2916 * cr - credentials of caller. 2917 * ct - caller context 2918 * flags - case flags 2919 * 2920 * RETURN: 0 if success 2921 * error code if failure 2922 * 2923 * Timestamps: 2924 * sdvp,tdvp - ctime|mtime updated 2925 */ 2926 /*ARGSUSED*/ 2927 static int 2928 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 2929 caller_context_t *ct, int flags) 2930 { 2931 znode_t *tdzp, *szp, *tzp; 2932 znode_t *sdzp = VTOZ(sdvp); 2933 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 2934 zilog_t *zilog; 2935 vnode_t *realvp; 2936 zfs_dirlock_t *sdl, *tdl; 2937 dmu_tx_t *tx; 2938 zfs_zlock_t *zl; 2939 int cmp, serr, terr; 2940 int error = 0; 2941 int zflg = 0; 2942 2943 ZFS_ENTER(zfsvfs); 2944 ZFS_VERIFY_ZP(sdzp); 2945 zilog = zfsvfs->z_log; 2946 2947 /* 2948 * Make sure we have the real vp for the target directory. 2949 */ 2950 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 2951 tdvp = realvp; 2952 2953 if (tdvp->v_vfsp != sdvp->v_vfsp) { 2954 ZFS_EXIT(zfsvfs); 2955 return (EXDEV); 2956 } 2957 2958 tdzp = VTOZ(tdvp); 2959 ZFS_VERIFY_ZP(tdzp); 2960 if (zfsvfs->z_utf8 && u8_validate(tnm, 2961 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2962 ZFS_EXIT(zfsvfs); 2963 return (EILSEQ); 2964 } 2965 2966 if (flags & FIGNORECASE) 2967 zflg |= ZCILOOK; 2968 2969 top: 2970 szp = NULL; 2971 tzp = NULL; 2972 zl = NULL; 2973 2974 /* 2975 * This is to prevent the creation of links into attribute space 2976 * by renaming a linked file into/outof an attribute directory. 2977 * See the comment in zfs_link() for why this is considered bad. 2978 */ 2979 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 2980 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 2981 ZFS_EXIT(zfsvfs); 2982 return (EINVAL); 2983 } 2984 2985 /* 2986 * Lock source and target directory entries. To prevent deadlock, 2987 * a lock ordering must be defined. We lock the directory with 2988 * the smallest object id first, or if it's a tie, the one with 2989 * the lexically first name. 2990 */ 2991 if (sdzp->z_id < tdzp->z_id) { 2992 cmp = -1; 2993 } else if (sdzp->z_id > tdzp->z_id) { 2994 cmp = 1; 2995 } else { 2996 /* 2997 * First compare the two name arguments without 2998 * considering any case folding. 2999 */ 3000 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3001 3002 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3003 ASSERT(error == 0 || !zfsvfs->z_utf8); 3004 if (cmp == 0) { 3005 /* 3006 * POSIX: "If the old argument and the new argument 3007 * both refer to links to the same existing file, 3008 * the rename() function shall return successfully 3009 * and perform no other action." 3010 */ 3011 ZFS_EXIT(zfsvfs); 3012 return (0); 3013 } 3014 /* 3015 * If the file system is case-folding, then we may 3016 * have some more checking to do. A case-folding file 3017 * system is either supporting mixed case sensitivity 3018 * access or is completely case-insensitive. Note 3019 * that the file system is always case preserving. 3020 * 3021 * In mixed sensitivity mode case sensitive behavior 3022 * is the default. FIGNORECASE must be used to 3023 * explicitly request case insensitive behavior. 3024 * 3025 * If the source and target names provided differ only 3026 * by case (e.g., a request to rename 'tim' to 'Tim'), 3027 * we will treat this as a special case in the 3028 * case-insensitive mode: as long as the source name 3029 * is an exact match, we will allow this to proceed as 3030 * a name-change request. 3031 */ 3032 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3033 (zfsvfs->z_case == ZFS_CASE_MIXED && 3034 flags & FIGNORECASE)) && 3035 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3036 &error) == 0) { 3037 /* 3038 * case preserving rename request, require exact 3039 * name matches 3040 */ 3041 zflg |= ZCIEXACT; 3042 zflg &= ~ZCILOOK; 3043 } 3044 } 3045 3046 if (cmp < 0) { 3047 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3048 ZEXISTS | zflg, NULL, NULL); 3049 terr = zfs_dirent_lock(&tdl, 3050 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3051 } else { 3052 terr = zfs_dirent_lock(&tdl, 3053 tdzp, tnm, &tzp, zflg, NULL, NULL); 3054 serr = zfs_dirent_lock(&sdl, 3055 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3056 NULL, NULL); 3057 } 3058 3059 if (serr) { 3060 /* 3061 * Source entry invalid or not there. 3062 */ 3063 if (!terr) { 3064 zfs_dirent_unlock(tdl); 3065 if (tzp) 3066 VN_RELE(ZTOV(tzp)); 3067 } 3068 if (strcmp(snm, "..") == 0) 3069 serr = EINVAL; 3070 ZFS_EXIT(zfsvfs); 3071 return (serr); 3072 } 3073 if (terr) { 3074 zfs_dirent_unlock(sdl); 3075 VN_RELE(ZTOV(szp)); 3076 if (strcmp(tnm, "..") == 0) 3077 terr = EINVAL; 3078 ZFS_EXIT(zfsvfs); 3079 return (terr); 3080 } 3081 3082 /* 3083 * Must have write access at the source to remove the old entry 3084 * and write access at the target to create the new entry. 3085 * Note that if target and source are the same, this can be 3086 * done in a single check. 3087 */ 3088 3089 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3090 goto out; 3091 3092 if (ZTOV(szp)->v_type == VDIR) { 3093 /* 3094 * Check to make sure rename is valid. 3095 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3096 */ 3097 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3098 goto out; 3099 } 3100 3101 /* 3102 * Does target exist? 3103 */ 3104 if (tzp) { 3105 /* 3106 * Source and target must be the same type. 3107 */ 3108 if (ZTOV(szp)->v_type == VDIR) { 3109 if (ZTOV(tzp)->v_type != VDIR) { 3110 error = ENOTDIR; 3111 goto out; 3112 } 3113 } else { 3114 if (ZTOV(tzp)->v_type == VDIR) { 3115 error = EISDIR; 3116 goto out; 3117 } 3118 } 3119 /* 3120 * POSIX dictates that when the source and target 3121 * entries refer to the same file object, rename 3122 * must do nothing and exit without error. 3123 */ 3124 if (szp->z_id == tzp->z_id) { 3125 error = 0; 3126 goto out; 3127 } 3128 } 3129 3130 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3131 if (tzp) 3132 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3133 3134 /* 3135 * notify the target directory if it is not the same 3136 * as source directory. 3137 */ 3138 if (tdvp != sdvp) { 3139 vnevent_rename_dest_dir(tdvp, ct); 3140 } 3141 3142 tx = dmu_tx_create(zfsvfs->z_os); 3143 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 3144 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 3145 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3146 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3147 if (sdzp != tdzp) 3148 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 3149 if (tzp) 3150 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 3151 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3152 error = dmu_tx_assign(tx, TXG_NOWAIT); 3153 if (error) { 3154 if (zl != NULL) 3155 zfs_rename_unlock(&zl); 3156 zfs_dirent_unlock(sdl); 3157 zfs_dirent_unlock(tdl); 3158 VN_RELE(ZTOV(szp)); 3159 if (tzp) 3160 VN_RELE(ZTOV(tzp)); 3161 if (error == ERESTART) { 3162 dmu_tx_wait(tx); 3163 dmu_tx_abort(tx); 3164 goto top; 3165 } 3166 dmu_tx_abort(tx); 3167 ZFS_EXIT(zfsvfs); 3168 return (error); 3169 } 3170 3171 if (tzp) /* Attempt to remove the existing target */ 3172 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3173 3174 if (error == 0) { 3175 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3176 if (error == 0) { 3177 szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; 3178 3179 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3180 ASSERT(error == 0); 3181 3182 zfs_log_rename(zilog, tx, 3183 TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), 3184 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3185 3186 /* Update path information for the target vnode */ 3187 vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); 3188 } 3189 } 3190 3191 dmu_tx_commit(tx); 3192 out: 3193 if (zl != NULL) 3194 zfs_rename_unlock(&zl); 3195 3196 zfs_dirent_unlock(sdl); 3197 zfs_dirent_unlock(tdl); 3198 3199 VN_RELE(ZTOV(szp)); 3200 if (tzp) 3201 VN_RELE(ZTOV(tzp)); 3202 3203 ZFS_EXIT(zfsvfs); 3204 return (error); 3205 } 3206 3207 /* 3208 * Insert the indicated symbolic reference entry into the directory. 3209 * 3210 * IN: dvp - Directory to contain new symbolic link. 3211 * link - Name for new symlink entry. 3212 * vap - Attributes of new entry. 3213 * target - Target path of new symlink. 3214 * cr - credentials of caller. 3215 * ct - caller context 3216 * flags - case flags 3217 * 3218 * RETURN: 0 if success 3219 * error code if failure 3220 * 3221 * Timestamps: 3222 * dvp - ctime|mtime updated 3223 */ 3224 /*ARGSUSED*/ 3225 static int 3226 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 3227 caller_context_t *ct, int flags) 3228 { 3229 znode_t *zp, *dzp = VTOZ(dvp); 3230 zfs_dirlock_t *dl; 3231 dmu_tx_t *tx; 3232 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3233 zilog_t *zilog; 3234 int len = strlen(link); 3235 int error; 3236 int zflg = ZNEW; 3237 zfs_acl_ids_t acl_ids; 3238 boolean_t fuid_dirtied; 3239 3240 ASSERT(vap->va_type == VLNK); 3241 3242 ZFS_ENTER(zfsvfs); 3243 ZFS_VERIFY_ZP(dzp); 3244 zilog = zfsvfs->z_log; 3245 3246 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3247 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3248 ZFS_EXIT(zfsvfs); 3249 return (EILSEQ); 3250 } 3251 if (flags & FIGNORECASE) 3252 zflg |= ZCILOOK; 3253 top: 3254 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3255 ZFS_EXIT(zfsvfs); 3256 return (error); 3257 } 3258 3259 if (len > MAXPATHLEN) { 3260 ZFS_EXIT(zfsvfs); 3261 return (ENAMETOOLONG); 3262 } 3263 3264 /* 3265 * Attempt to lock directory; fail if entry already exists. 3266 */ 3267 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3268 if (error) { 3269 ZFS_EXIT(zfsvfs); 3270 return (error); 3271 } 3272 3273 VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); 3274 tx = dmu_tx_create(zfsvfs->z_os); 3275 fuid_dirtied = zfsvfs->z_fuid_dirty; 3276 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3277 dmu_tx_hold_bonus(tx, dzp->z_id); 3278 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3279 if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) 3280 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 3281 if (fuid_dirtied) { 3282 if (zfsvfs->z_fuid_obj == 0) { 3283 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 3284 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3285 FUID_SIZE_ESTIMATE(zfsvfs)); 3286 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); 3287 } else { 3288 dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); 3289 dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, 3290 FUID_SIZE_ESTIMATE(zfsvfs)); 3291 } 3292 } 3293 error = dmu_tx_assign(tx, TXG_NOWAIT); 3294 if (error) { 3295 zfs_acl_ids_free(&acl_ids); 3296 zfs_dirent_unlock(dl); 3297 if (error == ERESTART) { 3298 dmu_tx_wait(tx); 3299 dmu_tx_abort(tx); 3300 goto top; 3301 } 3302 dmu_tx_abort(tx); 3303 ZFS_EXIT(zfsvfs); 3304 return (error); 3305 } 3306 3307 dmu_buf_will_dirty(dzp->z_dbuf, tx); 3308 3309 /* 3310 * Create a new object for the symlink. 3311 * Put the link content into bonus buffer if it will fit; 3312 * otherwise, store it just like any other file data. 3313 */ 3314 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 3315 zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); 3316 if (len != 0) 3317 bcopy(link, zp->z_phys + 1, len); 3318 } else { 3319 dmu_buf_t *dbp; 3320 3321 zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); 3322 3323 if (fuid_dirtied) 3324 zfs_fuid_sync(zfsvfs, tx); 3325 /* 3326 * Nothing can access the znode yet so no locking needed 3327 * for growing the znode's blocksize. 3328 */ 3329 zfs_grow_blocksize(zp, len, tx); 3330 3331 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, 3332 zp->z_id, 0, FTAG, &dbp)); 3333 dmu_buf_will_dirty(dbp, tx); 3334 3335 ASSERT3U(len, <=, dbp->db_size); 3336 bcopy(link, dbp->db_data, len); 3337 dmu_buf_rele(dbp, FTAG); 3338 } 3339 zp->z_phys->zp_size = len; 3340 3341 /* 3342 * Insert the new object into the directory. 3343 */ 3344 (void) zfs_link_create(dl, zp, tx, ZNEW); 3345 out: 3346 if (error == 0) { 3347 uint64_t txtype = TX_SYMLINK; 3348 if (flags & FIGNORECASE) 3349 txtype |= TX_CI; 3350 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3351 } 3352 3353 zfs_acl_ids_free(&acl_ids); 3354 3355 dmu_tx_commit(tx); 3356 3357 zfs_dirent_unlock(dl); 3358 3359 VN_RELE(ZTOV(zp)); 3360 3361 ZFS_EXIT(zfsvfs); 3362 return (error); 3363 } 3364 3365 /* 3366 * Return, in the buffer contained in the provided uio structure, 3367 * the symbolic path referred to by vp. 3368 * 3369 * IN: vp - vnode of symbolic link. 3370 * uoip - structure to contain the link path. 3371 * cr - credentials of caller. 3372 * ct - caller context 3373 * 3374 * OUT: uio - structure to contain the link path. 3375 * 3376 * RETURN: 0 if success 3377 * error code if failure 3378 * 3379 * Timestamps: 3380 * vp - atime updated 3381 */ 3382 /* ARGSUSED */ 3383 static int 3384 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3385 { 3386 znode_t *zp = VTOZ(vp); 3387 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3388 size_t bufsz; 3389 int error; 3390 3391 ZFS_ENTER(zfsvfs); 3392 ZFS_VERIFY_ZP(zp); 3393 3394 bufsz = (size_t)zp->z_phys->zp_size; 3395 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 3396 error = uiomove(zp->z_phys + 1, 3397 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3398 } else { 3399 dmu_buf_t *dbp; 3400 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 3401 if (error) { 3402 ZFS_EXIT(zfsvfs); 3403 return (error); 3404 } 3405 error = uiomove(dbp->db_data, 3406 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 3407 dmu_buf_rele(dbp, FTAG); 3408 } 3409 3410 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3411 ZFS_EXIT(zfsvfs); 3412 return (error); 3413 } 3414 3415 /* 3416 * Insert a new entry into directory tdvp referencing svp. 3417 * 3418 * IN: tdvp - Directory to contain new entry. 3419 * svp - vnode of new entry. 3420 * name - name of new entry. 3421 * cr - credentials of caller. 3422 * ct - caller context 3423 * 3424 * RETURN: 0 if success 3425 * error code if failure 3426 * 3427 * Timestamps: 3428 * tdvp - ctime|mtime updated 3429 * svp - ctime updated 3430 */ 3431 /* ARGSUSED */ 3432 static int 3433 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3434 caller_context_t *ct, int flags) 3435 { 3436 znode_t *dzp = VTOZ(tdvp); 3437 znode_t *tzp, *szp; 3438 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3439 zilog_t *zilog; 3440 zfs_dirlock_t *dl; 3441 dmu_tx_t *tx; 3442 vnode_t *realvp; 3443 int error; 3444 int zf = ZNEW; 3445 uid_t owner; 3446 3447 ASSERT(tdvp->v_type == VDIR); 3448 3449 ZFS_ENTER(zfsvfs); 3450 ZFS_VERIFY_ZP(dzp); 3451 zilog = zfsvfs->z_log; 3452 3453 if (VOP_REALVP(svp, &realvp, ct) == 0) 3454 svp = realvp; 3455 3456 if (svp->v_vfsp != tdvp->v_vfsp) { 3457 ZFS_EXIT(zfsvfs); 3458 return (EXDEV); 3459 } 3460 szp = VTOZ(svp); 3461 ZFS_VERIFY_ZP(szp); 3462 3463 if (zfsvfs->z_utf8 && u8_validate(name, 3464 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3465 ZFS_EXIT(zfsvfs); 3466 return (EILSEQ); 3467 } 3468 if (flags & FIGNORECASE) 3469 zf |= ZCILOOK; 3470 3471 top: 3472 /* 3473 * We do not support links between attributes and non-attributes 3474 * because of the potential security risk of creating links 3475 * into "normal" file space in order to circumvent restrictions 3476 * imposed in attribute space. 3477 */ 3478 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 3479 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 3480 ZFS_EXIT(zfsvfs); 3481 return (EINVAL); 3482 } 3483 3484 /* 3485 * POSIX dictates that we return EPERM here. 3486 * Better choices include ENOTSUP or EISDIR. 3487 */ 3488 if (svp->v_type == VDIR) { 3489 ZFS_EXIT(zfsvfs); 3490 return (EPERM); 3491 } 3492 3493 owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); 3494 if (owner != crgetuid(cr) && 3495 secpolicy_basic_link(cr) != 0) { 3496 ZFS_EXIT(zfsvfs); 3497 return (EPERM); 3498 } 3499 3500 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3501 ZFS_EXIT(zfsvfs); 3502 return (error); 3503 } 3504 3505 /* 3506 * Attempt to lock directory; fail if entry already exists. 3507 */ 3508 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 3509 if (error) { 3510 ZFS_EXIT(zfsvfs); 3511 return (error); 3512 } 3513 3514 tx = dmu_tx_create(zfsvfs->z_os); 3515 dmu_tx_hold_bonus(tx, szp->z_id); 3516 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3517 error = dmu_tx_assign(tx, TXG_NOWAIT); 3518 if (error) { 3519 zfs_dirent_unlock(dl); 3520 if (error == ERESTART) { 3521 dmu_tx_wait(tx); 3522 dmu_tx_abort(tx); 3523 goto top; 3524 } 3525 dmu_tx_abort(tx); 3526 ZFS_EXIT(zfsvfs); 3527 return (error); 3528 } 3529 3530 error = zfs_link_create(dl, szp, tx, 0); 3531 3532 if (error == 0) { 3533 uint64_t txtype = TX_LINK; 3534 if (flags & FIGNORECASE) 3535 txtype |= TX_CI; 3536 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 3537 } 3538 3539 dmu_tx_commit(tx); 3540 3541 zfs_dirent_unlock(dl); 3542 3543 if (error == 0) { 3544 vnevent_link(svp, ct); 3545 } 3546 3547 ZFS_EXIT(zfsvfs); 3548 return (error); 3549 } 3550 3551 /* 3552 * zfs_null_putapage() is used when the file system has been force 3553 * unmounted. It just drops the pages. 3554 */ 3555 /* ARGSUSED */ 3556 static int 3557 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 3558 size_t *lenp, int flags, cred_t *cr) 3559 { 3560 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 3561 return (0); 3562 } 3563 3564 /* 3565 * Push a page out to disk, klustering if possible. 3566 * 3567 * IN: vp - file to push page to. 3568 * pp - page to push. 3569 * flags - additional flags. 3570 * cr - credentials of caller. 3571 * 3572 * OUT: offp - start of range pushed. 3573 * lenp - len of range pushed. 3574 * 3575 * RETURN: 0 if success 3576 * error code if failure 3577 * 3578 * NOTE: callers must have locked the page to be pushed. On 3579 * exit, the page (and all other pages in the kluster) must be 3580 * unlocked. 3581 */ 3582 /* ARGSUSED */ 3583 static int 3584 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 3585 size_t *lenp, int flags, cred_t *cr) 3586 { 3587 znode_t *zp = VTOZ(vp); 3588 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3589 dmu_tx_t *tx; 3590 u_offset_t off, koff; 3591 size_t len, klen; 3592 uint64_t filesz; 3593 int err; 3594 3595 filesz = zp->z_phys->zp_size; 3596 off = pp->p_offset; 3597 len = PAGESIZE; 3598 /* 3599 * If our blocksize is bigger than the page size, try to kluster 3600 * multiple pages so that we write a full block (thus avoiding 3601 * a read-modify-write). 3602 */ 3603 if (off < filesz && zp->z_blksz > PAGESIZE) { 3604 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 3605 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 3606 ASSERT(koff <= filesz); 3607 if (koff + klen > filesz) 3608 klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); 3609 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 3610 } 3611 ASSERT3U(btop(len), ==, btopr(len)); 3612 3613 /* 3614 * Can't push pages past end-of-file. 3615 */ 3616 if (off >= filesz) { 3617 /* ignore all pages */ 3618 err = 0; 3619 goto out; 3620 } else if (off + len > filesz) { 3621 int npages = btopr(filesz - off); 3622 page_t *trunc; 3623 3624 page_list_break(&pp, &trunc, npages); 3625 /* ignore pages past end of file */ 3626 if (trunc) 3627 pvn_write_done(trunc, flags); 3628 len = filesz - off; 3629 } 3630 top: 3631 tx = dmu_tx_create(zfsvfs->z_os); 3632 dmu_tx_hold_write(tx, zp->z_id, off, len); 3633 dmu_tx_hold_bonus(tx, zp->z_id); 3634 err = dmu_tx_assign(tx, TXG_NOWAIT); 3635 if (err != 0) { 3636 if (err == ERESTART) { 3637 dmu_tx_wait(tx); 3638 dmu_tx_abort(tx); 3639 goto top; 3640 } 3641 dmu_tx_abort(tx); 3642 goto out; 3643 } 3644 3645 if (zp->z_blksz <= PAGESIZE) { 3646 caddr_t va = zfs_map_page(pp, S_READ); 3647 ASSERT3U(len, <=, PAGESIZE); 3648 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 3649 zfs_unmap_page(pp, va); 3650 } else { 3651 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 3652 } 3653 3654 if (err == 0) { 3655 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 3656 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 3657 dmu_tx_commit(tx); 3658 } 3659 3660 out: 3661 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 3662 if (offp) 3663 *offp = off; 3664 if (lenp) 3665 *lenp = len; 3666 3667 return (err); 3668 } 3669 3670 /* 3671 * Copy the portion of the file indicated from pages into the file. 3672 * The pages are stored in a page list attached to the files vnode. 3673 * 3674 * IN: vp - vnode of file to push page data to. 3675 * off - position in file to put data. 3676 * len - amount of data to write. 3677 * flags - flags to control the operation. 3678 * cr - credentials of caller. 3679 * ct - caller context. 3680 * 3681 * RETURN: 0 if success 3682 * error code if failure 3683 * 3684 * Timestamps: 3685 * vp - ctime|mtime updated 3686 */ 3687 /*ARGSUSED*/ 3688 static int 3689 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 3690 caller_context_t *ct) 3691 { 3692 znode_t *zp = VTOZ(vp); 3693 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3694 page_t *pp; 3695 size_t io_len; 3696 u_offset_t io_off; 3697 uint_t blksz; 3698 rl_t *rl; 3699 int error = 0; 3700 3701 ZFS_ENTER(zfsvfs); 3702 ZFS_VERIFY_ZP(zp); 3703 3704 /* 3705 * Align this request to the file block size in case we kluster. 3706 * XXX - this can result in pretty aggresive locking, which can 3707 * impact simultanious read/write access. One option might be 3708 * to break up long requests (len == 0) into block-by-block 3709 * operations to get narrower locking. 3710 */ 3711 blksz = zp->z_blksz; 3712 if (ISP2(blksz)) 3713 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 3714 else 3715 io_off = 0; 3716 if (len > 0 && ISP2(blksz)) 3717 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 3718 else 3719 io_len = 0; 3720 3721 if (io_len == 0) { 3722 /* 3723 * Search the entire vp list for pages >= io_off. 3724 */ 3725 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); 3726 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 3727 goto out; 3728 } 3729 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); 3730 3731 if (off > zp->z_phys->zp_size) { 3732 /* past end of file */ 3733 zfs_range_unlock(rl); 3734 ZFS_EXIT(zfsvfs); 3735 return (0); 3736 } 3737 3738 len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); 3739 3740 for (off = io_off; io_off < off + len; io_off += io_len) { 3741 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 3742 pp = page_lookup(vp, io_off, 3743 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 3744 } else { 3745 pp = page_lookup_nowait(vp, io_off, 3746 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 3747 } 3748 3749 if (pp != NULL && pvn_getdirty(pp, flags)) { 3750 int err; 3751 3752 /* 3753 * Found a dirty page to push 3754 */ 3755 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 3756 if (err) 3757 error = err; 3758 } else { 3759 io_len = PAGESIZE; 3760 } 3761 } 3762 out: 3763 zfs_range_unlock(rl); 3764 if ((flags & B_ASYNC) == 0) 3765 zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); 3766 ZFS_EXIT(zfsvfs); 3767 return (error); 3768 } 3769 3770 /*ARGSUSED*/ 3771 void 3772 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 3773 { 3774 znode_t *zp = VTOZ(vp); 3775 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3776 int error; 3777 3778 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 3779 if (zp->z_dbuf == NULL) { 3780 /* 3781 * The fs has been unmounted, or we did a 3782 * suspend/resume and this file no longer exists. 3783 */ 3784 if (vn_has_cached_data(vp)) { 3785 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 3786 B_INVAL, cr); 3787 } 3788 3789 mutex_enter(&zp->z_lock); 3790 vp->v_count = 0; /* count arrives as 1 */ 3791 mutex_exit(&zp->z_lock); 3792 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3793 zfs_znode_free(zp); 3794 return; 3795 } 3796 3797 /* 3798 * Attempt to push any data in the page cache. If this fails 3799 * we will get kicked out later in zfs_zinactive(). 3800 */ 3801 if (vn_has_cached_data(vp)) { 3802 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 3803 cr); 3804 } 3805 3806 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 3807 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 3808 3809 dmu_tx_hold_bonus(tx, zp->z_id); 3810 error = dmu_tx_assign(tx, TXG_WAIT); 3811 if (error) { 3812 dmu_tx_abort(tx); 3813 } else { 3814 dmu_buf_will_dirty(zp->z_dbuf, tx); 3815 mutex_enter(&zp->z_lock); 3816 zp->z_atime_dirty = 0; 3817 mutex_exit(&zp->z_lock); 3818 dmu_tx_commit(tx); 3819 } 3820 } 3821 3822 zfs_zinactive(zp); 3823 rw_exit(&zfsvfs->z_teardown_inactive_lock); 3824 } 3825 3826 /* 3827 * Bounds-check the seek operation. 3828 * 3829 * IN: vp - vnode seeking within 3830 * ooff - old file offset 3831 * noffp - pointer to new file offset 3832 * ct - caller context 3833 * 3834 * RETURN: 0 if success 3835 * EINVAL if new offset invalid 3836 */ 3837 /* ARGSUSED */ 3838 static int 3839 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 3840 caller_context_t *ct) 3841 { 3842 if (vp->v_type == VDIR) 3843 return (0); 3844 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 3845 } 3846 3847 /* 3848 * Pre-filter the generic locking function to trap attempts to place 3849 * a mandatory lock on a memory mapped file. 3850 */ 3851 static int 3852 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 3853 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 3854 { 3855 znode_t *zp = VTOZ(vp); 3856 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3857 int error; 3858 3859 ZFS_ENTER(zfsvfs); 3860 ZFS_VERIFY_ZP(zp); 3861 3862 /* 3863 * We are following the UFS semantics with respect to mapcnt 3864 * here: If we see that the file is mapped already, then we will 3865 * return an error, but we don't worry about races between this 3866 * function and zfs_map(). 3867 */ 3868 if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { 3869 ZFS_EXIT(zfsvfs); 3870 return (EAGAIN); 3871 } 3872 error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); 3873 ZFS_EXIT(zfsvfs); 3874 return (error); 3875 } 3876 3877 /* 3878 * If we can't find a page in the cache, we will create a new page 3879 * and fill it with file data. For efficiency, we may try to fill 3880 * multiple pages at once (klustering) to fill up the supplied page 3881 * list. Note that the pages to be filled are held with an exclusive 3882 * lock to prevent access by other threads while they are being filled. 3883 */ 3884 static int 3885 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 3886 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 3887 { 3888 znode_t *zp = VTOZ(vp); 3889 page_t *pp, *cur_pp; 3890 objset_t *os = zp->z_zfsvfs->z_os; 3891 u_offset_t io_off, total; 3892 size_t io_len; 3893 int err; 3894 3895 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 3896 /* 3897 * We only have a single page, don't bother klustering 3898 */ 3899 io_off = off; 3900 io_len = PAGESIZE; 3901 pp = page_create_va(vp, io_off, io_len, 3902 PG_EXCL | PG_WAIT, seg, addr); 3903 } else { 3904 /* 3905 * Try to find enough pages to fill the page list 3906 */ 3907 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3908 &io_len, off, plsz, 0); 3909 } 3910 if (pp == NULL) { 3911 /* 3912 * The page already exists, nothing to do here. 3913 */ 3914 *pl = NULL; 3915 return (0); 3916 } 3917 3918 /* 3919 * Fill the pages in the kluster. 3920 */ 3921 cur_pp = pp; 3922 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 3923 caddr_t va; 3924 3925 ASSERT3U(io_off, ==, cur_pp->p_offset); 3926 va = zfs_map_page(cur_pp, S_WRITE); 3927 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va); 3928 zfs_unmap_page(cur_pp, va); 3929 if (err) { 3930 /* On error, toss the entire kluster */ 3931 pvn_read_done(pp, B_ERROR); 3932 /* convert checksum errors into IO errors */ 3933 if (err == ECKSUM) 3934 err = EIO; 3935 return (err); 3936 } 3937 cur_pp = cur_pp->p_next; 3938 } 3939 3940 /* 3941 * Fill in the page list array from the kluster starting 3942 * from the desired offset `off'. 3943 * NOTE: the page list will always be null terminated. 3944 */ 3945 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3946 ASSERT(pl == NULL || (*pl)->p_offset == off); 3947 3948 return (0); 3949 } 3950 3951 /* 3952 * Return pointers to the pages for the file region [off, off + len] 3953 * in the pl array. If plsz is greater than len, this function may 3954 * also return page pointers from after the specified region 3955 * (i.e. the region [off, off + plsz]). These additional pages are 3956 * only returned if they are already in the cache, or were created as 3957 * part of a klustered read. 3958 * 3959 * IN: vp - vnode of file to get data from. 3960 * off - position in file to get data from. 3961 * len - amount of data to retrieve. 3962 * plsz - length of provided page list. 3963 * seg - segment to obtain pages for. 3964 * addr - virtual address of fault. 3965 * rw - mode of created pages. 3966 * cr - credentials of caller. 3967 * ct - caller context. 3968 * 3969 * OUT: protp - protection mode of created pages. 3970 * pl - list of pages created. 3971 * 3972 * RETURN: 0 if success 3973 * error code if failure 3974 * 3975 * Timestamps: 3976 * vp - atime updated 3977 */ 3978 /* ARGSUSED */ 3979 static int 3980 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 3981 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3982 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 3983 { 3984 znode_t *zp = VTOZ(vp); 3985 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3986 page_t **pl0 = pl; 3987 int err = 0; 3988 3989 /* we do our own caching, faultahead is unnecessary */ 3990 if (pl == NULL) 3991 return (0); 3992 else if (len > plsz) 3993 len = plsz; 3994 else 3995 len = P2ROUNDUP(len, PAGESIZE); 3996 ASSERT(plsz >= len); 3997 3998 ZFS_ENTER(zfsvfs); 3999 ZFS_VERIFY_ZP(zp); 4000 4001 if (protp) 4002 *protp = PROT_ALL; 4003 4004 /* 4005 * Loop through the requested range [off, off + len) looking 4006 * for pages. If we don't find a page, we will need to create 4007 * a new page and fill it with data from the file. 4008 */ 4009 while (len > 0) { 4010 if (*pl = page_lookup(vp, off, SE_SHARED)) 4011 *(pl+1) = NULL; 4012 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 4013 goto out; 4014 while (*pl) { 4015 ASSERT3U((*pl)->p_offset, ==, off); 4016 off += PAGESIZE; 4017 addr += PAGESIZE; 4018 if (len > 0) { 4019 ASSERT3U(len, >=, PAGESIZE); 4020 len -= PAGESIZE; 4021 } 4022 ASSERT3U(plsz, >=, PAGESIZE); 4023 plsz -= PAGESIZE; 4024 pl++; 4025 } 4026 } 4027 4028 /* 4029 * Fill out the page array with any pages already in the cache. 4030 */ 4031 while (plsz > 0 && 4032 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 4033 off += PAGESIZE; 4034 plsz -= PAGESIZE; 4035 } 4036 out: 4037 if (err) { 4038 /* 4039 * Release any pages we have previously locked. 4040 */ 4041 while (pl > pl0) 4042 page_unlock(*--pl); 4043 } else { 4044 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4045 } 4046 4047 *pl = NULL; 4048 4049 ZFS_EXIT(zfsvfs); 4050 return (err); 4051 } 4052 4053 /* 4054 * Request a memory map for a section of a file. This code interacts 4055 * with common code and the VM system as follows: 4056 * 4057 * common code calls mmap(), which ends up in smmap_common() 4058 * 4059 * this calls VOP_MAP(), which takes you into (say) zfs 4060 * 4061 * zfs_map() calls as_map(), passing segvn_create() as the callback 4062 * 4063 * segvn_create() creates the new segment and calls VOP_ADDMAP() 4064 * 4065 * zfs_addmap() updates z_mapcnt 4066 */ 4067 /*ARGSUSED*/ 4068 static int 4069 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4070 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4071 caller_context_t *ct) 4072 { 4073 znode_t *zp = VTOZ(vp); 4074 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4075 segvn_crargs_t vn_a; 4076 int error; 4077 4078 ZFS_ENTER(zfsvfs); 4079 ZFS_VERIFY_ZP(zp); 4080 4081 if ((prot & PROT_WRITE) && 4082 (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | 4083 ZFS_APPENDONLY))) { 4084 ZFS_EXIT(zfsvfs); 4085 return (EPERM); 4086 } 4087 4088 if ((prot & (PROT_READ | PROT_EXEC)) && 4089 (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { 4090 ZFS_EXIT(zfsvfs); 4091 return (EACCES); 4092 } 4093 4094 if (vp->v_flag & VNOMAP) { 4095 ZFS_EXIT(zfsvfs); 4096 return (ENOSYS); 4097 } 4098 4099 if (off < 0 || len > MAXOFFSET_T - off) { 4100 ZFS_EXIT(zfsvfs); 4101 return (ENXIO); 4102 } 4103 4104 if (vp->v_type != VREG) { 4105 ZFS_EXIT(zfsvfs); 4106 return (ENODEV); 4107 } 4108 4109 /* 4110 * If file is locked, disallow mapping. 4111 */ 4112 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { 4113 ZFS_EXIT(zfsvfs); 4114 return (EAGAIN); 4115 } 4116 4117 as_rangelock(as); 4118 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4119 if (error != 0) { 4120 as_rangeunlock(as); 4121 ZFS_EXIT(zfsvfs); 4122 return (error); 4123 } 4124 4125 vn_a.vp = vp; 4126 vn_a.offset = (u_offset_t)off; 4127 vn_a.type = flags & MAP_TYPE; 4128 vn_a.prot = prot; 4129 vn_a.maxprot = maxprot; 4130 vn_a.cred = cr; 4131 vn_a.amp = NULL; 4132 vn_a.flags = flags & ~MAP_TYPE; 4133 vn_a.szc = 0; 4134 vn_a.lgrp_mem_policy_flags = 0; 4135 4136 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4137 4138 as_rangeunlock(as); 4139 ZFS_EXIT(zfsvfs); 4140 return (error); 4141 } 4142 4143 /* ARGSUSED */ 4144 static int 4145 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4146 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4147 caller_context_t *ct) 4148 { 4149 uint64_t pages = btopr(len); 4150 4151 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 4152 return (0); 4153 } 4154 4155 /* 4156 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 4157 * more accurate mtime for the associated file. Since we don't have a way of 4158 * detecting when the data was actually modified, we have to resort to 4159 * heuristics. If an explicit msync() is done, then we mark the mtime when the 4160 * last page is pushed. The problem occurs when the msync() call is omitted, 4161 * which by far the most common case: 4162 * 4163 * open() 4164 * mmap() 4165 * <modify memory> 4166 * munmap() 4167 * close() 4168 * <time lapse> 4169 * putpage() via fsflush 4170 * 4171 * If we wait until fsflush to come along, we can have a modification time that 4172 * is some arbitrary point in the future. In order to prevent this in the 4173 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 4174 * torn down. 4175 */ 4176 /* ARGSUSED */ 4177 static int 4178 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4179 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4180 caller_context_t *ct) 4181 { 4182 uint64_t pages = btopr(len); 4183 4184 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 4185 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 4186 4187 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 4188 vn_has_cached_data(vp)) 4189 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 4190 4191 return (0); 4192 } 4193 4194 /* 4195 * Free or allocate space in a file. Currently, this function only 4196 * supports the `F_FREESP' command. However, this command is somewhat 4197 * misnamed, as its functionality includes the ability to allocate as 4198 * well as free space. 4199 * 4200 * IN: vp - vnode of file to free data in. 4201 * cmd - action to take (only F_FREESP supported). 4202 * bfp - section of file to free/alloc. 4203 * flag - current file open mode flags. 4204 * offset - current file offset. 4205 * cr - credentials of caller [UNUSED]. 4206 * ct - caller context. 4207 * 4208 * RETURN: 0 if success 4209 * error code if failure 4210 * 4211 * Timestamps: 4212 * vp - ctime|mtime updated 4213 */ 4214 /* ARGSUSED */ 4215 static int 4216 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 4217 offset_t offset, cred_t *cr, caller_context_t *ct) 4218 { 4219 znode_t *zp = VTOZ(vp); 4220 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4221 uint64_t off, len; 4222 int error; 4223 4224 ZFS_ENTER(zfsvfs); 4225 ZFS_VERIFY_ZP(zp); 4226 4227 if (cmd != F_FREESP) { 4228 ZFS_EXIT(zfsvfs); 4229 return (EINVAL); 4230 } 4231 4232 if (error = convoff(vp, bfp, 0, offset)) { 4233 ZFS_EXIT(zfsvfs); 4234 return (error); 4235 } 4236 4237 if (bfp->l_len < 0) { 4238 ZFS_EXIT(zfsvfs); 4239 return (EINVAL); 4240 } 4241 4242 off = bfp->l_start; 4243 len = bfp->l_len; /* 0 means from off to end of file */ 4244 4245 error = zfs_freesp(zp, off, len, flag, TRUE); 4246 4247 ZFS_EXIT(zfsvfs); 4248 return (error); 4249 } 4250 4251 /*ARGSUSED*/ 4252 static int 4253 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4254 { 4255 znode_t *zp = VTOZ(vp); 4256 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4257 uint32_t gen; 4258 uint64_t object = zp->z_id; 4259 zfid_short_t *zfid; 4260 int size, i; 4261 4262 ZFS_ENTER(zfsvfs); 4263 ZFS_VERIFY_ZP(zp); 4264 gen = (uint32_t)zp->z_gen; 4265 4266 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4267 if (fidp->fid_len < size) { 4268 fidp->fid_len = size; 4269 ZFS_EXIT(zfsvfs); 4270 return (ENOSPC); 4271 } 4272 4273 zfid = (zfid_short_t *)fidp; 4274 4275 zfid->zf_len = size; 4276 4277 for (i = 0; i < sizeof (zfid->zf_object); i++) 4278 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4279 4280 /* Must have a non-zero generation number to distinguish from .zfs */ 4281 if (gen == 0) 4282 gen = 1; 4283 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4284 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4285 4286 if (size == LONG_FID_LEN) { 4287 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4288 zfid_long_t *zlfid; 4289 4290 zlfid = (zfid_long_t *)fidp; 4291 4292 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4293 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4294 4295 /* XXX - this should be the generation number for the objset */ 4296 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4297 zlfid->zf_setgen[i] = 0; 4298 } 4299 4300 ZFS_EXIT(zfsvfs); 4301 return (0); 4302 } 4303 4304 static int 4305 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4306 caller_context_t *ct) 4307 { 4308 znode_t *zp, *xzp; 4309 zfsvfs_t *zfsvfs; 4310 zfs_dirlock_t *dl; 4311 int error; 4312 4313 switch (cmd) { 4314 case _PC_LINK_MAX: 4315 *valp = ULONG_MAX; 4316 return (0); 4317 4318 case _PC_FILESIZEBITS: 4319 *valp = 64; 4320 return (0); 4321 4322 case _PC_XATTR_EXISTS: 4323 zp = VTOZ(vp); 4324 zfsvfs = zp->z_zfsvfs; 4325 ZFS_ENTER(zfsvfs); 4326 ZFS_VERIFY_ZP(zp); 4327 *valp = 0; 4328 error = zfs_dirent_lock(&dl, zp, "", &xzp, 4329 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 4330 if (error == 0) { 4331 zfs_dirent_unlock(dl); 4332 if (!zfs_dirempty(xzp)) 4333 *valp = 1; 4334 VN_RELE(ZTOV(xzp)); 4335 } else if (error == ENOENT) { 4336 /* 4337 * If there aren't extended attributes, it's the 4338 * same as having zero of them. 4339 */ 4340 error = 0; 4341 } 4342 ZFS_EXIT(zfsvfs); 4343 return (error); 4344 4345 case _PC_SATTR_ENABLED: 4346 case _PC_SATTR_EXISTS: 4347 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4348 (vp->v_type == VREG || vp->v_type == VDIR); 4349 return (0); 4350 4351 case _PC_ACL_ENABLED: 4352 *valp = _ACL_ACE_ENABLED; 4353 return (0); 4354 4355 case _PC_MIN_HOLE_SIZE: 4356 *valp = (ulong_t)SPA_MINBLOCKSIZE; 4357 return (0); 4358 4359 default: 4360 return (fs_pathconf(vp, cmd, valp, cr, ct)); 4361 } 4362 } 4363 4364 /*ARGSUSED*/ 4365 static int 4366 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4367 caller_context_t *ct) 4368 { 4369 znode_t *zp = VTOZ(vp); 4370 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4371 int error; 4372 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4373 4374 ZFS_ENTER(zfsvfs); 4375 ZFS_VERIFY_ZP(zp); 4376 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4377 ZFS_EXIT(zfsvfs); 4378 4379 return (error); 4380 } 4381 4382 /*ARGSUSED*/ 4383 static int 4384 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4385 caller_context_t *ct) 4386 { 4387 znode_t *zp = VTOZ(vp); 4388 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4389 int error; 4390 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4391 4392 ZFS_ENTER(zfsvfs); 4393 ZFS_VERIFY_ZP(zp); 4394 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4395 ZFS_EXIT(zfsvfs); 4396 return (error); 4397 } 4398 4399 /* 4400 * Predeclare these here so that the compiler assumes that 4401 * this is an "old style" function declaration that does 4402 * not include arguments => we won't get type mismatch errors 4403 * in the initializations that follow. 4404 */ 4405 static int zfs_inval(); 4406 static int zfs_isdir(); 4407 4408 static int 4409 zfs_inval() 4410 { 4411 return (EINVAL); 4412 } 4413 4414 static int 4415 zfs_isdir() 4416 { 4417 return (EISDIR); 4418 } 4419 /* 4420 * Directory vnode operations template 4421 */ 4422 vnodeops_t *zfs_dvnodeops; 4423 const fs_operation_def_t zfs_dvnodeops_template[] = { 4424 VOPNAME_OPEN, { .vop_open = zfs_open }, 4425 VOPNAME_CLOSE, { .vop_close = zfs_close }, 4426 VOPNAME_READ, { .error = zfs_isdir }, 4427 VOPNAME_WRITE, { .error = zfs_isdir }, 4428 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 4429 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4430 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4431 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4432 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 4433 VOPNAME_CREATE, { .vop_create = zfs_create }, 4434 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 4435 VOPNAME_LINK, { .vop_link = zfs_link }, 4436 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4437 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 4438 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 4439 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 4440 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 4441 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 4442 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4443 VOPNAME_FID, { .vop_fid = zfs_fid }, 4444 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 4445 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4446 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4447 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4448 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4449 NULL, NULL 4450 }; 4451 4452 /* 4453 * Regular file vnode operations template 4454 */ 4455 vnodeops_t *zfs_fvnodeops; 4456 const fs_operation_def_t zfs_fvnodeops_template[] = { 4457 VOPNAME_OPEN, { .vop_open = zfs_open }, 4458 VOPNAME_CLOSE, { .vop_close = zfs_close }, 4459 VOPNAME_READ, { .vop_read = zfs_read }, 4460 VOPNAME_WRITE, { .vop_write = zfs_write }, 4461 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 4462 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4463 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4464 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4465 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 4466 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4467 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 4468 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4469 VOPNAME_FID, { .vop_fid = zfs_fid }, 4470 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 4471 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 4472 VOPNAME_SPACE, { .vop_space = zfs_space }, 4473 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 4474 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 4475 VOPNAME_MAP, { .vop_map = zfs_map }, 4476 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 4477 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 4478 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4479 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4480 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4481 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4482 NULL, NULL 4483 }; 4484 4485 /* 4486 * Symbolic link vnode operations template 4487 */ 4488 vnodeops_t *zfs_symvnodeops; 4489 const fs_operation_def_t zfs_symvnodeops_template[] = { 4490 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4491 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4492 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4493 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4494 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 4495 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4496 VOPNAME_FID, { .vop_fid = zfs_fid }, 4497 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4498 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4499 NULL, NULL 4500 }; 4501 4502 /* 4503 * special share hidden files vnode operations template 4504 */ 4505 vnodeops_t *zfs_sharevnodeops; 4506 const fs_operation_def_t zfs_sharevnodeops_template[] = { 4507 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4508 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4509 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4510 VOPNAME_FID, { .vop_fid = zfs_fid }, 4511 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4512 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4513 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4514 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4515 NULL, NULL 4516 }; 4517 4518 /* 4519 * Extended attribute directory vnode operations template 4520 * This template is identical to the directory vnodes 4521 * operation template except for restricted operations: 4522 * VOP_MKDIR() 4523 * VOP_SYMLINK() 4524 * Note that there are other restrictions embedded in: 4525 * zfs_create() - restrict type to VREG 4526 * zfs_link() - no links into/out of attribute space 4527 * zfs_rename() - no moves into/out of attribute space 4528 */ 4529 vnodeops_t *zfs_xdvnodeops; 4530 const fs_operation_def_t zfs_xdvnodeops_template[] = { 4531 VOPNAME_OPEN, { .vop_open = zfs_open }, 4532 VOPNAME_CLOSE, { .vop_close = zfs_close }, 4533 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 4534 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 4535 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 4536 VOPNAME_ACCESS, { .vop_access = zfs_access }, 4537 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 4538 VOPNAME_CREATE, { .vop_create = zfs_create }, 4539 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 4540 VOPNAME_LINK, { .vop_link = zfs_link }, 4541 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 4542 VOPNAME_MKDIR, { .error = zfs_inval }, 4543 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 4544 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 4545 VOPNAME_SYMLINK, { .error = zfs_inval }, 4546 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 4547 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4548 VOPNAME_FID, { .vop_fid = zfs_fid }, 4549 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 4550 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4551 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 4552 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 4553 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 4554 NULL, NULL 4555 }; 4556 4557 /* 4558 * Error vnode operations template 4559 */ 4560 vnodeops_t *zfs_evnodeops; 4561 const fs_operation_def_t zfs_evnodeops_template[] = { 4562 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 4563 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 4564 NULL, NULL 4565 }; 4566