1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 /* Portions Copyright 2007 Jeremy Teo */ 28 /* Portions Copyright 2010 Robert Milkowski */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/file.h> 40 #include <sys/stat.h> 41 #include <sys/kmem.h> 42 #include <sys/taskq.h> 43 #include <sys/uio.h> 44 #include <sys/vmsystm.h> 45 #include <sys/atomic.h> 46 #include <sys/vm.h> 47 #include <vm/seg_vn.h> 48 #include <vm/pvn.h> 49 #include <vm/as.h> 50 #include <vm/kpm.h> 51 #include <vm/seg_kpm.h> 52 #include <sys/mman.h> 53 #include <sys/pathname.h> 54 #include <sys/cmn_err.h> 55 #include <sys/errno.h> 56 #include <sys/unistd.h> 57 #include <sys/zfs_dir.h> 58 #include <sys/zfs_acl.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/fs/zfs.h> 61 #include <sys/dmu.h> 62 #include <sys/dmu_objset.h> 63 #include <sys/spa.h> 64 #include <sys/txg.h> 65 #include <sys/dbuf.h> 66 #include <sys/zap.h> 67 #include <sys/sa.h> 68 #include <sys/dirent.h> 69 #include <sys/policy.h> 70 #include <sys/sunddi.h> 71 #include <sys/filio.h> 72 #include <sys/sid.h> 73 #include "fs/fs_subr.h" 74 #include <sys/zfs_ctldir.h> 75 #include <sys/zfs_fuid.h> 76 #include <sys/zfs_sa.h> 77 #include <sys/dnlc.h> 78 #include <sys/zfs_rlock.h> 79 #include <sys/extdirent.h> 80 #include <sys/kidmap.h> 81 #include <sys/cred.h> 82 #include <sys/attr.h> 83 #include <sys/zfs_events.h> 84 85 /* 86 * Programming rules. 87 * 88 * Each vnode op performs some logical unit of work. To do this, the ZPL must 89 * properly lock its in-core state, create a DMU transaction, do the work, 90 * record this work in the intent log (ZIL), commit the DMU transaction, 91 * and wait for the intent log to commit if it is a synchronous operation. 92 * Moreover, the vnode ops must work in both normal and log replay context. 93 * The ordering of events is important to avoid deadlocks and references 94 * to freed memory. The example below illustrates the following Big Rules: 95 * 96 * (1) A check must be made in each zfs thread for a mounted file system. 97 * This is done avoiding races using ZFS_ENTER(zfsvfs). 98 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 99 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 100 * can return EIO from the calling function. 101 * 102 * (2) VN_RELE() should always be the last thing except for zil_commit() 103 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 104 * First, if it's the last reference, the vnode/znode 105 * can be freed, so the zp may point to freed memory. Second, the last 106 * reference will call zfs_zinactive(), which may induce a lot of work -- 107 * pushing cached pages (which acquires range locks) and syncing out 108 * cached atime changes. Third, zfs_zinactive() may require a new tx, 109 * which could deadlock the system if you were already holding one. 110 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 111 * 112 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 113 * as they can span dmu_tx_assign() calls. 114 * 115 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). 116 * This is critical because we don't want to block while holding locks. 117 * Note, in particular, that if a lock is sometimes acquired before 118 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 119 * use a non-blocking assign can deadlock the system. The scenario: 120 * 121 * Thread A has grabbed a lock before calling dmu_tx_assign(). 122 * Thread B is in an already-assigned tx, and blocks for this lock. 123 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 124 * forever, because the previous txg can't quiesce until B's tx commits. 125 * 126 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 127 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 128 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 129 * to indicate that this operation has already called dmu_tx_wait(). 130 * This will ensure that we don't retry forever, waiting a short bit 131 * each time. 132 * 133 * (5) If the operation succeeded, generate the intent log entry for it 134 * before dropping locks. This ensures that the ordering of events 135 * in the intent log matches the order in which they actually occurred. 136 * During ZIL replay the zfs_log_* functions will update the sequence 137 * number to indicate the zil transaction has replayed. 138 * 139 * (6) At the end of each vnode op, the DMU tx must always commit, 140 * regardless of whether there were any errors. 141 * 142 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 143 * to ensure that synchronous semantics are provided when necessary. 144 * 145 * In general, this is how things should be ordered in each vnode op: 146 * 147 * ZFS_ENTER(zfsvfs); // exit if unmounted 148 * top: 149 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 150 * rw_enter(...); // grab any other locks you need 151 * tx = dmu_tx_create(...); // get DMU tx 152 * dmu_tx_hold_*(); // hold each object you might modify 153 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 154 * if (error) { 155 * rw_exit(...); // drop locks 156 * zfs_dirent_unlock(dl); // unlock directory entry 157 * VN_RELE(...); // release held vnodes 158 * if (error == ERESTART) { 159 * waited = B_TRUE; 160 * dmu_tx_wait(tx); 161 * dmu_tx_abort(tx); 162 * goto top; 163 * } 164 * dmu_tx_abort(tx); // abort DMU tx 165 * ZFS_EXIT(zfsvfs); // finished in zfs 166 * return (error); // really out of space 167 * } 168 * error = do_real_work(); // do whatever this VOP does 169 * if (error == 0) 170 * zfs_log_*(...); // on success, make ZIL entry 171 * dmu_tx_commit(tx); // commit DMU tx -- error or not 172 * rw_exit(...); // drop locks 173 * zfs_dirent_unlock(dl); // unlock directory entry 174 * VN_RELE(...); // release held vnodes 175 * zil_commit(zilog, foid); // synchronous when necessary 176 * ZFS_EXIT(zfsvfs); // finished in zfs 177 * return (error); // done, report error 178 */ 179 180 /* ARGSUSED */ 181 static int 182 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 183 { 184 znode_t *zp = VTOZ(*vpp); 185 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 186 187 ZFS_ENTER(zfsvfs); 188 ZFS_VERIFY_ZP(zp); 189 190 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 191 ((flag & FAPPEND) == 0)) { 192 ZFS_EXIT(zfsvfs); 193 return (SET_ERROR(EPERM)); 194 } 195 196 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 197 ZTOV(zp)->v_type == VREG && 198 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 199 if (fs_vscan(*vpp, cr, 0) != 0) { 200 ZFS_EXIT(zfsvfs); 201 return (SET_ERROR(EACCES)); 202 } 203 } 204 205 /* Keep a count of the synchronous opens in the znode */ 206 if (flag & (FSYNC | FDSYNC)) 207 atomic_inc_32(&zp->z_sync_cnt); 208 209 ZFS_EXIT(zfsvfs); 210 return (0); 211 } 212 213 /* ARGSUSED */ 214 static int 215 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 216 caller_context_t *ct) 217 { 218 znode_t *zp = VTOZ(vp); 219 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 220 221 /* 222 * Clean up any locks held by this process on the vp. 223 */ 224 cleanlocks(vp, ddi_get_pid(), 0); 225 cleanshares(vp, ddi_get_pid()); 226 227 ZFS_ENTER(zfsvfs); 228 ZFS_VERIFY_ZP(zp); 229 230 /* Decrement the synchronous opens in the znode */ 231 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 232 atomic_dec_32(&zp->z_sync_cnt); 233 234 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 235 ZTOV(zp)->v_type == VREG && 236 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 237 VERIFY(fs_vscan(vp, cr, 1) == 0); 238 239 if (ZTOV(zp)->v_type == VREG && zp->z_new_content) { 240 zp->z_new_content = 0; 241 rw_enter(&rz_zev_rwlock, RW_READER); 242 if (rz_zev_callbacks && 243 rz_zev_callbacks->rz_zev_znode_close_after_update) 244 rz_zev_callbacks->rz_zev_znode_close_after_update(zp); 245 rw_exit(&rz_zev_rwlock); 246 } 247 248 ZFS_EXIT(zfsvfs); 249 return (0); 250 } 251 252 /* 253 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 254 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 255 */ 256 static int 257 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 258 { 259 znode_t *zp = VTOZ(vp); 260 uint64_t noff = (uint64_t)*off; /* new offset */ 261 uint64_t file_sz; 262 int error; 263 boolean_t hole; 264 265 file_sz = zp->z_size; 266 if (noff >= file_sz) { 267 return (SET_ERROR(ENXIO)); 268 } 269 270 if (cmd == _FIO_SEEK_HOLE) 271 hole = B_TRUE; 272 else 273 hole = B_FALSE; 274 275 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 276 277 /* end of file? */ 278 if ((error == ESRCH) || (noff > file_sz)) { 279 /* 280 * Handle the virtual hole at the end of file. 281 */ 282 if (hole) { 283 *off = file_sz; 284 return (0); 285 } 286 return (SET_ERROR(ENXIO)); 287 } 288 289 if (noff < *off) 290 return (error); 291 *off = noff; 292 return (error); 293 } 294 295 /* ARGSUSED */ 296 static int 297 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 298 int *rvalp, caller_context_t *ct) 299 { 300 offset_t off; 301 int error; 302 zfsvfs_t *zfsvfs; 303 znode_t *zp; 304 305 switch (com) { 306 case _FIOFFS: 307 return (zfs_sync(vp->v_vfsp, 0, cred)); 308 309 /* 310 * The following two ioctls are used by bfu. Faking out, 311 * necessary to avoid bfu errors. 312 */ 313 case _FIOGDIO: 314 case _FIOSDIO: 315 return (0); 316 317 case _FIO_SEEK_DATA: 318 case _FIO_SEEK_HOLE: 319 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 320 return (SET_ERROR(EFAULT)); 321 322 zp = VTOZ(vp); 323 zfsvfs = zp->z_zfsvfs; 324 ZFS_ENTER(zfsvfs); 325 ZFS_VERIFY_ZP(zp); 326 327 /* offset parameter is in/out */ 328 error = zfs_holey(vp, com, &off); 329 ZFS_EXIT(zfsvfs); 330 if (error) 331 return (error); 332 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 333 return (SET_ERROR(EFAULT)); 334 return (0); 335 } 336 return (SET_ERROR(ENOTTY)); 337 } 338 339 /* 340 * Utility functions to map and unmap a single physical page. These 341 * are used to manage the mappable copies of ZFS file data, and therefore 342 * do not update ref/mod bits. 343 */ 344 caddr_t 345 zfs_map_page(page_t *pp, enum seg_rw rw) 346 { 347 if (kpm_enable) 348 return (hat_kpm_mapin(pp, 0)); 349 ASSERT(rw == S_READ || rw == S_WRITE); 350 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 351 (caddr_t)-1)); 352 } 353 354 void 355 zfs_unmap_page(page_t *pp, caddr_t addr) 356 { 357 if (kpm_enable) { 358 hat_kpm_mapout(pp, 0, addr); 359 } else { 360 ppmapout(addr); 361 } 362 } 363 364 /* 365 * When a file is memory mapped, we must keep the IO data synchronized 366 * between the DMU cache and the memory mapped pages. What this means: 367 * 368 * On Write: If we find a memory mapped page, we write to *both* 369 * the page and the dmu buffer. 370 */ 371 static void 372 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 373 { 374 int64_t off; 375 376 off = start & PAGEOFFSET; 377 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 378 page_t *pp; 379 uint64_t nbytes = MIN(PAGESIZE - off, len); 380 381 if (pp = page_lookup(vp, start, SE_SHARED)) { 382 caddr_t va; 383 384 va = zfs_map_page(pp, S_WRITE); 385 (void) dmu_read(os, oid, start+off, nbytes, va+off, 386 DMU_READ_PREFETCH); 387 zfs_unmap_page(pp, va); 388 page_unlock(pp); 389 } 390 len -= nbytes; 391 off = 0; 392 } 393 } 394 395 /* 396 * When a file is memory mapped, we must keep the IO data synchronized 397 * between the DMU cache and the memory mapped pages. What this means: 398 * 399 * On Read: We "read" preferentially from memory mapped pages, 400 * else we default from the dmu buffer. 401 * 402 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 403 * the file is memory mapped. 404 */ 405 static int 406 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 407 { 408 znode_t *zp = VTOZ(vp); 409 objset_t *os = zp->z_zfsvfs->z_os; 410 int64_t start, off; 411 int len = nbytes; 412 int error = 0; 413 414 start = uio->uio_loffset; 415 off = start & PAGEOFFSET; 416 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 417 page_t *pp; 418 uint64_t bytes = MIN(PAGESIZE - off, len); 419 420 if (pp = page_lookup(vp, start, SE_SHARED)) { 421 caddr_t va; 422 423 va = zfs_map_page(pp, S_READ); 424 error = uiomove(va + off, bytes, UIO_READ, uio); 425 zfs_unmap_page(pp, va); 426 page_unlock(pp); 427 } else { 428 error = dmu_read_uio(os, zp->z_id, uio, bytes); 429 } 430 len -= bytes; 431 off = 0; 432 if (error) 433 break; 434 } 435 return (error); 436 } 437 438 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 439 440 /* 441 * Read bytes from specified file into supplied buffer. 442 * 443 * IN: vp - vnode of file to be read from. 444 * uio - structure supplying read location, range info, 445 * and return buffer. 446 * ioflag - SYNC flags; used to provide FRSYNC semantics. 447 * cr - credentials of caller. 448 * ct - caller context 449 * 450 * OUT: uio - updated offset and range, buffer filled. 451 * 452 * RETURN: 0 on success, error code on failure. 453 * 454 * Side Effects: 455 * vp - atime updated if byte count > 0 456 */ 457 /* ARGSUSED */ 458 static int 459 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 460 { 461 znode_t *zp = VTOZ(vp); 462 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 463 objset_t *os; 464 ssize_t n, nbytes; 465 int error = 0; 466 rl_t *rl; 467 xuio_t *xuio = NULL; 468 469 ZFS_ENTER(zfsvfs); 470 ZFS_VERIFY_ZP(zp); 471 os = zfsvfs->z_os; 472 473 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 474 ZFS_EXIT(zfsvfs); 475 return (SET_ERROR(EACCES)); 476 } 477 478 /* 479 * Validate file offset 480 */ 481 if (uio->uio_loffset < (offset_t)0) { 482 ZFS_EXIT(zfsvfs); 483 return (SET_ERROR(EINVAL)); 484 } 485 486 /* 487 * Fasttrack empty reads 488 */ 489 if (uio->uio_resid == 0) { 490 ZFS_EXIT(zfsvfs); 491 return (0); 492 } 493 494 /* 495 * Check for mandatory locks 496 */ 497 if (MANDMODE(zp->z_mode)) { 498 if (error = chklock(vp, FREAD, 499 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 500 ZFS_EXIT(zfsvfs); 501 return (error); 502 } 503 } 504 505 /* 506 * If we're in FRSYNC mode, sync out this znode before reading it. 507 */ 508 if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 509 zil_commit(zfsvfs->z_log, zp->z_id); 510 511 /* 512 * Lock the range against changes. 513 */ 514 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 515 516 /* 517 * If we are reading past end-of-file we can skip 518 * to the end; but we might still need to set atime. 519 */ 520 if (uio->uio_loffset >= zp->z_size) { 521 error = 0; 522 goto out; 523 } 524 525 ASSERT(uio->uio_loffset < zp->z_size); 526 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 527 528 if ((uio->uio_extflg == UIO_XUIO) && 529 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 530 int nblk; 531 int blksz = zp->z_blksz; 532 uint64_t offset = uio->uio_loffset; 533 534 xuio = (xuio_t *)uio; 535 if ((ISP2(blksz))) { 536 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 537 blksz)) / blksz; 538 } else { 539 ASSERT(offset + n <= blksz); 540 nblk = 1; 541 } 542 (void) dmu_xuio_init(xuio, nblk); 543 544 if (vn_has_cached_data(vp)) { 545 /* 546 * For simplicity, we always allocate a full buffer 547 * even if we only expect to read a portion of a block. 548 */ 549 while (--nblk >= 0) { 550 (void) dmu_xuio_add(xuio, 551 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 552 blksz), 0, blksz); 553 } 554 } 555 } 556 557 while (n > 0) { 558 nbytes = MIN(n, zfs_read_chunk_size - 559 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 560 561 if (vn_has_cached_data(vp)) 562 error = mappedread(vp, nbytes, uio); 563 else 564 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 565 if (error) { 566 /* convert checksum errors into IO errors */ 567 if (error == ECKSUM) 568 error = SET_ERROR(EIO); 569 break; 570 } 571 572 n -= nbytes; 573 } 574 out: 575 zfs_range_unlock(rl); 576 577 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 578 ZFS_EXIT(zfsvfs); 579 return (error); 580 } 581 582 /* 583 * Write the bytes to a file. 584 * 585 * IN: vp - vnode of file to be written to. 586 * uio - structure supplying write location, range info, 587 * and data buffer. 588 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 589 * set if in append mode. 590 * cr - credentials of caller. 591 * ct - caller context (NFS/CIFS fem monitor only) 592 * 593 * OUT: uio - updated offset and range. 594 * 595 * RETURN: 0 on success, error code on failure. 596 * 597 * Timestamps: 598 * vp - ctime|mtime updated if byte count > 0 599 */ 600 601 /* ARGSUSED */ 602 static int 603 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 604 { 605 znode_t *zp = VTOZ(vp); 606 rlim64_t limit = uio->uio_llimit; 607 ssize_t start_resid = uio->uio_resid; 608 ssize_t tx_bytes; 609 uint64_t end_size; 610 dmu_tx_t *tx; 611 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 612 zilog_t *zilog; 613 offset_t woff; 614 ssize_t n, nbytes; 615 rl_t *rl; 616 int max_blksz = zfsvfs->z_max_blksz; 617 int error = 0; 618 arc_buf_t *abuf; 619 iovec_t *aiov = NULL; 620 xuio_t *xuio = NULL; 621 int i_iov = 0; 622 int iovcnt = uio->uio_iovcnt; 623 iovec_t *iovp = uio->uio_iov; 624 int write_eof; 625 int count = 0; 626 sa_bulk_attr_t bulk[4]; 627 uint64_t mtime[2], ctime[2]; 628 629 /* 630 * Fasttrack empty write 631 */ 632 n = start_resid; 633 if (n == 0) 634 return (0); 635 636 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 637 limit = MAXOFFSET_T; 638 639 ZFS_ENTER(zfsvfs); 640 ZFS_VERIFY_ZP(zp); 641 642 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 643 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 644 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 645 &zp->z_size, 8); 646 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 647 &zp->z_pflags, 8); 648 649 /* 650 * If immutable or not appending then return EPERM 651 */ 652 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 653 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 654 (uio->uio_loffset < zp->z_size))) { 655 ZFS_EXIT(zfsvfs); 656 return (SET_ERROR(EPERM)); 657 } 658 659 zilog = zfsvfs->z_log; 660 661 /* 662 * Validate file offset 663 */ 664 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 665 if (woff < 0) { 666 ZFS_EXIT(zfsvfs); 667 return (SET_ERROR(EINVAL)); 668 } 669 670 /* 671 * Check for mandatory locks before calling zfs_range_lock() 672 * in order to prevent a deadlock with locks set via fcntl(). 673 */ 674 if (MANDMODE((mode_t)zp->z_mode) && 675 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 676 ZFS_EXIT(zfsvfs); 677 return (error); 678 } 679 680 /* 681 * Pre-fault the pages to ensure slow (eg NFS) pages 682 * don't hold up txg. 683 * Skip this if uio contains loaned arc_buf. 684 */ 685 if ((uio->uio_extflg == UIO_XUIO) && 686 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 687 xuio = (xuio_t *)uio; 688 else 689 uio_prefaultpages(MIN(n, max_blksz), uio); 690 691 /* 692 * If in append mode, set the io offset pointer to eof. 693 */ 694 if (ioflag & FAPPEND) { 695 /* 696 * Obtain an appending range lock to guarantee file append 697 * semantics. We reset the write offset once we have the lock. 698 */ 699 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 700 woff = rl->r_off; 701 if (rl->r_len == UINT64_MAX) { 702 /* 703 * We overlocked the file because this write will cause 704 * the file block size to increase. 705 * Note that zp_size cannot change with this lock held. 706 */ 707 woff = zp->z_size; 708 } 709 uio->uio_loffset = woff; 710 } else { 711 /* 712 * Note that if the file block size will change as a result of 713 * this write, then this range lock will lock the entire file 714 * so that we can re-write the block safely. 715 */ 716 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 717 } 718 719 if (woff >= limit) { 720 zfs_range_unlock(rl); 721 ZFS_EXIT(zfsvfs); 722 return (SET_ERROR(EFBIG)); 723 } 724 725 if ((woff + n) > limit || woff > (limit - n)) 726 n = limit - woff; 727 728 /* Will this write extend the file length? */ 729 write_eof = (woff + n > zp->z_size); 730 731 end_size = MAX(zp->z_size, woff + n); 732 733 /* 734 * Write the file in reasonable size chunks. Each chunk is written 735 * in a separate transaction; this keeps the intent log records small 736 * and allows us to do more fine-grained space accounting. 737 */ 738 while (n > 0) { 739 abuf = NULL; 740 woff = uio->uio_loffset; 741 again: 742 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 743 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 744 if (abuf != NULL) 745 dmu_return_arcbuf(abuf); 746 error = SET_ERROR(EDQUOT); 747 break; 748 } 749 750 if (xuio && abuf == NULL) { 751 ASSERT(i_iov < iovcnt); 752 aiov = &iovp[i_iov]; 753 abuf = dmu_xuio_arcbuf(xuio, i_iov); 754 dmu_xuio_clear(xuio, i_iov); 755 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 756 iovec_t *, aiov, arc_buf_t *, abuf); 757 ASSERT((aiov->iov_base == abuf->b_data) || 758 ((char *)aiov->iov_base - (char *)abuf->b_data + 759 aiov->iov_len == arc_buf_size(abuf))); 760 i_iov++; 761 } else if (abuf == NULL && n >= max_blksz && 762 woff >= zp->z_size && 763 P2PHASE(woff, max_blksz) == 0 && 764 zp->z_blksz == max_blksz) { 765 /* 766 * This write covers a full block. "Borrow" a buffer 767 * from the dmu so that we can fill it before we enter 768 * a transaction. This avoids the possibility of 769 * holding up the transaction if the data copy hangs 770 * up on a pagefault (e.g., from an NFS server mapping). 771 */ 772 size_t cbytes; 773 774 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 775 max_blksz); 776 ASSERT(abuf != NULL); 777 ASSERT(arc_buf_size(abuf) == max_blksz); 778 if (error = uiocopy(abuf->b_data, max_blksz, 779 UIO_WRITE, uio, &cbytes)) { 780 dmu_return_arcbuf(abuf); 781 break; 782 } 783 ASSERT(cbytes == max_blksz); 784 } 785 786 /* 787 * Start a transaction. 788 */ 789 tx = dmu_tx_create(zfsvfs->z_os); 790 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 791 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 792 zfs_sa_upgrade_txholds(tx, zp); 793 error = dmu_tx_assign(tx, TXG_NOWAIT); 794 if (error) { 795 if (error == ERESTART) { 796 dmu_tx_wait(tx); 797 dmu_tx_abort(tx); 798 goto again; 799 } 800 dmu_tx_abort(tx); 801 if (abuf != NULL) 802 dmu_return_arcbuf(abuf); 803 break; 804 } 805 806 /* 807 * If zfs_range_lock() over-locked we grow the blocksize 808 * and then reduce the lock range. This will only happen 809 * on the first iteration since zfs_range_reduce() will 810 * shrink down r_len to the appropriate size. 811 */ 812 if (rl->r_len == UINT64_MAX) { 813 uint64_t new_blksz; 814 815 if (zp->z_blksz > max_blksz) { 816 ASSERT(!ISP2(zp->z_blksz)); 817 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 818 } else { 819 new_blksz = MIN(end_size, max_blksz); 820 } 821 zfs_grow_blocksize(zp, new_blksz, tx); 822 zfs_range_reduce(rl, woff, n); 823 } 824 825 /* 826 * XXX - should we really limit each write to z_max_blksz? 827 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 828 */ 829 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 830 831 if (abuf == NULL) { 832 tx_bytes = uio->uio_resid; 833 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 834 uio, nbytes, tx); 835 tx_bytes -= uio->uio_resid; 836 } else { 837 tx_bytes = nbytes; 838 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 839 /* 840 * If this is not a full block write, but we are 841 * extending the file past EOF and this data starts 842 * block-aligned, use assign_arcbuf(). Otherwise, 843 * write via dmu_write(). 844 */ 845 if (tx_bytes < max_blksz && (!write_eof || 846 aiov->iov_base != abuf->b_data)) { 847 ASSERT(xuio); 848 dmu_write(zfsvfs->z_os, zp->z_id, woff, 849 aiov->iov_len, aiov->iov_base, tx); 850 dmu_return_arcbuf(abuf); 851 xuio_stat_wbuf_copied(); 852 } else { 853 ASSERT(xuio || tx_bytes == max_blksz); 854 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 855 woff, abuf, tx); 856 } 857 ASSERT(tx_bytes <= uio->uio_resid); 858 uioskip(uio, tx_bytes); 859 } 860 if (tx_bytes && vn_has_cached_data(vp)) { 861 update_pages(vp, woff, 862 tx_bytes, zfsvfs->z_os, zp->z_id); 863 } 864 865 /* 866 * If we made no progress, we're done. If we made even 867 * partial progress, update the znode and ZIL accordingly. 868 */ 869 if (tx_bytes == 0) { 870 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 871 (void *)&zp->z_size, sizeof (uint64_t), tx); 872 dmu_tx_commit(tx); 873 ASSERT(error != 0); 874 break; 875 } 876 877 /* 878 * Clear Set-UID/Set-GID bits on successful write if not 879 * privileged and at least one of the excute bits is set. 880 * 881 * It would be nice to to this after all writes have 882 * been done, but that would still expose the ISUID/ISGID 883 * to another app after the partial write is committed. 884 * 885 * Note: we don't call zfs_fuid_map_id() here because 886 * user 0 is not an ephemeral uid. 887 */ 888 mutex_enter(&zp->z_acl_lock); 889 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 890 (S_IXUSR >> 6))) != 0 && 891 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 892 secpolicy_vnode_setid_retain(cr, 893 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 894 uint64_t newmode; 895 zp->z_mode &= ~(S_ISUID | S_ISGID); 896 newmode = zp->z_mode; 897 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 898 (void *)&newmode, sizeof (uint64_t), tx); 899 } 900 mutex_exit(&zp->z_acl_lock); 901 902 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 903 B_TRUE); 904 905 /* 906 * Update the file size (zp_size) if it has changed; 907 * account for possible concurrent updates. 908 */ 909 while ((end_size = zp->z_size) < uio->uio_loffset) { 910 (void) atomic_cas_64(&zp->z_size, end_size, 911 uio->uio_loffset); 912 ASSERT(error == 0); 913 } 914 /* 915 * If we are replaying and eof is non zero then force 916 * the file size to the specified eof. Note, there's no 917 * concurrency during replay. 918 */ 919 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 920 zp->z_size = zfsvfs->z_replay_eof; 921 922 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 923 924 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 925 dmu_tx_commit(tx); 926 927 if (error != 0) 928 break; 929 ASSERT(tx_bytes == nbytes); 930 n -= nbytes; 931 932 if (!xuio && n > 0) 933 uio_prefaultpages(MIN(n, max_blksz), uio); 934 } 935 936 zfs_range_unlock(rl); 937 938 /* 939 * If we're in replay mode, or we made no progress, return error. 940 * Otherwise, it's at least a partial write, so it's successful. 941 */ 942 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 943 ZFS_EXIT(zfsvfs); 944 return (error); 945 } 946 947 if (ioflag & (FSYNC | FDSYNC) || 948 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 949 zil_commit(zilog, zp->z_id); 950 951 ZFS_EXIT(zfsvfs); 952 return (0); 953 } 954 955 void 956 zfs_get_done(zgd_t *zgd, int error) 957 { 958 znode_t *zp = zgd->zgd_private; 959 objset_t *os = zp->z_zfsvfs->z_os; 960 961 if (zgd->zgd_db) 962 dmu_buf_rele(zgd->zgd_db, zgd); 963 964 zfs_range_unlock(zgd->zgd_rl); 965 966 /* 967 * Release the vnode asynchronously as we currently have the 968 * txg stopped from syncing. 969 */ 970 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 971 972 if (error == 0 && zgd->zgd_bp) 973 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 974 975 kmem_free(zgd, sizeof (zgd_t)); 976 } 977 978 #ifdef DEBUG 979 static int zil_fault_io = 0; 980 #endif 981 982 /* 983 * Get data to generate a TX_WRITE intent log record. 984 */ 985 int 986 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 987 { 988 zfsvfs_t *zfsvfs = arg; 989 objset_t *os = zfsvfs->z_os; 990 znode_t *zp; 991 uint64_t object = lr->lr_foid; 992 uint64_t offset = lr->lr_offset; 993 uint64_t size = lr->lr_length; 994 blkptr_t *bp = &lr->lr_blkptr; 995 dmu_buf_t *db; 996 zgd_t *zgd; 997 int error = 0; 998 999 ASSERT(zio != NULL); 1000 ASSERT(size != 0); 1001 1002 /* 1003 * Nothing to do if the file has been removed 1004 */ 1005 if (zfs_zget(zfsvfs, object, &zp) != 0) 1006 return (SET_ERROR(ENOENT)); 1007 if (zp->z_unlinked) { 1008 /* 1009 * Release the vnode asynchronously as we currently have the 1010 * txg stopped from syncing. 1011 */ 1012 VN_RELE_ASYNC(ZTOV(zp), 1013 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1014 return (SET_ERROR(ENOENT)); 1015 } 1016 1017 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1018 zgd->zgd_zilog = zfsvfs->z_log; 1019 zgd->zgd_private = zp; 1020 1021 /* 1022 * Write records come in two flavors: immediate and indirect. 1023 * For small writes it's cheaper to store the data with the 1024 * log record (immediate); for large writes it's cheaper to 1025 * sync the data and get a pointer to it (indirect) so that 1026 * we don't have to write the data twice. 1027 */ 1028 if (buf != NULL) { /* immediate write */ 1029 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1030 /* test for truncation needs to be done while range locked */ 1031 if (offset >= zp->z_size) { 1032 error = SET_ERROR(ENOENT); 1033 } else { 1034 error = dmu_read(os, object, offset, size, buf, 1035 DMU_READ_NO_PREFETCH); 1036 } 1037 ASSERT(error == 0 || error == ENOENT); 1038 } else { /* indirect write */ 1039 /* 1040 * Have to lock the whole block to ensure when it's 1041 * written out and it's checksum is being calculated 1042 * that no one can change the data. We need to re-check 1043 * blocksize after we get the lock in case it's changed! 1044 */ 1045 for (;;) { 1046 uint64_t blkoff; 1047 size = zp->z_blksz; 1048 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1049 offset -= blkoff; 1050 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1051 RL_READER); 1052 if (zp->z_blksz == size) 1053 break; 1054 offset += blkoff; 1055 zfs_range_unlock(zgd->zgd_rl); 1056 } 1057 /* test for truncation needs to be done while range locked */ 1058 if (lr->lr_offset >= zp->z_size) 1059 error = SET_ERROR(ENOENT); 1060 #ifdef DEBUG 1061 if (zil_fault_io) { 1062 error = SET_ERROR(EIO); 1063 zil_fault_io = 0; 1064 } 1065 #endif 1066 if (error == 0) 1067 error = dmu_buf_hold(os, object, offset, zgd, &db, 1068 DMU_READ_NO_PREFETCH); 1069 1070 if (error == 0) { 1071 blkptr_t *obp = dmu_buf_get_blkptr(db); 1072 if (obp) { 1073 ASSERT(BP_IS_HOLE(bp)); 1074 *bp = *obp; 1075 } 1076 1077 zgd->zgd_db = db; 1078 zgd->zgd_bp = bp; 1079 1080 ASSERT(db->db_offset == offset); 1081 ASSERT(db->db_size == size); 1082 1083 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1084 zfs_get_done, zgd); 1085 ASSERT(error || lr->lr_length <= zp->z_blksz); 1086 1087 /* 1088 * On success, we need to wait for the write I/O 1089 * initiated by dmu_sync() to complete before we can 1090 * release this dbuf. We will finish everything up 1091 * in the zfs_get_done() callback. 1092 */ 1093 if (error == 0) 1094 return (0); 1095 1096 if (error == EALREADY) { 1097 lr->lr_common.lrc_txtype = TX_WRITE2; 1098 error = 0; 1099 } 1100 } 1101 } 1102 1103 zfs_get_done(zgd, error); 1104 1105 return (error); 1106 } 1107 1108 /*ARGSUSED*/ 1109 static int 1110 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1111 caller_context_t *ct) 1112 { 1113 znode_t *zp = VTOZ(vp); 1114 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1115 int error; 1116 1117 ZFS_ENTER(zfsvfs); 1118 ZFS_VERIFY_ZP(zp); 1119 1120 if (flag & V_ACE_MASK) 1121 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1122 else 1123 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1124 1125 ZFS_EXIT(zfsvfs); 1126 return (error); 1127 } 1128 1129 /* 1130 * If vnode is for a device return a specfs vnode instead. 1131 */ 1132 static int 1133 specvp_check(vnode_t **vpp, cred_t *cr) 1134 { 1135 int error = 0; 1136 1137 if (IS_DEVVP(*vpp)) { 1138 struct vnode *svp; 1139 1140 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1141 VN_RELE(*vpp); 1142 if (svp == NULL) 1143 error = SET_ERROR(ENOSYS); 1144 *vpp = svp; 1145 } 1146 return (error); 1147 } 1148 1149 1150 /* 1151 * Lookup an entry in a directory, or an extended attribute directory. 1152 * If it exists, return a held vnode reference for it. 1153 * 1154 * IN: dvp - vnode of directory to search. 1155 * nm - name of entry to lookup. 1156 * pnp - full pathname to lookup [UNUSED]. 1157 * flags - LOOKUP_XATTR set if looking for an attribute. 1158 * rdir - root directory vnode [UNUSED]. 1159 * cr - credentials of caller. 1160 * ct - caller context 1161 * direntflags - directory lookup flags 1162 * realpnp - returned pathname. 1163 * 1164 * OUT: vpp - vnode of located entry, NULL if not found. 1165 * 1166 * RETURN: 0 on success, error code on failure. 1167 * 1168 * Timestamps: 1169 * NA 1170 */ 1171 /* ARGSUSED */ 1172 static int 1173 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1174 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1175 int *direntflags, pathname_t *realpnp) 1176 { 1177 znode_t *zdp = VTOZ(dvp); 1178 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1179 int error = 0; 1180 1181 /* fast path */ 1182 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1183 1184 if (dvp->v_type != VDIR) { 1185 return (SET_ERROR(ENOTDIR)); 1186 } else if (zdp->z_sa_hdl == NULL) { 1187 return (SET_ERROR(EIO)); 1188 } 1189 1190 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1191 error = zfs_fastaccesschk_execute(zdp, cr); 1192 if (!error) { 1193 *vpp = dvp; 1194 VN_HOLD(*vpp); 1195 return (0); 1196 } 1197 return (error); 1198 } else { 1199 vnode_t *tvp = dnlc_lookup(dvp, nm); 1200 1201 if (tvp) { 1202 error = zfs_fastaccesschk_execute(zdp, cr); 1203 if (error) { 1204 VN_RELE(tvp); 1205 return (error); 1206 } 1207 if (tvp == DNLC_NO_VNODE) { 1208 VN_RELE(tvp); 1209 return (SET_ERROR(ENOENT)); 1210 } else { 1211 *vpp = tvp; 1212 return (specvp_check(vpp, cr)); 1213 } 1214 } 1215 } 1216 } 1217 1218 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1219 1220 ZFS_ENTER(zfsvfs); 1221 ZFS_VERIFY_ZP(zdp); 1222 1223 *vpp = NULL; 1224 1225 if (flags & LOOKUP_XATTR) { 1226 /* 1227 * If the xattr property is off, refuse the lookup request. 1228 */ 1229 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1230 ZFS_EXIT(zfsvfs); 1231 return (SET_ERROR(EINVAL)); 1232 } 1233 1234 /* 1235 * We don't allow recursive attributes.. 1236 * Maybe someday we will. 1237 */ 1238 if (zdp->z_pflags & ZFS_XATTR) { 1239 ZFS_EXIT(zfsvfs); 1240 return (SET_ERROR(EINVAL)); 1241 } 1242 1243 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1244 ZFS_EXIT(zfsvfs); 1245 return (error); 1246 } 1247 1248 /* 1249 * Do we have permission to get into attribute directory? 1250 */ 1251 1252 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1253 B_FALSE, cr)) { 1254 VN_RELE(*vpp); 1255 *vpp = NULL; 1256 } 1257 1258 ZFS_EXIT(zfsvfs); 1259 return (error); 1260 } 1261 1262 if (dvp->v_type != VDIR) { 1263 ZFS_EXIT(zfsvfs); 1264 return (SET_ERROR(ENOTDIR)); 1265 } 1266 1267 /* 1268 * Check accessibility of directory. 1269 */ 1270 1271 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1272 ZFS_EXIT(zfsvfs); 1273 return (error); 1274 } 1275 1276 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1277 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1278 ZFS_EXIT(zfsvfs); 1279 return (SET_ERROR(EILSEQ)); 1280 } 1281 1282 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1283 if (error == 0) 1284 error = specvp_check(vpp, cr); 1285 1286 ZFS_EXIT(zfsvfs); 1287 return (error); 1288 } 1289 1290 /* 1291 * Attempt to create a new entry in a directory. If the entry 1292 * already exists, truncate the file if permissible, else return 1293 * an error. Return the vp of the created or trunc'd file. 1294 * 1295 * IN: dvp - vnode of directory to put new file entry in. 1296 * name - name of new file entry. 1297 * vap - attributes of new file. 1298 * excl - flag indicating exclusive or non-exclusive mode. 1299 * mode - mode to open file with. 1300 * cr - credentials of caller. 1301 * flag - large file flag [UNUSED]. 1302 * ct - caller context 1303 * vsecp - ACL to be set 1304 * 1305 * OUT: vpp - vnode of created or trunc'd entry. 1306 * 1307 * RETURN: 0 on success, error code on failure. 1308 * 1309 * Timestamps: 1310 * dvp - ctime|mtime updated if new entry created 1311 * vp - ctime|mtime always, atime if new 1312 */ 1313 1314 /* ARGSUSED */ 1315 static int 1316 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1317 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1318 vsecattr_t *vsecp) 1319 { 1320 znode_t *zp, *dzp = VTOZ(dvp); 1321 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1322 zilog_t *zilog; 1323 objset_t *os; 1324 zfs_dirlock_t *dl; 1325 dmu_tx_t *tx; 1326 int error; 1327 ksid_t *ksid; 1328 uid_t uid; 1329 gid_t gid = crgetgid(cr); 1330 zfs_acl_ids_t acl_ids; 1331 boolean_t fuid_dirtied; 1332 boolean_t have_acl = B_FALSE; 1333 boolean_t waited = B_FALSE; 1334 1335 /* 1336 * If we have an ephemeral id, ACL, or XVATTR then 1337 * make sure file system is at proper version 1338 */ 1339 1340 ksid = crgetsid(cr, KSID_OWNER); 1341 if (ksid) 1342 uid = ksid_getid(ksid); 1343 else 1344 uid = crgetuid(cr); 1345 1346 if (zfsvfs->z_use_fuids == B_FALSE && 1347 (vsecp || (vap->va_mask & AT_XVATTR) || 1348 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1349 return (SET_ERROR(EINVAL)); 1350 1351 ZFS_ENTER(zfsvfs); 1352 ZFS_VERIFY_ZP(dzp); 1353 os = zfsvfs->z_os; 1354 zilog = zfsvfs->z_log; 1355 1356 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1357 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1358 ZFS_EXIT(zfsvfs); 1359 return (SET_ERROR(EILSEQ)); 1360 } 1361 1362 if (vap->va_mask & AT_XVATTR) { 1363 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1364 crgetuid(cr), cr, vap->va_type)) != 0) { 1365 ZFS_EXIT(zfsvfs); 1366 return (error); 1367 } 1368 } 1369 top: 1370 *vpp = NULL; 1371 1372 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1373 vap->va_mode &= ~VSVTX; 1374 1375 if (*name == '\0') { 1376 /* 1377 * Null component name refers to the directory itself. 1378 */ 1379 VN_HOLD(dvp); 1380 zp = dzp; 1381 dl = NULL; 1382 error = 0; 1383 } else { 1384 /* possible VN_HOLD(zp) */ 1385 int zflg = 0; 1386 1387 if (flag & FIGNORECASE) 1388 zflg |= ZCILOOK; 1389 1390 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1391 NULL, NULL); 1392 if (error) { 1393 if (have_acl) 1394 zfs_acl_ids_free(&acl_ids); 1395 if (strcmp(name, "..") == 0) 1396 error = SET_ERROR(EISDIR); 1397 ZFS_EXIT(zfsvfs); 1398 return (error); 1399 } 1400 } 1401 1402 if (zp == NULL) { 1403 uint64_t txtype; 1404 1405 /* 1406 * Create a new file object and update the directory 1407 * to reference it. 1408 */ 1409 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1410 if (have_acl) 1411 zfs_acl_ids_free(&acl_ids); 1412 goto out; 1413 } 1414 1415 /* 1416 * We only support the creation of regular files in 1417 * extended attribute directories. 1418 */ 1419 1420 if ((dzp->z_pflags & ZFS_XATTR) && 1421 (vap->va_type != VREG)) { 1422 if (have_acl) 1423 zfs_acl_ids_free(&acl_ids); 1424 error = SET_ERROR(EINVAL); 1425 goto out; 1426 } 1427 1428 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 1429 cr, vsecp, &acl_ids)) != 0) 1430 goto out; 1431 have_acl = B_TRUE; 1432 1433 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1434 zfs_acl_ids_free(&acl_ids); 1435 error = SET_ERROR(EDQUOT); 1436 goto out; 1437 } 1438 1439 tx = dmu_tx_create(os); 1440 1441 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1442 ZFS_SA_BASE_ATTR_SIZE); 1443 1444 fuid_dirtied = zfsvfs->z_fuid_dirty; 1445 if (fuid_dirtied) 1446 zfs_fuid_txhold(zfsvfs, tx); 1447 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1448 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1449 if (!zfsvfs->z_use_sa && 1450 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1451 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1452 0, acl_ids.z_aclp->z_acl_bytes); 1453 } 1454 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1455 if (error) { 1456 zfs_dirent_unlock(dl); 1457 if (error == ERESTART) { 1458 waited = B_TRUE; 1459 dmu_tx_wait(tx); 1460 dmu_tx_abort(tx); 1461 goto top; 1462 } 1463 zfs_acl_ids_free(&acl_ids); 1464 dmu_tx_abort(tx); 1465 ZFS_EXIT(zfsvfs); 1466 return (error); 1467 } 1468 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1469 1470 if (fuid_dirtied) 1471 zfs_fuid_sync(zfsvfs, tx); 1472 1473 (void) zfs_link_create(dl, zp, tx, ZNEW); 1474 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1475 if (flag & FIGNORECASE) 1476 txtype |= TX_CI; 1477 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1478 vsecp, acl_ids.z_fuidp, vap); 1479 zfs_acl_ids_free(&acl_ids); 1480 dmu_tx_commit(tx); 1481 } else { 1482 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1483 1484 if (have_acl) 1485 zfs_acl_ids_free(&acl_ids); 1486 have_acl = B_FALSE; 1487 1488 /* 1489 * A directory entry already exists for this name. 1490 */ 1491 /* 1492 * Can't truncate an existing file if in exclusive mode. 1493 */ 1494 if (excl == EXCL) { 1495 error = SET_ERROR(EEXIST); 1496 goto out; 1497 } 1498 /* 1499 * Can't open a directory for writing. 1500 */ 1501 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1502 error = SET_ERROR(EISDIR); 1503 goto out; 1504 } 1505 /* 1506 * Verify requested access to file. 1507 */ 1508 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1509 goto out; 1510 } 1511 1512 mutex_enter(&dzp->z_lock); 1513 dzp->z_seq++; 1514 mutex_exit(&dzp->z_lock); 1515 1516 /* 1517 * Truncate regular files if requested. 1518 */ 1519 if ((ZTOV(zp)->v_type == VREG) && 1520 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1521 /* we can't hold any locks when calling zfs_freesp() */ 1522 zfs_dirent_unlock(dl); 1523 dl = NULL; 1524 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1525 if (error == 0) { 1526 vnevent_create(ZTOV(zp), ct); 1527 } 1528 } 1529 } 1530 out: 1531 1532 if (dl) 1533 zfs_dirent_unlock(dl); 1534 1535 if (error) { 1536 if (zp) 1537 VN_RELE(ZTOV(zp)); 1538 } else { 1539 *vpp = ZTOV(zp); 1540 error = specvp_check(vpp, cr); 1541 } 1542 1543 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1544 zil_commit(zilog, 0); 1545 1546 ZFS_EXIT(zfsvfs); 1547 return (error); 1548 } 1549 1550 /* 1551 * Remove an entry from a directory. 1552 * 1553 * IN: dvp - vnode of directory to remove entry from. 1554 * name - name of entry to remove. 1555 * cr - credentials of caller. 1556 * ct - caller context 1557 * flags - case flags 1558 * 1559 * RETURN: 0 on success, error code on failure. 1560 * 1561 * Timestamps: 1562 * dvp - ctime|mtime 1563 * vp - ctime (if nlink > 0) 1564 */ 1565 1566 uint64_t null_xattr = 0; 1567 1568 /*ARGSUSED*/ 1569 static int 1570 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1571 int flags) 1572 { 1573 znode_t *zp, *dzp = VTOZ(dvp); 1574 znode_t *xzp; 1575 vnode_t *vp; 1576 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1577 zilog_t *zilog; 1578 uint64_t acl_obj, xattr_obj; 1579 uint64_t xattr_obj_unlinked = 0; 1580 uint64_t obj = 0; 1581 zfs_dirlock_t *dl; 1582 dmu_tx_t *tx; 1583 boolean_t may_delete_now, delete_now = FALSE; 1584 boolean_t unlinked, toobig = FALSE; 1585 uint64_t txtype; 1586 pathname_t *realnmp = NULL; 1587 pathname_t realnm; 1588 int error; 1589 int zflg = ZEXISTS; 1590 boolean_t waited = B_FALSE; 1591 1592 ZFS_ENTER(zfsvfs); 1593 ZFS_VERIFY_ZP(dzp); 1594 zilog = zfsvfs->z_log; 1595 1596 if (flags & FIGNORECASE) { 1597 zflg |= ZCILOOK; 1598 pn_alloc(&realnm); 1599 realnmp = &realnm; 1600 } 1601 1602 top: 1603 xattr_obj = 0; 1604 xzp = NULL; 1605 /* 1606 * Attempt to lock directory; fail if entry doesn't exist. 1607 */ 1608 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1609 NULL, realnmp)) { 1610 if (realnmp) 1611 pn_free(realnmp); 1612 ZFS_EXIT(zfsvfs); 1613 return (error); 1614 } 1615 1616 vp = ZTOV(zp); 1617 1618 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1619 goto out; 1620 } 1621 1622 /* 1623 * Need to use rmdir for removing directories. 1624 */ 1625 if (vp->v_type == VDIR) { 1626 error = SET_ERROR(EPERM); 1627 goto out; 1628 } 1629 1630 vnevent_remove(vp, dvp, name, ct); 1631 1632 if (realnmp) 1633 dnlc_remove(dvp, realnmp->pn_buf); 1634 else 1635 dnlc_remove(dvp, name); 1636 1637 mutex_enter(&vp->v_lock); 1638 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1639 mutex_exit(&vp->v_lock); 1640 1641 /* 1642 * We may delete the znode now, or we may put it in the unlinked set; 1643 * it depends on whether we're the last link, and on whether there are 1644 * other holds on the vnode. So we dmu_tx_hold() the right things to 1645 * allow for either case. 1646 */ 1647 obj = zp->z_id; 1648 tx = dmu_tx_create(zfsvfs->z_os); 1649 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1650 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1651 zfs_sa_upgrade_txholds(tx, zp); 1652 zfs_sa_upgrade_txholds(tx, dzp); 1653 if (may_delete_now) { 1654 toobig = 1655 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1656 /* if the file is too big, only hold_free a token amount */ 1657 dmu_tx_hold_free(tx, zp->z_id, 0, 1658 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1659 } 1660 1661 /* are there any extended attributes? */ 1662 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1663 &xattr_obj, sizeof (xattr_obj)); 1664 if (error == 0 && xattr_obj) { 1665 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1666 ASSERT0(error); 1667 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1668 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1669 } 1670 1671 mutex_enter(&zp->z_lock); 1672 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1673 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1674 mutex_exit(&zp->z_lock); 1675 1676 /* charge as an update -- would be nice not to charge at all */ 1677 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1678 1679 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1680 if (error) { 1681 zfs_dirent_unlock(dl); 1682 VN_RELE(vp); 1683 if (xzp) 1684 VN_RELE(ZTOV(xzp)); 1685 if (error == ERESTART) { 1686 waited = B_TRUE; 1687 dmu_tx_wait(tx); 1688 dmu_tx_abort(tx); 1689 goto top; 1690 } 1691 if (realnmp) 1692 pn_free(realnmp); 1693 dmu_tx_abort(tx); 1694 ZFS_EXIT(zfsvfs); 1695 return (error); 1696 } 1697 1698 /* 1699 * Remove the directory entry. 1700 */ 1701 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1702 1703 if (error) { 1704 dmu_tx_commit(tx); 1705 goto out; 1706 } 1707 1708 if (unlinked) { 1709 1710 /* 1711 * Hold z_lock so that we can make sure that the ACL obj 1712 * hasn't changed. Could have been deleted due to 1713 * zfs_sa_upgrade(). 1714 */ 1715 mutex_enter(&zp->z_lock); 1716 mutex_enter(&vp->v_lock); 1717 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1718 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1719 delete_now = may_delete_now && !toobig && 1720 vp->v_count == 1 && !vn_has_cached_data(vp) && 1721 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == 1722 acl_obj; 1723 mutex_exit(&vp->v_lock); 1724 } 1725 1726 if (delete_now) { 1727 if (xattr_obj_unlinked) { 1728 ASSERT3U(xzp->z_links, ==, 2); 1729 mutex_enter(&xzp->z_lock); 1730 xzp->z_unlinked = 1; 1731 xzp->z_links = 0; 1732 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1733 &xzp->z_links, sizeof (xzp->z_links), tx); 1734 ASSERT3U(error, ==, 0); 1735 mutex_exit(&xzp->z_lock); 1736 zfs_unlinked_add(xzp, tx); 1737 1738 if (zp->z_is_sa) 1739 error = sa_remove(zp->z_sa_hdl, 1740 SA_ZPL_XATTR(zfsvfs), tx); 1741 else 1742 error = sa_update(zp->z_sa_hdl, 1743 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1744 sizeof (uint64_t), tx); 1745 ASSERT0(error); 1746 } 1747 mutex_enter(&vp->v_lock); 1748 vp->v_count--; 1749 ASSERT0(vp->v_count); 1750 mutex_exit(&vp->v_lock); 1751 mutex_exit(&zp->z_lock); 1752 zfs_znode_delete(zp, tx); 1753 } else if (unlinked) { 1754 mutex_exit(&zp->z_lock); 1755 zfs_unlinked_add(zp, tx); 1756 } 1757 1758 txtype = TX_REMOVE; 1759 if (flags & FIGNORECASE) 1760 txtype |= TX_CI; 1761 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 1762 1763 dmu_tx_commit(tx); 1764 out: 1765 if (realnmp) 1766 pn_free(realnmp); 1767 1768 zfs_dirent_unlock(dl); 1769 1770 if (!delete_now) 1771 VN_RELE(vp); 1772 if (xzp) 1773 VN_RELE(ZTOV(xzp)); 1774 1775 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1776 zil_commit(zilog, 0); 1777 1778 ZFS_EXIT(zfsvfs); 1779 return (error); 1780 } 1781 1782 /* 1783 * Create a new directory and insert it into dvp using the name 1784 * provided. Return a pointer to the inserted directory. 1785 * 1786 * IN: dvp - vnode of directory to add subdir to. 1787 * dirname - name of new directory. 1788 * vap - attributes of new directory. 1789 * cr - credentials of caller. 1790 * ct - caller context 1791 * flags - case flags 1792 * vsecp - ACL to be set 1793 * 1794 * OUT: vpp - vnode of created directory. 1795 * 1796 * RETURN: 0 on success, error code on failure. 1797 * 1798 * Timestamps: 1799 * dvp - ctime|mtime updated 1800 * vp - ctime|mtime|atime updated 1801 */ 1802 /*ARGSUSED*/ 1803 static int 1804 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1805 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1806 { 1807 znode_t *zp, *dzp = VTOZ(dvp); 1808 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1809 zilog_t *zilog; 1810 zfs_dirlock_t *dl; 1811 uint64_t txtype; 1812 dmu_tx_t *tx; 1813 int error; 1814 int zf = ZNEW; 1815 ksid_t *ksid; 1816 uid_t uid; 1817 gid_t gid = crgetgid(cr); 1818 zfs_acl_ids_t acl_ids; 1819 boolean_t fuid_dirtied; 1820 boolean_t waited = B_FALSE; 1821 1822 ASSERT(vap->va_type == VDIR); 1823 1824 /* 1825 * If we have an ephemeral id, ACL, or XVATTR then 1826 * make sure file system is at proper version 1827 */ 1828 1829 ksid = crgetsid(cr, KSID_OWNER); 1830 if (ksid) 1831 uid = ksid_getid(ksid); 1832 else 1833 uid = crgetuid(cr); 1834 if (zfsvfs->z_use_fuids == B_FALSE && 1835 (vsecp || (vap->va_mask & AT_XVATTR) || 1836 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1837 return (SET_ERROR(EINVAL)); 1838 1839 ZFS_ENTER(zfsvfs); 1840 ZFS_VERIFY_ZP(dzp); 1841 zilog = zfsvfs->z_log; 1842 1843 if (dzp->z_pflags & ZFS_XATTR) { 1844 ZFS_EXIT(zfsvfs); 1845 return (SET_ERROR(EINVAL)); 1846 } 1847 1848 if (zfsvfs->z_utf8 && u8_validate(dirname, 1849 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1850 ZFS_EXIT(zfsvfs); 1851 return (SET_ERROR(EILSEQ)); 1852 } 1853 if (flags & FIGNORECASE) 1854 zf |= ZCILOOK; 1855 1856 if (vap->va_mask & AT_XVATTR) { 1857 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1858 crgetuid(cr), cr, vap->va_type)) != 0) { 1859 ZFS_EXIT(zfsvfs); 1860 return (error); 1861 } 1862 } 1863 1864 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1865 vsecp, &acl_ids)) != 0) { 1866 ZFS_EXIT(zfsvfs); 1867 return (error); 1868 } 1869 /* 1870 * First make sure the new directory doesn't exist. 1871 * 1872 * Existence is checked first to make sure we don't return 1873 * EACCES instead of EEXIST which can cause some applications 1874 * to fail. 1875 */ 1876 top: 1877 *vpp = NULL; 1878 1879 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1880 NULL, NULL)) { 1881 zfs_acl_ids_free(&acl_ids); 1882 ZFS_EXIT(zfsvfs); 1883 return (error); 1884 } 1885 1886 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1887 zfs_acl_ids_free(&acl_ids); 1888 zfs_dirent_unlock(dl); 1889 ZFS_EXIT(zfsvfs); 1890 return (error); 1891 } 1892 1893 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1894 zfs_acl_ids_free(&acl_ids); 1895 zfs_dirent_unlock(dl); 1896 ZFS_EXIT(zfsvfs); 1897 return (SET_ERROR(EDQUOT)); 1898 } 1899 1900 /* 1901 * Add a new entry to the directory. 1902 */ 1903 tx = dmu_tx_create(zfsvfs->z_os); 1904 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1905 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1906 fuid_dirtied = zfsvfs->z_fuid_dirty; 1907 if (fuid_dirtied) 1908 zfs_fuid_txhold(zfsvfs, tx); 1909 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1910 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1911 acl_ids.z_aclp->z_acl_bytes); 1912 } 1913 1914 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1915 ZFS_SA_BASE_ATTR_SIZE); 1916 1917 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1918 if (error) { 1919 zfs_dirent_unlock(dl); 1920 if (error == ERESTART) { 1921 waited = B_TRUE; 1922 dmu_tx_wait(tx); 1923 dmu_tx_abort(tx); 1924 goto top; 1925 } 1926 zfs_acl_ids_free(&acl_ids); 1927 dmu_tx_abort(tx); 1928 ZFS_EXIT(zfsvfs); 1929 return (error); 1930 } 1931 1932 /* 1933 * Create new node. 1934 */ 1935 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1936 1937 if (fuid_dirtied) 1938 zfs_fuid_sync(zfsvfs, tx); 1939 1940 /* 1941 * Now put new name in parent dir. 1942 */ 1943 (void) zfs_link_create(dl, zp, tx, ZNEW); 1944 1945 *vpp = ZTOV(zp); 1946 1947 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1948 if (flags & FIGNORECASE) 1949 txtype |= TX_CI; 1950 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1951 acl_ids.z_fuidp, vap); 1952 1953 zfs_acl_ids_free(&acl_ids); 1954 1955 dmu_tx_commit(tx); 1956 1957 zfs_dirent_unlock(dl); 1958 1959 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1960 zil_commit(zilog, 0); 1961 1962 ZFS_EXIT(zfsvfs); 1963 return (0); 1964 } 1965 1966 /* 1967 * Remove a directory subdir entry. If the current working 1968 * directory is the same as the subdir to be removed, the 1969 * remove will fail. 1970 * 1971 * IN: dvp - vnode of directory to remove from. 1972 * name - name of directory to be removed. 1973 * cwd - vnode of current working directory. 1974 * cr - credentials of caller. 1975 * ct - caller context 1976 * flags - case flags 1977 * 1978 * RETURN: 0 on success, error code on failure. 1979 * 1980 * Timestamps: 1981 * dvp - ctime|mtime updated 1982 */ 1983 /*ARGSUSED*/ 1984 static int 1985 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1986 caller_context_t *ct, int flags) 1987 { 1988 znode_t *dzp = VTOZ(dvp); 1989 znode_t *zp; 1990 vnode_t *vp; 1991 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1992 zilog_t *zilog; 1993 zfs_dirlock_t *dl; 1994 dmu_tx_t *tx; 1995 int error; 1996 int zflg = ZEXISTS; 1997 boolean_t waited = B_FALSE; 1998 1999 ZFS_ENTER(zfsvfs); 2000 ZFS_VERIFY_ZP(dzp); 2001 zilog = zfsvfs->z_log; 2002 2003 if (flags & FIGNORECASE) 2004 zflg |= ZCILOOK; 2005 top: 2006 zp = NULL; 2007 2008 /* 2009 * Attempt to lock directory; fail if entry doesn't exist. 2010 */ 2011 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2012 NULL, NULL)) { 2013 ZFS_EXIT(zfsvfs); 2014 return (error); 2015 } 2016 2017 vp = ZTOV(zp); 2018 2019 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2020 goto out; 2021 } 2022 2023 if (vp->v_type != VDIR) { 2024 error = SET_ERROR(ENOTDIR); 2025 goto out; 2026 } 2027 2028 if (vp == cwd) { 2029 error = SET_ERROR(EINVAL); 2030 goto out; 2031 } 2032 2033 vnevent_rmdir(vp, dvp, name, ct); 2034 2035 /* 2036 * Grab a lock on the directory to make sure that noone is 2037 * trying to add (or lookup) entries while we are removing it. 2038 */ 2039 rw_enter(&zp->z_name_lock, RW_WRITER); 2040 2041 /* 2042 * Grab a lock on the parent pointer to make sure we play well 2043 * with the treewalk and directory rename code. 2044 */ 2045 rw_enter(&zp->z_parent_lock, RW_WRITER); 2046 2047 tx = dmu_tx_create(zfsvfs->z_os); 2048 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2049 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2050 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2051 zfs_sa_upgrade_txholds(tx, zp); 2052 zfs_sa_upgrade_txholds(tx, dzp); 2053 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 2054 if (error) { 2055 rw_exit(&zp->z_parent_lock); 2056 rw_exit(&zp->z_name_lock); 2057 zfs_dirent_unlock(dl); 2058 VN_RELE(vp); 2059 if (error == ERESTART) { 2060 waited = B_TRUE; 2061 dmu_tx_wait(tx); 2062 dmu_tx_abort(tx); 2063 goto top; 2064 } 2065 dmu_tx_abort(tx); 2066 ZFS_EXIT(zfsvfs); 2067 return (error); 2068 } 2069 2070 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2071 2072 if (error == 0) { 2073 uint64_t txtype = TX_RMDIR; 2074 if (flags & FIGNORECASE) 2075 txtype |= TX_CI; 2076 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2077 } 2078 2079 dmu_tx_commit(tx); 2080 2081 rw_exit(&zp->z_parent_lock); 2082 rw_exit(&zp->z_name_lock); 2083 out: 2084 zfs_dirent_unlock(dl); 2085 2086 VN_RELE(vp); 2087 2088 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2089 zil_commit(zilog, 0); 2090 2091 ZFS_EXIT(zfsvfs); 2092 return (error); 2093 } 2094 2095 /* 2096 * Read as many directory entries as will fit into the provided 2097 * buffer from the given directory cursor position (specified in 2098 * the uio structure). 2099 * 2100 * IN: vp - vnode of directory to read. 2101 * uio - structure supplying read location, range info, 2102 * and return buffer. 2103 * cr - credentials of caller. 2104 * ct - caller context 2105 * flags - case flags 2106 * 2107 * OUT: uio - updated offset and range, buffer filled. 2108 * eofp - set to true if end-of-file detected. 2109 * 2110 * RETURN: 0 on success, error code on failure. 2111 * 2112 * Timestamps: 2113 * vp - atime updated 2114 * 2115 * Note that the low 4 bits of the cookie returned by zap is always zero. 2116 * This allows us to use the low range for "special" directory entries: 2117 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2118 * we use the offset 2 for the '.zfs' directory. 2119 */ 2120 /* ARGSUSED */ 2121 static int 2122 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 2123 caller_context_t *ct, int flags) 2124 { 2125 znode_t *zp = VTOZ(vp); 2126 iovec_t *iovp; 2127 edirent_t *eodp; 2128 dirent64_t *odp; 2129 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2130 objset_t *os; 2131 caddr_t outbuf; 2132 size_t bufsize; 2133 zap_cursor_t zc; 2134 zap_attribute_t zap; 2135 uint_t bytes_wanted; 2136 uint64_t offset; /* must be unsigned; checks for < 1 */ 2137 uint64_t parent; 2138 int local_eof; 2139 int outcount; 2140 int error; 2141 uint8_t prefetch; 2142 boolean_t check_sysattrs; 2143 2144 ZFS_ENTER(zfsvfs); 2145 ZFS_VERIFY_ZP(zp); 2146 2147 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2148 &parent, sizeof (parent))) != 0) { 2149 ZFS_EXIT(zfsvfs); 2150 return (error); 2151 } 2152 2153 /* 2154 * If we are not given an eof variable, 2155 * use a local one. 2156 */ 2157 if (eofp == NULL) 2158 eofp = &local_eof; 2159 2160 /* 2161 * Check for valid iov_len. 2162 */ 2163 if (uio->uio_iov->iov_len <= 0) { 2164 ZFS_EXIT(zfsvfs); 2165 return (SET_ERROR(EINVAL)); 2166 } 2167 2168 /* 2169 * Quit if directory has been removed (posix) 2170 */ 2171 if ((*eofp = zp->z_unlinked) != 0) { 2172 ZFS_EXIT(zfsvfs); 2173 return (0); 2174 } 2175 2176 error = 0; 2177 os = zfsvfs->z_os; 2178 offset = uio->uio_loffset; 2179 prefetch = zp->z_zn_prefetch; 2180 2181 /* 2182 * Initialize the iterator cursor. 2183 */ 2184 if (offset <= 3) { 2185 /* 2186 * Start iteration from the beginning of the directory. 2187 */ 2188 zap_cursor_init(&zc, os, zp->z_id); 2189 } else { 2190 /* 2191 * The offset is a serialized cursor. 2192 */ 2193 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2194 } 2195 2196 /* 2197 * Get space to change directory entries into fs independent format. 2198 */ 2199 iovp = uio->uio_iov; 2200 bytes_wanted = iovp->iov_len; 2201 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2202 bufsize = bytes_wanted; 2203 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2204 odp = (struct dirent64 *)outbuf; 2205 } else { 2206 bufsize = bytes_wanted; 2207 outbuf = NULL; 2208 odp = (struct dirent64 *)iovp->iov_base; 2209 } 2210 eodp = (struct edirent *)odp; 2211 2212 /* 2213 * If this VFS supports the system attribute view interface; and 2214 * we're looking at an extended attribute directory; and we care 2215 * about normalization conflicts on this vfs; then we must check 2216 * for normalization conflicts with the sysattr name space. 2217 */ 2218 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2219 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2220 (flags & V_RDDIR_ENTFLAGS); 2221 2222 /* 2223 * Transform to file-system independent format 2224 */ 2225 outcount = 0; 2226 while (outcount < bytes_wanted) { 2227 ino64_t objnum; 2228 ushort_t reclen; 2229 off64_t *next = NULL; 2230 2231 /* 2232 * Special case `.', `..', and `.zfs'. 2233 */ 2234 if (offset == 0) { 2235 (void) strcpy(zap.za_name, "."); 2236 zap.za_normalization_conflict = 0; 2237 objnum = zp->z_id; 2238 } else if (offset == 1) { 2239 (void) strcpy(zap.za_name, ".."); 2240 zap.za_normalization_conflict = 0; 2241 objnum = parent; 2242 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2243 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2244 zap.za_normalization_conflict = 0; 2245 objnum = ZFSCTL_INO_ROOT; 2246 } else { 2247 /* 2248 * Grab next entry. 2249 */ 2250 if (error = zap_cursor_retrieve(&zc, &zap)) { 2251 if ((*eofp = (error == ENOENT)) != 0) 2252 break; 2253 else 2254 goto update; 2255 } 2256 2257 if (zap.za_integer_length != 8 || 2258 zap.za_num_integers != 1) { 2259 cmn_err(CE_WARN, "zap_readdir: bad directory " 2260 "entry, obj = %lld, offset = %lld\n", 2261 (u_longlong_t)zp->z_id, 2262 (u_longlong_t)offset); 2263 error = SET_ERROR(ENXIO); 2264 goto update; 2265 } 2266 2267 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2268 /* 2269 * MacOS X can extract the object type here such as: 2270 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2271 */ 2272 2273 if (check_sysattrs && !zap.za_normalization_conflict) { 2274 zap.za_normalization_conflict = 2275 xattr_sysattr_casechk(zap.za_name); 2276 } 2277 } 2278 2279 if (flags & V_RDDIR_ACCFILTER) { 2280 /* 2281 * If we have no access at all, don't include 2282 * this entry in the returned information 2283 */ 2284 znode_t *ezp; 2285 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2286 goto skip_entry; 2287 if (!zfs_has_access(ezp, cr)) { 2288 VN_RELE(ZTOV(ezp)); 2289 goto skip_entry; 2290 } 2291 VN_RELE(ZTOV(ezp)); 2292 } 2293 2294 if (flags & V_RDDIR_ENTFLAGS) 2295 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2296 else 2297 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2298 2299 /* 2300 * Will this entry fit in the buffer? 2301 */ 2302 if (outcount + reclen > bufsize) { 2303 /* 2304 * Did we manage to fit anything in the buffer? 2305 */ 2306 if (!outcount) { 2307 error = SET_ERROR(EINVAL); 2308 goto update; 2309 } 2310 break; 2311 } 2312 if (flags & V_RDDIR_ENTFLAGS) { 2313 /* 2314 * Add extended flag entry: 2315 */ 2316 eodp->ed_ino = objnum; 2317 eodp->ed_reclen = reclen; 2318 /* NOTE: ed_off is the offset for the *next* entry */ 2319 next = &(eodp->ed_off); 2320 eodp->ed_eflags = zap.za_normalization_conflict ? 2321 ED_CASE_CONFLICT : 0; 2322 (void) strncpy(eodp->ed_name, zap.za_name, 2323 EDIRENT_NAMELEN(reclen)); 2324 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2325 } else { 2326 /* 2327 * Add normal entry: 2328 */ 2329 odp->d_ino = objnum; 2330 odp->d_reclen = reclen; 2331 /* NOTE: d_off is the offset for the *next* entry */ 2332 next = &(odp->d_off); 2333 (void) strncpy(odp->d_name, zap.za_name, 2334 DIRENT64_NAMELEN(reclen)); 2335 odp = (dirent64_t *)((intptr_t)odp + reclen); 2336 } 2337 outcount += reclen; 2338 2339 ASSERT(outcount <= bufsize); 2340 2341 /* Prefetch znode */ 2342 if (prefetch) 2343 dmu_prefetch(os, objnum, 0, 0); 2344 2345 skip_entry: 2346 /* 2347 * Move to the next entry, fill in the previous offset. 2348 */ 2349 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2350 zap_cursor_advance(&zc); 2351 offset = zap_cursor_serialize(&zc); 2352 } else { 2353 offset += 1; 2354 } 2355 if (next) 2356 *next = offset; 2357 } 2358 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2359 2360 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2361 iovp->iov_base += outcount; 2362 iovp->iov_len -= outcount; 2363 uio->uio_resid -= outcount; 2364 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2365 /* 2366 * Reset the pointer. 2367 */ 2368 offset = uio->uio_loffset; 2369 } 2370 2371 update: 2372 zap_cursor_fini(&zc); 2373 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2374 kmem_free(outbuf, bufsize); 2375 2376 if (error == ENOENT) 2377 error = 0; 2378 2379 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2380 2381 uio->uio_loffset = offset; 2382 ZFS_EXIT(zfsvfs); 2383 return (error); 2384 } 2385 2386 ulong_t zfs_fsync_sync_cnt = 4; 2387 2388 static int 2389 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2390 { 2391 znode_t *zp = VTOZ(vp); 2392 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2393 2394 /* 2395 * Regardless of whether this is required for standards conformance, 2396 * this is the logical behavior when fsync() is called on a file with 2397 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2398 * going to be pushed out as part of the zil_commit(). 2399 */ 2400 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2401 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2402 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2403 2404 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2405 2406 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2407 ZFS_ENTER(zfsvfs); 2408 ZFS_VERIFY_ZP(zp); 2409 zil_commit(zfsvfs->z_log, zp->z_id); 2410 ZFS_EXIT(zfsvfs); 2411 } 2412 return (0); 2413 } 2414 2415 2416 /* 2417 * Get the requested file attributes and place them in the provided 2418 * vattr structure. 2419 * 2420 * IN: vp - vnode of file. 2421 * vap - va_mask identifies requested attributes. 2422 * If AT_XVATTR set, then optional attrs are requested 2423 * flags - ATTR_NOACLCHECK (CIFS server context) 2424 * cr - credentials of caller. 2425 * ct - caller context 2426 * 2427 * OUT: vap - attribute values. 2428 * 2429 * RETURN: 0 (always succeeds). 2430 */ 2431 /* ARGSUSED */ 2432 static int 2433 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2434 caller_context_t *ct) 2435 { 2436 znode_t *zp = VTOZ(vp); 2437 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2438 int error = 0; 2439 uint64_t links; 2440 uint64_t mtime[2], ctime[2]; 2441 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2442 xoptattr_t *xoap = NULL; 2443 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2444 sa_bulk_attr_t bulk[2]; 2445 int count = 0; 2446 2447 ZFS_ENTER(zfsvfs); 2448 ZFS_VERIFY_ZP(zp); 2449 2450 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2451 2452 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2453 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2454 2455 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2456 ZFS_EXIT(zfsvfs); 2457 return (error); 2458 } 2459 2460 /* 2461 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2462 * Also, if we are the owner don't bother, since owner should 2463 * always be allowed to read basic attributes of file. 2464 */ 2465 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2466 (vap->va_uid != crgetuid(cr))) { 2467 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2468 skipaclchk, cr)) { 2469 ZFS_EXIT(zfsvfs); 2470 return (error); 2471 } 2472 } 2473 2474 /* 2475 * Return all attributes. It's cheaper to provide the answer 2476 * than to determine whether we were asked the question. 2477 */ 2478 2479 mutex_enter(&zp->z_lock); 2480 vap->va_type = vp->v_type; 2481 vap->va_mode = zp->z_mode & MODEMASK; 2482 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2483 vap->va_nodeid = zp->z_id; 2484 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2485 links = zp->z_links + 1; 2486 else 2487 links = zp->z_links; 2488 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2489 vap->va_size = zp->z_size; 2490 vap->va_rdev = vp->v_rdev; 2491 vap->va_seq = zp->z_seq; 2492 2493 /* 2494 * Add in any requested optional attributes and the create time. 2495 * Also set the corresponding bits in the returned attribute bitmap. 2496 */ 2497 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2498 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2499 xoap->xoa_archive = 2500 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2501 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2502 } 2503 2504 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2505 xoap->xoa_readonly = 2506 ((zp->z_pflags & ZFS_READONLY) != 0); 2507 XVA_SET_RTN(xvap, XAT_READONLY); 2508 } 2509 2510 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2511 xoap->xoa_system = 2512 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2513 XVA_SET_RTN(xvap, XAT_SYSTEM); 2514 } 2515 2516 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2517 xoap->xoa_hidden = 2518 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2519 XVA_SET_RTN(xvap, XAT_HIDDEN); 2520 } 2521 2522 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2523 xoap->xoa_nounlink = 2524 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2525 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2526 } 2527 2528 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2529 xoap->xoa_immutable = 2530 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2531 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2532 } 2533 2534 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2535 xoap->xoa_appendonly = 2536 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2537 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2538 } 2539 2540 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2541 xoap->xoa_nodump = 2542 ((zp->z_pflags & ZFS_NODUMP) != 0); 2543 XVA_SET_RTN(xvap, XAT_NODUMP); 2544 } 2545 2546 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2547 xoap->xoa_opaque = 2548 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2549 XVA_SET_RTN(xvap, XAT_OPAQUE); 2550 } 2551 2552 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2553 xoap->xoa_av_quarantined = 2554 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2555 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2556 } 2557 2558 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2559 xoap->xoa_av_modified = 2560 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2561 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2562 } 2563 2564 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2565 vp->v_type == VREG) { 2566 zfs_sa_get_scanstamp(zp, xvap); 2567 } 2568 2569 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2570 uint64_t times[2]; 2571 2572 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 2573 times, sizeof (times)); 2574 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 2575 XVA_SET_RTN(xvap, XAT_CREATETIME); 2576 } 2577 2578 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2579 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2580 XVA_SET_RTN(xvap, XAT_REPARSE); 2581 } 2582 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2583 xoap->xoa_generation = zp->z_gen; 2584 XVA_SET_RTN(xvap, XAT_GEN); 2585 } 2586 2587 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2588 xoap->xoa_offline = 2589 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2590 XVA_SET_RTN(xvap, XAT_OFFLINE); 2591 } 2592 2593 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2594 xoap->xoa_sparse = 2595 ((zp->z_pflags & ZFS_SPARSE) != 0); 2596 XVA_SET_RTN(xvap, XAT_SPARSE); 2597 } 2598 } 2599 2600 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2601 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2602 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2603 2604 mutex_exit(&zp->z_lock); 2605 2606 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); 2607 2608 if (zp->z_blksz == 0) { 2609 /* 2610 * Block size hasn't been set; suggest maximal I/O transfers. 2611 */ 2612 vap->va_blksize = zfsvfs->z_max_blksz; 2613 } 2614 2615 ZFS_EXIT(zfsvfs); 2616 return (0); 2617 } 2618 2619 /* 2620 * Set the file attributes to the values contained in the 2621 * vattr structure. 2622 * 2623 * IN: vp - vnode of file to be modified. 2624 * vap - new attribute values. 2625 * If AT_XVATTR set, then optional attrs are being set 2626 * flags - ATTR_UTIME set if non-default time values provided. 2627 * - ATTR_NOACLCHECK (CIFS context only). 2628 * cr - credentials of caller. 2629 * ct - caller context 2630 * 2631 * RETURN: 0 on success, error code on failure. 2632 * 2633 * Timestamps: 2634 * vp - ctime updated, mtime updated if size changed. 2635 */ 2636 /* ARGSUSED */ 2637 static int 2638 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2639 caller_context_t *ct) 2640 { 2641 znode_t *zp = VTOZ(vp); 2642 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2643 zilog_t *zilog; 2644 dmu_tx_t *tx; 2645 vattr_t oldva; 2646 xvattr_t tmpxvattr; 2647 uint_t mask = vap->va_mask; 2648 uint_t saved_mask = 0; 2649 int trim_mask = 0; 2650 uint64_t new_mode; 2651 uint64_t new_uid, new_gid; 2652 uint64_t xattr_obj; 2653 uint64_t mtime[2], ctime[2]; 2654 znode_t *attrzp; 2655 int need_policy = FALSE; 2656 int err, err2; 2657 zfs_fuid_info_t *fuidp = NULL; 2658 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2659 xoptattr_t *xoap; 2660 zfs_acl_t *aclp; 2661 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2662 boolean_t fuid_dirtied = B_FALSE; 2663 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2664 int count = 0, xattr_count = 0; 2665 2666 if (mask == 0) 2667 return (0); 2668 2669 if (mask & AT_NOSET) 2670 return (SET_ERROR(EINVAL)); 2671 2672 ZFS_ENTER(zfsvfs); 2673 ZFS_VERIFY_ZP(zp); 2674 2675 zilog = zfsvfs->z_log; 2676 2677 /* 2678 * Make sure that if we have ephemeral uid/gid or xvattr specified 2679 * that file system is at proper version level 2680 */ 2681 2682 if (zfsvfs->z_use_fuids == B_FALSE && 2683 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2684 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2685 (mask & AT_XVATTR))) { 2686 ZFS_EXIT(zfsvfs); 2687 return (SET_ERROR(EINVAL)); 2688 } 2689 2690 if (mask & AT_SIZE && vp->v_type == VDIR) { 2691 ZFS_EXIT(zfsvfs); 2692 return (SET_ERROR(EISDIR)); 2693 } 2694 2695 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2696 ZFS_EXIT(zfsvfs); 2697 return (SET_ERROR(EINVAL)); 2698 } 2699 2700 /* 2701 * If this is an xvattr_t, then get a pointer to the structure of 2702 * optional attributes. If this is NULL, then we have a vattr_t. 2703 */ 2704 xoap = xva_getxoptattr(xvap); 2705 2706 xva_init(&tmpxvattr); 2707 2708 /* 2709 * Immutable files can only alter immutable bit and atime 2710 */ 2711 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2712 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2713 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2714 ZFS_EXIT(zfsvfs); 2715 return (SET_ERROR(EPERM)); 2716 } 2717 2718 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 2719 ZFS_EXIT(zfsvfs); 2720 return (SET_ERROR(EPERM)); 2721 } 2722 2723 /* 2724 * Verify timestamps doesn't overflow 32 bits. 2725 * ZFS can handle large timestamps, but 32bit syscalls can't 2726 * handle times greater than 2039. This check should be removed 2727 * once large timestamps are fully supported. 2728 */ 2729 if (mask & (AT_ATIME | AT_MTIME)) { 2730 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2731 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2732 ZFS_EXIT(zfsvfs); 2733 return (SET_ERROR(EOVERFLOW)); 2734 } 2735 } 2736 2737 top: 2738 attrzp = NULL; 2739 aclp = NULL; 2740 2741 /* Can this be moved to before the top label? */ 2742 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2743 ZFS_EXIT(zfsvfs); 2744 return (SET_ERROR(EROFS)); 2745 } 2746 2747 /* 2748 * First validate permissions 2749 */ 2750 2751 if (mask & AT_SIZE) { 2752 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2753 if (err) { 2754 ZFS_EXIT(zfsvfs); 2755 return (err); 2756 } 2757 /* 2758 * XXX - Note, we are not providing any open 2759 * mode flags here (like FNDELAY), so we may 2760 * block if there are locks present... this 2761 * should be addressed in openat(). 2762 */ 2763 /* XXX - would it be OK to generate a log record here? */ 2764 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2765 if (err) { 2766 ZFS_EXIT(zfsvfs); 2767 return (err); 2768 } 2769 } 2770 2771 if (mask & (AT_ATIME|AT_MTIME) || 2772 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2773 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2774 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2775 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2776 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2777 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2778 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2779 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2780 skipaclchk, cr); 2781 } 2782 2783 if (mask & (AT_UID|AT_GID)) { 2784 int idmask = (mask & (AT_UID|AT_GID)); 2785 int take_owner; 2786 int take_group; 2787 2788 /* 2789 * NOTE: even if a new mode is being set, 2790 * we may clear S_ISUID/S_ISGID bits. 2791 */ 2792 2793 if (!(mask & AT_MODE)) 2794 vap->va_mode = zp->z_mode; 2795 2796 /* 2797 * Take ownership or chgrp to group we are a member of 2798 */ 2799 2800 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2801 take_group = (mask & AT_GID) && 2802 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2803 2804 /* 2805 * If both AT_UID and AT_GID are set then take_owner and 2806 * take_group must both be set in order to allow taking 2807 * ownership. 2808 * 2809 * Otherwise, send the check through secpolicy_vnode_setattr() 2810 * 2811 */ 2812 2813 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2814 ((idmask == AT_UID) && take_owner) || 2815 ((idmask == AT_GID) && take_group)) { 2816 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2817 skipaclchk, cr) == 0) { 2818 /* 2819 * Remove setuid/setgid for non-privileged users 2820 */ 2821 secpolicy_setid_clear(vap, cr); 2822 trim_mask = (mask & (AT_UID|AT_GID)); 2823 } else { 2824 need_policy = TRUE; 2825 } 2826 } else { 2827 need_policy = TRUE; 2828 } 2829 } 2830 2831 mutex_enter(&zp->z_lock); 2832 oldva.va_mode = zp->z_mode; 2833 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2834 if (mask & AT_XVATTR) { 2835 /* 2836 * Update xvattr mask to include only those attributes 2837 * that are actually changing. 2838 * 2839 * the bits will be restored prior to actually setting 2840 * the attributes so the caller thinks they were set. 2841 */ 2842 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2843 if (xoap->xoa_appendonly != 2844 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2845 need_policy = TRUE; 2846 } else { 2847 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2848 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 2849 } 2850 } 2851 2852 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2853 if (xoap->xoa_nounlink != 2854 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2855 need_policy = TRUE; 2856 } else { 2857 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2858 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 2859 } 2860 } 2861 2862 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2863 if (xoap->xoa_immutable != 2864 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2865 need_policy = TRUE; 2866 } else { 2867 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2868 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 2869 } 2870 } 2871 2872 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2873 if (xoap->xoa_nodump != 2874 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2875 need_policy = TRUE; 2876 } else { 2877 XVA_CLR_REQ(xvap, XAT_NODUMP); 2878 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 2879 } 2880 } 2881 2882 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2883 if (xoap->xoa_av_modified != 2884 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2885 need_policy = TRUE; 2886 } else { 2887 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2888 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 2889 } 2890 } 2891 2892 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2893 if ((vp->v_type != VREG && 2894 xoap->xoa_av_quarantined) || 2895 xoap->xoa_av_quarantined != 2896 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2897 need_policy = TRUE; 2898 } else { 2899 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2900 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 2901 } 2902 } 2903 2904 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2905 mutex_exit(&zp->z_lock); 2906 ZFS_EXIT(zfsvfs); 2907 return (SET_ERROR(EPERM)); 2908 } 2909 2910 if (need_policy == FALSE && 2911 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2912 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2913 need_policy = TRUE; 2914 } 2915 } 2916 2917 mutex_exit(&zp->z_lock); 2918 2919 if (mask & AT_MODE) { 2920 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2921 err = secpolicy_setid_setsticky_clear(vp, vap, 2922 &oldva, cr); 2923 if (err) { 2924 ZFS_EXIT(zfsvfs); 2925 return (err); 2926 } 2927 trim_mask |= AT_MODE; 2928 } else { 2929 need_policy = TRUE; 2930 } 2931 } 2932 2933 if (need_policy) { 2934 /* 2935 * If trim_mask is set then take ownership 2936 * has been granted or write_acl is present and user 2937 * has the ability to modify mode. In that case remove 2938 * UID|GID and or MODE from mask so that 2939 * secpolicy_vnode_setattr() doesn't revoke it. 2940 */ 2941 2942 if (trim_mask) { 2943 saved_mask = vap->va_mask; 2944 vap->va_mask &= ~trim_mask; 2945 } 2946 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2947 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2948 if (err) { 2949 ZFS_EXIT(zfsvfs); 2950 return (err); 2951 } 2952 2953 if (trim_mask) 2954 vap->va_mask |= saved_mask; 2955 } 2956 2957 /* 2958 * secpolicy_vnode_setattr, or take ownership may have 2959 * changed va_mask 2960 */ 2961 mask = vap->va_mask; 2962 2963 if ((mask & (AT_UID | AT_GID))) { 2964 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2965 &xattr_obj, sizeof (xattr_obj)); 2966 2967 if (err == 0 && xattr_obj) { 2968 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 2969 if (err) 2970 goto out2; 2971 } 2972 if (mask & AT_UID) { 2973 new_uid = zfs_fuid_create(zfsvfs, 2974 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2975 if (new_uid != zp->z_uid && 2976 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 2977 if (attrzp) 2978 VN_RELE(ZTOV(attrzp)); 2979 err = SET_ERROR(EDQUOT); 2980 goto out2; 2981 } 2982 } 2983 2984 if (mask & AT_GID) { 2985 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 2986 cr, ZFS_GROUP, &fuidp); 2987 if (new_gid != zp->z_gid && 2988 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 2989 if (attrzp) 2990 VN_RELE(ZTOV(attrzp)); 2991 err = SET_ERROR(EDQUOT); 2992 goto out2; 2993 } 2994 } 2995 } 2996 tx = dmu_tx_create(zfsvfs->z_os); 2997 2998 if (mask & AT_MODE) { 2999 uint64_t pmode = zp->z_mode; 3000 uint64_t acl_obj; 3001 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3002 3003 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3004 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3005 err = SET_ERROR(EPERM); 3006 goto out; 3007 } 3008 3009 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3010 goto out; 3011 3012 mutex_enter(&zp->z_lock); 3013 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3014 /* 3015 * Are we upgrading ACL from old V0 format 3016 * to V1 format? 3017 */ 3018 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3019 zfs_znode_acl_version(zp) == 3020 ZFS_ACL_VERSION_INITIAL) { 3021 dmu_tx_hold_free(tx, acl_obj, 0, 3022 DMU_OBJECT_END); 3023 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3024 0, aclp->z_acl_bytes); 3025 } else { 3026 dmu_tx_hold_write(tx, acl_obj, 0, 3027 aclp->z_acl_bytes); 3028 } 3029 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3030 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3031 0, aclp->z_acl_bytes); 3032 } 3033 mutex_exit(&zp->z_lock); 3034 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3035 } else { 3036 if ((mask & AT_XVATTR) && 3037 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3038 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3039 else 3040 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3041 } 3042 3043 if (attrzp) { 3044 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3045 } 3046 3047 fuid_dirtied = zfsvfs->z_fuid_dirty; 3048 if (fuid_dirtied) 3049 zfs_fuid_txhold(zfsvfs, tx); 3050 3051 zfs_sa_upgrade_txholds(tx, zp); 3052 3053 err = dmu_tx_assign(tx, TXG_NOWAIT); 3054 if (err) { 3055 if (err == ERESTART) 3056 dmu_tx_wait(tx); 3057 goto out; 3058 } 3059 3060 count = 0; 3061 /* 3062 * Set each attribute requested. 3063 * We group settings according to the locks they need to acquire. 3064 * 3065 * Note: you cannot set ctime directly, although it will be 3066 * updated as a side-effect of calling this function. 3067 */ 3068 3069 3070 if (mask & (AT_UID|AT_GID|AT_MODE)) 3071 mutex_enter(&zp->z_acl_lock); 3072 mutex_enter(&zp->z_lock); 3073 3074 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3075 &zp->z_pflags, sizeof (zp->z_pflags)); 3076 3077 if (attrzp) { 3078 if (mask & (AT_UID|AT_GID|AT_MODE)) 3079 mutex_enter(&attrzp->z_acl_lock); 3080 mutex_enter(&attrzp->z_lock); 3081 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3082 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3083 sizeof (attrzp->z_pflags)); 3084 } 3085 3086 if (mask & (AT_UID|AT_GID)) { 3087 3088 if (mask & AT_UID) { 3089 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3090 &new_uid, sizeof (new_uid)); 3091 zp->z_uid = new_uid; 3092 if (attrzp) { 3093 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3094 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3095 sizeof (new_uid)); 3096 attrzp->z_uid = new_uid; 3097 } 3098 } 3099 3100 if (mask & AT_GID) { 3101 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3102 NULL, &new_gid, sizeof (new_gid)); 3103 zp->z_gid = new_gid; 3104 if (attrzp) { 3105 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3106 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3107 sizeof (new_gid)); 3108 attrzp->z_gid = new_gid; 3109 } 3110 } 3111 if (!(mask & AT_MODE)) { 3112 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3113 NULL, &new_mode, sizeof (new_mode)); 3114 new_mode = zp->z_mode; 3115 } 3116 err = zfs_acl_chown_setattr(zp); 3117 ASSERT(err == 0); 3118 if (attrzp) { 3119 err = zfs_acl_chown_setattr(attrzp); 3120 ASSERT(err == 0); 3121 } 3122 } 3123 3124 if (mask & AT_MODE) { 3125 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3126 &new_mode, sizeof (new_mode)); 3127 zp->z_mode = new_mode; 3128 ASSERT3U((uintptr_t)aclp, !=, NULL); 3129 err = zfs_aclset_common(zp, aclp, cr, tx); 3130 ASSERT0(err); 3131 if (zp->z_acl_cached) 3132 zfs_acl_free(zp->z_acl_cached); 3133 zp->z_acl_cached = aclp; 3134 aclp = NULL; 3135 } 3136 3137 3138 if (mask & AT_ATIME) { 3139 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3140 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3141 &zp->z_atime, sizeof (zp->z_atime)); 3142 } 3143 3144 if (mask & AT_MTIME) { 3145 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3146 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3147 mtime, sizeof (mtime)); 3148 } 3149 3150 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3151 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3152 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3153 NULL, mtime, sizeof (mtime)); 3154 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3155 &ctime, sizeof (ctime)); 3156 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3157 B_TRUE); 3158 } else if (mask != 0) { 3159 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3160 &ctime, sizeof (ctime)); 3161 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3162 B_TRUE); 3163 if (attrzp) { 3164 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3165 SA_ZPL_CTIME(zfsvfs), NULL, 3166 &ctime, sizeof (ctime)); 3167 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3168 mtime, ctime, B_TRUE); 3169 } 3170 } 3171 /* 3172 * Do this after setting timestamps to prevent timestamp 3173 * update from toggling bit 3174 */ 3175 3176 if (xoap && (mask & AT_XVATTR)) { 3177 3178 /* 3179 * restore trimmed off masks 3180 * so that return masks can be set for caller. 3181 */ 3182 3183 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3184 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3185 } 3186 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3187 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3188 } 3189 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3190 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3191 } 3192 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3193 XVA_SET_REQ(xvap, XAT_NODUMP); 3194 } 3195 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3196 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3197 } 3198 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3199 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3200 } 3201 3202 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3203 ASSERT(vp->v_type == VREG); 3204 3205 zfs_xvattr_set(zp, xvap, tx); 3206 } 3207 3208 if (fuid_dirtied) 3209 zfs_fuid_sync(zfsvfs, tx); 3210 3211 if (mask != 0) 3212 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3213 3214 mutex_exit(&zp->z_lock); 3215 if (mask & (AT_UID|AT_GID|AT_MODE)) 3216 mutex_exit(&zp->z_acl_lock); 3217 3218 if (attrzp) { 3219 if (mask & (AT_UID|AT_GID|AT_MODE)) 3220 mutex_exit(&attrzp->z_acl_lock); 3221 mutex_exit(&attrzp->z_lock); 3222 } 3223 out: 3224 if (err == 0 && attrzp) { 3225 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3226 xattr_count, tx); 3227 ASSERT(err2 == 0); 3228 } 3229 3230 if (attrzp) 3231 VN_RELE(ZTOV(attrzp)); 3232 3233 if (aclp) 3234 zfs_acl_free(aclp); 3235 3236 if (fuidp) { 3237 zfs_fuid_info_free(fuidp); 3238 fuidp = NULL; 3239 } 3240 3241 if (err) { 3242 dmu_tx_abort(tx); 3243 if (err == ERESTART) 3244 goto top; 3245 } else { 3246 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3247 dmu_tx_commit(tx); 3248 } 3249 3250 out2: 3251 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3252 zil_commit(zilog, 0); 3253 3254 ZFS_EXIT(zfsvfs); 3255 return (err); 3256 } 3257 3258 typedef struct zfs_zlock { 3259 krwlock_t *zl_rwlock; /* lock we acquired */ 3260 znode_t *zl_znode; /* znode we held */ 3261 struct zfs_zlock *zl_next; /* next in list */ 3262 } zfs_zlock_t; 3263 3264 /* 3265 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3266 */ 3267 static void 3268 zfs_rename_unlock(zfs_zlock_t **zlpp) 3269 { 3270 zfs_zlock_t *zl; 3271 3272 while ((zl = *zlpp) != NULL) { 3273 if (zl->zl_znode != NULL) 3274 VN_RELE(ZTOV(zl->zl_znode)); 3275 rw_exit(zl->zl_rwlock); 3276 *zlpp = zl->zl_next; 3277 kmem_free(zl, sizeof (*zl)); 3278 } 3279 } 3280 3281 /* 3282 * Search back through the directory tree, using the ".." entries. 3283 * Lock each directory in the chain to prevent concurrent renames. 3284 * Fail any attempt to move a directory into one of its own descendants. 3285 * XXX - z_parent_lock can overlap with map or grow locks 3286 */ 3287 static int 3288 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3289 { 3290 zfs_zlock_t *zl; 3291 znode_t *zp = tdzp; 3292 uint64_t rootid = zp->z_zfsvfs->z_root; 3293 uint64_t oidp = zp->z_id; 3294 krwlock_t *rwlp = &szp->z_parent_lock; 3295 krw_t rw = RW_WRITER; 3296 3297 /* 3298 * First pass write-locks szp and compares to zp->z_id. 3299 * Later passes read-lock zp and compare to zp->z_parent. 3300 */ 3301 do { 3302 if (!rw_tryenter(rwlp, rw)) { 3303 /* 3304 * Another thread is renaming in this path. 3305 * Note that if we are a WRITER, we don't have any 3306 * parent_locks held yet. 3307 */ 3308 if (rw == RW_READER && zp->z_id > szp->z_id) { 3309 /* 3310 * Drop our locks and restart 3311 */ 3312 zfs_rename_unlock(&zl); 3313 *zlpp = NULL; 3314 zp = tdzp; 3315 oidp = zp->z_id; 3316 rwlp = &szp->z_parent_lock; 3317 rw = RW_WRITER; 3318 continue; 3319 } else { 3320 /* 3321 * Wait for other thread to drop its locks 3322 */ 3323 rw_enter(rwlp, rw); 3324 } 3325 } 3326 3327 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3328 zl->zl_rwlock = rwlp; 3329 zl->zl_znode = NULL; 3330 zl->zl_next = *zlpp; 3331 *zlpp = zl; 3332 3333 if (oidp == szp->z_id) /* We're a descendant of szp */ 3334 return (SET_ERROR(EINVAL)); 3335 3336 if (oidp == rootid) /* We've hit the top */ 3337 return (0); 3338 3339 if (rw == RW_READER) { /* i.e. not the first pass */ 3340 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); 3341 if (error) 3342 return (error); 3343 zl->zl_znode = zp; 3344 } 3345 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), 3346 &oidp, sizeof (oidp)); 3347 rwlp = &zp->z_parent_lock; 3348 rw = RW_READER; 3349 3350 } while (zp->z_id != sdzp->z_id); 3351 3352 return (0); 3353 } 3354 3355 /* 3356 * Move an entry from the provided source directory to the target 3357 * directory. Change the entry name as indicated. 3358 * 3359 * IN: sdvp - Source directory containing the "old entry". 3360 * snm - Old entry name. 3361 * tdvp - Target directory to contain the "new entry". 3362 * tnm - New entry name. 3363 * cr - credentials of caller. 3364 * ct - caller context 3365 * flags - case flags 3366 * 3367 * RETURN: 0 on success, error code on failure. 3368 * 3369 * Timestamps: 3370 * sdvp,tdvp - ctime|mtime updated 3371 */ 3372 /*ARGSUSED*/ 3373 static int 3374 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3375 caller_context_t *ct, int flags) 3376 { 3377 znode_t *tdzp, *szp, *tzp; 3378 znode_t *sdzp = VTOZ(sdvp); 3379 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3380 zilog_t *zilog; 3381 vnode_t *realvp; 3382 zfs_dirlock_t *sdl, *tdl; 3383 dmu_tx_t *tx; 3384 zfs_zlock_t *zl; 3385 int cmp, serr, terr; 3386 int error = 0; 3387 int zflg = 0; 3388 boolean_t waited = B_FALSE; 3389 3390 ZFS_ENTER(zfsvfs); 3391 ZFS_VERIFY_ZP(sdzp); 3392 zilog = zfsvfs->z_log; 3393 3394 /* 3395 * Make sure we have the real vp for the target directory. 3396 */ 3397 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3398 tdvp = realvp; 3399 3400 tdzp = VTOZ(tdvp); 3401 ZFS_VERIFY_ZP(tdzp); 3402 3403 /* 3404 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3405 * ctldir appear to have the same v_vfsp. 3406 */ 3407 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { 3408 ZFS_EXIT(zfsvfs); 3409 return (SET_ERROR(EXDEV)); 3410 } 3411 3412 if (zfsvfs->z_utf8 && u8_validate(tnm, 3413 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3414 ZFS_EXIT(zfsvfs); 3415 return (SET_ERROR(EILSEQ)); 3416 } 3417 3418 if (flags & FIGNORECASE) 3419 zflg |= ZCILOOK; 3420 3421 top: 3422 szp = NULL; 3423 tzp = NULL; 3424 zl = NULL; 3425 3426 /* 3427 * This is to prevent the creation of links into attribute space 3428 * by renaming a linked file into/outof an attribute directory. 3429 * See the comment in zfs_link() for why this is considered bad. 3430 */ 3431 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3432 ZFS_EXIT(zfsvfs); 3433 return (SET_ERROR(EINVAL)); 3434 } 3435 3436 /* 3437 * Lock source and target directory entries. To prevent deadlock, 3438 * a lock ordering must be defined. We lock the directory with 3439 * the smallest object id first, or if it's a tie, the one with 3440 * the lexically first name. 3441 */ 3442 if (sdzp->z_id < tdzp->z_id) { 3443 cmp = -1; 3444 } else if (sdzp->z_id > tdzp->z_id) { 3445 cmp = 1; 3446 } else { 3447 /* 3448 * First compare the two name arguments without 3449 * considering any case folding. 3450 */ 3451 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3452 3453 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3454 ASSERT(error == 0 || !zfsvfs->z_utf8); 3455 if (cmp == 0) { 3456 /* 3457 * POSIX: "If the old argument and the new argument 3458 * both refer to links to the same existing file, 3459 * the rename() function shall return successfully 3460 * and perform no other action." 3461 */ 3462 ZFS_EXIT(zfsvfs); 3463 return (0); 3464 } 3465 /* 3466 * If the file system is case-folding, then we may 3467 * have some more checking to do. A case-folding file 3468 * system is either supporting mixed case sensitivity 3469 * access or is completely case-insensitive. Note 3470 * that the file system is always case preserving. 3471 * 3472 * In mixed sensitivity mode case sensitive behavior 3473 * is the default. FIGNORECASE must be used to 3474 * explicitly request case insensitive behavior. 3475 * 3476 * If the source and target names provided differ only 3477 * by case (e.g., a request to rename 'tim' to 'Tim'), 3478 * we will treat this as a special case in the 3479 * case-insensitive mode: as long as the source name 3480 * is an exact match, we will allow this to proceed as 3481 * a name-change request. 3482 */ 3483 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3484 (zfsvfs->z_case == ZFS_CASE_MIXED && 3485 flags & FIGNORECASE)) && 3486 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3487 &error) == 0) { 3488 /* 3489 * case preserving rename request, require exact 3490 * name matches 3491 */ 3492 zflg |= ZCIEXACT; 3493 zflg &= ~ZCILOOK; 3494 } 3495 } 3496 3497 /* 3498 * If the source and destination directories are the same, we should 3499 * grab the z_name_lock of that directory only once. 3500 */ 3501 if (sdzp == tdzp) { 3502 zflg |= ZHAVELOCK; 3503 rw_enter(&sdzp->z_name_lock, RW_READER); 3504 } 3505 3506 if (cmp < 0) { 3507 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3508 ZEXISTS | zflg, NULL, NULL); 3509 terr = zfs_dirent_lock(&tdl, 3510 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3511 } else { 3512 terr = zfs_dirent_lock(&tdl, 3513 tdzp, tnm, &tzp, zflg, NULL, NULL); 3514 serr = zfs_dirent_lock(&sdl, 3515 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3516 NULL, NULL); 3517 } 3518 3519 if (serr) { 3520 /* 3521 * Source entry invalid or not there. 3522 */ 3523 if (!terr) { 3524 zfs_dirent_unlock(tdl); 3525 if (tzp) 3526 VN_RELE(ZTOV(tzp)); 3527 } 3528 3529 if (sdzp == tdzp) 3530 rw_exit(&sdzp->z_name_lock); 3531 3532 if (strcmp(snm, "..") == 0) 3533 serr = SET_ERROR(EINVAL); 3534 ZFS_EXIT(zfsvfs); 3535 return (serr); 3536 } 3537 if (terr) { 3538 zfs_dirent_unlock(sdl); 3539 VN_RELE(ZTOV(szp)); 3540 3541 if (sdzp == tdzp) 3542 rw_exit(&sdzp->z_name_lock); 3543 3544 if (strcmp(tnm, "..") == 0) 3545 terr = SET_ERROR(EINVAL); 3546 ZFS_EXIT(zfsvfs); 3547 return (terr); 3548 } 3549 3550 /* 3551 * Must have write access at the source to remove the old entry 3552 * and write access at the target to create the new entry. 3553 * Note that if target and source are the same, this can be 3554 * done in a single check. 3555 */ 3556 3557 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3558 goto out; 3559 3560 if (ZTOV(szp)->v_type == VDIR) { 3561 /* 3562 * Check to make sure rename is valid. 3563 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3564 */ 3565 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3566 goto out; 3567 } 3568 3569 /* 3570 * Does target exist? 3571 */ 3572 if (tzp) { 3573 /* 3574 * Source and target must be the same type. 3575 */ 3576 if (ZTOV(szp)->v_type == VDIR) { 3577 if (ZTOV(tzp)->v_type != VDIR) { 3578 error = SET_ERROR(ENOTDIR); 3579 goto out; 3580 } 3581 } else { 3582 if (ZTOV(tzp)->v_type == VDIR) { 3583 error = SET_ERROR(EISDIR); 3584 goto out; 3585 } 3586 } 3587 /* 3588 * POSIX dictates that when the source and target 3589 * entries refer to the same file object, rename 3590 * must do nothing and exit without error. 3591 */ 3592 if (szp->z_id == tzp->z_id) { 3593 error = 0; 3594 goto out; 3595 } 3596 } 3597 3598 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3599 if (tzp) 3600 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3601 3602 /* 3603 * notify the target directory if it is not the same 3604 * as source directory. 3605 */ 3606 if (tdvp != sdvp) { 3607 vnevent_rename_dest_dir(tdvp, ct); 3608 } 3609 3610 tx = dmu_tx_create(zfsvfs->z_os); 3611 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3612 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3613 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3614 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3615 if (sdzp != tdzp) { 3616 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3617 zfs_sa_upgrade_txholds(tx, tdzp); 3618 } 3619 if (tzp) { 3620 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3621 zfs_sa_upgrade_txholds(tx, tzp); 3622 } 3623 3624 zfs_sa_upgrade_txholds(tx, szp); 3625 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3626 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 3627 if (error) { 3628 if (zl != NULL) 3629 zfs_rename_unlock(&zl); 3630 zfs_dirent_unlock(sdl); 3631 zfs_dirent_unlock(tdl); 3632 3633 if (sdzp == tdzp) 3634 rw_exit(&sdzp->z_name_lock); 3635 3636 VN_RELE(ZTOV(szp)); 3637 if (tzp) 3638 VN_RELE(ZTOV(tzp)); 3639 if (error == ERESTART) { 3640 waited = B_TRUE; 3641 dmu_tx_wait(tx); 3642 dmu_tx_abort(tx); 3643 goto top; 3644 } 3645 dmu_tx_abort(tx); 3646 ZFS_EXIT(zfsvfs); 3647 return (error); 3648 } 3649 3650 if (tzp) /* Attempt to remove the existing target */ 3651 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3652 3653 if (error == 0) { 3654 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3655 if (error == 0) { 3656 szp->z_pflags |= ZFS_AV_MODIFIED; 3657 3658 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3659 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3660 ASSERT0(error); 3661 3662 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3663 if (error == 0) { 3664 zfs_log_rename(zilog, tx, TX_RENAME | 3665 (flags & FIGNORECASE ? TX_CI : 0), sdzp, 3666 sdl->dl_name, tdzp, tdl->dl_name, szp); 3667 3668 /* 3669 * Update path information for the target vnode 3670 */ 3671 vn_renamepath(tdvp, ZTOV(szp), tnm, 3672 strlen(tnm)); 3673 } else { 3674 /* 3675 * At this point, we have successfully created 3676 * the target name, but have failed to remove 3677 * the source name. Since the create was done 3678 * with the ZRENAMING flag, there are 3679 * complications; for one, the link count is 3680 * wrong. The easiest way to deal with this 3681 * is to remove the newly created target, and 3682 * return the original error. This must 3683 * succeed; fortunately, it is very unlikely to 3684 * fail, since we just created it. 3685 */ 3686 VERIFY3U(zfs_link_destroy(tdl, szp, tx, 3687 ZRENAMING, NULL), ==, 0); 3688 } 3689 } 3690 } 3691 3692 dmu_tx_commit(tx); 3693 out: 3694 if (zl != NULL) 3695 zfs_rename_unlock(&zl); 3696 3697 zfs_dirent_unlock(sdl); 3698 zfs_dirent_unlock(tdl); 3699 3700 if (sdzp == tdzp) 3701 rw_exit(&sdzp->z_name_lock); 3702 3703 3704 VN_RELE(ZTOV(szp)); 3705 if (tzp) 3706 VN_RELE(ZTOV(tzp)); 3707 3708 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3709 zil_commit(zilog, 0); 3710 3711 ZFS_EXIT(zfsvfs); 3712 return (error); 3713 } 3714 3715 /* 3716 * Insert the indicated symbolic reference entry into the directory. 3717 * 3718 * IN: dvp - Directory to contain new symbolic link. 3719 * link - Name for new symlink entry. 3720 * vap - Attributes of new entry. 3721 * cr - credentials of caller. 3722 * ct - caller context 3723 * flags - case flags 3724 * 3725 * RETURN: 0 on success, error code on failure. 3726 * 3727 * Timestamps: 3728 * dvp - ctime|mtime updated 3729 */ 3730 /*ARGSUSED*/ 3731 static int 3732 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 3733 caller_context_t *ct, int flags) 3734 { 3735 znode_t *zp, *dzp = VTOZ(dvp); 3736 zfs_dirlock_t *dl; 3737 dmu_tx_t *tx; 3738 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3739 zilog_t *zilog; 3740 uint64_t len = strlen(link); 3741 int error; 3742 int zflg = ZNEW; 3743 zfs_acl_ids_t acl_ids; 3744 boolean_t fuid_dirtied; 3745 uint64_t txtype = TX_SYMLINK; 3746 boolean_t waited = B_FALSE; 3747 3748 ASSERT(vap->va_type == VLNK); 3749 3750 ZFS_ENTER(zfsvfs); 3751 ZFS_VERIFY_ZP(dzp); 3752 zilog = zfsvfs->z_log; 3753 3754 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3755 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3756 ZFS_EXIT(zfsvfs); 3757 return (SET_ERROR(EILSEQ)); 3758 } 3759 if (flags & FIGNORECASE) 3760 zflg |= ZCILOOK; 3761 3762 if (len > MAXPATHLEN) { 3763 ZFS_EXIT(zfsvfs); 3764 return (SET_ERROR(ENAMETOOLONG)); 3765 } 3766 3767 if ((error = zfs_acl_ids_create(dzp, 0, 3768 vap, cr, NULL, &acl_ids)) != 0) { 3769 ZFS_EXIT(zfsvfs); 3770 return (error); 3771 } 3772 top: 3773 /* 3774 * Attempt to lock directory; fail if entry already exists. 3775 */ 3776 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3777 if (error) { 3778 zfs_acl_ids_free(&acl_ids); 3779 ZFS_EXIT(zfsvfs); 3780 return (error); 3781 } 3782 3783 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3784 zfs_acl_ids_free(&acl_ids); 3785 zfs_dirent_unlock(dl); 3786 ZFS_EXIT(zfsvfs); 3787 return (error); 3788 } 3789 3790 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 3791 zfs_acl_ids_free(&acl_ids); 3792 zfs_dirent_unlock(dl); 3793 ZFS_EXIT(zfsvfs); 3794 return (SET_ERROR(EDQUOT)); 3795 } 3796 tx = dmu_tx_create(zfsvfs->z_os); 3797 fuid_dirtied = zfsvfs->z_fuid_dirty; 3798 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3799 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3800 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3801 ZFS_SA_BASE_ATTR_SIZE + len); 3802 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3803 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3804 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3805 acl_ids.z_aclp->z_acl_bytes); 3806 } 3807 if (fuid_dirtied) 3808 zfs_fuid_txhold(zfsvfs, tx); 3809 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 3810 if (error) { 3811 zfs_dirent_unlock(dl); 3812 if (error == ERESTART) { 3813 waited = B_TRUE; 3814 dmu_tx_wait(tx); 3815 dmu_tx_abort(tx); 3816 goto top; 3817 } 3818 zfs_acl_ids_free(&acl_ids); 3819 dmu_tx_abort(tx); 3820 ZFS_EXIT(zfsvfs); 3821 return (error); 3822 } 3823 3824 /* 3825 * Create a new object for the symlink. 3826 * for version 4 ZPL datsets the symlink will be an SA attribute 3827 */ 3828 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3829 3830 if (fuid_dirtied) 3831 zfs_fuid_sync(zfsvfs, tx); 3832 3833 mutex_enter(&zp->z_lock); 3834 if (zp->z_is_sa) 3835 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3836 link, len, tx); 3837 else 3838 zfs_sa_symlink(zp, link, len, tx); 3839 mutex_exit(&zp->z_lock); 3840 3841 zp->z_size = len; 3842 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3843 &zp->z_size, sizeof (zp->z_size), tx); 3844 /* 3845 * Insert the new object into the directory. 3846 */ 3847 (void) zfs_link_create(dl, zp, tx, ZNEW); 3848 3849 if (flags & FIGNORECASE) 3850 txtype |= TX_CI; 3851 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3852 3853 zfs_acl_ids_free(&acl_ids); 3854 3855 dmu_tx_commit(tx); 3856 3857 zfs_dirent_unlock(dl); 3858 3859 VN_RELE(ZTOV(zp)); 3860 3861 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3862 zil_commit(zilog, 0); 3863 3864 ZFS_EXIT(zfsvfs); 3865 return (error); 3866 } 3867 3868 /* 3869 * Return, in the buffer contained in the provided uio structure, 3870 * the symbolic path referred to by vp. 3871 * 3872 * IN: vp - vnode of symbolic link. 3873 * uio - structure to contain the link path. 3874 * cr - credentials of caller. 3875 * ct - caller context 3876 * 3877 * OUT: uio - structure containing the link path. 3878 * 3879 * RETURN: 0 on success, error code on failure. 3880 * 3881 * Timestamps: 3882 * vp - atime updated 3883 */ 3884 /* ARGSUSED */ 3885 static int 3886 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3887 { 3888 znode_t *zp = VTOZ(vp); 3889 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3890 int error; 3891 3892 ZFS_ENTER(zfsvfs); 3893 ZFS_VERIFY_ZP(zp); 3894 3895 mutex_enter(&zp->z_lock); 3896 if (zp->z_is_sa) 3897 error = sa_lookup_uio(zp->z_sa_hdl, 3898 SA_ZPL_SYMLINK(zfsvfs), uio); 3899 else 3900 error = zfs_sa_readlink(zp, uio); 3901 mutex_exit(&zp->z_lock); 3902 3903 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3904 3905 ZFS_EXIT(zfsvfs); 3906 return (error); 3907 } 3908 3909 /* 3910 * Insert a new entry into directory tdvp referencing svp. 3911 * 3912 * IN: tdvp - Directory to contain new entry. 3913 * svp - vnode of new entry. 3914 * name - name of new entry. 3915 * cr - credentials of caller. 3916 * ct - caller context 3917 * 3918 * RETURN: 0 on success, error code on failure. 3919 * 3920 * Timestamps: 3921 * tdvp - ctime|mtime updated 3922 * svp - ctime updated 3923 */ 3924 /* ARGSUSED */ 3925 static int 3926 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3927 caller_context_t *ct, int flags) 3928 { 3929 znode_t *dzp = VTOZ(tdvp); 3930 znode_t *tzp, *szp; 3931 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3932 zilog_t *zilog; 3933 zfs_dirlock_t *dl; 3934 dmu_tx_t *tx; 3935 vnode_t *realvp; 3936 int error; 3937 int zf = ZNEW; 3938 uint64_t parent; 3939 uid_t owner; 3940 boolean_t waited = B_FALSE; 3941 3942 ASSERT(tdvp->v_type == VDIR); 3943 3944 ZFS_ENTER(zfsvfs); 3945 ZFS_VERIFY_ZP(dzp); 3946 zilog = zfsvfs->z_log; 3947 3948 if (VOP_REALVP(svp, &realvp, ct) == 0) 3949 svp = realvp; 3950 3951 /* 3952 * POSIX dictates that we return EPERM here. 3953 * Better choices include ENOTSUP or EISDIR. 3954 */ 3955 if (svp->v_type == VDIR) { 3956 ZFS_EXIT(zfsvfs); 3957 return (SET_ERROR(EPERM)); 3958 } 3959 3960 szp = VTOZ(svp); 3961 ZFS_VERIFY_ZP(szp); 3962 3963 /* 3964 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3965 * ctldir appear to have the same v_vfsp. 3966 */ 3967 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { 3968 ZFS_EXIT(zfsvfs); 3969 return (SET_ERROR(EXDEV)); 3970 } 3971 3972 /* Prevent links to .zfs/shares files */ 3973 3974 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3975 &parent, sizeof (uint64_t))) != 0) { 3976 ZFS_EXIT(zfsvfs); 3977 return (error); 3978 } 3979 if (parent == zfsvfs->z_shares_dir) { 3980 ZFS_EXIT(zfsvfs); 3981 return (SET_ERROR(EPERM)); 3982 } 3983 3984 if (zfsvfs->z_utf8 && u8_validate(name, 3985 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3986 ZFS_EXIT(zfsvfs); 3987 return (SET_ERROR(EILSEQ)); 3988 } 3989 if (flags & FIGNORECASE) 3990 zf |= ZCILOOK; 3991 3992 /* 3993 * We do not support links between attributes and non-attributes 3994 * because of the potential security risk of creating links 3995 * into "normal" file space in order to circumvent restrictions 3996 * imposed in attribute space. 3997 */ 3998 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 3999 ZFS_EXIT(zfsvfs); 4000 return (SET_ERROR(EINVAL)); 4001 } 4002 4003 4004 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4005 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 4006 ZFS_EXIT(zfsvfs); 4007 return (SET_ERROR(EPERM)); 4008 } 4009 4010 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4011 ZFS_EXIT(zfsvfs); 4012 return (error); 4013 } 4014 4015 top: 4016 /* 4017 * Attempt to lock directory; fail if entry already exists. 4018 */ 4019 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 4020 if (error) { 4021 ZFS_EXIT(zfsvfs); 4022 return (error); 4023 } 4024 4025 tx = dmu_tx_create(zfsvfs->z_os); 4026 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4027 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4028 zfs_sa_upgrade_txholds(tx, szp); 4029 zfs_sa_upgrade_txholds(tx, dzp); 4030 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 4031 if (error) { 4032 zfs_dirent_unlock(dl); 4033 if (error == ERESTART) { 4034 waited = B_TRUE; 4035 dmu_tx_wait(tx); 4036 dmu_tx_abort(tx); 4037 goto top; 4038 } 4039 dmu_tx_abort(tx); 4040 ZFS_EXIT(zfsvfs); 4041 return (error); 4042 } 4043 4044 error = zfs_link_create(dl, szp, tx, 0); 4045 4046 if (error == 0) { 4047 uint64_t txtype = TX_LINK; 4048 if (flags & FIGNORECASE) 4049 txtype |= TX_CI; 4050 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4051 } 4052 4053 dmu_tx_commit(tx); 4054 4055 zfs_dirent_unlock(dl); 4056 4057 if (error == 0) { 4058 vnevent_link(svp, ct); 4059 } 4060 4061 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4062 zil_commit(zilog, 0); 4063 4064 ZFS_EXIT(zfsvfs); 4065 return (error); 4066 } 4067 4068 /* 4069 * zfs_null_putapage() is used when the file system has been force 4070 * unmounted. It just drops the pages. 4071 */ 4072 /* ARGSUSED */ 4073 static int 4074 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4075 size_t *lenp, int flags, cred_t *cr) 4076 { 4077 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 4078 return (0); 4079 } 4080 4081 /* 4082 * Push a page out to disk, klustering if possible. 4083 * 4084 * IN: vp - file to push page to. 4085 * pp - page to push. 4086 * flags - additional flags. 4087 * cr - credentials of caller. 4088 * 4089 * OUT: offp - start of range pushed. 4090 * lenp - len of range pushed. 4091 * 4092 * RETURN: 0 on success, error code on failure. 4093 * 4094 * NOTE: callers must have locked the page to be pushed. On 4095 * exit, the page (and all other pages in the kluster) must be 4096 * unlocked. 4097 */ 4098 /* ARGSUSED */ 4099 static int 4100 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4101 size_t *lenp, int flags, cred_t *cr) 4102 { 4103 znode_t *zp = VTOZ(vp); 4104 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4105 dmu_tx_t *tx; 4106 u_offset_t off, koff; 4107 size_t len, klen; 4108 int err; 4109 4110 off = pp->p_offset; 4111 len = PAGESIZE; 4112 /* 4113 * If our blocksize is bigger than the page size, try to kluster 4114 * multiple pages so that we write a full block (thus avoiding 4115 * a read-modify-write). 4116 */ 4117 if (off < zp->z_size && zp->z_blksz > PAGESIZE) { 4118 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 4119 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 4120 ASSERT(koff <= zp->z_size); 4121 if (koff + klen > zp->z_size) 4122 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); 4123 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 4124 } 4125 ASSERT3U(btop(len), ==, btopr(len)); 4126 4127 /* 4128 * Can't push pages past end-of-file. 4129 */ 4130 if (off >= zp->z_size) { 4131 /* ignore all pages */ 4132 err = 0; 4133 goto out; 4134 } else if (off + len > zp->z_size) { 4135 int npages = btopr(zp->z_size - off); 4136 page_t *trunc; 4137 4138 page_list_break(&pp, &trunc, npages); 4139 /* ignore pages past end of file */ 4140 if (trunc) 4141 pvn_write_done(trunc, flags); 4142 len = zp->z_size - off; 4143 } 4144 4145 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4146 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4147 err = SET_ERROR(EDQUOT); 4148 goto out; 4149 } 4150 top: 4151 tx = dmu_tx_create(zfsvfs->z_os); 4152 dmu_tx_hold_write(tx, zp->z_id, off, len); 4153 4154 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4155 zfs_sa_upgrade_txholds(tx, zp); 4156 err = dmu_tx_assign(tx, TXG_NOWAIT); 4157 if (err != 0) { 4158 if (err == ERESTART) { 4159 dmu_tx_wait(tx); 4160 dmu_tx_abort(tx); 4161 goto top; 4162 } 4163 dmu_tx_abort(tx); 4164 goto out; 4165 } 4166 4167 if (zp->z_blksz <= PAGESIZE) { 4168 caddr_t va = zfs_map_page(pp, S_READ); 4169 ASSERT3U(len, <=, PAGESIZE); 4170 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 4171 zfs_unmap_page(pp, va); 4172 } else { 4173 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 4174 } 4175 4176 if (err == 0) { 4177 uint64_t mtime[2], ctime[2]; 4178 sa_bulk_attr_t bulk[3]; 4179 int count = 0; 4180 4181 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4182 &mtime, 16); 4183 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4184 &ctime, 16); 4185 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4186 &zp->z_pflags, 8); 4187 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4188 B_TRUE); 4189 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4190 } 4191 dmu_tx_commit(tx); 4192 4193 out: 4194 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 4195 if (offp) 4196 *offp = off; 4197 if (lenp) 4198 *lenp = len; 4199 4200 return (err); 4201 } 4202 4203 /* 4204 * Copy the portion of the file indicated from pages into the file. 4205 * The pages are stored in a page list attached to the files vnode. 4206 * 4207 * IN: vp - vnode of file to push page data to. 4208 * off - position in file to put data. 4209 * len - amount of data to write. 4210 * flags - flags to control the operation. 4211 * cr - credentials of caller. 4212 * ct - caller context. 4213 * 4214 * RETURN: 0 on success, error code on failure. 4215 * 4216 * Timestamps: 4217 * vp - ctime|mtime updated 4218 */ 4219 /*ARGSUSED*/ 4220 static int 4221 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4222 caller_context_t *ct) 4223 { 4224 znode_t *zp = VTOZ(vp); 4225 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4226 page_t *pp; 4227 size_t io_len; 4228 u_offset_t io_off; 4229 uint_t blksz; 4230 rl_t *rl; 4231 int error = 0; 4232 4233 ZFS_ENTER(zfsvfs); 4234 ZFS_VERIFY_ZP(zp); 4235 4236 /* 4237 * There's nothing to do if no data is cached. 4238 */ 4239 if (!vn_has_cached_data(vp)) { 4240 ZFS_EXIT(zfsvfs); 4241 return (0); 4242 } 4243 4244 /* 4245 * Align this request to the file block size in case we kluster. 4246 * XXX - this can result in pretty aggresive locking, which can 4247 * impact simultanious read/write access. One option might be 4248 * to break up long requests (len == 0) into block-by-block 4249 * operations to get narrower locking. 4250 */ 4251 blksz = zp->z_blksz; 4252 if (ISP2(blksz)) 4253 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4254 else 4255 io_off = 0; 4256 if (len > 0 && ISP2(blksz)) 4257 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4258 else 4259 io_len = 0; 4260 4261 if (io_len == 0) { 4262 /* 4263 * Search the entire vp list for pages >= io_off. 4264 */ 4265 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); 4266 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4267 goto out; 4268 } 4269 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); 4270 4271 if (off > zp->z_size) { 4272 /* past end of file */ 4273 zfs_range_unlock(rl); 4274 ZFS_EXIT(zfsvfs); 4275 return (0); 4276 } 4277 4278 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); 4279 4280 for (off = io_off; io_off < off + len; io_off += io_len) { 4281 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4282 pp = page_lookup(vp, io_off, 4283 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4284 } else { 4285 pp = page_lookup_nowait(vp, io_off, 4286 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4287 } 4288 4289 if (pp != NULL && pvn_getdirty(pp, flags)) { 4290 int err; 4291 4292 /* 4293 * Found a dirty page to push 4294 */ 4295 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4296 if (err) 4297 error = err; 4298 } else { 4299 io_len = PAGESIZE; 4300 } 4301 } 4302 out: 4303 zfs_range_unlock(rl); 4304 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4305 zil_commit(zfsvfs->z_log, zp->z_id); 4306 ZFS_EXIT(zfsvfs); 4307 return (error); 4308 } 4309 4310 /*ARGSUSED*/ 4311 void 4312 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4313 { 4314 znode_t *zp = VTOZ(vp); 4315 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4316 int error; 4317 4318 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4319 if (zp->z_sa_hdl == NULL) { 4320 /* 4321 * The fs has been unmounted, or we did a 4322 * suspend/resume and this file no longer exists. 4323 */ 4324 if (vn_has_cached_data(vp)) { 4325 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 4326 B_INVAL, cr); 4327 } 4328 4329 mutex_enter(&zp->z_lock); 4330 mutex_enter(&vp->v_lock); 4331 ASSERT(vp->v_count == 1); 4332 vp->v_count = 0; 4333 mutex_exit(&vp->v_lock); 4334 mutex_exit(&zp->z_lock); 4335 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4336 zfs_znode_free(zp); 4337 return; 4338 } 4339 4340 /* 4341 * Attempt to push any data in the page cache. If this fails 4342 * we will get kicked out later in zfs_zinactive(). 4343 */ 4344 if (vn_has_cached_data(vp)) { 4345 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 4346 cr); 4347 } 4348 4349 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4350 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4351 4352 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4353 zfs_sa_upgrade_txholds(tx, zp); 4354 error = dmu_tx_assign(tx, TXG_WAIT); 4355 if (error) { 4356 dmu_tx_abort(tx); 4357 } else { 4358 mutex_enter(&zp->z_lock); 4359 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4360 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4361 zp->z_atime_dirty = 0; 4362 mutex_exit(&zp->z_lock); 4363 dmu_tx_commit(tx); 4364 } 4365 } 4366 4367 zfs_zinactive(zp); 4368 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4369 } 4370 4371 /* 4372 * Bounds-check the seek operation. 4373 * 4374 * IN: vp - vnode seeking within 4375 * ooff - old file offset 4376 * noffp - pointer to new file offset 4377 * ct - caller context 4378 * 4379 * RETURN: 0 on success, EINVAL if new offset invalid. 4380 */ 4381 /* ARGSUSED */ 4382 static int 4383 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4384 caller_context_t *ct) 4385 { 4386 if (vp->v_type == VDIR) 4387 return (0); 4388 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4389 } 4390 4391 /* 4392 * Pre-filter the generic locking function to trap attempts to place 4393 * a mandatory lock on a memory mapped file. 4394 */ 4395 static int 4396 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4397 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4398 { 4399 znode_t *zp = VTOZ(vp); 4400 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4401 4402 ZFS_ENTER(zfsvfs); 4403 ZFS_VERIFY_ZP(zp); 4404 4405 /* 4406 * We are following the UFS semantics with respect to mapcnt 4407 * here: If we see that the file is mapped already, then we will 4408 * return an error, but we don't worry about races between this 4409 * function and zfs_map(). 4410 */ 4411 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { 4412 ZFS_EXIT(zfsvfs); 4413 return (SET_ERROR(EAGAIN)); 4414 } 4415 ZFS_EXIT(zfsvfs); 4416 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4417 } 4418 4419 /* 4420 * If we can't find a page in the cache, we will create a new page 4421 * and fill it with file data. For efficiency, we may try to fill 4422 * multiple pages at once (klustering) to fill up the supplied page 4423 * list. Note that the pages to be filled are held with an exclusive 4424 * lock to prevent access by other threads while they are being filled. 4425 */ 4426 static int 4427 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4428 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4429 { 4430 znode_t *zp = VTOZ(vp); 4431 page_t *pp, *cur_pp; 4432 objset_t *os = zp->z_zfsvfs->z_os; 4433 u_offset_t io_off, total; 4434 size_t io_len; 4435 int err; 4436 4437 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4438 /* 4439 * We only have a single page, don't bother klustering 4440 */ 4441 io_off = off; 4442 io_len = PAGESIZE; 4443 pp = page_create_va(vp, io_off, io_len, 4444 PG_EXCL | PG_WAIT, seg, addr); 4445 } else { 4446 /* 4447 * Try to find enough pages to fill the page list 4448 */ 4449 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4450 &io_len, off, plsz, 0); 4451 } 4452 if (pp == NULL) { 4453 /* 4454 * The page already exists, nothing to do here. 4455 */ 4456 *pl = NULL; 4457 return (0); 4458 } 4459 4460 /* 4461 * Fill the pages in the kluster. 4462 */ 4463 cur_pp = pp; 4464 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4465 caddr_t va; 4466 4467 ASSERT3U(io_off, ==, cur_pp->p_offset); 4468 va = zfs_map_page(cur_pp, S_WRITE); 4469 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4470 DMU_READ_PREFETCH); 4471 zfs_unmap_page(cur_pp, va); 4472 if (err) { 4473 /* On error, toss the entire kluster */ 4474 pvn_read_done(pp, B_ERROR); 4475 /* convert checksum errors into IO errors */ 4476 if (err == ECKSUM) 4477 err = SET_ERROR(EIO); 4478 return (err); 4479 } 4480 cur_pp = cur_pp->p_next; 4481 } 4482 4483 /* 4484 * Fill in the page list array from the kluster starting 4485 * from the desired offset `off'. 4486 * NOTE: the page list will always be null terminated. 4487 */ 4488 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4489 ASSERT(pl == NULL || (*pl)->p_offset == off); 4490 4491 return (0); 4492 } 4493 4494 /* 4495 * Return pointers to the pages for the file region [off, off + len] 4496 * in the pl array. If plsz is greater than len, this function may 4497 * also return page pointers from after the specified region 4498 * (i.e. the region [off, off + plsz]). These additional pages are 4499 * only returned if they are already in the cache, or were created as 4500 * part of a klustered read. 4501 * 4502 * IN: vp - vnode of file to get data from. 4503 * off - position in file to get data from. 4504 * len - amount of data to retrieve. 4505 * plsz - length of provided page list. 4506 * seg - segment to obtain pages for. 4507 * addr - virtual address of fault. 4508 * rw - mode of created pages. 4509 * cr - credentials of caller. 4510 * ct - caller context. 4511 * 4512 * OUT: protp - protection mode of created pages. 4513 * pl - list of pages created. 4514 * 4515 * RETURN: 0 on success, error code on failure. 4516 * 4517 * Timestamps: 4518 * vp - atime updated 4519 */ 4520 /* ARGSUSED */ 4521 static int 4522 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4523 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4524 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4525 { 4526 znode_t *zp = VTOZ(vp); 4527 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4528 page_t **pl0 = pl; 4529 int err = 0; 4530 4531 /* we do our own caching, faultahead is unnecessary */ 4532 if (pl == NULL) 4533 return (0); 4534 else if (len > plsz) 4535 len = plsz; 4536 else 4537 len = P2ROUNDUP(len, PAGESIZE); 4538 ASSERT(plsz >= len); 4539 4540 ZFS_ENTER(zfsvfs); 4541 ZFS_VERIFY_ZP(zp); 4542 4543 if (protp) 4544 *protp = PROT_ALL; 4545 4546 /* 4547 * Loop through the requested range [off, off + len) looking 4548 * for pages. If we don't find a page, we will need to create 4549 * a new page and fill it with data from the file. 4550 */ 4551 while (len > 0) { 4552 if (*pl = page_lookup(vp, off, SE_SHARED)) 4553 *(pl+1) = NULL; 4554 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 4555 goto out; 4556 while (*pl) { 4557 ASSERT3U((*pl)->p_offset, ==, off); 4558 off += PAGESIZE; 4559 addr += PAGESIZE; 4560 if (len > 0) { 4561 ASSERT3U(len, >=, PAGESIZE); 4562 len -= PAGESIZE; 4563 } 4564 ASSERT3U(plsz, >=, PAGESIZE); 4565 plsz -= PAGESIZE; 4566 pl++; 4567 } 4568 } 4569 4570 /* 4571 * Fill out the page array with any pages already in the cache. 4572 */ 4573 while (plsz > 0 && 4574 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 4575 off += PAGESIZE; 4576 plsz -= PAGESIZE; 4577 } 4578 out: 4579 if (err) { 4580 /* 4581 * Release any pages we have previously locked. 4582 */ 4583 while (pl > pl0) 4584 page_unlock(*--pl); 4585 } else { 4586 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4587 } 4588 4589 *pl = NULL; 4590 4591 ZFS_EXIT(zfsvfs); 4592 return (err); 4593 } 4594 4595 /* 4596 * Request a memory map for a section of a file. This code interacts 4597 * with common code and the VM system as follows: 4598 * 4599 * - common code calls mmap(), which ends up in smmap_common() 4600 * - this calls VOP_MAP(), which takes you into (say) zfs 4601 * - zfs_map() calls as_map(), passing segvn_create() as the callback 4602 * - segvn_create() creates the new segment and calls VOP_ADDMAP() 4603 * - zfs_addmap() updates z_mapcnt 4604 */ 4605 /*ARGSUSED*/ 4606 static int 4607 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4608 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4609 caller_context_t *ct) 4610 { 4611 znode_t *zp = VTOZ(vp); 4612 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4613 segvn_crargs_t vn_a; 4614 int error; 4615 4616 ZFS_ENTER(zfsvfs); 4617 ZFS_VERIFY_ZP(zp); 4618 4619 if ((prot & PROT_WRITE) && (zp->z_pflags & 4620 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4621 ZFS_EXIT(zfsvfs); 4622 return (SET_ERROR(EPERM)); 4623 } 4624 4625 if ((prot & (PROT_READ | PROT_EXEC)) && 4626 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4627 ZFS_EXIT(zfsvfs); 4628 return (SET_ERROR(EACCES)); 4629 } 4630 4631 if (vp->v_flag & VNOMAP) { 4632 ZFS_EXIT(zfsvfs); 4633 return (SET_ERROR(ENOSYS)); 4634 } 4635 4636 if (off < 0 || len > MAXOFFSET_T - off) { 4637 ZFS_EXIT(zfsvfs); 4638 return (SET_ERROR(ENXIO)); 4639 } 4640 4641 if (vp->v_type != VREG) { 4642 ZFS_EXIT(zfsvfs); 4643 return (SET_ERROR(ENODEV)); 4644 } 4645 4646 /* 4647 * If file is locked, disallow mapping. 4648 */ 4649 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { 4650 ZFS_EXIT(zfsvfs); 4651 return (SET_ERROR(EAGAIN)); 4652 } 4653 4654 as_rangelock(as); 4655 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4656 if (error != 0) { 4657 as_rangeunlock(as); 4658 ZFS_EXIT(zfsvfs); 4659 return (error); 4660 } 4661 4662 vn_a.vp = vp; 4663 vn_a.offset = (u_offset_t)off; 4664 vn_a.type = flags & MAP_TYPE; 4665 vn_a.prot = prot; 4666 vn_a.maxprot = maxprot; 4667 vn_a.cred = cr; 4668 vn_a.amp = NULL; 4669 vn_a.flags = flags & ~MAP_TYPE; 4670 vn_a.szc = 0; 4671 vn_a.lgrp_mem_policy_flags = 0; 4672 4673 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4674 4675 as_rangeunlock(as); 4676 ZFS_EXIT(zfsvfs); 4677 return (error); 4678 } 4679 4680 /* ARGSUSED */ 4681 static int 4682 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4683 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4684 caller_context_t *ct) 4685 { 4686 uint64_t pages = btopr(len); 4687 4688 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 4689 return (0); 4690 } 4691 4692 /* 4693 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 4694 * more accurate mtime for the associated file. Since we don't have a way of 4695 * detecting when the data was actually modified, we have to resort to 4696 * heuristics. If an explicit msync() is done, then we mark the mtime when the 4697 * last page is pushed. The problem occurs when the msync() call is omitted, 4698 * which by far the most common case: 4699 * 4700 * open() 4701 * mmap() 4702 * <modify memory> 4703 * munmap() 4704 * close() 4705 * <time lapse> 4706 * putpage() via fsflush 4707 * 4708 * If we wait until fsflush to come along, we can have a modification time that 4709 * is some arbitrary point in the future. In order to prevent this in the 4710 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 4711 * torn down. 4712 */ 4713 /* ARGSUSED */ 4714 static int 4715 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4716 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4717 caller_context_t *ct) 4718 { 4719 uint64_t pages = btopr(len); 4720 4721 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 4722 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 4723 4724 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 4725 vn_has_cached_data(vp)) 4726 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 4727 4728 return (0); 4729 } 4730 4731 /* 4732 * Free or allocate space in a file. Currently, this function only 4733 * supports the `F_FREESP' command. However, this command is somewhat 4734 * misnamed, as its functionality includes the ability to allocate as 4735 * well as free space. 4736 * 4737 * IN: vp - vnode of file to free data in. 4738 * cmd - action to take (only F_FREESP supported). 4739 * bfp - section of file to free/alloc. 4740 * flag - current file open mode flags. 4741 * offset - current file offset. 4742 * cr - credentials of caller [UNUSED]. 4743 * ct - caller context. 4744 * 4745 * RETURN: 0 on success, error code on failure. 4746 * 4747 * Timestamps: 4748 * vp - ctime|mtime updated 4749 */ 4750 /* ARGSUSED */ 4751 static int 4752 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 4753 offset_t offset, cred_t *cr, caller_context_t *ct) 4754 { 4755 znode_t *zp = VTOZ(vp); 4756 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4757 uint64_t off, len; 4758 int error; 4759 4760 ZFS_ENTER(zfsvfs); 4761 ZFS_VERIFY_ZP(zp); 4762 4763 if (cmd != F_FREESP) { 4764 ZFS_EXIT(zfsvfs); 4765 return (SET_ERROR(EINVAL)); 4766 } 4767 4768 if (error = convoff(vp, bfp, 0, offset)) { 4769 ZFS_EXIT(zfsvfs); 4770 return (error); 4771 } 4772 4773 if (bfp->l_len < 0) { 4774 ZFS_EXIT(zfsvfs); 4775 return (SET_ERROR(EINVAL)); 4776 } 4777 4778 off = bfp->l_start; 4779 len = bfp->l_len; /* 0 means from off to end of file */ 4780 4781 error = zfs_freesp(zp, off, len, flag, TRUE); 4782 4783 ZFS_EXIT(zfsvfs); 4784 return (error); 4785 } 4786 4787 /*ARGSUSED*/ 4788 static int 4789 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4790 { 4791 znode_t *zp = VTOZ(vp); 4792 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4793 uint32_t gen; 4794 uint64_t gen64; 4795 uint64_t object = zp->z_id; 4796 zfid_short_t *zfid; 4797 int size, i, error; 4798 4799 ZFS_ENTER(zfsvfs); 4800 ZFS_VERIFY_ZP(zp); 4801 4802 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4803 &gen64, sizeof (uint64_t))) != 0) { 4804 ZFS_EXIT(zfsvfs); 4805 return (error); 4806 } 4807 4808 gen = (uint32_t)gen64; 4809 4810 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4811 if (fidp->fid_len < size) { 4812 fidp->fid_len = size; 4813 ZFS_EXIT(zfsvfs); 4814 return (SET_ERROR(ENOSPC)); 4815 } 4816 4817 zfid = (zfid_short_t *)fidp; 4818 4819 zfid->zf_len = size; 4820 4821 for (i = 0; i < sizeof (zfid->zf_object); i++) 4822 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4823 4824 /* Must have a non-zero generation number to distinguish from .zfs */ 4825 if (gen == 0) 4826 gen = 1; 4827 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4828 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4829 4830 if (size == LONG_FID_LEN) { 4831 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4832 zfid_long_t *zlfid; 4833 4834 zlfid = (zfid_long_t *)fidp; 4835 4836 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4837 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4838 4839 /* XXX - this should be the generation number for the objset */ 4840 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4841 zlfid->zf_setgen[i] = 0; 4842 } 4843 4844 ZFS_EXIT(zfsvfs); 4845 return (0); 4846 } 4847 4848 static int 4849 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4850 caller_context_t *ct) 4851 { 4852 znode_t *zp, *xzp; 4853 zfsvfs_t *zfsvfs; 4854 zfs_dirlock_t *dl; 4855 int error; 4856 4857 switch (cmd) { 4858 case _PC_LINK_MAX: 4859 *valp = ULONG_MAX; 4860 return (0); 4861 4862 case _PC_FILESIZEBITS: 4863 *valp = 64; 4864 return (0); 4865 4866 case _PC_XATTR_EXISTS: 4867 zp = VTOZ(vp); 4868 zfsvfs = zp->z_zfsvfs; 4869 ZFS_ENTER(zfsvfs); 4870 ZFS_VERIFY_ZP(zp); 4871 *valp = 0; 4872 error = zfs_dirent_lock(&dl, zp, "", &xzp, 4873 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 4874 if (error == 0) { 4875 zfs_dirent_unlock(dl); 4876 if (!zfs_dirempty(xzp)) 4877 *valp = 1; 4878 VN_RELE(ZTOV(xzp)); 4879 } else if (error == ENOENT) { 4880 /* 4881 * If there aren't extended attributes, it's the 4882 * same as having zero of them. 4883 */ 4884 error = 0; 4885 } 4886 ZFS_EXIT(zfsvfs); 4887 return (error); 4888 4889 case _PC_SATTR_ENABLED: 4890 case _PC_SATTR_EXISTS: 4891 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4892 (vp->v_type == VREG || vp->v_type == VDIR); 4893 return (0); 4894 4895 case _PC_ACCESS_FILTERING: 4896 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4897 vp->v_type == VDIR; 4898 return (0); 4899 4900 case _PC_ACL_ENABLED: 4901 *valp = _ACL_ACE_ENABLED; 4902 return (0); 4903 4904 case _PC_MIN_HOLE_SIZE: 4905 *valp = (ulong_t)SPA_MINBLOCKSIZE; 4906 return (0); 4907 4908 case _PC_TIMESTAMP_RESOLUTION: 4909 /* nanosecond timestamp resolution */ 4910 *valp = 1L; 4911 return (0); 4912 4913 default: 4914 return (fs_pathconf(vp, cmd, valp, cr, ct)); 4915 } 4916 } 4917 4918 /*ARGSUSED*/ 4919 static int 4920 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4921 caller_context_t *ct) 4922 { 4923 znode_t *zp = VTOZ(vp); 4924 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4925 int error; 4926 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4927 4928 ZFS_ENTER(zfsvfs); 4929 ZFS_VERIFY_ZP(zp); 4930 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4931 ZFS_EXIT(zfsvfs); 4932 4933 return (error); 4934 } 4935 4936 /*ARGSUSED*/ 4937 static int 4938 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4939 caller_context_t *ct) 4940 { 4941 znode_t *zp = VTOZ(vp); 4942 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4943 int error; 4944 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4945 zilog_t *zilog = zfsvfs->z_log; 4946 4947 ZFS_ENTER(zfsvfs); 4948 ZFS_VERIFY_ZP(zp); 4949 4950 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4951 4952 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4953 zil_commit(zilog, 0); 4954 4955 ZFS_EXIT(zfsvfs); 4956 return (error); 4957 } 4958 4959 /* 4960 * The smallest read we may consider to loan out an arcbuf. 4961 * This must be a power of 2. 4962 */ 4963 int zcr_blksz_min = (1 << 10); /* 1K */ 4964 /* 4965 * If set to less than the file block size, allow loaning out of an 4966 * arcbuf for a partial block read. This must be a power of 2. 4967 */ 4968 int zcr_blksz_max = (1 << 17); /* 128K */ 4969 4970 /*ARGSUSED*/ 4971 static int 4972 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 4973 caller_context_t *ct) 4974 { 4975 znode_t *zp = VTOZ(vp); 4976 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4977 int max_blksz = zfsvfs->z_max_blksz; 4978 uio_t *uio = &xuio->xu_uio; 4979 ssize_t size = uio->uio_resid; 4980 offset_t offset = uio->uio_loffset; 4981 int blksz; 4982 int fullblk, i; 4983 arc_buf_t *abuf; 4984 ssize_t maxsize; 4985 int preamble, postamble; 4986 4987 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 4988 return (SET_ERROR(EINVAL)); 4989 4990 ZFS_ENTER(zfsvfs); 4991 ZFS_VERIFY_ZP(zp); 4992 switch (ioflag) { 4993 case UIO_WRITE: 4994 /* 4995 * Loan out an arc_buf for write if write size is bigger than 4996 * max_blksz, and the file's block size is also max_blksz. 4997 */ 4998 blksz = max_blksz; 4999 if (size < blksz || zp->z_blksz != blksz) { 5000 ZFS_EXIT(zfsvfs); 5001 return (SET_ERROR(EINVAL)); 5002 } 5003 /* 5004 * Caller requests buffers for write before knowing where the 5005 * write offset might be (e.g. NFS TCP write). 5006 */ 5007 if (offset == -1) { 5008 preamble = 0; 5009 } else { 5010 preamble = P2PHASE(offset, blksz); 5011 if (preamble) { 5012 preamble = blksz - preamble; 5013 size -= preamble; 5014 } 5015 } 5016 5017 postamble = P2PHASE(size, blksz); 5018 size -= postamble; 5019 5020 fullblk = size / blksz; 5021 (void) dmu_xuio_init(xuio, 5022 (preamble != 0) + fullblk + (postamble != 0)); 5023 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5024 int, postamble, int, 5025 (preamble != 0) + fullblk + (postamble != 0)); 5026 5027 /* 5028 * Have to fix iov base/len for partial buffers. They 5029 * currently represent full arc_buf's. 5030 */ 5031 if (preamble) { 5032 /* data begins in the middle of the arc_buf */ 5033 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5034 blksz); 5035 ASSERT(abuf); 5036 (void) dmu_xuio_add(xuio, abuf, 5037 blksz - preamble, preamble); 5038 } 5039 5040 for (i = 0; i < fullblk; i++) { 5041 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5042 blksz); 5043 ASSERT(abuf); 5044 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5045 } 5046 5047 if (postamble) { 5048 /* data ends in the middle of the arc_buf */ 5049 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5050 blksz); 5051 ASSERT(abuf); 5052 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5053 } 5054 break; 5055 case UIO_READ: 5056 /* 5057 * Loan out an arc_buf for read if the read size is larger than 5058 * the current file block size. Block alignment is not 5059 * considered. Partial arc_buf will be loaned out for read. 5060 */ 5061 blksz = zp->z_blksz; 5062 if (blksz < zcr_blksz_min) 5063 blksz = zcr_blksz_min; 5064 if (blksz > zcr_blksz_max) 5065 blksz = zcr_blksz_max; 5066 /* avoid potential complexity of dealing with it */ 5067 if (blksz > max_blksz) { 5068 ZFS_EXIT(zfsvfs); 5069 return (SET_ERROR(EINVAL)); 5070 } 5071 5072 maxsize = zp->z_size - uio->uio_loffset; 5073 if (size > maxsize) 5074 size = maxsize; 5075 5076 if (size < blksz || vn_has_cached_data(vp)) { 5077 ZFS_EXIT(zfsvfs); 5078 return (SET_ERROR(EINVAL)); 5079 } 5080 break; 5081 default: 5082 ZFS_EXIT(zfsvfs); 5083 return (SET_ERROR(EINVAL)); 5084 } 5085 5086 uio->uio_extflg = UIO_XUIO; 5087 XUIO_XUZC_RW(xuio) = ioflag; 5088 ZFS_EXIT(zfsvfs); 5089 return (0); 5090 } 5091 5092 /*ARGSUSED*/ 5093 static int 5094 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5095 { 5096 int i; 5097 arc_buf_t *abuf; 5098 int ioflag = XUIO_XUZC_RW(xuio); 5099 5100 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5101 5102 i = dmu_xuio_cnt(xuio); 5103 while (i-- > 0) { 5104 abuf = dmu_xuio_arcbuf(xuio, i); 5105 /* 5106 * if abuf == NULL, it must be a write buffer 5107 * that has been returned in zfs_write(). 5108 */ 5109 if (abuf) 5110 dmu_return_arcbuf(abuf); 5111 ASSERT(abuf || ioflag == UIO_WRITE); 5112 } 5113 5114 dmu_xuio_fini(xuio); 5115 return (0); 5116 } 5117 5118 /* 5119 * Predeclare these here so that the compiler assumes that 5120 * this is an "old style" function declaration that does 5121 * not include arguments => we won't get type mismatch errors 5122 * in the initializations that follow. 5123 */ 5124 static int zfs_inval(); 5125 static int zfs_isdir(); 5126 5127 static int 5128 zfs_inval() 5129 { 5130 return (SET_ERROR(EINVAL)); 5131 } 5132 5133 static int 5134 zfs_isdir() 5135 { 5136 return (SET_ERROR(EISDIR)); 5137 } 5138 /* 5139 * Directory vnode operations template 5140 */ 5141 vnodeops_t *zfs_dvnodeops; 5142 const fs_operation_def_t zfs_dvnodeops_template[] = { 5143 VOPNAME_OPEN, { .vop_open = zfs_open }, 5144 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5145 VOPNAME_READ, { .error = zfs_isdir }, 5146 VOPNAME_WRITE, { .error = zfs_isdir }, 5147 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5148 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5149 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5150 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5151 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5152 VOPNAME_CREATE, { .vop_create = zfs_create }, 5153 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5154 VOPNAME_LINK, { .vop_link = zfs_link }, 5155 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5156 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 5157 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5158 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5159 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 5160 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5161 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5162 VOPNAME_FID, { .vop_fid = zfs_fid }, 5163 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5164 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5165 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5166 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5167 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5168 NULL, NULL 5169 }; 5170 5171 /* 5172 * Regular file vnode operations template 5173 */ 5174 vnodeops_t *zfs_fvnodeops; 5175 const fs_operation_def_t zfs_fvnodeops_template[] = { 5176 VOPNAME_OPEN, { .vop_open = zfs_open }, 5177 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5178 VOPNAME_READ, { .vop_read = zfs_read }, 5179 VOPNAME_WRITE, { .vop_write = zfs_write }, 5180 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5181 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5182 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5183 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5184 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5185 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5186 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5187 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5188 VOPNAME_FID, { .vop_fid = zfs_fid }, 5189 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5190 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 5191 VOPNAME_SPACE, { .vop_space = zfs_space }, 5192 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 5193 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 5194 VOPNAME_MAP, { .vop_map = zfs_map }, 5195 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 5196 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 5197 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5198 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5199 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5200 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5201 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, 5202 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, 5203 NULL, NULL 5204 }; 5205 5206 /* 5207 * Symbolic link vnode operations template 5208 */ 5209 vnodeops_t *zfs_symvnodeops; 5210 const fs_operation_def_t zfs_symvnodeops_template[] = { 5211 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5212 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5213 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5214 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5215 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 5216 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5217 VOPNAME_FID, { .vop_fid = zfs_fid }, 5218 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5219 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5220 NULL, NULL 5221 }; 5222 5223 /* 5224 * special share hidden files vnode operations template 5225 */ 5226 vnodeops_t *zfs_sharevnodeops; 5227 const fs_operation_def_t zfs_sharevnodeops_template[] = { 5228 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5229 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5230 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5231 VOPNAME_FID, { .vop_fid = zfs_fid }, 5232 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5233 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5234 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5235 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5236 NULL, NULL 5237 }; 5238 5239 /* 5240 * Extended attribute directory vnode operations template 5241 * 5242 * This template is identical to the directory vnodes 5243 * operation template except for restricted operations: 5244 * VOP_MKDIR() 5245 * VOP_SYMLINK() 5246 * 5247 * Note that there are other restrictions embedded in: 5248 * zfs_create() - restrict type to VREG 5249 * zfs_link() - no links into/out of attribute space 5250 * zfs_rename() - no moves into/out of attribute space 5251 */ 5252 vnodeops_t *zfs_xdvnodeops; 5253 const fs_operation_def_t zfs_xdvnodeops_template[] = { 5254 VOPNAME_OPEN, { .vop_open = zfs_open }, 5255 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5256 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5257 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5258 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5259 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5260 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5261 VOPNAME_CREATE, { .vop_create = zfs_create }, 5262 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5263 VOPNAME_LINK, { .vop_link = zfs_link }, 5264 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5265 VOPNAME_MKDIR, { .error = zfs_inval }, 5266 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5267 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5268 VOPNAME_SYMLINK, { .error = zfs_inval }, 5269 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5270 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5271 VOPNAME_FID, { .vop_fid = zfs_fid }, 5272 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5273 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5274 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5275 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5276 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5277 NULL, NULL 5278 }; 5279 5280 /* 5281 * Error vnode operations template 5282 */ 5283 vnodeops_t *zfs_evnodeops; 5284 const fs_operation_def_t zfs_evnodeops_template[] = { 5285 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5286 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5287 NULL, NULL 5288 }; 5289