1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 /* Portions Copyright 2007 Jeremy Teo */ 28 /* Portions Copyright 2010 Robert Milkowski */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/file.h> 40 #include <sys/stat.h> 41 #include <sys/kmem.h> 42 #include <sys/taskq.h> 43 #include <sys/uio.h> 44 #include <sys/vmsystm.h> 45 #include <sys/atomic.h> 46 #include <sys/vm.h> 47 #include <vm/seg_vn.h> 48 #include <vm/pvn.h> 49 #include <vm/as.h> 50 #include <vm/kpm.h> 51 #include <vm/seg_kpm.h> 52 #include <sys/mman.h> 53 #include <sys/pathname.h> 54 #include <sys/cmn_err.h> 55 #include <sys/errno.h> 56 #include <sys/unistd.h> 57 #include <sys/zfs_dir.h> 58 #include <sys/zfs_acl.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/fs/zfs.h> 61 #include <sys/dmu.h> 62 #include <sys/dmu_objset.h> 63 #include <sys/spa.h> 64 #include <sys/txg.h> 65 #include <sys/dbuf.h> 66 #include <sys/zap.h> 67 #include <sys/sa.h> 68 #include <sys/dirent.h> 69 #include <sys/policy.h> 70 #include <sys/sunddi.h> 71 #include <sys/filio.h> 72 #include <sys/sid.h> 73 #include "fs/fs_subr.h" 74 #include <sys/zfs_ctldir.h> 75 #include <sys/zfs_fuid.h> 76 #include <sys/zfs_sa.h> 77 #include <sys/dnlc.h> 78 #include <sys/zfs_rlock.h> 79 #include <sys/extdirent.h> 80 #include <sys/kidmap.h> 81 #include <sys/cred.h> 82 #include <sys/attr.h> 83 #include <sys/zfs_events.h> 84 #include <sys/fs/zev.h> 85 86 /* 87 * Programming rules. 88 * 89 * Each vnode op performs some logical unit of work. To do this, the ZPL must 90 * properly lock its in-core state, create a DMU transaction, do the work, 91 * record this work in the intent log (ZIL), commit the DMU transaction, 92 * and wait for the intent log to commit if it is a synchronous operation. 93 * Moreover, the vnode ops must work in both normal and log replay context. 94 * The ordering of events is important to avoid deadlocks and references 95 * to freed memory. The example below illustrates the following Big Rules: 96 * 97 * (1) A check must be made in each zfs thread for a mounted file system. 98 * This is done avoiding races using ZFS_ENTER(zfsvfs). 99 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 100 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 101 * can return EIO from the calling function. 102 * 103 * (2) VN_RELE() should always be the last thing except for zil_commit() 104 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 105 * First, if it's the last reference, the vnode/znode 106 * can be freed, so the zp may point to freed memory. Second, the last 107 * reference will call zfs_zinactive(), which may induce a lot of work -- 108 * pushing cached pages (which acquires range locks) and syncing out 109 * cached atime changes. Third, zfs_zinactive() may require a new tx, 110 * which could deadlock the system if you were already holding one. 111 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 112 * 113 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 114 * as they can span dmu_tx_assign() calls. 115 * 116 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 117 * dmu_tx_assign(). This is critical because we don't want to block 118 * while holding locks. 119 * 120 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 121 * reduces lock contention and CPU usage when we must wait (note that if 122 * throughput is constrained by the storage, nearly every transaction 123 * must wait). 124 * 125 * Note, in particular, that if a lock is sometimes acquired before 126 * the tx assigns, and sometimes after (e.g. z_lock), then failing 127 * to use a non-blocking assign can deadlock the system. The scenario: 128 * 129 * Thread A has grabbed a lock before calling dmu_tx_assign(). 130 * Thread B is in an already-assigned tx, and blocks for this lock. 131 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 132 * forever, because the previous txg can't quiesce until B's tx commits. 133 * 134 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 135 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 136 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 137 * to indicate that this operation has already called dmu_tx_wait(). 138 * This will ensure that we don't retry forever, waiting a short bit 139 * each time. 140 * 141 * (5) If the operation succeeded, generate the intent log entry for it 142 * before dropping locks. This ensures that the ordering of events 143 * in the intent log matches the order in which they actually occurred. 144 * During ZIL replay the zfs_log_* functions will update the sequence 145 * number to indicate the zil transaction has replayed. 146 * 147 * (6) At the end of each vnode op, the DMU tx must always commit, 148 * regardless of whether there were any errors. 149 * 150 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 151 * to ensure that synchronous semantics are provided when necessary. 152 * 153 * In general, this is how things should be ordered in each vnode op: 154 * 155 * ZFS_ENTER(zfsvfs); // exit if unmounted 156 * top: 157 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 158 * rw_enter(...); // grab any other locks you need 159 * tx = dmu_tx_create(...); // get DMU tx 160 * dmu_tx_hold_*(); // hold each object you might modify 161 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 162 * if (error) { 163 * rw_exit(...); // drop locks 164 * zfs_dirent_unlock(dl); // unlock directory entry 165 * VN_RELE(...); // release held vnodes 166 * if (error == ERESTART) { 167 * waited = B_TRUE; 168 * dmu_tx_wait(tx); 169 * dmu_tx_abort(tx); 170 * goto top; 171 * } 172 * dmu_tx_abort(tx); // abort DMU tx 173 * ZFS_EXIT(zfsvfs); // finished in zfs 174 * return (error); // really out of space 175 * } 176 * error = do_real_work(); // do whatever this VOP does 177 * if (error == 0) 178 * zfs_log_*(...); // on success, make ZIL entry 179 * dmu_tx_commit(tx); // commit DMU tx -- error or not 180 * rw_exit(...); // drop locks 181 * zfs_dirent_unlock(dl); // unlock directory entry 182 * VN_RELE(...); // release held vnodes 183 * zil_commit(zilog, foid); // synchronous when necessary 184 * ZFS_EXIT(zfsvfs); // finished in zfs 185 * return (error); // done, report error 186 */ 187 188 /* ARGSUSED */ 189 static int 190 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 191 { 192 znode_t *zp = VTOZ(*vpp); 193 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 194 195 ZFS_ENTER(zfsvfs); 196 ZFS_VERIFY_ZP(zp); 197 198 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 199 ((flag & FAPPEND) == 0)) { 200 ZFS_EXIT(zfsvfs); 201 return (SET_ERROR(EPERM)); 202 } 203 204 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 205 ZTOV(zp)->v_type == VREG && 206 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 207 if (fs_vscan(*vpp, cr, 0) != 0) { 208 ZFS_EXIT(zfsvfs); 209 return (SET_ERROR(EACCES)); 210 } 211 } 212 213 /* Keep a count of the synchronous opens in the znode */ 214 if (flag & (FSYNC | FDSYNC)) 215 atomic_inc_32(&zp->z_sync_cnt); 216 217 ZFS_EXIT(zfsvfs); 218 return (0); 219 } 220 221 /* ARGSUSED */ 222 static int 223 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 224 caller_context_t *ct) 225 { 226 znode_t *zp = VTOZ(vp); 227 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 228 229 /* 230 * Clean up any locks held by this process on the vp. 231 */ 232 cleanlocks(vp, ddi_get_pid(), 0); 233 cleanshares(vp, ddi_get_pid()); 234 235 ZFS_ENTER(zfsvfs); 236 ZFS_VERIFY_ZP(zp); 237 238 /* Decrement the synchronous opens in the znode */ 239 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 240 atomic_dec_32(&zp->z_sync_cnt); 241 242 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 243 ZTOV(zp)->v_type == VREG && 244 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 245 VERIFY(fs_vscan(vp, cr, 1) == 0); 246 247 if (ZTOV(zp)->v_type == VREG && zp->z_new_content) { 248 zp->z_new_content = 0; 249 rw_enter(&rz_zev_rwlock, RW_READER); 250 if (rz_zev_callbacks && 251 rz_zev_callbacks->rz_zev_znode_close_after_update) 252 rz_zev_callbacks->rz_zev_znode_close_after_update(zp); 253 rw_exit(&rz_zev_rwlock); 254 } 255 256 ZFS_EXIT(zfsvfs); 257 return (0); 258 } 259 260 /* 261 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 262 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 263 */ 264 static int 265 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 266 { 267 znode_t *zp = VTOZ(vp); 268 uint64_t noff = (uint64_t)*off; /* new offset */ 269 uint64_t file_sz; 270 int error; 271 boolean_t hole; 272 273 file_sz = zp->z_size; 274 if (noff >= file_sz) { 275 return (SET_ERROR(ENXIO)); 276 } 277 278 if (cmd == _FIO_SEEK_HOLE) 279 hole = B_TRUE; 280 else 281 hole = B_FALSE; 282 283 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 284 285 /* end of file? */ 286 if ((error == ESRCH) || (noff > file_sz)) { 287 /* 288 * Handle the virtual hole at the end of file. 289 */ 290 if (hole) { 291 *off = file_sz; 292 return (0); 293 } 294 return (SET_ERROR(ENXIO)); 295 } 296 297 if (noff < *off) 298 return (error); 299 *off = noff; 300 return (error); 301 } 302 303 /* ARGSUSED */ 304 static int 305 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 306 int *rvalp, caller_context_t *ct) 307 { 308 offset_t off; 309 int error; 310 zfsvfs_t *zfsvfs; 311 znode_t *zp; 312 313 switch (com) { 314 case _FIOFFS: 315 return (zfs_sync(vp->v_vfsp, 0, cred)); 316 317 /* 318 * The following two ioctls are used by bfu. Faking out, 319 * necessary to avoid bfu errors. 320 */ 321 case _FIOGDIO: 322 case _FIOSDIO: 323 return (0); 324 325 case _FIO_SEEK_DATA: 326 case _FIO_SEEK_HOLE: 327 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 328 return (SET_ERROR(EFAULT)); 329 330 zp = VTOZ(vp); 331 zfsvfs = zp->z_zfsvfs; 332 ZFS_ENTER(zfsvfs); 333 ZFS_VERIFY_ZP(zp); 334 335 /* offset parameter is in/out */ 336 error = zfs_holey(vp, com, &off); 337 ZFS_EXIT(zfsvfs); 338 if (error) 339 return (error); 340 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 341 return (SET_ERROR(EFAULT)); 342 return (0); 343 } 344 return (SET_ERROR(ENOTTY)); 345 } 346 347 /* 348 * Utility functions to map and unmap a single physical page. These 349 * are used to manage the mappable copies of ZFS file data, and therefore 350 * do not update ref/mod bits. 351 */ 352 caddr_t 353 zfs_map_page(page_t *pp, enum seg_rw rw) 354 { 355 if (kpm_enable) 356 return (hat_kpm_mapin(pp, 0)); 357 ASSERT(rw == S_READ || rw == S_WRITE); 358 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 359 (caddr_t)-1)); 360 } 361 362 void 363 zfs_unmap_page(page_t *pp, caddr_t addr) 364 { 365 if (kpm_enable) { 366 hat_kpm_mapout(pp, 0, addr); 367 } else { 368 ppmapout(addr); 369 } 370 } 371 372 /* 373 * When a file is memory mapped, we must keep the IO data synchronized 374 * between the DMU cache and the memory mapped pages. What this means: 375 * 376 * On Write: If we find a memory mapped page, we write to *both* 377 * the page and the dmu buffer. 378 */ 379 static void 380 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 381 { 382 int64_t off; 383 384 off = start & PAGEOFFSET; 385 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 386 page_t *pp; 387 uint64_t nbytes = MIN(PAGESIZE - off, len); 388 389 if (pp = page_lookup(vp, start, SE_SHARED)) { 390 caddr_t va; 391 392 va = zfs_map_page(pp, S_WRITE); 393 (void) dmu_read(os, oid, start+off, nbytes, va+off, 394 DMU_READ_PREFETCH); 395 zfs_unmap_page(pp, va); 396 page_unlock(pp); 397 } 398 len -= nbytes; 399 off = 0; 400 } 401 } 402 403 /* 404 * When a file is memory mapped, we must keep the IO data synchronized 405 * between the DMU cache and the memory mapped pages. What this means: 406 * 407 * On Read: We "read" preferentially from memory mapped pages, 408 * else we default from the dmu buffer. 409 * 410 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 411 * the file is memory mapped. 412 */ 413 static int 414 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 415 { 416 znode_t *zp = VTOZ(vp); 417 objset_t *os = zp->z_zfsvfs->z_os; 418 int64_t start, off; 419 int len = nbytes; 420 int error = 0; 421 422 start = uio->uio_loffset; 423 off = start & PAGEOFFSET; 424 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 425 page_t *pp; 426 uint64_t bytes = MIN(PAGESIZE - off, len); 427 428 if (pp = page_lookup(vp, start, SE_SHARED)) { 429 caddr_t va; 430 431 va = zfs_map_page(pp, S_READ); 432 error = uiomove(va + off, bytes, UIO_READ, uio); 433 zfs_unmap_page(pp, va); 434 page_unlock(pp); 435 } else { 436 error = dmu_read_uio(os, zp->z_id, uio, bytes); 437 } 438 len -= bytes; 439 off = 0; 440 if (error) 441 break; 442 } 443 return (error); 444 } 445 446 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 447 448 /* 449 * Read bytes from specified file into supplied buffer. 450 * 451 * IN: vp - vnode of file to be read from. 452 * uio - structure supplying read location, range info, 453 * and return buffer. 454 * ioflag - SYNC flags; used to provide FRSYNC semantics. 455 * cr - credentials of caller. 456 * ct - caller context 457 * 458 * OUT: uio - updated offset and range, buffer filled. 459 * 460 * RETURN: 0 on success, error code on failure. 461 * 462 * Side Effects: 463 * vp - atime updated if byte count > 0 464 */ 465 /* ARGSUSED */ 466 static int 467 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 468 { 469 znode_t *zp = VTOZ(vp); 470 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 471 objset_t *os; 472 ssize_t n, nbytes; 473 int error = 0; 474 rl_t *rl; 475 xuio_t *xuio = NULL; 476 477 ZFS_ENTER(zfsvfs); 478 ZFS_VERIFY_ZP(zp); 479 os = zfsvfs->z_os; 480 481 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 482 ZFS_EXIT(zfsvfs); 483 return (SET_ERROR(EACCES)); 484 } 485 486 /* 487 * Validate file offset 488 */ 489 if (uio->uio_loffset < (offset_t)0) { 490 ZFS_EXIT(zfsvfs); 491 return (SET_ERROR(EINVAL)); 492 } 493 494 /* 495 * Fasttrack empty reads 496 */ 497 if (uio->uio_resid == 0) { 498 ZFS_EXIT(zfsvfs); 499 return (0); 500 } 501 502 /* 503 * Check for mandatory locks 504 */ 505 if (MANDMODE(zp->z_mode)) { 506 if (error = chklock(vp, FREAD, 507 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 508 ZFS_EXIT(zfsvfs); 509 return (error); 510 } 511 } 512 513 /* 514 * If we're in FRSYNC mode, sync out this znode before reading it. 515 */ 516 if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 517 zil_commit(zfsvfs->z_log, zp->z_id); 518 519 /* 520 * Lock the range against changes. 521 */ 522 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 523 524 /* 525 * If we are reading past end-of-file we can skip 526 * to the end; but we might still need to set atime. 527 */ 528 if (uio->uio_loffset >= zp->z_size) { 529 error = 0; 530 goto out; 531 } 532 533 ASSERT(uio->uio_loffset < zp->z_size); 534 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 535 536 if ((uio->uio_extflg == UIO_XUIO) && 537 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 538 int nblk; 539 int blksz = zp->z_blksz; 540 uint64_t offset = uio->uio_loffset; 541 542 xuio = (xuio_t *)uio; 543 if ((ISP2(blksz))) { 544 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 545 blksz)) / blksz; 546 } else { 547 ASSERT(offset + n <= blksz); 548 nblk = 1; 549 } 550 (void) dmu_xuio_init(xuio, nblk); 551 552 if (vn_has_cached_data(vp)) { 553 /* 554 * For simplicity, we always allocate a full buffer 555 * even if we only expect to read a portion of a block. 556 */ 557 while (--nblk >= 0) { 558 (void) dmu_xuio_add(xuio, 559 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 560 blksz), 0, blksz); 561 } 562 } 563 } 564 565 while (n > 0) { 566 nbytes = MIN(n, zfs_read_chunk_size - 567 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 568 569 if (vn_has_cached_data(vp)) 570 error = mappedread(vp, nbytes, uio); 571 else 572 error = dmu_read_uio(os, zp->z_id, uio, nbytes); 573 if (error) { 574 /* convert checksum errors into IO errors */ 575 if (error == ECKSUM) 576 error = SET_ERROR(EIO); 577 break; 578 } 579 580 n -= nbytes; 581 } 582 out: 583 zfs_range_unlock(rl); 584 585 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 586 ZFS_EXIT(zfsvfs); 587 return (error); 588 } 589 590 /* 591 * Write the bytes to a file. 592 * 593 * IN: vp - vnode of file to be written to. 594 * uio - structure supplying write location, range info, 595 * and data buffer. 596 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 597 * set if in append mode. 598 * cr - credentials of caller. 599 * ct - caller context (NFS/CIFS fem monitor only) 600 * 601 * OUT: uio - updated offset and range. 602 * 603 * RETURN: 0 on success, error code on failure. 604 * 605 * Timestamps: 606 * vp - ctime|mtime updated if byte count > 0 607 */ 608 609 /* ARGSUSED */ 610 static int 611 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 612 { 613 znode_t *zp = VTOZ(vp); 614 rlim64_t limit = uio->uio_llimit; 615 ssize_t start_resid = uio->uio_resid; 616 ssize_t tx_bytes; 617 uint64_t end_size; 618 dmu_tx_t *tx; 619 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 620 zilog_t *zilog; 621 offset_t woff; 622 ssize_t n, nbytes; 623 rl_t *rl; 624 int max_blksz = zfsvfs->z_max_blksz; 625 int error = 0; 626 arc_buf_t *abuf; 627 iovec_t *aiov = NULL; 628 xuio_t *xuio = NULL; 629 int i_iov = 0; 630 int iovcnt = uio->uio_iovcnt; 631 iovec_t *iovp = uio->uio_iov; 632 int write_eof; 633 int count = 0; 634 sa_bulk_attr_t bulk[4]; 635 uint64_t mtime[2], ctime[2]; 636 ssize_t lock_off, lock_len; 637 638 /* 639 * Fasttrack empty write 640 */ 641 n = start_resid; 642 if (n == 0) 643 return (0); 644 645 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 646 limit = MAXOFFSET_T; 647 648 ZFS_ENTER(zfsvfs); 649 ZFS_VERIFY_ZP(zp); 650 651 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 652 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 653 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 654 &zp->z_size, 8); 655 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 656 &zp->z_pflags, 8); 657 658 /* 659 * If immutable or not appending then return EPERM 660 */ 661 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 662 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 663 (uio->uio_loffset < zp->z_size))) { 664 ZFS_EXIT(zfsvfs); 665 return (SET_ERROR(EPERM)); 666 } 667 668 zilog = zfsvfs->z_log; 669 670 /* 671 * Validate file offset 672 */ 673 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 674 if (woff < 0) { 675 ZFS_EXIT(zfsvfs); 676 return (SET_ERROR(EINVAL)); 677 } 678 679 /* 680 * Check for mandatory locks before calling zfs_range_lock() 681 * in order to prevent a deadlock with locks set via fcntl(). 682 */ 683 if (MANDMODE((mode_t)zp->z_mode) && 684 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 685 ZFS_EXIT(zfsvfs); 686 return (error); 687 } 688 689 /* 690 * Pre-fault the pages to ensure slow (eg NFS) pages 691 * don't hold up txg. 692 * Skip this if uio contains loaned arc_buf. 693 */ 694 if ((uio->uio_extflg == UIO_XUIO) && 695 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 696 xuio = (xuio_t *)uio; 697 else 698 uio_prefaultpages(MIN(n, max_blksz), uio); 699 700 /* 701 * If in append mode, set the io offset pointer to eof. 702 */ 703 if (ioflag & FAPPEND) { 704 /* 705 * Obtain an appending range lock to guarantee file append 706 * semantics. We reset the write offset once we have the lock. 707 */ 708 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 709 woff = rl->r_off; 710 if (rl->r_len == UINT64_MAX) { 711 /* 712 * We overlocked the file because this write will cause 713 * the file block size to increase. 714 * Note that zp_size cannot change with this lock held. 715 */ 716 woff = zp->z_size; 717 } 718 uio->uio_loffset = woff; 719 } else { 720 /* 721 * Note that if the file block size will change as a result of 722 * this write, then this range lock will lock the entire file 723 * so that we can re-write the block safely. 724 */ 725 726 /* 727 * If in zev mode, lock offsets are quantized to 1MB chunks 728 * so that we can calculate level 1 checksums later on. 729 */ 730 if (rz_zev_active()) { 731 /* start of this megabyte */ 732 lock_off = P2ALIGN(woff, ZEV_L1_SIZE); 733 /* full megabytes */ 734 lock_len = n + (woff - lock_off); 735 lock_len = P2ROUNDUP(lock_len, ZEV_L1_SIZE); 736 } else { 737 lock_off = woff; 738 lock_len = n; 739 } 740 741 rl = zfs_range_lock(zp, lock_off, lock_len, RL_WRITER); 742 } 743 744 if (woff >= limit) { 745 zfs_range_unlock(rl); 746 ZFS_EXIT(zfsvfs); 747 return (SET_ERROR(EFBIG)); 748 } 749 750 if ((woff + n) > limit || woff > (limit - n)) 751 n = limit - woff; 752 753 /* Will this write extend the file length? */ 754 write_eof = (woff + n > zp->z_size); 755 756 end_size = MAX(zp->z_size, woff + n); 757 758 /* 759 * Write the file in reasonable size chunks. Each chunk is written 760 * in a separate transaction; this keeps the intent log records small 761 * and allows us to do more fine-grained space accounting. 762 */ 763 while (n > 0) { 764 abuf = NULL; 765 woff = uio->uio_loffset; 766 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 767 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 768 if (abuf != NULL) 769 dmu_return_arcbuf(abuf); 770 error = SET_ERROR(EDQUOT); 771 break; 772 } 773 774 if (xuio && abuf == NULL) { 775 ASSERT(i_iov < iovcnt); 776 aiov = &iovp[i_iov]; 777 abuf = dmu_xuio_arcbuf(xuio, i_iov); 778 dmu_xuio_clear(xuio, i_iov); 779 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 780 iovec_t *, aiov, arc_buf_t *, abuf); 781 ASSERT((aiov->iov_base == abuf->b_data) || 782 ((char *)aiov->iov_base - (char *)abuf->b_data + 783 aiov->iov_len == arc_buf_size(abuf))); 784 i_iov++; 785 } else if (abuf == NULL && n >= max_blksz && 786 woff >= zp->z_size && 787 P2PHASE(woff, max_blksz) == 0 && 788 zp->z_blksz == max_blksz) { 789 /* 790 * This write covers a full block. "Borrow" a buffer 791 * from the dmu so that we can fill it before we enter 792 * a transaction. This avoids the possibility of 793 * holding up the transaction if the data copy hangs 794 * up on a pagefault (e.g., from an NFS server mapping). 795 */ 796 size_t cbytes; 797 798 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 799 max_blksz); 800 ASSERT(abuf != NULL); 801 ASSERT(arc_buf_size(abuf) == max_blksz); 802 if (error = uiocopy(abuf->b_data, max_blksz, 803 UIO_WRITE, uio, &cbytes)) { 804 dmu_return_arcbuf(abuf); 805 break; 806 } 807 ASSERT(cbytes == max_blksz); 808 } 809 810 /* 811 * Start a transaction. 812 */ 813 tx = dmu_tx_create(zfsvfs->z_os); 814 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 815 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 816 zfs_sa_upgrade_txholds(tx, zp); 817 error = dmu_tx_assign(tx, TXG_WAIT); 818 if (error) { 819 dmu_tx_abort(tx); 820 if (abuf != NULL) 821 dmu_return_arcbuf(abuf); 822 break; 823 } 824 825 /* 826 * If zfs_range_lock() over-locked we grow the blocksize 827 * and then reduce the lock range. This will only happen 828 * on the first iteration since zfs_range_reduce() will 829 * shrink down r_len to the appropriate size. 830 */ 831 if (rl->r_len == UINT64_MAX) { 832 uint64_t new_blksz; 833 834 if (zp->z_blksz > max_blksz) { 835 ASSERT(!ISP2(zp->z_blksz)); 836 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 837 } else { 838 new_blksz = MIN(end_size, max_blksz); 839 } 840 zfs_grow_blocksize(zp, new_blksz, tx); 841 zfs_range_reduce(rl, woff, n); 842 } 843 844 /* 845 * XXX - should we really limit each write to z_max_blksz? 846 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 847 */ 848 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 849 850 if (abuf == NULL) { 851 tx_bytes = uio->uio_resid; 852 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 853 uio, nbytes, tx); 854 tx_bytes -= uio->uio_resid; 855 } else { 856 tx_bytes = nbytes; 857 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 858 /* 859 * If this is not a full block write, but we are 860 * extending the file past EOF and this data starts 861 * block-aligned, use assign_arcbuf(). Otherwise, 862 * write via dmu_write(). 863 */ 864 if (tx_bytes < max_blksz && (!write_eof || 865 aiov->iov_base != abuf->b_data)) { 866 ASSERT(xuio); 867 dmu_write(zfsvfs->z_os, zp->z_id, woff, 868 aiov->iov_len, aiov->iov_base, tx); 869 dmu_return_arcbuf(abuf); 870 xuio_stat_wbuf_copied(); 871 } else { 872 ASSERT(xuio || tx_bytes == max_blksz); 873 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 874 woff, abuf, tx); 875 } 876 ASSERT(tx_bytes <= uio->uio_resid); 877 uioskip(uio, tx_bytes); 878 } 879 if (tx_bytes && vn_has_cached_data(vp)) { 880 update_pages(vp, woff, 881 tx_bytes, zfsvfs->z_os, zp->z_id); 882 } 883 884 /* 885 * If we made no progress, we're done. If we made even 886 * partial progress, update the znode and ZIL accordingly. 887 */ 888 if (tx_bytes == 0) { 889 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 890 (void *)&zp->z_size, sizeof (uint64_t), tx); 891 dmu_tx_commit(tx); 892 ASSERT(error != 0); 893 break; 894 } 895 896 /* 897 * Clear Set-UID/Set-GID bits on successful write if not 898 * privileged and at least one of the excute bits is set. 899 * 900 * It would be nice to to this after all writes have 901 * been done, but that would still expose the ISUID/ISGID 902 * to another app after the partial write is committed. 903 * 904 * Note: we don't call zfs_fuid_map_id() here because 905 * user 0 is not an ephemeral uid. 906 */ 907 mutex_enter(&zp->z_acl_lock); 908 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 909 (S_IXUSR >> 6))) != 0 && 910 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 911 secpolicy_vnode_setid_retain(cr, 912 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 913 uint64_t newmode; 914 zp->z_mode &= ~(S_ISUID | S_ISGID); 915 newmode = zp->z_mode; 916 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 917 (void *)&newmode, sizeof (uint64_t), tx); 918 } 919 mutex_exit(&zp->z_acl_lock); 920 921 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 922 B_TRUE); 923 924 /* 925 * Update the file size (zp_size) if it has changed; 926 * account for possible concurrent updates. 927 */ 928 while ((end_size = zp->z_size) < uio->uio_loffset) { 929 (void) atomic_cas_64(&zp->z_size, end_size, 930 uio->uio_loffset); 931 ASSERT(error == 0); 932 } 933 /* 934 * If we are replaying and eof is non zero then force 935 * the file size to the specified eof. Note, there's no 936 * concurrency during replay. 937 */ 938 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 939 zp->z_size = zfsvfs->z_replay_eof; 940 941 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 942 943 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 944 dmu_tx_commit(tx); 945 946 if (error != 0) 947 break; 948 ASSERT(tx_bytes == nbytes); 949 n -= nbytes; 950 951 if (!xuio && n > 0) 952 uio_prefaultpages(MIN(n, max_blksz), uio); 953 } 954 955 zfs_range_unlock(rl); 956 957 /* 958 * If we're in replay mode, or we made no progress, return error. 959 * Otherwise, it's at least a partial write, so it's successful. 960 */ 961 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 962 ZFS_EXIT(zfsvfs); 963 return (error); 964 } 965 966 if (ioflag & (FSYNC | FDSYNC) || 967 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 968 zil_commit(zilog, zp->z_id); 969 970 ZFS_EXIT(zfsvfs); 971 return (0); 972 } 973 974 void 975 zfs_get_done(zgd_t *zgd, int error) 976 { 977 znode_t *zp = zgd->zgd_private; 978 objset_t *os = zp->z_zfsvfs->z_os; 979 980 if (zgd->zgd_db) 981 dmu_buf_rele(zgd->zgd_db, zgd); 982 983 zfs_range_unlock(zgd->zgd_rl); 984 985 /* 986 * Release the vnode asynchronously as we currently have the 987 * txg stopped from syncing. 988 */ 989 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 990 991 if (error == 0 && zgd->zgd_bp) 992 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 993 994 kmem_free(zgd, sizeof (zgd_t)); 995 } 996 997 #ifdef DEBUG 998 static int zil_fault_io = 0; 999 #endif 1000 1001 /* 1002 * Get data to generate a TX_WRITE intent log record. 1003 */ 1004 int 1005 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1006 { 1007 zfsvfs_t *zfsvfs = arg; 1008 objset_t *os = zfsvfs->z_os; 1009 znode_t *zp; 1010 uint64_t object = lr->lr_foid; 1011 uint64_t offset = lr->lr_offset; 1012 uint64_t size = lr->lr_length; 1013 blkptr_t *bp = &lr->lr_blkptr; 1014 dmu_buf_t *db; 1015 zgd_t *zgd; 1016 int error = 0; 1017 1018 ASSERT(zio != NULL); 1019 ASSERT(size != 0); 1020 1021 /* 1022 * Nothing to do if the file has been removed 1023 */ 1024 if (zfs_zget(zfsvfs, object, &zp) != 0) 1025 return (SET_ERROR(ENOENT)); 1026 if (zp->z_unlinked) { 1027 /* 1028 * Release the vnode asynchronously as we currently have the 1029 * txg stopped from syncing. 1030 */ 1031 VN_RELE_ASYNC(ZTOV(zp), 1032 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1033 return (SET_ERROR(ENOENT)); 1034 } 1035 1036 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1037 zgd->zgd_zilog = zfsvfs->z_log; 1038 zgd->zgd_private = zp; 1039 1040 /* 1041 * Write records come in two flavors: immediate and indirect. 1042 * For small writes it's cheaper to store the data with the 1043 * log record (immediate); for large writes it's cheaper to 1044 * sync the data and get a pointer to it (indirect) so that 1045 * we don't have to write the data twice. 1046 */ 1047 if (buf != NULL) { /* immediate write */ 1048 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1049 /* test for truncation needs to be done while range locked */ 1050 if (offset >= zp->z_size) { 1051 error = SET_ERROR(ENOENT); 1052 } else { 1053 error = dmu_read(os, object, offset, size, buf, 1054 DMU_READ_NO_PREFETCH); 1055 } 1056 ASSERT(error == 0 || error == ENOENT); 1057 } else { /* indirect write */ 1058 /* 1059 * Have to lock the whole block to ensure when it's 1060 * written out and it's checksum is being calculated 1061 * that no one can change the data. We need to re-check 1062 * blocksize after we get the lock in case it's changed! 1063 */ 1064 for (;;) { 1065 uint64_t blkoff; 1066 size = zp->z_blksz; 1067 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1068 offset -= blkoff; 1069 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1070 RL_READER); 1071 if (zp->z_blksz == size) 1072 break; 1073 offset += blkoff; 1074 zfs_range_unlock(zgd->zgd_rl); 1075 } 1076 /* test for truncation needs to be done while range locked */ 1077 if (lr->lr_offset >= zp->z_size) 1078 error = SET_ERROR(ENOENT); 1079 #ifdef DEBUG 1080 if (zil_fault_io) { 1081 error = SET_ERROR(EIO); 1082 zil_fault_io = 0; 1083 } 1084 #endif 1085 if (error == 0) 1086 error = dmu_buf_hold(os, object, offset, zgd, &db, 1087 DMU_READ_NO_PREFETCH); 1088 1089 if (error == 0) { 1090 blkptr_t *obp = dmu_buf_get_blkptr(db); 1091 if (obp) { 1092 ASSERT(BP_IS_HOLE(bp)); 1093 *bp = *obp; 1094 } 1095 1096 zgd->zgd_db = db; 1097 zgd->zgd_bp = bp; 1098 1099 ASSERT(db->db_offset == offset); 1100 ASSERT(db->db_size == size); 1101 1102 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1103 zfs_get_done, zgd); 1104 ASSERT(error || lr->lr_length <= zp->z_blksz); 1105 1106 /* 1107 * On success, we need to wait for the write I/O 1108 * initiated by dmu_sync() to complete before we can 1109 * release this dbuf. We will finish everything up 1110 * in the zfs_get_done() callback. 1111 */ 1112 if (error == 0) 1113 return (0); 1114 1115 if (error == EALREADY) { 1116 lr->lr_common.lrc_txtype = TX_WRITE2; 1117 error = 0; 1118 } 1119 } 1120 } 1121 1122 zfs_get_done(zgd, error); 1123 1124 return (error); 1125 } 1126 1127 /*ARGSUSED*/ 1128 static int 1129 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1130 caller_context_t *ct) 1131 { 1132 znode_t *zp = VTOZ(vp); 1133 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1134 int error; 1135 1136 ZFS_ENTER(zfsvfs); 1137 ZFS_VERIFY_ZP(zp); 1138 1139 if (flag & V_ACE_MASK) 1140 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1141 else 1142 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1143 1144 ZFS_EXIT(zfsvfs); 1145 return (error); 1146 } 1147 1148 /* 1149 * If vnode is for a device return a specfs vnode instead. 1150 */ 1151 static int 1152 specvp_check(vnode_t **vpp, cred_t *cr) 1153 { 1154 int error = 0; 1155 1156 if (IS_DEVVP(*vpp)) { 1157 struct vnode *svp; 1158 1159 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1160 VN_RELE(*vpp); 1161 if (svp == NULL) 1162 error = SET_ERROR(ENOSYS); 1163 *vpp = svp; 1164 } 1165 return (error); 1166 } 1167 1168 1169 /* 1170 * Lookup an entry in a directory, or an extended attribute directory. 1171 * If it exists, return a held vnode reference for it. 1172 * 1173 * IN: dvp - vnode of directory to search. 1174 * nm - name of entry to lookup. 1175 * pnp - full pathname to lookup [UNUSED]. 1176 * flags - LOOKUP_XATTR set if looking for an attribute. 1177 * rdir - root directory vnode [UNUSED]. 1178 * cr - credentials of caller. 1179 * ct - caller context 1180 * direntflags - directory lookup flags 1181 * realpnp - returned pathname. 1182 * 1183 * OUT: vpp - vnode of located entry, NULL if not found. 1184 * 1185 * RETURN: 0 on success, error code on failure. 1186 * 1187 * Timestamps: 1188 * NA 1189 */ 1190 /* ARGSUSED */ 1191 static int 1192 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1193 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1194 int *direntflags, pathname_t *realpnp) 1195 { 1196 znode_t *zdp = VTOZ(dvp); 1197 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1198 int error = 0; 1199 1200 /* fast path */ 1201 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1202 1203 if (dvp->v_type != VDIR) { 1204 return (SET_ERROR(ENOTDIR)); 1205 } else if (zdp->z_sa_hdl == NULL) { 1206 return (SET_ERROR(EIO)); 1207 } 1208 1209 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1210 error = zfs_fastaccesschk_execute(zdp, cr); 1211 if (!error) { 1212 *vpp = dvp; 1213 VN_HOLD(*vpp); 1214 return (0); 1215 } 1216 return (error); 1217 } else { 1218 vnode_t *tvp = dnlc_lookup(dvp, nm); 1219 1220 if (tvp) { 1221 error = zfs_fastaccesschk_execute(zdp, cr); 1222 if (error) { 1223 VN_RELE(tvp); 1224 return (error); 1225 } 1226 if (tvp == DNLC_NO_VNODE) { 1227 VN_RELE(tvp); 1228 return (SET_ERROR(ENOENT)); 1229 } else { 1230 *vpp = tvp; 1231 return (specvp_check(vpp, cr)); 1232 } 1233 } 1234 } 1235 } 1236 1237 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1238 1239 ZFS_ENTER(zfsvfs); 1240 ZFS_VERIFY_ZP(zdp); 1241 1242 *vpp = NULL; 1243 1244 if (flags & LOOKUP_XATTR) { 1245 /* 1246 * If the xattr property is off, refuse the lookup request. 1247 */ 1248 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1249 ZFS_EXIT(zfsvfs); 1250 return (SET_ERROR(EINVAL)); 1251 } 1252 1253 /* 1254 * We don't allow recursive attributes.. 1255 * Maybe someday we will. 1256 */ 1257 if (zdp->z_pflags & ZFS_XATTR) { 1258 ZFS_EXIT(zfsvfs); 1259 return (SET_ERROR(EINVAL)); 1260 } 1261 1262 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1263 ZFS_EXIT(zfsvfs); 1264 return (error); 1265 } 1266 1267 /* 1268 * Do we have permission to get into attribute directory? 1269 */ 1270 1271 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1272 B_FALSE, cr)) { 1273 VN_RELE(*vpp); 1274 *vpp = NULL; 1275 } 1276 1277 ZFS_EXIT(zfsvfs); 1278 return (error); 1279 } 1280 1281 if (dvp->v_type != VDIR) { 1282 ZFS_EXIT(zfsvfs); 1283 return (SET_ERROR(ENOTDIR)); 1284 } 1285 1286 /* 1287 * Check accessibility of directory. 1288 */ 1289 1290 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1291 ZFS_EXIT(zfsvfs); 1292 return (error); 1293 } 1294 1295 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1296 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1297 ZFS_EXIT(zfsvfs); 1298 return (SET_ERROR(EILSEQ)); 1299 } 1300 1301 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1302 if (error == 0) 1303 error = specvp_check(vpp, cr); 1304 1305 ZFS_EXIT(zfsvfs); 1306 return (error); 1307 } 1308 1309 /* 1310 * Attempt to create a new entry in a directory. If the entry 1311 * already exists, truncate the file if permissible, else return 1312 * an error. Return the vp of the created or trunc'd file. 1313 * 1314 * IN: dvp - vnode of directory to put new file entry in. 1315 * name - name of new file entry. 1316 * vap - attributes of new file. 1317 * excl - flag indicating exclusive or non-exclusive mode. 1318 * mode - mode to open file with. 1319 * cr - credentials of caller. 1320 * flag - large file flag [UNUSED]. 1321 * ct - caller context 1322 * vsecp - ACL to be set 1323 * 1324 * OUT: vpp - vnode of created or trunc'd entry. 1325 * 1326 * RETURN: 0 on success, error code on failure. 1327 * 1328 * Timestamps: 1329 * dvp - ctime|mtime updated if new entry created 1330 * vp - ctime|mtime always, atime if new 1331 */ 1332 1333 /* ARGSUSED */ 1334 static int 1335 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1336 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1337 vsecattr_t *vsecp) 1338 { 1339 znode_t *zp, *dzp = VTOZ(dvp); 1340 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1341 zilog_t *zilog; 1342 objset_t *os; 1343 zfs_dirlock_t *dl; 1344 dmu_tx_t *tx; 1345 int error; 1346 ksid_t *ksid; 1347 uid_t uid; 1348 gid_t gid = crgetgid(cr); 1349 zfs_acl_ids_t acl_ids; 1350 boolean_t fuid_dirtied; 1351 boolean_t have_acl = B_FALSE; 1352 boolean_t waited = B_FALSE; 1353 1354 /* 1355 * If we have an ephemeral id, ACL, or XVATTR then 1356 * make sure file system is at proper version 1357 */ 1358 1359 ksid = crgetsid(cr, KSID_OWNER); 1360 if (ksid) 1361 uid = ksid_getid(ksid); 1362 else 1363 uid = crgetuid(cr); 1364 1365 if (zfsvfs->z_use_fuids == B_FALSE && 1366 (vsecp || (vap->va_mask & AT_XVATTR) || 1367 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1368 return (SET_ERROR(EINVAL)); 1369 1370 ZFS_ENTER(zfsvfs); 1371 ZFS_VERIFY_ZP(dzp); 1372 os = zfsvfs->z_os; 1373 zilog = zfsvfs->z_log; 1374 1375 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1376 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1377 ZFS_EXIT(zfsvfs); 1378 return (SET_ERROR(EILSEQ)); 1379 } 1380 1381 if (vap->va_mask & AT_XVATTR) { 1382 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1383 crgetuid(cr), cr, vap->va_type)) != 0) { 1384 ZFS_EXIT(zfsvfs); 1385 return (error); 1386 } 1387 } 1388 top: 1389 *vpp = NULL; 1390 1391 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1392 vap->va_mode &= ~VSVTX; 1393 1394 if (*name == '\0') { 1395 /* 1396 * Null component name refers to the directory itself. 1397 */ 1398 VN_HOLD(dvp); 1399 zp = dzp; 1400 dl = NULL; 1401 error = 0; 1402 } else { 1403 /* possible VN_HOLD(zp) */ 1404 int zflg = 0; 1405 1406 if (flag & FIGNORECASE) 1407 zflg |= ZCILOOK; 1408 1409 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1410 NULL, NULL); 1411 if (error) { 1412 if (have_acl) 1413 zfs_acl_ids_free(&acl_ids); 1414 if (strcmp(name, "..") == 0) 1415 error = SET_ERROR(EISDIR); 1416 ZFS_EXIT(zfsvfs); 1417 return (error); 1418 } 1419 } 1420 1421 if (zp == NULL) { 1422 uint64_t txtype; 1423 1424 /* 1425 * Create a new file object and update the directory 1426 * to reference it. 1427 */ 1428 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1429 if (have_acl) 1430 zfs_acl_ids_free(&acl_ids); 1431 goto out; 1432 } 1433 1434 /* 1435 * We only support the creation of regular files in 1436 * extended attribute directories. 1437 */ 1438 1439 if ((dzp->z_pflags & ZFS_XATTR) && 1440 (vap->va_type != VREG)) { 1441 if (have_acl) 1442 zfs_acl_ids_free(&acl_ids); 1443 error = SET_ERROR(EINVAL); 1444 goto out; 1445 } 1446 1447 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 1448 cr, vsecp, &acl_ids)) != 0) 1449 goto out; 1450 have_acl = B_TRUE; 1451 1452 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1453 zfs_acl_ids_free(&acl_ids); 1454 error = SET_ERROR(EDQUOT); 1455 goto out; 1456 } 1457 1458 tx = dmu_tx_create(os); 1459 1460 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1461 ZFS_SA_BASE_ATTR_SIZE); 1462 1463 fuid_dirtied = zfsvfs->z_fuid_dirty; 1464 if (fuid_dirtied) 1465 zfs_fuid_txhold(zfsvfs, tx); 1466 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1467 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1468 if (!zfsvfs->z_use_sa && 1469 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1470 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1471 0, acl_ids.z_aclp->z_acl_bytes); 1472 } 1473 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1474 if (error) { 1475 zfs_dirent_unlock(dl); 1476 if (error == ERESTART) { 1477 waited = B_TRUE; 1478 dmu_tx_wait(tx); 1479 dmu_tx_abort(tx); 1480 goto top; 1481 } 1482 zfs_acl_ids_free(&acl_ids); 1483 dmu_tx_abort(tx); 1484 ZFS_EXIT(zfsvfs); 1485 return (error); 1486 } 1487 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1488 1489 if (fuid_dirtied) 1490 zfs_fuid_sync(zfsvfs, tx); 1491 1492 (void) zfs_link_create(dl, zp, tx, ZNEW); 1493 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1494 if (flag & FIGNORECASE) 1495 txtype |= TX_CI; 1496 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1497 vsecp, acl_ids.z_fuidp, vap); 1498 zfs_acl_ids_free(&acl_ids); 1499 dmu_tx_commit(tx); 1500 } else { 1501 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1502 1503 if (have_acl) 1504 zfs_acl_ids_free(&acl_ids); 1505 have_acl = B_FALSE; 1506 1507 /* 1508 * A directory entry already exists for this name. 1509 */ 1510 /* 1511 * Can't truncate an existing file if in exclusive mode. 1512 */ 1513 if (excl == EXCL) { 1514 error = SET_ERROR(EEXIST); 1515 goto out; 1516 } 1517 /* 1518 * Can't open a directory for writing. 1519 */ 1520 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1521 error = SET_ERROR(EISDIR); 1522 goto out; 1523 } 1524 /* 1525 * Verify requested access to file. 1526 */ 1527 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1528 goto out; 1529 } 1530 1531 mutex_enter(&dzp->z_lock); 1532 dzp->z_seq++; 1533 mutex_exit(&dzp->z_lock); 1534 1535 /* 1536 * Truncate regular files if requested. 1537 */ 1538 if ((ZTOV(zp)->v_type == VREG) && 1539 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1540 /* we can't hold any locks when calling zfs_freesp() */ 1541 zfs_dirent_unlock(dl); 1542 dl = NULL; 1543 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1544 if (error == 0) { 1545 vnevent_create(ZTOV(zp), ct); 1546 } 1547 } 1548 } 1549 out: 1550 1551 if (dl) 1552 zfs_dirent_unlock(dl); 1553 1554 if (error) { 1555 if (zp) 1556 VN_RELE(ZTOV(zp)); 1557 } else { 1558 *vpp = ZTOV(zp); 1559 error = specvp_check(vpp, cr); 1560 } 1561 1562 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1563 zil_commit(zilog, 0); 1564 1565 ZFS_EXIT(zfsvfs); 1566 return (error); 1567 } 1568 1569 /* 1570 * Remove an entry from a directory. 1571 * 1572 * IN: dvp - vnode of directory to remove entry from. 1573 * name - name of entry to remove. 1574 * cr - credentials of caller. 1575 * ct - caller context 1576 * flags - case flags 1577 * 1578 * RETURN: 0 on success, error code on failure. 1579 * 1580 * Timestamps: 1581 * dvp - ctime|mtime 1582 * vp - ctime (if nlink > 0) 1583 */ 1584 1585 uint64_t null_xattr = 0; 1586 1587 /*ARGSUSED*/ 1588 static int 1589 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1590 int flags) 1591 { 1592 znode_t *zp, *dzp = VTOZ(dvp); 1593 znode_t *xzp; 1594 vnode_t *vp; 1595 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1596 zilog_t *zilog; 1597 uint64_t acl_obj, xattr_obj; 1598 uint64_t xattr_obj_unlinked = 0; 1599 uint64_t obj = 0; 1600 zfs_dirlock_t *dl; 1601 dmu_tx_t *tx; 1602 boolean_t may_delete_now, delete_now = FALSE; 1603 boolean_t unlinked, toobig = FALSE; 1604 uint64_t txtype; 1605 pathname_t *realnmp = NULL; 1606 pathname_t realnm; 1607 int error; 1608 int zflg = ZEXISTS; 1609 boolean_t waited = B_FALSE; 1610 1611 ZFS_ENTER(zfsvfs); 1612 ZFS_VERIFY_ZP(dzp); 1613 zilog = zfsvfs->z_log; 1614 1615 if (flags & FIGNORECASE) { 1616 zflg |= ZCILOOK; 1617 pn_alloc(&realnm); 1618 realnmp = &realnm; 1619 } 1620 1621 top: 1622 xattr_obj = 0; 1623 xzp = NULL; 1624 /* 1625 * Attempt to lock directory; fail if entry doesn't exist. 1626 */ 1627 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1628 NULL, realnmp)) { 1629 if (realnmp) 1630 pn_free(realnmp); 1631 ZFS_EXIT(zfsvfs); 1632 return (error); 1633 } 1634 1635 vp = ZTOV(zp); 1636 1637 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1638 goto out; 1639 } 1640 1641 /* 1642 * Need to use rmdir for removing directories. 1643 */ 1644 if (vp->v_type == VDIR) { 1645 error = SET_ERROR(EPERM); 1646 goto out; 1647 } 1648 1649 vnevent_remove(vp, dvp, name, ct); 1650 1651 if (realnmp) 1652 dnlc_remove(dvp, realnmp->pn_buf); 1653 else 1654 dnlc_remove(dvp, name); 1655 1656 mutex_enter(&vp->v_lock); 1657 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1658 mutex_exit(&vp->v_lock); 1659 1660 /* 1661 * We may delete the znode now, or we may put it in the unlinked set; 1662 * it depends on whether we're the last link, and on whether there are 1663 * other holds on the vnode. So we dmu_tx_hold() the right things to 1664 * allow for either case. 1665 */ 1666 obj = zp->z_id; 1667 tx = dmu_tx_create(zfsvfs->z_os); 1668 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1669 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1670 zfs_sa_upgrade_txholds(tx, zp); 1671 zfs_sa_upgrade_txholds(tx, dzp); 1672 if (may_delete_now) { 1673 toobig = 1674 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1675 /* if the file is too big, only hold_free a token amount */ 1676 dmu_tx_hold_free(tx, zp->z_id, 0, 1677 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1678 } 1679 1680 /* are there any extended attributes? */ 1681 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1682 &xattr_obj, sizeof (xattr_obj)); 1683 if (error == 0 && xattr_obj) { 1684 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1685 ASSERT0(error); 1686 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1687 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1688 } 1689 1690 mutex_enter(&zp->z_lock); 1691 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1692 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1693 mutex_exit(&zp->z_lock); 1694 1695 /* charge as an update -- would be nice not to charge at all */ 1696 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1697 1698 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1699 if (error) { 1700 zfs_dirent_unlock(dl); 1701 VN_RELE(vp); 1702 if (xzp) 1703 VN_RELE(ZTOV(xzp)); 1704 if (error == ERESTART) { 1705 waited = B_TRUE; 1706 dmu_tx_wait(tx); 1707 dmu_tx_abort(tx); 1708 goto top; 1709 } 1710 if (realnmp) 1711 pn_free(realnmp); 1712 dmu_tx_abort(tx); 1713 ZFS_EXIT(zfsvfs); 1714 return (error); 1715 } 1716 1717 /* 1718 * Remove the directory entry. 1719 */ 1720 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1721 1722 if (error) { 1723 dmu_tx_commit(tx); 1724 goto out; 1725 } 1726 1727 if (unlinked) { 1728 1729 /* 1730 * Hold z_lock so that we can make sure that the ACL obj 1731 * hasn't changed. Could have been deleted due to 1732 * zfs_sa_upgrade(). 1733 */ 1734 mutex_enter(&zp->z_lock); 1735 mutex_enter(&vp->v_lock); 1736 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1737 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1738 delete_now = may_delete_now && !toobig && 1739 vp->v_count == 1 && !vn_has_cached_data(vp) && 1740 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == 1741 acl_obj; 1742 mutex_exit(&vp->v_lock); 1743 } 1744 1745 txtype = TX_REMOVE; 1746 if (flags & FIGNORECASE) 1747 txtype |= TX_CI; 1748 rw_enter(&rz_zev_rwlock, RW_READER); 1749 if (rz_zev_callbacks && rz_zev_callbacks->rz_zev_znode_remove) 1750 rz_zev_callbacks->rz_zev_znode_remove(dzp, zp, tx, 1751 name, txtype); 1752 rw_exit(&rz_zev_rwlock); 1753 1754 if (delete_now) { 1755 if (xattr_obj_unlinked) { 1756 ASSERT3U(xzp->z_links, ==, 2); 1757 mutex_enter(&xzp->z_lock); 1758 xzp->z_unlinked = 1; 1759 xzp->z_links = 0; 1760 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1761 &xzp->z_links, sizeof (xzp->z_links), tx); 1762 ASSERT3U(error, ==, 0); 1763 mutex_exit(&xzp->z_lock); 1764 zfs_unlinked_add(xzp, tx); 1765 1766 if (zp->z_is_sa) 1767 error = sa_remove(zp->z_sa_hdl, 1768 SA_ZPL_XATTR(zfsvfs), tx); 1769 else 1770 error = sa_update(zp->z_sa_hdl, 1771 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1772 sizeof (uint64_t), tx); 1773 ASSERT0(error); 1774 } 1775 mutex_enter(&vp->v_lock); 1776 vp->v_count--; 1777 ASSERT0(vp->v_count); 1778 mutex_exit(&vp->v_lock); 1779 mutex_exit(&zp->z_lock); 1780 zfs_znode_delete(zp, tx); 1781 } else if (unlinked) { 1782 mutex_exit(&zp->z_lock); 1783 zfs_unlinked_add(zp, tx); 1784 } 1785 1786 txtype = TX_REMOVE; 1787 if (flags & FIGNORECASE) 1788 txtype |= TX_CI; 1789 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 1790 1791 dmu_tx_commit(tx); 1792 out: 1793 if (realnmp) 1794 pn_free(realnmp); 1795 1796 zfs_dirent_unlock(dl); 1797 1798 if (!delete_now) 1799 VN_RELE(vp); 1800 if (xzp) 1801 VN_RELE(ZTOV(xzp)); 1802 1803 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1804 zil_commit(zilog, 0); 1805 1806 ZFS_EXIT(zfsvfs); 1807 return (error); 1808 } 1809 1810 /* 1811 * Create a new directory and insert it into dvp using the name 1812 * provided. Return a pointer to the inserted directory. 1813 * 1814 * IN: dvp - vnode of directory to add subdir to. 1815 * dirname - name of new directory. 1816 * vap - attributes of new directory. 1817 * cr - credentials of caller. 1818 * ct - caller context 1819 * flags - case flags 1820 * vsecp - ACL to be set 1821 * 1822 * OUT: vpp - vnode of created directory. 1823 * 1824 * RETURN: 0 on success, error code on failure. 1825 * 1826 * Timestamps: 1827 * dvp - ctime|mtime updated 1828 * vp - ctime|mtime|atime updated 1829 */ 1830 /*ARGSUSED*/ 1831 static int 1832 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1833 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1834 { 1835 znode_t *zp, *dzp = VTOZ(dvp); 1836 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1837 zilog_t *zilog; 1838 zfs_dirlock_t *dl; 1839 uint64_t txtype; 1840 dmu_tx_t *tx; 1841 int error; 1842 int zf = ZNEW; 1843 ksid_t *ksid; 1844 uid_t uid; 1845 gid_t gid = crgetgid(cr); 1846 zfs_acl_ids_t acl_ids; 1847 boolean_t fuid_dirtied; 1848 boolean_t waited = B_FALSE; 1849 1850 ASSERT(vap->va_type == VDIR); 1851 1852 /* 1853 * If we have an ephemeral id, ACL, or XVATTR then 1854 * make sure file system is at proper version 1855 */ 1856 1857 ksid = crgetsid(cr, KSID_OWNER); 1858 if (ksid) 1859 uid = ksid_getid(ksid); 1860 else 1861 uid = crgetuid(cr); 1862 if (zfsvfs->z_use_fuids == B_FALSE && 1863 (vsecp || (vap->va_mask & AT_XVATTR) || 1864 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1865 return (SET_ERROR(EINVAL)); 1866 1867 ZFS_ENTER(zfsvfs); 1868 ZFS_VERIFY_ZP(dzp); 1869 zilog = zfsvfs->z_log; 1870 1871 if (dzp->z_pflags & ZFS_XATTR) { 1872 ZFS_EXIT(zfsvfs); 1873 return (SET_ERROR(EINVAL)); 1874 } 1875 1876 if (zfsvfs->z_utf8 && u8_validate(dirname, 1877 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1878 ZFS_EXIT(zfsvfs); 1879 return (SET_ERROR(EILSEQ)); 1880 } 1881 if (flags & FIGNORECASE) 1882 zf |= ZCILOOK; 1883 1884 if (vap->va_mask & AT_XVATTR) { 1885 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1886 crgetuid(cr), cr, vap->va_type)) != 0) { 1887 ZFS_EXIT(zfsvfs); 1888 return (error); 1889 } 1890 } 1891 1892 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1893 vsecp, &acl_ids)) != 0) { 1894 ZFS_EXIT(zfsvfs); 1895 return (error); 1896 } 1897 /* 1898 * First make sure the new directory doesn't exist. 1899 * 1900 * Existence is checked first to make sure we don't return 1901 * EACCES instead of EEXIST which can cause some applications 1902 * to fail. 1903 */ 1904 top: 1905 *vpp = NULL; 1906 1907 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1908 NULL, NULL)) { 1909 zfs_acl_ids_free(&acl_ids); 1910 ZFS_EXIT(zfsvfs); 1911 return (error); 1912 } 1913 1914 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1915 zfs_acl_ids_free(&acl_ids); 1916 zfs_dirent_unlock(dl); 1917 ZFS_EXIT(zfsvfs); 1918 return (error); 1919 } 1920 1921 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1922 zfs_acl_ids_free(&acl_ids); 1923 zfs_dirent_unlock(dl); 1924 ZFS_EXIT(zfsvfs); 1925 return (SET_ERROR(EDQUOT)); 1926 } 1927 1928 /* 1929 * Add a new entry to the directory. 1930 */ 1931 tx = dmu_tx_create(zfsvfs->z_os); 1932 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1933 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1934 fuid_dirtied = zfsvfs->z_fuid_dirty; 1935 if (fuid_dirtied) 1936 zfs_fuid_txhold(zfsvfs, tx); 1937 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1938 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1939 acl_ids.z_aclp->z_acl_bytes); 1940 } 1941 1942 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1943 ZFS_SA_BASE_ATTR_SIZE); 1944 1945 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1946 if (error) { 1947 zfs_dirent_unlock(dl); 1948 if (error == ERESTART) { 1949 waited = B_TRUE; 1950 dmu_tx_wait(tx); 1951 dmu_tx_abort(tx); 1952 goto top; 1953 } 1954 zfs_acl_ids_free(&acl_ids); 1955 dmu_tx_abort(tx); 1956 ZFS_EXIT(zfsvfs); 1957 return (error); 1958 } 1959 1960 /* 1961 * Create new node. 1962 */ 1963 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1964 1965 if (fuid_dirtied) 1966 zfs_fuid_sync(zfsvfs, tx); 1967 1968 /* 1969 * Now put new name in parent dir. 1970 */ 1971 (void) zfs_link_create(dl, zp, tx, ZNEW); 1972 1973 *vpp = ZTOV(zp); 1974 1975 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1976 if (flags & FIGNORECASE) 1977 txtype |= TX_CI; 1978 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1979 acl_ids.z_fuidp, vap); 1980 1981 zfs_acl_ids_free(&acl_ids); 1982 1983 dmu_tx_commit(tx); 1984 1985 zfs_dirent_unlock(dl); 1986 1987 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1988 zil_commit(zilog, 0); 1989 1990 ZFS_EXIT(zfsvfs); 1991 return (0); 1992 } 1993 1994 /* 1995 * Remove a directory subdir entry. If the current working 1996 * directory is the same as the subdir to be removed, the 1997 * remove will fail. 1998 * 1999 * IN: dvp - vnode of directory to remove from. 2000 * name - name of directory to be removed. 2001 * cwd - vnode of current working directory. 2002 * cr - credentials of caller. 2003 * ct - caller context 2004 * flags - case flags 2005 * 2006 * RETURN: 0 on success, error code on failure. 2007 * 2008 * Timestamps: 2009 * dvp - ctime|mtime updated 2010 */ 2011 /*ARGSUSED*/ 2012 static int 2013 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 2014 caller_context_t *ct, int flags) 2015 { 2016 znode_t *dzp = VTOZ(dvp); 2017 znode_t *zp; 2018 vnode_t *vp; 2019 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2020 zilog_t *zilog; 2021 zfs_dirlock_t *dl; 2022 dmu_tx_t *tx; 2023 int error; 2024 int zflg = ZEXISTS; 2025 boolean_t waited = B_FALSE; 2026 2027 ZFS_ENTER(zfsvfs); 2028 ZFS_VERIFY_ZP(dzp); 2029 zilog = zfsvfs->z_log; 2030 2031 if (flags & FIGNORECASE) 2032 zflg |= ZCILOOK; 2033 top: 2034 zp = NULL; 2035 2036 /* 2037 * Attempt to lock directory; fail if entry doesn't exist. 2038 */ 2039 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2040 NULL, NULL)) { 2041 ZFS_EXIT(zfsvfs); 2042 return (error); 2043 } 2044 2045 vp = ZTOV(zp); 2046 2047 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2048 goto out; 2049 } 2050 2051 if (vp->v_type != VDIR) { 2052 error = SET_ERROR(ENOTDIR); 2053 goto out; 2054 } 2055 2056 if (vp == cwd) { 2057 error = SET_ERROR(EINVAL); 2058 goto out; 2059 } 2060 2061 vnevent_rmdir(vp, dvp, name, ct); 2062 2063 /* 2064 * Grab a lock on the directory to make sure that noone is 2065 * trying to add (or lookup) entries while we are removing it. 2066 */ 2067 rw_enter(&zp->z_name_lock, RW_WRITER); 2068 2069 /* 2070 * Grab a lock on the parent pointer to make sure we play well 2071 * with the treewalk and directory rename code. 2072 */ 2073 rw_enter(&zp->z_parent_lock, RW_WRITER); 2074 2075 tx = dmu_tx_create(zfsvfs->z_os); 2076 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2077 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2078 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2079 zfs_sa_upgrade_txholds(tx, zp); 2080 zfs_sa_upgrade_txholds(tx, dzp); 2081 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 2082 if (error) { 2083 rw_exit(&zp->z_parent_lock); 2084 rw_exit(&zp->z_name_lock); 2085 zfs_dirent_unlock(dl); 2086 VN_RELE(vp); 2087 if (error == ERESTART) { 2088 waited = B_TRUE; 2089 dmu_tx_wait(tx); 2090 dmu_tx_abort(tx); 2091 goto top; 2092 } 2093 dmu_tx_abort(tx); 2094 ZFS_EXIT(zfsvfs); 2095 return (error); 2096 } 2097 2098 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2099 2100 if (error == 0) { 2101 uint64_t txtype = TX_RMDIR; 2102 if (flags & FIGNORECASE) 2103 txtype |= TX_CI; 2104 2105 rw_enter(&rz_zev_rwlock, RW_READER); 2106 if (rz_zev_callbacks && rz_zev_callbacks->rz_zev_znode_remove) 2107 rz_zev_callbacks->rz_zev_znode_remove(dzp, zp, tx, 2108 name, txtype); 2109 rw_exit(&rz_zev_rwlock); 2110 2111 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2112 } 2113 2114 dmu_tx_commit(tx); 2115 2116 rw_exit(&zp->z_parent_lock); 2117 rw_exit(&zp->z_name_lock); 2118 out: 2119 zfs_dirent_unlock(dl); 2120 2121 VN_RELE(vp); 2122 2123 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2124 zil_commit(zilog, 0); 2125 2126 ZFS_EXIT(zfsvfs); 2127 return (error); 2128 } 2129 2130 /* 2131 * Read as many directory entries as will fit into the provided 2132 * buffer from the given directory cursor position (specified in 2133 * the uio structure). 2134 * 2135 * IN: vp - vnode of directory to read. 2136 * uio - structure supplying read location, range info, 2137 * and return buffer. 2138 * cr - credentials of caller. 2139 * ct - caller context 2140 * flags - case flags 2141 * 2142 * OUT: uio - updated offset and range, buffer filled. 2143 * eofp - set to true if end-of-file detected. 2144 * 2145 * RETURN: 0 on success, error code on failure. 2146 * 2147 * Timestamps: 2148 * vp - atime updated 2149 * 2150 * Note that the low 4 bits of the cookie returned by zap is always zero. 2151 * This allows us to use the low range for "special" directory entries: 2152 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2153 * we use the offset 2 for the '.zfs' directory. 2154 */ 2155 /* ARGSUSED */ 2156 static int 2157 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 2158 caller_context_t *ct, int flags) 2159 { 2160 znode_t *zp = VTOZ(vp); 2161 iovec_t *iovp; 2162 edirent_t *eodp; 2163 dirent64_t *odp; 2164 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2165 objset_t *os; 2166 caddr_t outbuf; 2167 size_t bufsize; 2168 zap_cursor_t zc; 2169 zap_attribute_t zap; 2170 uint_t bytes_wanted; 2171 uint64_t offset; /* must be unsigned; checks for < 1 */ 2172 uint64_t parent; 2173 int local_eof; 2174 int outcount; 2175 int error; 2176 uint8_t prefetch; 2177 boolean_t check_sysattrs; 2178 2179 ZFS_ENTER(zfsvfs); 2180 ZFS_VERIFY_ZP(zp); 2181 2182 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2183 &parent, sizeof (parent))) != 0) { 2184 ZFS_EXIT(zfsvfs); 2185 return (error); 2186 } 2187 2188 /* 2189 * If we are not given an eof variable, 2190 * use a local one. 2191 */ 2192 if (eofp == NULL) 2193 eofp = &local_eof; 2194 2195 /* 2196 * Check for valid iov_len. 2197 */ 2198 if (uio->uio_iov->iov_len <= 0) { 2199 ZFS_EXIT(zfsvfs); 2200 return (SET_ERROR(EINVAL)); 2201 } 2202 2203 /* 2204 * Quit if directory has been removed (posix) 2205 */ 2206 if ((*eofp = zp->z_unlinked) != 0) { 2207 ZFS_EXIT(zfsvfs); 2208 return (0); 2209 } 2210 2211 error = 0; 2212 os = zfsvfs->z_os; 2213 offset = uio->uio_loffset; 2214 prefetch = zp->z_zn_prefetch; 2215 2216 /* 2217 * Initialize the iterator cursor. 2218 */ 2219 if (offset <= 3) { 2220 /* 2221 * Start iteration from the beginning of the directory. 2222 */ 2223 zap_cursor_init(&zc, os, zp->z_id); 2224 } else { 2225 /* 2226 * The offset is a serialized cursor. 2227 */ 2228 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2229 } 2230 2231 /* 2232 * Get space to change directory entries into fs independent format. 2233 */ 2234 iovp = uio->uio_iov; 2235 bytes_wanted = iovp->iov_len; 2236 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2237 bufsize = bytes_wanted; 2238 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2239 odp = (struct dirent64 *)outbuf; 2240 } else { 2241 bufsize = bytes_wanted; 2242 outbuf = NULL; 2243 odp = (struct dirent64 *)iovp->iov_base; 2244 } 2245 eodp = (struct edirent *)odp; 2246 2247 /* 2248 * If this VFS supports the system attribute view interface; and 2249 * we're looking at an extended attribute directory; and we care 2250 * about normalization conflicts on this vfs; then we must check 2251 * for normalization conflicts with the sysattr name space. 2252 */ 2253 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2254 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2255 (flags & V_RDDIR_ENTFLAGS); 2256 2257 /* 2258 * Transform to file-system independent format 2259 */ 2260 outcount = 0; 2261 while (outcount < bytes_wanted) { 2262 ino64_t objnum; 2263 ushort_t reclen; 2264 off64_t *next = NULL; 2265 2266 /* 2267 * Special case `.', `..', and `.zfs'. 2268 */ 2269 if (offset == 0) { 2270 (void) strcpy(zap.za_name, "."); 2271 zap.za_normalization_conflict = 0; 2272 objnum = zp->z_id; 2273 } else if (offset == 1) { 2274 (void) strcpy(zap.za_name, ".."); 2275 zap.za_normalization_conflict = 0; 2276 objnum = parent; 2277 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2278 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2279 zap.za_normalization_conflict = 0; 2280 objnum = ZFSCTL_INO_ROOT; 2281 } else { 2282 /* 2283 * Grab next entry. 2284 */ 2285 if (error = zap_cursor_retrieve(&zc, &zap)) { 2286 if ((*eofp = (error == ENOENT)) != 0) 2287 break; 2288 else 2289 goto update; 2290 } 2291 2292 if (zap.za_integer_length != 8 || 2293 zap.za_num_integers != 1) { 2294 cmn_err(CE_WARN, "zap_readdir: bad directory " 2295 "entry, obj = %lld, offset = %lld\n", 2296 (u_longlong_t)zp->z_id, 2297 (u_longlong_t)offset); 2298 error = SET_ERROR(ENXIO); 2299 goto update; 2300 } 2301 2302 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2303 /* 2304 * MacOS X can extract the object type here such as: 2305 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2306 */ 2307 2308 if (check_sysattrs && !zap.za_normalization_conflict) { 2309 zap.za_normalization_conflict = 2310 xattr_sysattr_casechk(zap.za_name); 2311 } 2312 } 2313 2314 if (flags & V_RDDIR_ACCFILTER) { 2315 /* 2316 * If we have no access at all, don't include 2317 * this entry in the returned information 2318 */ 2319 znode_t *ezp; 2320 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2321 goto skip_entry; 2322 if (!zfs_has_access(ezp, cr)) { 2323 VN_RELE(ZTOV(ezp)); 2324 goto skip_entry; 2325 } 2326 VN_RELE(ZTOV(ezp)); 2327 } 2328 2329 if (flags & V_RDDIR_ENTFLAGS) 2330 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2331 else 2332 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2333 2334 /* 2335 * Will this entry fit in the buffer? 2336 */ 2337 if (outcount + reclen > bufsize) { 2338 /* 2339 * Did we manage to fit anything in the buffer? 2340 */ 2341 if (!outcount) { 2342 error = SET_ERROR(EINVAL); 2343 goto update; 2344 } 2345 break; 2346 } 2347 if (flags & V_RDDIR_ENTFLAGS) { 2348 /* 2349 * Add extended flag entry: 2350 */ 2351 eodp->ed_ino = objnum; 2352 eodp->ed_reclen = reclen; 2353 /* NOTE: ed_off is the offset for the *next* entry */ 2354 next = &(eodp->ed_off); 2355 eodp->ed_eflags = zap.za_normalization_conflict ? 2356 ED_CASE_CONFLICT : 0; 2357 (void) strncpy(eodp->ed_name, zap.za_name, 2358 EDIRENT_NAMELEN(reclen)); 2359 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2360 } else { 2361 /* 2362 * Add normal entry: 2363 */ 2364 odp->d_ino = objnum; 2365 odp->d_reclen = reclen; 2366 /* NOTE: d_off is the offset for the *next* entry */ 2367 next = &(odp->d_off); 2368 (void) strncpy(odp->d_name, zap.za_name, 2369 DIRENT64_NAMELEN(reclen)); 2370 odp = (dirent64_t *)((intptr_t)odp + reclen); 2371 } 2372 outcount += reclen; 2373 2374 ASSERT(outcount <= bufsize); 2375 2376 /* Prefetch znode */ 2377 if (prefetch) 2378 dmu_prefetch(os, objnum, 0, 0); 2379 2380 skip_entry: 2381 /* 2382 * Move to the next entry, fill in the previous offset. 2383 */ 2384 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2385 zap_cursor_advance(&zc); 2386 offset = zap_cursor_serialize(&zc); 2387 } else { 2388 offset += 1; 2389 } 2390 if (next) 2391 *next = offset; 2392 } 2393 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2394 2395 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2396 iovp->iov_base += outcount; 2397 iovp->iov_len -= outcount; 2398 uio->uio_resid -= outcount; 2399 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2400 /* 2401 * Reset the pointer. 2402 */ 2403 offset = uio->uio_loffset; 2404 } 2405 2406 update: 2407 zap_cursor_fini(&zc); 2408 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2409 kmem_free(outbuf, bufsize); 2410 2411 if (error == ENOENT) 2412 error = 0; 2413 2414 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2415 2416 uio->uio_loffset = offset; 2417 ZFS_EXIT(zfsvfs); 2418 return (error); 2419 } 2420 2421 ulong_t zfs_fsync_sync_cnt = 4; 2422 2423 static int 2424 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2425 { 2426 znode_t *zp = VTOZ(vp); 2427 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2428 2429 /* 2430 * Regardless of whether this is required for standards conformance, 2431 * this is the logical behavior when fsync() is called on a file with 2432 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2433 * going to be pushed out as part of the zil_commit(). 2434 */ 2435 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2436 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2437 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2438 2439 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2440 2441 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2442 ZFS_ENTER(zfsvfs); 2443 ZFS_VERIFY_ZP(zp); 2444 zil_commit(zfsvfs->z_log, zp->z_id); 2445 ZFS_EXIT(zfsvfs); 2446 } 2447 return (0); 2448 } 2449 2450 2451 /* 2452 * Get the requested file attributes and place them in the provided 2453 * vattr structure. 2454 * 2455 * IN: vp - vnode of file. 2456 * vap - va_mask identifies requested attributes. 2457 * If AT_XVATTR set, then optional attrs are requested 2458 * flags - ATTR_NOACLCHECK (CIFS server context) 2459 * cr - credentials of caller. 2460 * ct - caller context 2461 * 2462 * OUT: vap - attribute values. 2463 * 2464 * RETURN: 0 (always succeeds). 2465 */ 2466 /* ARGSUSED */ 2467 static int 2468 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2469 caller_context_t *ct) 2470 { 2471 znode_t *zp = VTOZ(vp); 2472 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2473 int error = 0; 2474 uint64_t links; 2475 uint64_t mtime[2], ctime[2]; 2476 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2477 xoptattr_t *xoap = NULL; 2478 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2479 sa_bulk_attr_t bulk[2]; 2480 int count = 0; 2481 2482 ZFS_ENTER(zfsvfs); 2483 ZFS_VERIFY_ZP(zp); 2484 2485 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2486 2487 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2488 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2489 2490 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2491 ZFS_EXIT(zfsvfs); 2492 return (error); 2493 } 2494 2495 /* 2496 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2497 * Also, if we are the owner don't bother, since owner should 2498 * always be allowed to read basic attributes of file. 2499 */ 2500 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2501 (vap->va_uid != crgetuid(cr))) { 2502 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2503 skipaclchk, cr)) { 2504 ZFS_EXIT(zfsvfs); 2505 return (error); 2506 } 2507 } 2508 2509 /* 2510 * Return all attributes. It's cheaper to provide the answer 2511 * than to determine whether we were asked the question. 2512 */ 2513 2514 mutex_enter(&zp->z_lock); 2515 vap->va_type = vp->v_type; 2516 vap->va_mode = zp->z_mode & MODEMASK; 2517 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2518 vap->va_nodeid = zp->z_id; 2519 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2520 links = zp->z_links + 1; 2521 else 2522 links = zp->z_links; 2523 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2524 vap->va_size = zp->z_size; 2525 vap->va_rdev = vp->v_rdev; 2526 vap->va_seq = zp->z_seq; 2527 2528 /* 2529 * Add in any requested optional attributes and the create time. 2530 * Also set the corresponding bits in the returned attribute bitmap. 2531 */ 2532 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2533 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2534 xoap->xoa_archive = 2535 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2536 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2537 } 2538 2539 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2540 xoap->xoa_readonly = 2541 ((zp->z_pflags & ZFS_READONLY) != 0); 2542 XVA_SET_RTN(xvap, XAT_READONLY); 2543 } 2544 2545 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2546 xoap->xoa_system = 2547 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2548 XVA_SET_RTN(xvap, XAT_SYSTEM); 2549 } 2550 2551 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2552 xoap->xoa_hidden = 2553 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2554 XVA_SET_RTN(xvap, XAT_HIDDEN); 2555 } 2556 2557 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2558 xoap->xoa_nounlink = 2559 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2560 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2561 } 2562 2563 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2564 xoap->xoa_immutable = 2565 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2566 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2567 } 2568 2569 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2570 xoap->xoa_appendonly = 2571 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2572 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2573 } 2574 2575 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2576 xoap->xoa_nodump = 2577 ((zp->z_pflags & ZFS_NODUMP) != 0); 2578 XVA_SET_RTN(xvap, XAT_NODUMP); 2579 } 2580 2581 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2582 xoap->xoa_opaque = 2583 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2584 XVA_SET_RTN(xvap, XAT_OPAQUE); 2585 } 2586 2587 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2588 xoap->xoa_av_quarantined = 2589 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2590 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2591 } 2592 2593 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2594 xoap->xoa_av_modified = 2595 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2596 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2597 } 2598 2599 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2600 vp->v_type == VREG) { 2601 zfs_sa_get_scanstamp(zp, xvap); 2602 } 2603 2604 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2605 uint64_t times[2]; 2606 2607 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 2608 times, sizeof (times)); 2609 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 2610 XVA_SET_RTN(xvap, XAT_CREATETIME); 2611 } 2612 2613 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2614 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2615 XVA_SET_RTN(xvap, XAT_REPARSE); 2616 } 2617 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2618 xoap->xoa_generation = zp->z_gen; 2619 XVA_SET_RTN(xvap, XAT_GEN); 2620 } 2621 2622 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2623 xoap->xoa_offline = 2624 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2625 XVA_SET_RTN(xvap, XAT_OFFLINE); 2626 } 2627 2628 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2629 xoap->xoa_sparse = 2630 ((zp->z_pflags & ZFS_SPARSE) != 0); 2631 XVA_SET_RTN(xvap, XAT_SPARSE); 2632 } 2633 } 2634 2635 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2636 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2637 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2638 2639 mutex_exit(&zp->z_lock); 2640 2641 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); 2642 2643 if (zp->z_blksz == 0) { 2644 /* 2645 * Block size hasn't been set; suggest maximal I/O transfers. 2646 */ 2647 vap->va_blksize = zfsvfs->z_max_blksz; 2648 } 2649 2650 ZFS_EXIT(zfsvfs); 2651 return (0); 2652 } 2653 2654 /* 2655 * Set the file attributes to the values contained in the 2656 * vattr structure. 2657 * 2658 * IN: vp - vnode of file to be modified. 2659 * vap - new attribute values. 2660 * If AT_XVATTR set, then optional attrs are being set 2661 * flags - ATTR_UTIME set if non-default time values provided. 2662 * - ATTR_NOACLCHECK (CIFS context only). 2663 * cr - credentials of caller. 2664 * ct - caller context 2665 * 2666 * RETURN: 0 on success, error code on failure. 2667 * 2668 * Timestamps: 2669 * vp - ctime updated, mtime updated if size changed. 2670 */ 2671 /* ARGSUSED */ 2672 static int 2673 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2674 caller_context_t *ct) 2675 { 2676 znode_t *zp = VTOZ(vp); 2677 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2678 zilog_t *zilog; 2679 dmu_tx_t *tx; 2680 vattr_t oldva; 2681 xvattr_t tmpxvattr; 2682 uint_t mask = vap->va_mask; 2683 uint_t saved_mask = 0; 2684 int trim_mask = 0; 2685 uint64_t new_mode; 2686 uint64_t new_uid, new_gid; 2687 uint64_t xattr_obj; 2688 uint64_t mtime[2], ctime[2]; 2689 znode_t *attrzp; 2690 int need_policy = FALSE; 2691 int err, err2; 2692 zfs_fuid_info_t *fuidp = NULL; 2693 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2694 xoptattr_t *xoap; 2695 zfs_acl_t *aclp; 2696 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2697 boolean_t fuid_dirtied = B_FALSE; 2698 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2699 int count = 0, xattr_count = 0; 2700 2701 if (mask == 0) 2702 return (0); 2703 2704 if (mask & AT_NOSET) 2705 return (SET_ERROR(EINVAL)); 2706 2707 ZFS_ENTER(zfsvfs); 2708 ZFS_VERIFY_ZP(zp); 2709 2710 zilog = zfsvfs->z_log; 2711 2712 /* 2713 * Make sure that if we have ephemeral uid/gid or xvattr specified 2714 * that file system is at proper version level 2715 */ 2716 2717 if (zfsvfs->z_use_fuids == B_FALSE && 2718 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2719 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2720 (mask & AT_XVATTR))) { 2721 ZFS_EXIT(zfsvfs); 2722 return (SET_ERROR(EINVAL)); 2723 } 2724 2725 if (mask & AT_SIZE && vp->v_type == VDIR) { 2726 ZFS_EXIT(zfsvfs); 2727 return (SET_ERROR(EISDIR)); 2728 } 2729 2730 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2731 ZFS_EXIT(zfsvfs); 2732 return (SET_ERROR(EINVAL)); 2733 } 2734 2735 /* 2736 * If this is an xvattr_t, then get a pointer to the structure of 2737 * optional attributes. If this is NULL, then we have a vattr_t. 2738 */ 2739 xoap = xva_getxoptattr(xvap); 2740 2741 xva_init(&tmpxvattr); 2742 2743 /* 2744 * Immutable files can only alter immutable bit and atime 2745 */ 2746 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2747 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2748 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2749 ZFS_EXIT(zfsvfs); 2750 return (SET_ERROR(EPERM)); 2751 } 2752 2753 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 2754 ZFS_EXIT(zfsvfs); 2755 return (SET_ERROR(EPERM)); 2756 } 2757 2758 /* 2759 * Verify timestamps doesn't overflow 32 bits. 2760 * ZFS can handle large timestamps, but 32bit syscalls can't 2761 * handle times greater than 2039. This check should be removed 2762 * once large timestamps are fully supported. 2763 */ 2764 if (mask & (AT_ATIME | AT_MTIME)) { 2765 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2766 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2767 ZFS_EXIT(zfsvfs); 2768 return (SET_ERROR(EOVERFLOW)); 2769 } 2770 } 2771 2772 top: 2773 attrzp = NULL; 2774 aclp = NULL; 2775 2776 /* Can this be moved to before the top label? */ 2777 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2778 ZFS_EXIT(zfsvfs); 2779 return (SET_ERROR(EROFS)); 2780 } 2781 2782 /* 2783 * First validate permissions 2784 */ 2785 2786 if (mask & AT_SIZE) { 2787 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2788 if (err) { 2789 ZFS_EXIT(zfsvfs); 2790 return (err); 2791 } 2792 /* 2793 * XXX - Note, we are not providing any open 2794 * mode flags here (like FNDELAY), so we may 2795 * block if there are locks present... this 2796 * should be addressed in openat(). 2797 */ 2798 /* XXX - would it be OK to generate a log record here? */ 2799 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2800 if (err) { 2801 ZFS_EXIT(zfsvfs); 2802 return (err); 2803 } 2804 2805 if (vap->va_size == 0) 2806 vnevent_truncate(ZTOV(zp), ct); 2807 } 2808 2809 if (mask & (AT_ATIME|AT_MTIME) || 2810 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2811 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2812 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2813 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2814 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2815 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2816 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2817 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2818 skipaclchk, cr); 2819 } 2820 2821 if (mask & (AT_UID|AT_GID)) { 2822 int idmask = (mask & (AT_UID|AT_GID)); 2823 int take_owner; 2824 int take_group; 2825 2826 /* 2827 * NOTE: even if a new mode is being set, 2828 * we may clear S_ISUID/S_ISGID bits. 2829 */ 2830 2831 if (!(mask & AT_MODE)) 2832 vap->va_mode = zp->z_mode; 2833 2834 /* 2835 * Take ownership or chgrp to group we are a member of 2836 */ 2837 2838 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2839 take_group = (mask & AT_GID) && 2840 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2841 2842 /* 2843 * If both AT_UID and AT_GID are set then take_owner and 2844 * take_group must both be set in order to allow taking 2845 * ownership. 2846 * 2847 * Otherwise, send the check through secpolicy_vnode_setattr() 2848 * 2849 */ 2850 2851 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2852 ((idmask == AT_UID) && take_owner) || 2853 ((idmask == AT_GID) && take_group)) { 2854 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2855 skipaclchk, cr) == 0) { 2856 /* 2857 * Remove setuid/setgid for non-privileged users 2858 */ 2859 secpolicy_setid_clear(vap, cr); 2860 trim_mask = (mask & (AT_UID|AT_GID)); 2861 } else { 2862 need_policy = TRUE; 2863 } 2864 } else { 2865 need_policy = TRUE; 2866 } 2867 } 2868 2869 mutex_enter(&zp->z_lock); 2870 oldva.va_mode = zp->z_mode; 2871 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2872 if (mask & AT_XVATTR) { 2873 /* 2874 * Update xvattr mask to include only those attributes 2875 * that are actually changing. 2876 * 2877 * the bits will be restored prior to actually setting 2878 * the attributes so the caller thinks they were set. 2879 */ 2880 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2881 if (xoap->xoa_appendonly != 2882 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2883 need_policy = TRUE; 2884 } else { 2885 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2886 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 2887 } 2888 } 2889 2890 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2891 if (xoap->xoa_nounlink != 2892 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2893 need_policy = TRUE; 2894 } else { 2895 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2896 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 2897 } 2898 } 2899 2900 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2901 if (xoap->xoa_immutable != 2902 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2903 need_policy = TRUE; 2904 } else { 2905 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2906 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 2907 } 2908 } 2909 2910 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2911 if (xoap->xoa_nodump != 2912 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2913 need_policy = TRUE; 2914 } else { 2915 XVA_CLR_REQ(xvap, XAT_NODUMP); 2916 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 2917 } 2918 } 2919 2920 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2921 if (xoap->xoa_av_modified != 2922 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2923 need_policy = TRUE; 2924 } else { 2925 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2926 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 2927 } 2928 } 2929 2930 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2931 if ((vp->v_type != VREG && 2932 xoap->xoa_av_quarantined) || 2933 xoap->xoa_av_quarantined != 2934 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2935 need_policy = TRUE; 2936 } else { 2937 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2938 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 2939 } 2940 } 2941 2942 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2943 mutex_exit(&zp->z_lock); 2944 ZFS_EXIT(zfsvfs); 2945 return (SET_ERROR(EPERM)); 2946 } 2947 2948 if (need_policy == FALSE && 2949 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2950 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2951 need_policy = TRUE; 2952 } 2953 } 2954 2955 mutex_exit(&zp->z_lock); 2956 2957 if (mask & AT_MODE) { 2958 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2959 err = secpolicy_setid_setsticky_clear(vp, vap, 2960 &oldva, cr); 2961 if (err) { 2962 ZFS_EXIT(zfsvfs); 2963 return (err); 2964 } 2965 trim_mask |= AT_MODE; 2966 } else { 2967 need_policy = TRUE; 2968 } 2969 } 2970 2971 if (need_policy) { 2972 /* 2973 * If trim_mask is set then take ownership 2974 * has been granted or write_acl is present and user 2975 * has the ability to modify mode. In that case remove 2976 * UID|GID and or MODE from mask so that 2977 * secpolicy_vnode_setattr() doesn't revoke it. 2978 */ 2979 2980 if (trim_mask) { 2981 saved_mask = vap->va_mask; 2982 vap->va_mask &= ~trim_mask; 2983 } 2984 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2985 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2986 if (err) { 2987 ZFS_EXIT(zfsvfs); 2988 return (err); 2989 } 2990 2991 if (trim_mask) 2992 vap->va_mask |= saved_mask; 2993 } 2994 2995 /* 2996 * secpolicy_vnode_setattr, or take ownership may have 2997 * changed va_mask 2998 */ 2999 mask = vap->va_mask; 3000 3001 if ((mask & (AT_UID | AT_GID))) { 3002 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3003 &xattr_obj, sizeof (xattr_obj)); 3004 3005 if (err == 0 && xattr_obj) { 3006 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3007 if (err) 3008 goto out2; 3009 } 3010 if (mask & AT_UID) { 3011 new_uid = zfs_fuid_create(zfsvfs, 3012 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3013 if (new_uid != zp->z_uid && 3014 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3015 if (attrzp) 3016 VN_RELE(ZTOV(attrzp)); 3017 err = SET_ERROR(EDQUOT); 3018 goto out2; 3019 } 3020 } 3021 3022 if (mask & AT_GID) { 3023 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3024 cr, ZFS_GROUP, &fuidp); 3025 if (new_gid != zp->z_gid && 3026 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3027 if (attrzp) 3028 VN_RELE(ZTOV(attrzp)); 3029 err = SET_ERROR(EDQUOT); 3030 goto out2; 3031 } 3032 } 3033 } 3034 tx = dmu_tx_create(zfsvfs->z_os); 3035 3036 if (mask & AT_MODE) { 3037 uint64_t pmode = zp->z_mode; 3038 uint64_t acl_obj; 3039 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3040 3041 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3042 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3043 err = SET_ERROR(EPERM); 3044 goto out; 3045 } 3046 3047 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3048 goto out; 3049 3050 mutex_enter(&zp->z_lock); 3051 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3052 /* 3053 * Are we upgrading ACL from old V0 format 3054 * to V1 format? 3055 */ 3056 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3057 zfs_znode_acl_version(zp) == 3058 ZFS_ACL_VERSION_INITIAL) { 3059 dmu_tx_hold_free(tx, acl_obj, 0, 3060 DMU_OBJECT_END); 3061 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3062 0, aclp->z_acl_bytes); 3063 } else { 3064 dmu_tx_hold_write(tx, acl_obj, 0, 3065 aclp->z_acl_bytes); 3066 } 3067 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3068 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3069 0, aclp->z_acl_bytes); 3070 } 3071 mutex_exit(&zp->z_lock); 3072 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3073 } else { 3074 if ((mask & AT_XVATTR) && 3075 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3076 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3077 else 3078 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3079 } 3080 3081 if (attrzp) { 3082 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3083 } 3084 3085 fuid_dirtied = zfsvfs->z_fuid_dirty; 3086 if (fuid_dirtied) 3087 zfs_fuid_txhold(zfsvfs, tx); 3088 3089 zfs_sa_upgrade_txholds(tx, zp); 3090 3091 err = dmu_tx_assign(tx, TXG_WAIT); 3092 if (err) 3093 goto out; 3094 3095 count = 0; 3096 /* 3097 * Set each attribute requested. 3098 * We group settings according to the locks they need to acquire. 3099 * 3100 * Note: you cannot set ctime directly, although it will be 3101 * updated as a side-effect of calling this function. 3102 */ 3103 3104 3105 if (mask & (AT_UID|AT_GID|AT_MODE)) 3106 mutex_enter(&zp->z_acl_lock); 3107 mutex_enter(&zp->z_lock); 3108 3109 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3110 &zp->z_pflags, sizeof (zp->z_pflags)); 3111 3112 if (attrzp) { 3113 if (mask & (AT_UID|AT_GID|AT_MODE)) 3114 mutex_enter(&attrzp->z_acl_lock); 3115 mutex_enter(&attrzp->z_lock); 3116 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3117 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3118 sizeof (attrzp->z_pflags)); 3119 } 3120 3121 if (mask & (AT_UID|AT_GID)) { 3122 3123 if (mask & AT_UID) { 3124 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3125 &new_uid, sizeof (new_uid)); 3126 zp->z_uid = new_uid; 3127 if (attrzp) { 3128 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3129 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3130 sizeof (new_uid)); 3131 attrzp->z_uid = new_uid; 3132 } 3133 } 3134 3135 if (mask & AT_GID) { 3136 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3137 NULL, &new_gid, sizeof (new_gid)); 3138 zp->z_gid = new_gid; 3139 if (attrzp) { 3140 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3141 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3142 sizeof (new_gid)); 3143 attrzp->z_gid = new_gid; 3144 } 3145 } 3146 if (!(mask & AT_MODE)) { 3147 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3148 NULL, &new_mode, sizeof (new_mode)); 3149 new_mode = zp->z_mode; 3150 } 3151 err = zfs_acl_chown_setattr(zp); 3152 ASSERT(err == 0); 3153 if (attrzp) { 3154 err = zfs_acl_chown_setattr(attrzp); 3155 ASSERT(err == 0); 3156 } 3157 } 3158 3159 if (mask & AT_MODE) { 3160 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3161 &new_mode, sizeof (new_mode)); 3162 zp->z_mode = new_mode; 3163 ASSERT3U((uintptr_t)aclp, !=, NULL); 3164 err = zfs_aclset_common(zp, aclp, cr, tx); 3165 ASSERT0(err); 3166 if (zp->z_acl_cached) 3167 zfs_acl_free(zp->z_acl_cached); 3168 zp->z_acl_cached = aclp; 3169 aclp = NULL; 3170 } 3171 3172 3173 if (mask & AT_ATIME) { 3174 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3175 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3176 &zp->z_atime, sizeof (zp->z_atime)); 3177 } 3178 3179 if (mask & AT_MTIME) { 3180 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3181 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3182 mtime, sizeof (mtime)); 3183 } 3184 3185 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3186 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3187 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3188 NULL, mtime, sizeof (mtime)); 3189 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3190 &ctime, sizeof (ctime)); 3191 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3192 B_TRUE); 3193 } else if (mask != 0) { 3194 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3195 &ctime, sizeof (ctime)); 3196 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3197 B_TRUE); 3198 if (attrzp) { 3199 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3200 SA_ZPL_CTIME(zfsvfs), NULL, 3201 &ctime, sizeof (ctime)); 3202 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3203 mtime, ctime, B_TRUE); 3204 } 3205 } 3206 /* 3207 * Do this after setting timestamps to prevent timestamp 3208 * update from toggling bit 3209 */ 3210 3211 if (xoap && (mask & AT_XVATTR)) { 3212 3213 /* 3214 * restore trimmed off masks 3215 * so that return masks can be set for caller. 3216 */ 3217 3218 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3219 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3220 } 3221 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3222 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3223 } 3224 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3225 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3226 } 3227 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3228 XVA_SET_REQ(xvap, XAT_NODUMP); 3229 } 3230 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3231 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3232 } 3233 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3234 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3235 } 3236 3237 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3238 ASSERT(vp->v_type == VREG); 3239 3240 zfs_xvattr_set(zp, xvap, tx); 3241 } 3242 3243 if (fuid_dirtied) 3244 zfs_fuid_sync(zfsvfs, tx); 3245 3246 if (mask != 0) 3247 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3248 3249 mutex_exit(&zp->z_lock); 3250 if (mask & (AT_UID|AT_GID|AT_MODE)) 3251 mutex_exit(&zp->z_acl_lock); 3252 3253 if (attrzp) { 3254 if (mask & (AT_UID|AT_GID|AT_MODE)) 3255 mutex_exit(&attrzp->z_acl_lock); 3256 mutex_exit(&attrzp->z_lock); 3257 } 3258 out: 3259 if (err == 0 && attrzp) { 3260 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3261 xattr_count, tx); 3262 ASSERT(err2 == 0); 3263 } 3264 3265 if (attrzp) 3266 VN_RELE(ZTOV(attrzp)); 3267 3268 if (aclp) 3269 zfs_acl_free(aclp); 3270 3271 if (fuidp) { 3272 zfs_fuid_info_free(fuidp); 3273 fuidp = NULL; 3274 } 3275 3276 if (err) { 3277 dmu_tx_abort(tx); 3278 if (err == ERESTART) 3279 goto top; 3280 } else { 3281 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3282 rw_enter(&rz_zev_rwlock, RW_READER); 3283 if (rz_zev_callbacks && rz_zev_callbacks->rz_zev_znode_setattr) 3284 rz_zev_callbacks->rz_zev_znode_setattr(zp, tx); 3285 rw_exit(&rz_zev_rwlock); 3286 dmu_tx_commit(tx); 3287 } 3288 3289 out2: 3290 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3291 zil_commit(zilog, 0); 3292 3293 ZFS_EXIT(zfsvfs); 3294 return (err); 3295 } 3296 3297 typedef struct zfs_zlock { 3298 krwlock_t *zl_rwlock; /* lock we acquired */ 3299 znode_t *zl_znode; /* znode we held */ 3300 struct zfs_zlock *zl_next; /* next in list */ 3301 } zfs_zlock_t; 3302 3303 /* 3304 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3305 */ 3306 static void 3307 zfs_rename_unlock(zfs_zlock_t **zlpp) 3308 { 3309 zfs_zlock_t *zl; 3310 3311 while ((zl = *zlpp) != NULL) { 3312 if (zl->zl_znode != NULL) 3313 VN_RELE(ZTOV(zl->zl_znode)); 3314 rw_exit(zl->zl_rwlock); 3315 *zlpp = zl->zl_next; 3316 kmem_free(zl, sizeof (*zl)); 3317 } 3318 } 3319 3320 /* 3321 * Search back through the directory tree, using the ".." entries. 3322 * Lock each directory in the chain to prevent concurrent renames. 3323 * Fail any attempt to move a directory into one of its own descendants. 3324 * XXX - z_parent_lock can overlap with map or grow locks 3325 */ 3326 static int 3327 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3328 { 3329 zfs_zlock_t *zl; 3330 znode_t *zp = tdzp; 3331 uint64_t rootid = zp->z_zfsvfs->z_root; 3332 uint64_t oidp = zp->z_id; 3333 krwlock_t *rwlp = &szp->z_parent_lock; 3334 krw_t rw = RW_WRITER; 3335 3336 /* 3337 * First pass write-locks szp and compares to zp->z_id. 3338 * Later passes read-lock zp and compare to zp->z_parent. 3339 */ 3340 do { 3341 if (!rw_tryenter(rwlp, rw)) { 3342 /* 3343 * Another thread is renaming in this path. 3344 * Note that if we are a WRITER, we don't have any 3345 * parent_locks held yet. 3346 */ 3347 if (rw == RW_READER && zp->z_id > szp->z_id) { 3348 /* 3349 * Drop our locks and restart 3350 */ 3351 zfs_rename_unlock(&zl); 3352 *zlpp = NULL; 3353 zp = tdzp; 3354 oidp = zp->z_id; 3355 rwlp = &szp->z_parent_lock; 3356 rw = RW_WRITER; 3357 continue; 3358 } else { 3359 /* 3360 * Wait for other thread to drop its locks 3361 */ 3362 rw_enter(rwlp, rw); 3363 } 3364 } 3365 3366 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3367 zl->zl_rwlock = rwlp; 3368 zl->zl_znode = NULL; 3369 zl->zl_next = *zlpp; 3370 *zlpp = zl; 3371 3372 if (oidp == szp->z_id) /* We're a descendant of szp */ 3373 return (SET_ERROR(EINVAL)); 3374 3375 if (oidp == rootid) /* We've hit the top */ 3376 return (0); 3377 3378 if (rw == RW_READER) { /* i.e. not the first pass */ 3379 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); 3380 if (error) 3381 return (error); 3382 zl->zl_znode = zp; 3383 } 3384 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), 3385 &oidp, sizeof (oidp)); 3386 rwlp = &zp->z_parent_lock; 3387 rw = RW_READER; 3388 3389 } while (zp->z_id != sdzp->z_id); 3390 3391 return (0); 3392 } 3393 3394 /* 3395 * Move an entry from the provided source directory to the target 3396 * directory. Change the entry name as indicated. 3397 * 3398 * IN: sdvp - Source directory containing the "old entry". 3399 * snm - Old entry name. 3400 * tdvp - Target directory to contain the "new entry". 3401 * tnm - New entry name. 3402 * cr - credentials of caller. 3403 * ct - caller context 3404 * flags - case flags 3405 * 3406 * RETURN: 0 on success, error code on failure. 3407 * 3408 * Timestamps: 3409 * sdvp,tdvp - ctime|mtime updated 3410 */ 3411 /*ARGSUSED*/ 3412 static int 3413 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3414 caller_context_t *ct, int flags) 3415 { 3416 znode_t *tdzp, *szp, *tzp; 3417 znode_t *sdzp = VTOZ(sdvp); 3418 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3419 zilog_t *zilog; 3420 vnode_t *realvp; 3421 zfs_dirlock_t *sdl, *tdl; 3422 dmu_tx_t *tx; 3423 zfs_zlock_t *zl; 3424 int cmp, serr, terr; 3425 int error = 0; 3426 int zflg = 0; 3427 boolean_t waited = B_FALSE; 3428 3429 ZFS_ENTER(zfsvfs); 3430 ZFS_VERIFY_ZP(sdzp); 3431 zilog = zfsvfs->z_log; 3432 3433 /* 3434 * Make sure we have the real vp for the target directory. 3435 */ 3436 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3437 tdvp = realvp; 3438 3439 tdzp = VTOZ(tdvp); 3440 ZFS_VERIFY_ZP(tdzp); 3441 3442 /* 3443 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3444 * ctldir appear to have the same v_vfsp. 3445 */ 3446 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { 3447 ZFS_EXIT(zfsvfs); 3448 return (SET_ERROR(EXDEV)); 3449 } 3450 3451 if (zfsvfs->z_utf8 && u8_validate(tnm, 3452 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3453 ZFS_EXIT(zfsvfs); 3454 return (SET_ERROR(EILSEQ)); 3455 } 3456 3457 if (flags & FIGNORECASE) 3458 zflg |= ZCILOOK; 3459 3460 top: 3461 szp = NULL; 3462 tzp = NULL; 3463 zl = NULL; 3464 3465 /* 3466 * This is to prevent the creation of links into attribute space 3467 * by renaming a linked file into/outof an attribute directory. 3468 * See the comment in zfs_link() for why this is considered bad. 3469 */ 3470 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3471 ZFS_EXIT(zfsvfs); 3472 return (SET_ERROR(EINVAL)); 3473 } 3474 3475 /* 3476 * Lock source and target directory entries. To prevent deadlock, 3477 * a lock ordering must be defined. We lock the directory with 3478 * the smallest object id first, or if it's a tie, the one with 3479 * the lexically first name. 3480 */ 3481 if (sdzp->z_id < tdzp->z_id) { 3482 cmp = -1; 3483 } else if (sdzp->z_id > tdzp->z_id) { 3484 cmp = 1; 3485 } else { 3486 /* 3487 * First compare the two name arguments without 3488 * considering any case folding. 3489 */ 3490 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3491 3492 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3493 ASSERT(error == 0 || !zfsvfs->z_utf8); 3494 if (cmp == 0) { 3495 /* 3496 * POSIX: "If the old argument and the new argument 3497 * both refer to links to the same existing file, 3498 * the rename() function shall return successfully 3499 * and perform no other action." 3500 */ 3501 ZFS_EXIT(zfsvfs); 3502 return (0); 3503 } 3504 /* 3505 * If the file system is case-folding, then we may 3506 * have some more checking to do. A case-folding file 3507 * system is either supporting mixed case sensitivity 3508 * access or is completely case-insensitive. Note 3509 * that the file system is always case preserving. 3510 * 3511 * In mixed sensitivity mode case sensitive behavior 3512 * is the default. FIGNORECASE must be used to 3513 * explicitly request case insensitive behavior. 3514 * 3515 * If the source and target names provided differ only 3516 * by case (e.g., a request to rename 'tim' to 'Tim'), 3517 * we will treat this as a special case in the 3518 * case-insensitive mode: as long as the source name 3519 * is an exact match, we will allow this to proceed as 3520 * a name-change request. 3521 */ 3522 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3523 (zfsvfs->z_case == ZFS_CASE_MIXED && 3524 flags & FIGNORECASE)) && 3525 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3526 &error) == 0) { 3527 /* 3528 * case preserving rename request, require exact 3529 * name matches 3530 */ 3531 zflg |= ZCIEXACT; 3532 zflg &= ~ZCILOOK; 3533 } 3534 } 3535 3536 /* 3537 * If the source and destination directories are the same, we should 3538 * grab the z_name_lock of that directory only once. 3539 */ 3540 if (sdzp == tdzp) { 3541 zflg |= ZHAVELOCK; 3542 rw_enter(&sdzp->z_name_lock, RW_READER); 3543 } 3544 3545 if (cmp < 0) { 3546 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3547 ZEXISTS | zflg, NULL, NULL); 3548 terr = zfs_dirent_lock(&tdl, 3549 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3550 } else { 3551 terr = zfs_dirent_lock(&tdl, 3552 tdzp, tnm, &tzp, zflg, NULL, NULL); 3553 serr = zfs_dirent_lock(&sdl, 3554 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3555 NULL, NULL); 3556 } 3557 3558 if (serr) { 3559 /* 3560 * Source entry invalid or not there. 3561 */ 3562 if (!terr) { 3563 zfs_dirent_unlock(tdl); 3564 if (tzp) 3565 VN_RELE(ZTOV(tzp)); 3566 } 3567 3568 if (sdzp == tdzp) 3569 rw_exit(&sdzp->z_name_lock); 3570 3571 if (strcmp(snm, "..") == 0) 3572 serr = SET_ERROR(EINVAL); 3573 ZFS_EXIT(zfsvfs); 3574 return (serr); 3575 } 3576 if (terr) { 3577 zfs_dirent_unlock(sdl); 3578 VN_RELE(ZTOV(szp)); 3579 3580 if (sdzp == tdzp) 3581 rw_exit(&sdzp->z_name_lock); 3582 3583 if (strcmp(tnm, "..") == 0) 3584 terr = SET_ERROR(EINVAL); 3585 ZFS_EXIT(zfsvfs); 3586 return (terr); 3587 } 3588 3589 /* 3590 * Must have write access at the source to remove the old entry 3591 * and write access at the target to create the new entry. 3592 * Note that if target and source are the same, this can be 3593 * done in a single check. 3594 */ 3595 3596 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3597 goto out; 3598 3599 if (ZTOV(szp)->v_type == VDIR) { 3600 /* 3601 * Check to make sure rename is valid. 3602 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3603 */ 3604 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3605 goto out; 3606 } 3607 3608 /* 3609 * Does target exist? 3610 */ 3611 if (tzp) { 3612 /* 3613 * Source and target must be the same type. 3614 */ 3615 if (ZTOV(szp)->v_type == VDIR) { 3616 if (ZTOV(tzp)->v_type != VDIR) { 3617 error = SET_ERROR(ENOTDIR); 3618 goto out; 3619 } 3620 } else { 3621 if (ZTOV(tzp)->v_type == VDIR) { 3622 error = SET_ERROR(EISDIR); 3623 goto out; 3624 } 3625 } 3626 /* 3627 * POSIX dictates that when the source and target 3628 * entries refer to the same file object, rename 3629 * must do nothing and exit without error. 3630 */ 3631 if (szp->z_id == tzp->z_id) { 3632 error = 0; 3633 goto out; 3634 } 3635 } 3636 3637 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3638 if (tzp) 3639 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3640 3641 /* 3642 * notify the target directory if it is not the same 3643 * as source directory. 3644 */ 3645 if (tdvp != sdvp) { 3646 vnevent_rename_dest_dir(tdvp, ct); 3647 } 3648 3649 tx = dmu_tx_create(zfsvfs->z_os); 3650 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3651 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3652 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3653 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3654 if (sdzp != tdzp) { 3655 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3656 zfs_sa_upgrade_txholds(tx, tdzp); 3657 } 3658 if (tzp) { 3659 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3660 zfs_sa_upgrade_txholds(tx, tzp); 3661 } 3662 3663 zfs_sa_upgrade_txholds(tx, szp); 3664 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3665 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 3666 if (error) { 3667 if (zl != NULL) 3668 zfs_rename_unlock(&zl); 3669 zfs_dirent_unlock(sdl); 3670 zfs_dirent_unlock(tdl); 3671 3672 if (sdzp == tdzp) 3673 rw_exit(&sdzp->z_name_lock); 3674 3675 VN_RELE(ZTOV(szp)); 3676 if (tzp) 3677 VN_RELE(ZTOV(tzp)); 3678 if (error == ERESTART) { 3679 waited = B_TRUE; 3680 dmu_tx_wait(tx); 3681 dmu_tx_abort(tx); 3682 goto top; 3683 } 3684 dmu_tx_abort(tx); 3685 ZFS_EXIT(zfsvfs); 3686 return (error); 3687 } 3688 3689 if (tzp) /* Attempt to remove the existing target */ 3690 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3691 3692 if (error == 0) { 3693 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3694 if (error == 0) { 3695 szp->z_pflags |= ZFS_AV_MODIFIED; 3696 3697 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3698 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3699 ASSERT0(error); 3700 3701 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3702 if (error == 0) { 3703 zfs_log_rename(zilog, tx, TX_RENAME | 3704 (flags & FIGNORECASE ? TX_CI : 0), sdzp, 3705 sdl->dl_name, tdzp, tdl->dl_name, szp); 3706 3707 /* 3708 * Update path information for the target vnode 3709 */ 3710 vn_renamepath(tdvp, ZTOV(szp), tnm, 3711 strlen(tnm)); 3712 } else { 3713 /* 3714 * At this point, we have successfully created 3715 * the target name, but have failed to remove 3716 * the source name. Since the create was done 3717 * with the ZRENAMING flag, there are 3718 * complications; for one, the link count is 3719 * wrong. The easiest way to deal with this 3720 * is to remove the newly created target, and 3721 * return the original error. This must 3722 * succeed; fortunately, it is very unlikely to 3723 * fail, since we just created it. 3724 */ 3725 VERIFY3U(zfs_link_destroy(tdl, szp, tx, 3726 ZRENAMING, NULL), ==, 0); 3727 } 3728 } 3729 } 3730 3731 dmu_tx_commit(tx); 3732 out: 3733 if (zl != NULL) 3734 zfs_rename_unlock(&zl); 3735 3736 zfs_dirent_unlock(sdl); 3737 zfs_dirent_unlock(tdl); 3738 3739 if (sdzp == tdzp) 3740 rw_exit(&sdzp->z_name_lock); 3741 3742 3743 VN_RELE(ZTOV(szp)); 3744 if (tzp) 3745 VN_RELE(ZTOV(tzp)); 3746 3747 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3748 zil_commit(zilog, 0); 3749 3750 ZFS_EXIT(zfsvfs); 3751 return (error); 3752 } 3753 3754 /* 3755 * Insert the indicated symbolic reference entry into the directory. 3756 * 3757 * IN: dvp - Directory to contain new symbolic link. 3758 * link - Name for new symlink entry. 3759 * vap - Attributes of new entry. 3760 * cr - credentials of caller. 3761 * ct - caller context 3762 * flags - case flags 3763 * 3764 * RETURN: 0 on success, error code on failure. 3765 * 3766 * Timestamps: 3767 * dvp - ctime|mtime updated 3768 */ 3769 /*ARGSUSED*/ 3770 static int 3771 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 3772 caller_context_t *ct, int flags) 3773 { 3774 znode_t *zp, *dzp = VTOZ(dvp); 3775 zfs_dirlock_t *dl; 3776 dmu_tx_t *tx; 3777 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3778 zilog_t *zilog; 3779 uint64_t len = strlen(link); 3780 int error; 3781 int zflg = ZNEW; 3782 zfs_acl_ids_t acl_ids; 3783 boolean_t fuid_dirtied; 3784 uint64_t txtype = TX_SYMLINK; 3785 boolean_t waited = B_FALSE; 3786 3787 ASSERT(vap->va_type == VLNK); 3788 3789 ZFS_ENTER(zfsvfs); 3790 ZFS_VERIFY_ZP(dzp); 3791 zilog = zfsvfs->z_log; 3792 3793 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3794 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3795 ZFS_EXIT(zfsvfs); 3796 return (SET_ERROR(EILSEQ)); 3797 } 3798 if (flags & FIGNORECASE) 3799 zflg |= ZCILOOK; 3800 3801 if (len > MAXPATHLEN) { 3802 ZFS_EXIT(zfsvfs); 3803 return (SET_ERROR(ENAMETOOLONG)); 3804 } 3805 3806 if ((error = zfs_acl_ids_create(dzp, 0, 3807 vap, cr, NULL, &acl_ids)) != 0) { 3808 ZFS_EXIT(zfsvfs); 3809 return (error); 3810 } 3811 top: 3812 /* 3813 * Attempt to lock directory; fail if entry already exists. 3814 */ 3815 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3816 if (error) { 3817 zfs_acl_ids_free(&acl_ids); 3818 ZFS_EXIT(zfsvfs); 3819 return (error); 3820 } 3821 3822 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3823 zfs_acl_ids_free(&acl_ids); 3824 zfs_dirent_unlock(dl); 3825 ZFS_EXIT(zfsvfs); 3826 return (error); 3827 } 3828 3829 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 3830 zfs_acl_ids_free(&acl_ids); 3831 zfs_dirent_unlock(dl); 3832 ZFS_EXIT(zfsvfs); 3833 return (SET_ERROR(EDQUOT)); 3834 } 3835 tx = dmu_tx_create(zfsvfs->z_os); 3836 fuid_dirtied = zfsvfs->z_fuid_dirty; 3837 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3838 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3839 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3840 ZFS_SA_BASE_ATTR_SIZE + len); 3841 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3842 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3843 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3844 acl_ids.z_aclp->z_acl_bytes); 3845 } 3846 if (fuid_dirtied) 3847 zfs_fuid_txhold(zfsvfs, tx); 3848 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 3849 if (error) { 3850 zfs_dirent_unlock(dl); 3851 if (error == ERESTART) { 3852 waited = B_TRUE; 3853 dmu_tx_wait(tx); 3854 dmu_tx_abort(tx); 3855 goto top; 3856 } 3857 zfs_acl_ids_free(&acl_ids); 3858 dmu_tx_abort(tx); 3859 ZFS_EXIT(zfsvfs); 3860 return (error); 3861 } 3862 3863 /* 3864 * Create a new object for the symlink. 3865 * for version 4 ZPL datsets the symlink will be an SA attribute 3866 */ 3867 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3868 3869 if (fuid_dirtied) 3870 zfs_fuid_sync(zfsvfs, tx); 3871 3872 mutex_enter(&zp->z_lock); 3873 if (zp->z_is_sa) 3874 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3875 link, len, tx); 3876 else 3877 zfs_sa_symlink(zp, link, len, tx); 3878 mutex_exit(&zp->z_lock); 3879 3880 zp->z_size = len; 3881 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3882 &zp->z_size, sizeof (zp->z_size), tx); 3883 /* 3884 * Insert the new object into the directory. 3885 */ 3886 (void) zfs_link_create(dl, zp, tx, ZNEW); 3887 3888 if (flags & FIGNORECASE) 3889 txtype |= TX_CI; 3890 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3891 3892 zfs_acl_ids_free(&acl_ids); 3893 3894 dmu_tx_commit(tx); 3895 3896 zfs_dirent_unlock(dl); 3897 3898 VN_RELE(ZTOV(zp)); 3899 3900 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3901 zil_commit(zilog, 0); 3902 3903 ZFS_EXIT(zfsvfs); 3904 return (error); 3905 } 3906 3907 /* 3908 * Return, in the buffer contained in the provided uio structure, 3909 * the symbolic path referred to by vp. 3910 * 3911 * IN: vp - vnode of symbolic link. 3912 * uio - structure to contain the link path. 3913 * cr - credentials of caller. 3914 * ct - caller context 3915 * 3916 * OUT: uio - structure containing the link path. 3917 * 3918 * RETURN: 0 on success, error code on failure. 3919 * 3920 * Timestamps: 3921 * vp - atime updated 3922 */ 3923 /* ARGSUSED */ 3924 static int 3925 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3926 { 3927 znode_t *zp = VTOZ(vp); 3928 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3929 int error; 3930 3931 ZFS_ENTER(zfsvfs); 3932 ZFS_VERIFY_ZP(zp); 3933 3934 mutex_enter(&zp->z_lock); 3935 if (zp->z_is_sa) 3936 error = sa_lookup_uio(zp->z_sa_hdl, 3937 SA_ZPL_SYMLINK(zfsvfs), uio); 3938 else 3939 error = zfs_sa_readlink(zp, uio); 3940 mutex_exit(&zp->z_lock); 3941 3942 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3943 3944 ZFS_EXIT(zfsvfs); 3945 return (error); 3946 } 3947 3948 /* 3949 * Insert a new entry into directory tdvp referencing svp. 3950 * 3951 * IN: tdvp - Directory to contain new entry. 3952 * svp - vnode of new entry. 3953 * name - name of new entry. 3954 * cr - credentials of caller. 3955 * ct - caller context 3956 * 3957 * RETURN: 0 on success, error code on failure. 3958 * 3959 * Timestamps: 3960 * tdvp - ctime|mtime updated 3961 * svp - ctime updated 3962 */ 3963 /* ARGSUSED */ 3964 static int 3965 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3966 caller_context_t *ct, int flags) 3967 { 3968 znode_t *dzp = VTOZ(tdvp); 3969 znode_t *tzp, *szp; 3970 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3971 zilog_t *zilog; 3972 zfs_dirlock_t *dl; 3973 dmu_tx_t *tx; 3974 vnode_t *realvp; 3975 int error; 3976 int zf = ZNEW; 3977 uint64_t parent; 3978 uid_t owner; 3979 boolean_t waited = B_FALSE; 3980 3981 ASSERT(tdvp->v_type == VDIR); 3982 3983 ZFS_ENTER(zfsvfs); 3984 ZFS_VERIFY_ZP(dzp); 3985 zilog = zfsvfs->z_log; 3986 3987 if (VOP_REALVP(svp, &realvp, ct) == 0) 3988 svp = realvp; 3989 3990 /* 3991 * POSIX dictates that we return EPERM here. 3992 * Better choices include ENOTSUP or EISDIR. 3993 */ 3994 if (svp->v_type == VDIR) { 3995 ZFS_EXIT(zfsvfs); 3996 return (SET_ERROR(EPERM)); 3997 } 3998 3999 szp = VTOZ(svp); 4000 ZFS_VERIFY_ZP(szp); 4001 4002 /* 4003 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 4004 * ctldir appear to have the same v_vfsp. 4005 */ 4006 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { 4007 ZFS_EXIT(zfsvfs); 4008 return (SET_ERROR(EXDEV)); 4009 } 4010 4011 /* Prevent links to .zfs/shares files */ 4012 4013 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4014 &parent, sizeof (uint64_t))) != 0) { 4015 ZFS_EXIT(zfsvfs); 4016 return (error); 4017 } 4018 if (parent == zfsvfs->z_shares_dir) { 4019 ZFS_EXIT(zfsvfs); 4020 return (SET_ERROR(EPERM)); 4021 } 4022 4023 if (zfsvfs->z_utf8 && u8_validate(name, 4024 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4025 ZFS_EXIT(zfsvfs); 4026 return (SET_ERROR(EILSEQ)); 4027 } 4028 if (flags & FIGNORECASE) 4029 zf |= ZCILOOK; 4030 4031 /* 4032 * We do not support links between attributes and non-attributes 4033 * because of the potential security risk of creating links 4034 * into "normal" file space in order to circumvent restrictions 4035 * imposed in attribute space. 4036 */ 4037 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4038 ZFS_EXIT(zfsvfs); 4039 return (SET_ERROR(EINVAL)); 4040 } 4041 4042 4043 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4044 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 4045 ZFS_EXIT(zfsvfs); 4046 return (SET_ERROR(EPERM)); 4047 } 4048 4049 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4050 ZFS_EXIT(zfsvfs); 4051 return (error); 4052 } 4053 4054 top: 4055 /* 4056 * Attempt to lock directory; fail if entry already exists. 4057 */ 4058 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 4059 if (error) { 4060 ZFS_EXIT(zfsvfs); 4061 return (error); 4062 } 4063 4064 tx = dmu_tx_create(zfsvfs->z_os); 4065 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4066 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4067 zfs_sa_upgrade_txholds(tx, szp); 4068 zfs_sa_upgrade_txholds(tx, dzp); 4069 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 4070 if (error) { 4071 zfs_dirent_unlock(dl); 4072 if (error == ERESTART) { 4073 waited = B_TRUE; 4074 dmu_tx_wait(tx); 4075 dmu_tx_abort(tx); 4076 goto top; 4077 } 4078 dmu_tx_abort(tx); 4079 ZFS_EXIT(zfsvfs); 4080 return (error); 4081 } 4082 4083 error = zfs_link_create(dl, szp, tx, 0); 4084 4085 if (error == 0) { 4086 uint64_t txtype = TX_LINK; 4087 if (flags & FIGNORECASE) 4088 txtype |= TX_CI; 4089 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4090 } 4091 4092 dmu_tx_commit(tx); 4093 4094 zfs_dirent_unlock(dl); 4095 4096 if (error == 0) { 4097 vnevent_link(svp, ct); 4098 } 4099 4100 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4101 zil_commit(zilog, 0); 4102 4103 ZFS_EXIT(zfsvfs); 4104 return (error); 4105 } 4106 4107 /* 4108 * zfs_null_putapage() is used when the file system has been force 4109 * unmounted. It just drops the pages. 4110 */ 4111 /* ARGSUSED */ 4112 static int 4113 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4114 size_t *lenp, int flags, cred_t *cr) 4115 { 4116 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 4117 return (0); 4118 } 4119 4120 /* 4121 * Push a page out to disk, klustering if possible. 4122 * 4123 * IN: vp - file to push page to. 4124 * pp - page to push. 4125 * flags - additional flags. 4126 * cr - credentials of caller. 4127 * 4128 * OUT: offp - start of range pushed. 4129 * lenp - len of range pushed. 4130 * 4131 * RETURN: 0 on success, error code on failure. 4132 * 4133 * NOTE: callers must have locked the page to be pushed. On 4134 * exit, the page (and all other pages in the kluster) must be 4135 * unlocked. 4136 */ 4137 /* ARGSUSED */ 4138 static int 4139 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4140 size_t *lenp, int flags, cred_t *cr) 4141 { 4142 znode_t *zp = VTOZ(vp); 4143 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4144 dmu_tx_t *tx; 4145 u_offset_t off, koff; 4146 size_t len, klen; 4147 int err; 4148 4149 off = pp->p_offset; 4150 len = PAGESIZE; 4151 /* 4152 * If our blocksize is bigger than the page size, try to kluster 4153 * multiple pages so that we write a full block (thus avoiding 4154 * a read-modify-write). 4155 */ 4156 if (off < zp->z_size && zp->z_blksz > PAGESIZE) { 4157 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 4158 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 4159 ASSERT(koff <= zp->z_size); 4160 if (koff + klen > zp->z_size) 4161 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); 4162 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 4163 } 4164 ASSERT3U(btop(len), ==, btopr(len)); 4165 4166 /* 4167 * Can't push pages past end-of-file. 4168 */ 4169 if (off >= zp->z_size) { 4170 /* ignore all pages */ 4171 err = 0; 4172 goto out; 4173 } else if (off + len > zp->z_size) { 4174 int npages = btopr(zp->z_size - off); 4175 page_t *trunc; 4176 4177 page_list_break(&pp, &trunc, npages); 4178 /* ignore pages past end of file */ 4179 if (trunc) 4180 pvn_write_done(trunc, flags); 4181 len = zp->z_size - off; 4182 } 4183 4184 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4185 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4186 err = SET_ERROR(EDQUOT); 4187 goto out; 4188 } 4189 tx = dmu_tx_create(zfsvfs->z_os); 4190 dmu_tx_hold_write(tx, zp->z_id, off, len); 4191 4192 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4193 zfs_sa_upgrade_txholds(tx, zp); 4194 err = dmu_tx_assign(tx, TXG_WAIT); 4195 if (err != 0) { 4196 dmu_tx_abort(tx); 4197 goto out; 4198 } 4199 4200 if (zp->z_blksz <= PAGESIZE) { 4201 caddr_t va = zfs_map_page(pp, S_READ); 4202 ASSERT3U(len, <=, PAGESIZE); 4203 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 4204 zfs_unmap_page(pp, va); 4205 } else { 4206 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 4207 } 4208 4209 if (err == 0) { 4210 uint64_t mtime[2], ctime[2]; 4211 sa_bulk_attr_t bulk[3]; 4212 int count = 0; 4213 4214 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4215 &mtime, 16); 4216 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4217 &ctime, 16); 4218 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4219 &zp->z_pflags, 8); 4220 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4221 B_TRUE); 4222 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4223 } 4224 dmu_tx_commit(tx); 4225 4226 out: 4227 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 4228 if (offp) 4229 *offp = off; 4230 if (lenp) 4231 *lenp = len; 4232 4233 return (err); 4234 } 4235 4236 /* 4237 * Copy the portion of the file indicated from pages into the file. 4238 * The pages are stored in a page list attached to the files vnode. 4239 * 4240 * IN: vp - vnode of file to push page data to. 4241 * off - position in file to put data. 4242 * len - amount of data to write. 4243 * flags - flags to control the operation. 4244 * cr - credentials of caller. 4245 * ct - caller context. 4246 * 4247 * RETURN: 0 on success, error code on failure. 4248 * 4249 * Timestamps: 4250 * vp - ctime|mtime updated 4251 */ 4252 /*ARGSUSED*/ 4253 static int 4254 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4255 caller_context_t *ct) 4256 { 4257 znode_t *zp = VTOZ(vp); 4258 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4259 page_t *pp; 4260 size_t io_len; 4261 u_offset_t io_off; 4262 uint_t blksz; 4263 rl_t *rl; 4264 int error = 0; 4265 4266 ZFS_ENTER(zfsvfs); 4267 ZFS_VERIFY_ZP(zp); 4268 4269 /* 4270 * There's nothing to do if no data is cached. 4271 */ 4272 if (!vn_has_cached_data(vp)) { 4273 ZFS_EXIT(zfsvfs); 4274 return (0); 4275 } 4276 4277 /* 4278 * Align this request to the file block size in case we kluster. 4279 * XXX - this can result in pretty aggresive locking, which can 4280 * impact simultanious read/write access. One option might be 4281 * to break up long requests (len == 0) into block-by-block 4282 * operations to get narrower locking. 4283 */ 4284 blksz = zp->z_blksz; 4285 if (ISP2(blksz)) 4286 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4287 else 4288 io_off = 0; 4289 if (len > 0 && ISP2(blksz)) 4290 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4291 else 4292 io_len = 0; 4293 4294 if (io_len == 0) { 4295 /* 4296 * Search the entire vp list for pages >= io_off. 4297 */ 4298 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); 4299 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4300 goto out; 4301 } 4302 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); 4303 4304 if (off > zp->z_size) { 4305 /* past end of file */ 4306 zfs_range_unlock(rl); 4307 ZFS_EXIT(zfsvfs); 4308 return (0); 4309 } 4310 4311 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); 4312 4313 for (off = io_off; io_off < off + len; io_off += io_len) { 4314 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4315 pp = page_lookup(vp, io_off, 4316 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4317 } else { 4318 pp = page_lookup_nowait(vp, io_off, 4319 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4320 } 4321 4322 if (pp != NULL && pvn_getdirty(pp, flags)) { 4323 int err; 4324 4325 /* 4326 * Found a dirty page to push 4327 */ 4328 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4329 if (err) 4330 error = err; 4331 } else { 4332 io_len = PAGESIZE; 4333 } 4334 } 4335 out: 4336 zfs_range_unlock(rl); 4337 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4338 zil_commit(zfsvfs->z_log, zp->z_id); 4339 ZFS_EXIT(zfsvfs); 4340 return (error); 4341 } 4342 4343 /*ARGSUSED*/ 4344 void 4345 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4346 { 4347 znode_t *zp = VTOZ(vp); 4348 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4349 int error; 4350 4351 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4352 if (zp->z_sa_hdl == NULL) { 4353 /* 4354 * The fs has been unmounted, or we did a 4355 * suspend/resume and this file no longer exists. 4356 */ 4357 if (vn_has_cached_data(vp)) { 4358 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 4359 B_INVAL, cr); 4360 } 4361 4362 mutex_enter(&zp->z_lock); 4363 mutex_enter(&vp->v_lock); 4364 ASSERT(vp->v_count == 1); 4365 vp->v_count = 0; 4366 mutex_exit(&vp->v_lock); 4367 mutex_exit(&zp->z_lock); 4368 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4369 zfs_znode_free(zp); 4370 return; 4371 } 4372 4373 /* 4374 * Attempt to push any data in the page cache. If this fails 4375 * we will get kicked out later in zfs_zinactive(). 4376 */ 4377 if (vn_has_cached_data(vp)) { 4378 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 4379 cr); 4380 } 4381 4382 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4383 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4384 4385 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4386 zfs_sa_upgrade_txholds(tx, zp); 4387 error = dmu_tx_assign(tx, TXG_WAIT); 4388 if (error) { 4389 dmu_tx_abort(tx); 4390 } else { 4391 mutex_enter(&zp->z_lock); 4392 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4393 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4394 zp->z_atime_dirty = 0; 4395 mutex_exit(&zp->z_lock); 4396 dmu_tx_commit(tx); 4397 } 4398 } 4399 4400 zfs_zinactive(zp); 4401 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4402 } 4403 4404 /* 4405 * Bounds-check the seek operation. 4406 * 4407 * IN: vp - vnode seeking within 4408 * ooff - old file offset 4409 * noffp - pointer to new file offset 4410 * ct - caller context 4411 * 4412 * RETURN: 0 on success, EINVAL if new offset invalid. 4413 */ 4414 /* ARGSUSED */ 4415 static int 4416 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4417 caller_context_t *ct) 4418 { 4419 if (vp->v_type == VDIR) 4420 return (0); 4421 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4422 } 4423 4424 /* 4425 * Pre-filter the generic locking function to trap attempts to place 4426 * a mandatory lock on a memory mapped file. 4427 */ 4428 static int 4429 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4430 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4431 { 4432 znode_t *zp = VTOZ(vp); 4433 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4434 4435 ZFS_ENTER(zfsvfs); 4436 ZFS_VERIFY_ZP(zp); 4437 4438 /* 4439 * We are following the UFS semantics with respect to mapcnt 4440 * here: If we see that the file is mapped already, then we will 4441 * return an error, but we don't worry about races between this 4442 * function and zfs_map(). 4443 */ 4444 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { 4445 ZFS_EXIT(zfsvfs); 4446 return (SET_ERROR(EAGAIN)); 4447 } 4448 ZFS_EXIT(zfsvfs); 4449 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4450 } 4451 4452 /* 4453 * If we can't find a page in the cache, we will create a new page 4454 * and fill it with file data. For efficiency, we may try to fill 4455 * multiple pages at once (klustering) to fill up the supplied page 4456 * list. Note that the pages to be filled are held with an exclusive 4457 * lock to prevent access by other threads while they are being filled. 4458 */ 4459 static int 4460 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4461 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4462 { 4463 znode_t *zp = VTOZ(vp); 4464 page_t *pp, *cur_pp; 4465 objset_t *os = zp->z_zfsvfs->z_os; 4466 u_offset_t io_off, total; 4467 size_t io_len; 4468 int err; 4469 4470 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4471 /* 4472 * We only have a single page, don't bother klustering 4473 */ 4474 io_off = off; 4475 io_len = PAGESIZE; 4476 pp = page_create_va(vp, io_off, io_len, 4477 PG_EXCL | PG_WAIT, seg, addr); 4478 } else { 4479 /* 4480 * Try to find enough pages to fill the page list 4481 */ 4482 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4483 &io_len, off, plsz, 0); 4484 } 4485 if (pp == NULL) { 4486 /* 4487 * The page already exists, nothing to do here. 4488 */ 4489 *pl = NULL; 4490 return (0); 4491 } 4492 4493 /* 4494 * Fill the pages in the kluster. 4495 */ 4496 cur_pp = pp; 4497 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4498 caddr_t va; 4499 4500 ASSERT3U(io_off, ==, cur_pp->p_offset); 4501 va = zfs_map_page(cur_pp, S_WRITE); 4502 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4503 DMU_READ_PREFETCH); 4504 zfs_unmap_page(cur_pp, va); 4505 if (err) { 4506 /* On error, toss the entire kluster */ 4507 pvn_read_done(pp, B_ERROR); 4508 /* convert checksum errors into IO errors */ 4509 if (err == ECKSUM) 4510 err = SET_ERROR(EIO); 4511 return (err); 4512 } 4513 cur_pp = cur_pp->p_next; 4514 } 4515 4516 /* 4517 * Fill in the page list array from the kluster starting 4518 * from the desired offset `off'. 4519 * NOTE: the page list will always be null terminated. 4520 */ 4521 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4522 ASSERT(pl == NULL || (*pl)->p_offset == off); 4523 4524 return (0); 4525 } 4526 4527 /* 4528 * Return pointers to the pages for the file region [off, off + len] 4529 * in the pl array. If plsz is greater than len, this function may 4530 * also return page pointers from after the specified region 4531 * (i.e. the region [off, off + plsz]). These additional pages are 4532 * only returned if they are already in the cache, or were created as 4533 * part of a klustered read. 4534 * 4535 * IN: vp - vnode of file to get data from. 4536 * off - position in file to get data from. 4537 * len - amount of data to retrieve. 4538 * plsz - length of provided page list. 4539 * seg - segment to obtain pages for. 4540 * addr - virtual address of fault. 4541 * rw - mode of created pages. 4542 * cr - credentials of caller. 4543 * ct - caller context. 4544 * 4545 * OUT: protp - protection mode of created pages. 4546 * pl - list of pages created. 4547 * 4548 * RETURN: 0 on success, error code on failure. 4549 * 4550 * Timestamps: 4551 * vp - atime updated 4552 */ 4553 /* ARGSUSED */ 4554 static int 4555 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4556 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4557 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4558 { 4559 znode_t *zp = VTOZ(vp); 4560 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4561 page_t **pl0 = pl; 4562 int err = 0; 4563 4564 /* we do our own caching, faultahead is unnecessary */ 4565 if (pl == NULL) 4566 return (0); 4567 else if (len > plsz) 4568 len = plsz; 4569 else 4570 len = P2ROUNDUP(len, PAGESIZE); 4571 ASSERT(plsz >= len); 4572 4573 ZFS_ENTER(zfsvfs); 4574 ZFS_VERIFY_ZP(zp); 4575 4576 if (protp) 4577 *protp = PROT_ALL; 4578 4579 /* 4580 * Loop through the requested range [off, off + len) looking 4581 * for pages. If we don't find a page, we will need to create 4582 * a new page and fill it with data from the file. 4583 */ 4584 while (len > 0) { 4585 if (*pl = page_lookup(vp, off, SE_SHARED)) 4586 *(pl+1) = NULL; 4587 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 4588 goto out; 4589 while (*pl) { 4590 ASSERT3U((*pl)->p_offset, ==, off); 4591 off += PAGESIZE; 4592 addr += PAGESIZE; 4593 if (len > 0) { 4594 ASSERT3U(len, >=, PAGESIZE); 4595 len -= PAGESIZE; 4596 } 4597 ASSERT3U(plsz, >=, PAGESIZE); 4598 plsz -= PAGESIZE; 4599 pl++; 4600 } 4601 } 4602 4603 /* 4604 * Fill out the page array with any pages already in the cache. 4605 */ 4606 while (plsz > 0 && 4607 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 4608 off += PAGESIZE; 4609 plsz -= PAGESIZE; 4610 } 4611 out: 4612 if (err) { 4613 /* 4614 * Release any pages we have previously locked. 4615 */ 4616 while (pl > pl0) 4617 page_unlock(*--pl); 4618 } else { 4619 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4620 } 4621 4622 *pl = NULL; 4623 4624 ZFS_EXIT(zfsvfs); 4625 return (err); 4626 } 4627 4628 /* 4629 * Request a memory map for a section of a file. This code interacts 4630 * with common code and the VM system as follows: 4631 * 4632 * - common code calls mmap(), which ends up in smmap_common() 4633 * - this calls VOP_MAP(), which takes you into (say) zfs 4634 * - zfs_map() calls as_map(), passing segvn_create() as the callback 4635 * - segvn_create() creates the new segment and calls VOP_ADDMAP() 4636 * - zfs_addmap() updates z_mapcnt 4637 */ 4638 /*ARGSUSED*/ 4639 static int 4640 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4641 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4642 caller_context_t *ct) 4643 { 4644 znode_t *zp = VTOZ(vp); 4645 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4646 segvn_crargs_t vn_a; 4647 int error; 4648 4649 ZFS_ENTER(zfsvfs); 4650 ZFS_VERIFY_ZP(zp); 4651 4652 if ((prot & PROT_WRITE) && (zp->z_pflags & 4653 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4654 ZFS_EXIT(zfsvfs); 4655 return (SET_ERROR(EPERM)); 4656 } 4657 4658 if ((prot & (PROT_READ | PROT_EXEC)) && 4659 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4660 ZFS_EXIT(zfsvfs); 4661 return (SET_ERROR(EACCES)); 4662 } 4663 4664 if (vp->v_flag & VNOMAP) { 4665 ZFS_EXIT(zfsvfs); 4666 return (SET_ERROR(ENOSYS)); 4667 } 4668 4669 if (off < 0 || len > MAXOFFSET_T - off) { 4670 ZFS_EXIT(zfsvfs); 4671 return (SET_ERROR(ENXIO)); 4672 } 4673 4674 if (vp->v_type != VREG) { 4675 ZFS_EXIT(zfsvfs); 4676 return (SET_ERROR(ENODEV)); 4677 } 4678 4679 /* 4680 * If file is locked, disallow mapping. 4681 */ 4682 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { 4683 ZFS_EXIT(zfsvfs); 4684 return (SET_ERROR(EAGAIN)); 4685 } 4686 4687 as_rangelock(as); 4688 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4689 if (error != 0) { 4690 as_rangeunlock(as); 4691 ZFS_EXIT(zfsvfs); 4692 return (error); 4693 } 4694 4695 vn_a.vp = vp; 4696 vn_a.offset = (u_offset_t)off; 4697 vn_a.type = flags & MAP_TYPE; 4698 vn_a.prot = prot; 4699 vn_a.maxprot = maxprot; 4700 vn_a.cred = cr; 4701 vn_a.amp = NULL; 4702 vn_a.flags = flags & ~MAP_TYPE; 4703 vn_a.szc = 0; 4704 vn_a.lgrp_mem_policy_flags = 0; 4705 4706 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4707 4708 as_rangeunlock(as); 4709 ZFS_EXIT(zfsvfs); 4710 return (error); 4711 } 4712 4713 /* ARGSUSED */ 4714 static int 4715 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4716 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4717 caller_context_t *ct) 4718 { 4719 uint64_t pages = btopr(len); 4720 4721 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 4722 return (0); 4723 } 4724 4725 /* 4726 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 4727 * more accurate mtime for the associated file. Since we don't have a way of 4728 * detecting when the data was actually modified, we have to resort to 4729 * heuristics. If an explicit msync() is done, then we mark the mtime when the 4730 * last page is pushed. The problem occurs when the msync() call is omitted, 4731 * which by far the most common case: 4732 * 4733 * open() 4734 * mmap() 4735 * <modify memory> 4736 * munmap() 4737 * close() 4738 * <time lapse> 4739 * putpage() via fsflush 4740 * 4741 * If we wait until fsflush to come along, we can have a modification time that 4742 * is some arbitrary point in the future. In order to prevent this in the 4743 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 4744 * torn down. 4745 */ 4746 /* ARGSUSED */ 4747 static int 4748 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4749 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4750 caller_context_t *ct) 4751 { 4752 uint64_t pages = btopr(len); 4753 4754 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 4755 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 4756 4757 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 4758 vn_has_cached_data(vp)) 4759 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 4760 4761 return (0); 4762 } 4763 4764 /* 4765 * Free or allocate space in a file. Currently, this function only 4766 * supports the `F_FREESP' command. However, this command is somewhat 4767 * misnamed, as its functionality includes the ability to allocate as 4768 * well as free space. 4769 * 4770 * IN: vp - vnode of file to free data in. 4771 * cmd - action to take (only F_FREESP supported). 4772 * bfp - section of file to free/alloc. 4773 * flag - current file open mode flags. 4774 * offset - current file offset. 4775 * cr - credentials of caller [UNUSED]. 4776 * ct - caller context. 4777 * 4778 * RETURN: 0 on success, error code on failure. 4779 * 4780 * Timestamps: 4781 * vp - ctime|mtime updated 4782 */ 4783 /* ARGSUSED */ 4784 static int 4785 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 4786 offset_t offset, cred_t *cr, caller_context_t *ct) 4787 { 4788 znode_t *zp = VTOZ(vp); 4789 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4790 uint64_t off, len; 4791 int error; 4792 4793 ZFS_ENTER(zfsvfs); 4794 ZFS_VERIFY_ZP(zp); 4795 4796 if (cmd != F_FREESP) { 4797 ZFS_EXIT(zfsvfs); 4798 return (SET_ERROR(EINVAL)); 4799 } 4800 4801 if (error = convoff(vp, bfp, 0, offset)) { 4802 ZFS_EXIT(zfsvfs); 4803 return (error); 4804 } 4805 4806 if (bfp->l_len < 0) { 4807 ZFS_EXIT(zfsvfs); 4808 return (SET_ERROR(EINVAL)); 4809 } 4810 4811 off = bfp->l_start; 4812 len = bfp->l_len; /* 0 means from off to end of file */ 4813 4814 error = zfs_freesp(zp, off, len, flag, TRUE); 4815 4816 if (error == 0 && off == 0 && len == 0) 4817 vnevent_truncate(ZTOV(zp), ct); 4818 4819 ZFS_EXIT(zfsvfs); 4820 return (error); 4821 } 4822 4823 /*ARGSUSED*/ 4824 static int 4825 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4826 { 4827 znode_t *zp = VTOZ(vp); 4828 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4829 uint32_t gen; 4830 uint64_t gen64; 4831 uint64_t object = zp->z_id; 4832 zfid_short_t *zfid; 4833 int size, i, error; 4834 4835 ZFS_ENTER(zfsvfs); 4836 ZFS_VERIFY_ZP(zp); 4837 4838 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4839 &gen64, sizeof (uint64_t))) != 0) { 4840 ZFS_EXIT(zfsvfs); 4841 return (error); 4842 } 4843 4844 gen = (uint32_t)gen64; 4845 4846 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4847 if (fidp->fid_len < size) { 4848 fidp->fid_len = size; 4849 ZFS_EXIT(zfsvfs); 4850 return (SET_ERROR(ENOSPC)); 4851 } 4852 4853 zfid = (zfid_short_t *)fidp; 4854 4855 zfid->zf_len = size; 4856 4857 for (i = 0; i < sizeof (zfid->zf_object); i++) 4858 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4859 4860 /* Must have a non-zero generation number to distinguish from .zfs */ 4861 if (gen == 0) 4862 gen = 1; 4863 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4864 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4865 4866 if (size == LONG_FID_LEN) { 4867 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4868 zfid_long_t *zlfid; 4869 4870 zlfid = (zfid_long_t *)fidp; 4871 4872 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4873 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4874 4875 /* XXX - this should be the generation number for the objset */ 4876 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4877 zlfid->zf_setgen[i] = 0; 4878 } 4879 4880 ZFS_EXIT(zfsvfs); 4881 return (0); 4882 } 4883 4884 static int 4885 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4886 caller_context_t *ct) 4887 { 4888 znode_t *zp, *xzp; 4889 zfsvfs_t *zfsvfs; 4890 zfs_dirlock_t *dl; 4891 int error; 4892 4893 switch (cmd) { 4894 case _PC_LINK_MAX: 4895 *valp = ULONG_MAX; 4896 return (0); 4897 4898 case _PC_FILESIZEBITS: 4899 *valp = 64; 4900 return (0); 4901 4902 case _PC_XATTR_EXISTS: 4903 zp = VTOZ(vp); 4904 zfsvfs = zp->z_zfsvfs; 4905 ZFS_ENTER(zfsvfs); 4906 ZFS_VERIFY_ZP(zp); 4907 *valp = 0; 4908 error = zfs_dirent_lock(&dl, zp, "", &xzp, 4909 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 4910 if (error == 0) { 4911 zfs_dirent_unlock(dl); 4912 if (!zfs_dirempty(xzp)) 4913 *valp = 1; 4914 VN_RELE(ZTOV(xzp)); 4915 } else if (error == ENOENT) { 4916 /* 4917 * If there aren't extended attributes, it's the 4918 * same as having zero of them. 4919 */ 4920 error = 0; 4921 } 4922 ZFS_EXIT(zfsvfs); 4923 return (error); 4924 4925 case _PC_SATTR_ENABLED: 4926 case _PC_SATTR_EXISTS: 4927 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4928 (vp->v_type == VREG || vp->v_type == VDIR); 4929 return (0); 4930 4931 case _PC_ACCESS_FILTERING: 4932 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4933 vp->v_type == VDIR; 4934 return (0); 4935 4936 case _PC_ACL_ENABLED: 4937 *valp = _ACL_ACE_ENABLED; 4938 return (0); 4939 4940 case _PC_MIN_HOLE_SIZE: 4941 *valp = (ulong_t)SPA_MINBLOCKSIZE; 4942 return (0); 4943 4944 case _PC_TIMESTAMP_RESOLUTION: 4945 /* nanosecond timestamp resolution */ 4946 *valp = 1L; 4947 return (0); 4948 4949 default: 4950 return (fs_pathconf(vp, cmd, valp, cr, ct)); 4951 } 4952 } 4953 4954 /*ARGSUSED*/ 4955 static int 4956 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4957 caller_context_t *ct) 4958 { 4959 znode_t *zp = VTOZ(vp); 4960 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4961 int error; 4962 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4963 4964 ZFS_ENTER(zfsvfs); 4965 ZFS_VERIFY_ZP(zp); 4966 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4967 ZFS_EXIT(zfsvfs); 4968 4969 return (error); 4970 } 4971 4972 /*ARGSUSED*/ 4973 static int 4974 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4975 caller_context_t *ct) 4976 { 4977 znode_t *zp = VTOZ(vp); 4978 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4979 int error; 4980 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4981 zilog_t *zilog = zfsvfs->z_log; 4982 4983 ZFS_ENTER(zfsvfs); 4984 ZFS_VERIFY_ZP(zp); 4985 4986 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4987 4988 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4989 zil_commit(zilog, 0); 4990 4991 ZFS_EXIT(zfsvfs); 4992 return (error); 4993 } 4994 4995 /* 4996 * The smallest read we may consider to loan out an arcbuf. 4997 * This must be a power of 2. 4998 */ 4999 int zcr_blksz_min = (1 << 10); /* 1K */ 5000 /* 5001 * If set to less than the file block size, allow loaning out of an 5002 * arcbuf for a partial block read. This must be a power of 2. 5003 */ 5004 int zcr_blksz_max = (1 << 17); /* 128K */ 5005 5006 /*ARGSUSED*/ 5007 static int 5008 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 5009 caller_context_t *ct) 5010 { 5011 znode_t *zp = VTOZ(vp); 5012 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5013 int max_blksz = zfsvfs->z_max_blksz; 5014 uio_t *uio = &xuio->xu_uio; 5015 ssize_t size = uio->uio_resid; 5016 offset_t offset = uio->uio_loffset; 5017 int blksz; 5018 int fullblk, i; 5019 arc_buf_t *abuf; 5020 ssize_t maxsize; 5021 int preamble, postamble; 5022 5023 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 5024 return (SET_ERROR(EINVAL)); 5025 5026 ZFS_ENTER(zfsvfs); 5027 ZFS_VERIFY_ZP(zp); 5028 switch (ioflag) { 5029 case UIO_WRITE: 5030 /* 5031 * Loan out an arc_buf for write if write size is bigger than 5032 * max_blksz, and the file's block size is also max_blksz. 5033 */ 5034 blksz = max_blksz; 5035 if (size < blksz || zp->z_blksz != blksz) { 5036 ZFS_EXIT(zfsvfs); 5037 return (SET_ERROR(EINVAL)); 5038 } 5039 /* 5040 * Caller requests buffers for write before knowing where the 5041 * write offset might be (e.g. NFS TCP write). 5042 */ 5043 if (offset == -1) { 5044 preamble = 0; 5045 } else { 5046 preamble = P2PHASE(offset, blksz); 5047 if (preamble) { 5048 preamble = blksz - preamble; 5049 size -= preamble; 5050 } 5051 } 5052 5053 postamble = P2PHASE(size, blksz); 5054 size -= postamble; 5055 5056 fullblk = size / blksz; 5057 (void) dmu_xuio_init(xuio, 5058 (preamble != 0) + fullblk + (postamble != 0)); 5059 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5060 int, postamble, int, 5061 (preamble != 0) + fullblk + (postamble != 0)); 5062 5063 /* 5064 * Have to fix iov base/len for partial buffers. They 5065 * currently represent full arc_buf's. 5066 */ 5067 if (preamble) { 5068 /* data begins in the middle of the arc_buf */ 5069 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5070 blksz); 5071 ASSERT(abuf); 5072 (void) dmu_xuio_add(xuio, abuf, 5073 blksz - preamble, preamble); 5074 } 5075 5076 for (i = 0; i < fullblk; i++) { 5077 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5078 blksz); 5079 ASSERT(abuf); 5080 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5081 } 5082 5083 if (postamble) { 5084 /* data ends in the middle of the arc_buf */ 5085 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5086 blksz); 5087 ASSERT(abuf); 5088 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5089 } 5090 break; 5091 case UIO_READ: 5092 /* 5093 * Loan out an arc_buf for read if the read size is larger than 5094 * the current file block size. Block alignment is not 5095 * considered. Partial arc_buf will be loaned out for read. 5096 */ 5097 blksz = zp->z_blksz; 5098 if (blksz < zcr_blksz_min) 5099 blksz = zcr_blksz_min; 5100 if (blksz > zcr_blksz_max) 5101 blksz = zcr_blksz_max; 5102 /* avoid potential complexity of dealing with it */ 5103 if (blksz > max_blksz) { 5104 ZFS_EXIT(zfsvfs); 5105 return (SET_ERROR(EINVAL)); 5106 } 5107 5108 maxsize = zp->z_size - uio->uio_loffset; 5109 if (size > maxsize) 5110 size = maxsize; 5111 5112 if (size < blksz || vn_has_cached_data(vp)) { 5113 ZFS_EXIT(zfsvfs); 5114 return (SET_ERROR(EINVAL)); 5115 } 5116 break; 5117 default: 5118 ZFS_EXIT(zfsvfs); 5119 return (SET_ERROR(EINVAL)); 5120 } 5121 5122 uio->uio_extflg = UIO_XUIO; 5123 XUIO_XUZC_RW(xuio) = ioflag; 5124 ZFS_EXIT(zfsvfs); 5125 return (0); 5126 } 5127 5128 /*ARGSUSED*/ 5129 static int 5130 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5131 { 5132 int i; 5133 arc_buf_t *abuf; 5134 int ioflag = XUIO_XUZC_RW(xuio); 5135 5136 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5137 5138 i = dmu_xuio_cnt(xuio); 5139 while (i-- > 0) { 5140 abuf = dmu_xuio_arcbuf(xuio, i); 5141 /* 5142 * if abuf == NULL, it must be a write buffer 5143 * that has been returned in zfs_write(). 5144 */ 5145 if (abuf) 5146 dmu_return_arcbuf(abuf); 5147 ASSERT(abuf || ioflag == UIO_WRITE); 5148 } 5149 5150 dmu_xuio_fini(xuio); 5151 return (0); 5152 } 5153 5154 /* 5155 * Predeclare these here so that the compiler assumes that 5156 * this is an "old style" function declaration that does 5157 * not include arguments => we won't get type mismatch errors 5158 * in the initializations that follow. 5159 */ 5160 static int zfs_inval(); 5161 static int zfs_isdir(); 5162 5163 static int 5164 zfs_inval() 5165 { 5166 return (SET_ERROR(EINVAL)); 5167 } 5168 5169 static int 5170 zfs_isdir() 5171 { 5172 return (SET_ERROR(EISDIR)); 5173 } 5174 /* 5175 * Directory vnode operations template 5176 */ 5177 vnodeops_t *zfs_dvnodeops; 5178 const fs_operation_def_t zfs_dvnodeops_template[] = { 5179 VOPNAME_OPEN, { .vop_open = zfs_open }, 5180 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5181 VOPNAME_READ, { .error = zfs_isdir }, 5182 VOPNAME_WRITE, { .error = zfs_isdir }, 5183 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5184 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5185 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5186 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5187 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5188 VOPNAME_CREATE, { .vop_create = zfs_create }, 5189 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5190 VOPNAME_LINK, { .vop_link = zfs_link }, 5191 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5192 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 5193 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5194 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5195 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 5196 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5197 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5198 VOPNAME_FID, { .vop_fid = zfs_fid }, 5199 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5200 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5201 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5202 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5203 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5204 NULL, NULL 5205 }; 5206 5207 /* 5208 * Regular file vnode operations template 5209 */ 5210 vnodeops_t *zfs_fvnodeops; 5211 const fs_operation_def_t zfs_fvnodeops_template[] = { 5212 VOPNAME_OPEN, { .vop_open = zfs_open }, 5213 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5214 VOPNAME_READ, { .vop_read = zfs_read }, 5215 VOPNAME_WRITE, { .vop_write = zfs_write }, 5216 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5217 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5218 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5219 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5220 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5221 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5222 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5223 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5224 VOPNAME_FID, { .vop_fid = zfs_fid }, 5225 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5226 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 5227 VOPNAME_SPACE, { .vop_space = zfs_space }, 5228 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 5229 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 5230 VOPNAME_MAP, { .vop_map = zfs_map }, 5231 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 5232 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 5233 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5234 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5235 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5236 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5237 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, 5238 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, 5239 NULL, NULL 5240 }; 5241 5242 /* 5243 * Symbolic link vnode operations template 5244 */ 5245 vnodeops_t *zfs_symvnodeops; 5246 const fs_operation_def_t zfs_symvnodeops_template[] = { 5247 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5248 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5249 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5250 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5251 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 5252 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5253 VOPNAME_FID, { .vop_fid = zfs_fid }, 5254 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5255 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5256 NULL, NULL 5257 }; 5258 5259 /* 5260 * special share hidden files vnode operations template 5261 */ 5262 vnodeops_t *zfs_sharevnodeops; 5263 const fs_operation_def_t zfs_sharevnodeops_template[] = { 5264 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5265 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5266 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5267 VOPNAME_FID, { .vop_fid = zfs_fid }, 5268 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5269 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5270 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5271 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5272 NULL, NULL 5273 }; 5274 5275 /* 5276 * Extended attribute directory vnode operations template 5277 * 5278 * This template is identical to the directory vnodes 5279 * operation template except for restricted operations: 5280 * VOP_MKDIR() 5281 * VOP_SYMLINK() 5282 * 5283 * Note that there are other restrictions embedded in: 5284 * zfs_create() - restrict type to VREG 5285 * zfs_link() - no links into/out of attribute space 5286 * zfs_rename() - no moves into/out of attribute space 5287 */ 5288 vnodeops_t *zfs_xdvnodeops; 5289 const fs_operation_def_t zfs_xdvnodeops_template[] = { 5290 VOPNAME_OPEN, { .vop_open = zfs_open }, 5291 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5292 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5293 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5294 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5295 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5296 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5297 VOPNAME_CREATE, { .vop_create = zfs_create }, 5298 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5299 VOPNAME_LINK, { .vop_link = zfs_link }, 5300 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5301 VOPNAME_MKDIR, { .error = zfs_inval }, 5302 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5303 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5304 VOPNAME_SYMLINK, { .error = zfs_inval }, 5305 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5306 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5307 VOPNAME_FID, { .vop_fid = zfs_fid }, 5308 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5309 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5310 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5311 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5312 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5313 NULL, NULL 5314 }; 5315 5316 /* 5317 * Error vnode operations template 5318 */ 5319 vnodeops_t *zfs_evnodeops; 5320 const fs_operation_def_t zfs_evnodeops_template[] = { 5321 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5322 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5323 NULL, NULL 5324 }; 5325