1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 /* Portions Copyright 2007 Jeremy Teo */ 28 /* Portions Copyright 2010 Robert Milkowski */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/file.h> 40 #include <sys/stat.h> 41 #include <sys/kmem.h> 42 #include <sys/taskq.h> 43 #include <sys/uio.h> 44 #include <sys/vmsystm.h> 45 #include <sys/atomic.h> 46 #include <sys/vm.h> 47 #include <vm/seg_vn.h> 48 #include <vm/pvn.h> 49 #include <vm/as.h> 50 #include <vm/kpm.h> 51 #include <vm/seg_kpm.h> 52 #include <sys/mman.h> 53 #include <sys/pathname.h> 54 #include <sys/cmn_err.h> 55 #include <sys/errno.h> 56 #include <sys/unistd.h> 57 #include <sys/zfs_dir.h> 58 #include <sys/zfs_acl.h> 59 #include <sys/zfs_ioctl.h> 60 #include <sys/fs/zfs.h> 61 #include <sys/dmu.h> 62 #include <sys/dmu_objset.h> 63 #include <sys/spa.h> 64 #include <sys/txg.h> 65 #include <sys/dbuf.h> 66 #include <sys/zap.h> 67 #include <sys/sa.h> 68 #include <sys/dirent.h> 69 #include <sys/policy.h> 70 #include <sys/sunddi.h> 71 #include <sys/filio.h> 72 #include <sys/sid.h> 73 #include "fs/fs_subr.h" 74 #include <sys/zfs_ctldir.h> 75 #include <sys/zfs_fuid.h> 76 #include <sys/zfs_sa.h> 77 #include <sys/dnlc.h> 78 #include <sys/zfs_rlock.h> 79 #include <sys/extdirent.h> 80 #include <sys/kidmap.h> 81 #include <sys/cred.h> 82 #include <sys/attr.h> 83 84 /* 85 * Programming rules. 86 * 87 * Each vnode op performs some logical unit of work. To do this, the ZPL must 88 * properly lock its in-core state, create a DMU transaction, do the work, 89 * record this work in the intent log (ZIL), commit the DMU transaction, 90 * and wait for the intent log to commit if it is a synchronous operation. 91 * Moreover, the vnode ops must work in both normal and log replay context. 92 * The ordering of events is important to avoid deadlocks and references 93 * to freed memory. The example below illustrates the following Big Rules: 94 * 95 * (1) A check must be made in each zfs thread for a mounted file system. 96 * This is done avoiding races using ZFS_ENTER(zfsvfs). 97 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 98 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 99 * can return EIO from the calling function. 100 * 101 * (2) VN_RELE() should always be the last thing except for zil_commit() 102 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 103 * First, if it's the last reference, the vnode/znode 104 * can be freed, so the zp may point to freed memory. Second, the last 105 * reference will call zfs_zinactive(), which may induce a lot of work -- 106 * pushing cached pages (which acquires range locks) and syncing out 107 * cached atime changes. Third, zfs_zinactive() may require a new tx, 108 * which could deadlock the system if you were already holding one. 109 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 110 * 111 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 112 * as they can span dmu_tx_assign() calls. 113 * 114 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 115 * dmu_tx_assign(). This is critical because we don't want to block 116 * while holding locks. 117 * 118 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 119 * reduces lock contention and CPU usage when we must wait (note that if 120 * throughput is constrained by the storage, nearly every transaction 121 * must wait). 122 * 123 * Note, in particular, that if a lock is sometimes acquired before 124 * the tx assigns, and sometimes after (e.g. z_lock), then failing 125 * to use a non-blocking assign can deadlock the system. The scenario: 126 * 127 * Thread A has grabbed a lock before calling dmu_tx_assign(). 128 * Thread B is in an already-assigned tx, and blocks for this lock. 129 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 130 * forever, because the previous txg can't quiesce until B's tx commits. 131 * 132 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 133 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 134 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 135 * to indicate that this operation has already called dmu_tx_wait(). 136 * This will ensure that we don't retry forever, waiting a short bit 137 * each time. 138 * 139 * (5) If the operation succeeded, generate the intent log entry for it 140 * before dropping locks. This ensures that the ordering of events 141 * in the intent log matches the order in which they actually occurred. 142 * During ZIL replay the zfs_log_* functions will update the sequence 143 * number to indicate the zil transaction has replayed. 144 * 145 * (6) At the end of each vnode op, the DMU tx must always commit, 146 * regardless of whether there were any errors. 147 * 148 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 149 * to ensure that synchronous semantics are provided when necessary. 150 * 151 * In general, this is how things should be ordered in each vnode op: 152 * 153 * ZFS_ENTER(zfsvfs); // exit if unmounted 154 * top: 155 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 156 * rw_enter(...); // grab any other locks you need 157 * tx = dmu_tx_create(...); // get DMU tx 158 * dmu_tx_hold_*(); // hold each object you might modify 159 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 160 * if (error) { 161 * rw_exit(...); // drop locks 162 * zfs_dirent_unlock(dl); // unlock directory entry 163 * VN_RELE(...); // release held vnodes 164 * if (error == ERESTART) { 165 * waited = B_TRUE; 166 * dmu_tx_wait(tx); 167 * dmu_tx_abort(tx); 168 * goto top; 169 * } 170 * dmu_tx_abort(tx); // abort DMU tx 171 * ZFS_EXIT(zfsvfs); // finished in zfs 172 * return (error); // really out of space 173 * } 174 * error = do_real_work(); // do whatever this VOP does 175 * if (error == 0) 176 * zfs_log_*(...); // on success, make ZIL entry 177 * dmu_tx_commit(tx); // commit DMU tx -- error or not 178 * rw_exit(...); // drop locks 179 * zfs_dirent_unlock(dl); // unlock directory entry 180 * VN_RELE(...); // release held vnodes 181 * zil_commit(zilog, foid); // synchronous when necessary 182 * ZFS_EXIT(zfsvfs); // finished in zfs 183 * return (error); // done, report error 184 */ 185 186 /* ARGSUSED */ 187 static int 188 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 189 { 190 znode_t *zp = VTOZ(*vpp); 191 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 192 193 ZFS_ENTER(zfsvfs); 194 ZFS_VERIFY_ZP(zp); 195 196 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 197 ((flag & FAPPEND) == 0)) { 198 ZFS_EXIT(zfsvfs); 199 return (SET_ERROR(EPERM)); 200 } 201 202 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 203 ZTOV(zp)->v_type == VREG && 204 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 205 if (fs_vscan(*vpp, cr, 0) != 0) { 206 ZFS_EXIT(zfsvfs); 207 return (SET_ERROR(EACCES)); 208 } 209 } 210 211 /* Keep a count of the synchronous opens in the znode */ 212 if (flag & (FSYNC | FDSYNC)) 213 atomic_inc_32(&zp->z_sync_cnt); 214 215 ZFS_EXIT(zfsvfs); 216 return (0); 217 } 218 219 /* ARGSUSED */ 220 static int 221 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 222 caller_context_t *ct) 223 { 224 znode_t *zp = VTOZ(vp); 225 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 226 227 /* 228 * Clean up any locks held by this process on the vp. 229 */ 230 cleanlocks(vp, ddi_get_pid(), 0); 231 cleanshares(vp, ddi_get_pid()); 232 233 ZFS_ENTER(zfsvfs); 234 ZFS_VERIFY_ZP(zp); 235 236 /* Decrement the synchronous opens in the znode */ 237 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 238 atomic_dec_32(&zp->z_sync_cnt); 239 240 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 241 ZTOV(zp)->v_type == VREG && 242 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 243 VERIFY(fs_vscan(vp, cr, 1) == 0); 244 245 ZFS_EXIT(zfsvfs); 246 return (0); 247 } 248 249 /* 250 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 251 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 252 */ 253 static int 254 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 255 { 256 znode_t *zp = VTOZ(vp); 257 uint64_t noff = (uint64_t)*off; /* new offset */ 258 uint64_t file_sz; 259 int error; 260 boolean_t hole; 261 262 file_sz = zp->z_size; 263 if (noff >= file_sz) { 264 return (SET_ERROR(ENXIO)); 265 } 266 267 if (cmd == _FIO_SEEK_HOLE) 268 hole = B_TRUE; 269 else 270 hole = B_FALSE; 271 272 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 273 274 if (error == ESRCH) 275 return (SET_ERROR(ENXIO)); 276 277 /* 278 * We could find a hole that begins after the logical end-of-file, 279 * because dmu_offset_next() only works on whole blocks. If the 280 * EOF falls mid-block, then indicate that the "virtual hole" 281 * at the end of the file begins at the logical EOF, rather than 282 * at the end of the last block. 283 */ 284 if (noff > file_sz) { 285 ASSERT(hole); 286 noff = file_sz; 287 } 288 289 if (noff < *off) 290 return (error); 291 *off = noff; 292 return (error); 293 } 294 295 /* ARGSUSED */ 296 static int 297 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 298 int *rvalp, caller_context_t *ct) 299 { 300 offset_t off; 301 int error; 302 zfsvfs_t *zfsvfs; 303 znode_t *zp; 304 305 switch (com) { 306 case _FIOFFS: 307 return (zfs_sync(vp->v_vfsp, 0, cred)); 308 309 /* 310 * The following two ioctls are used by bfu. Faking out, 311 * necessary to avoid bfu errors. 312 */ 313 case _FIOGDIO: 314 case _FIOSDIO: 315 return (0); 316 317 case _FIO_SEEK_DATA: 318 case _FIO_SEEK_HOLE: 319 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 320 return (SET_ERROR(EFAULT)); 321 322 zp = VTOZ(vp); 323 zfsvfs = zp->z_zfsvfs; 324 ZFS_ENTER(zfsvfs); 325 ZFS_VERIFY_ZP(zp); 326 327 /* offset parameter is in/out */ 328 error = zfs_holey(vp, com, &off); 329 ZFS_EXIT(zfsvfs); 330 if (error) 331 return (error); 332 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 333 return (SET_ERROR(EFAULT)); 334 return (0); 335 } 336 return (SET_ERROR(ENOTTY)); 337 } 338 339 /* 340 * Utility functions to map and unmap a single physical page. These 341 * are used to manage the mappable copies of ZFS file data, and therefore 342 * do not update ref/mod bits. 343 */ 344 caddr_t 345 zfs_map_page(page_t *pp, enum seg_rw rw) 346 { 347 if (kpm_enable) 348 return (hat_kpm_mapin(pp, 0)); 349 ASSERT(rw == S_READ || rw == S_WRITE); 350 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0), 351 (caddr_t)-1)); 352 } 353 354 void 355 zfs_unmap_page(page_t *pp, caddr_t addr) 356 { 357 if (kpm_enable) { 358 hat_kpm_mapout(pp, 0, addr); 359 } else { 360 ppmapout(addr); 361 } 362 } 363 364 /* 365 * When a file is memory mapped, we must keep the IO data synchronized 366 * between the DMU cache and the memory mapped pages. What this means: 367 * 368 * On Write: If we find a memory mapped page, we write to *both* 369 * the page and the dmu buffer. 370 */ 371 static void 372 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) 373 { 374 int64_t off; 375 376 off = start & PAGEOFFSET; 377 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 378 page_t *pp; 379 uint64_t nbytes = MIN(PAGESIZE - off, len); 380 381 if (pp = page_lookup(vp, start, SE_SHARED)) { 382 caddr_t va; 383 384 va = zfs_map_page(pp, S_WRITE); 385 (void) dmu_read(os, oid, start+off, nbytes, va+off, 386 DMU_READ_PREFETCH); 387 zfs_unmap_page(pp, va); 388 page_unlock(pp); 389 } 390 len -= nbytes; 391 off = 0; 392 } 393 } 394 395 /* 396 * When a file is memory mapped, we must keep the IO data synchronized 397 * between the DMU cache and the memory mapped pages. What this means: 398 * 399 * On Read: We "read" preferentially from memory mapped pages, 400 * else we default from the dmu buffer. 401 * 402 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 403 * the file is memory mapped. 404 */ 405 static int 406 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 407 { 408 znode_t *zp = VTOZ(vp); 409 int64_t start, off; 410 int len = nbytes; 411 int error = 0; 412 413 start = uio->uio_loffset; 414 off = start & PAGEOFFSET; 415 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 416 page_t *pp; 417 uint64_t bytes = MIN(PAGESIZE - off, len); 418 419 if (pp = page_lookup(vp, start, SE_SHARED)) { 420 caddr_t va; 421 422 va = zfs_map_page(pp, S_READ); 423 error = uiomove(va + off, bytes, UIO_READ, uio); 424 zfs_unmap_page(pp, va); 425 page_unlock(pp); 426 } else { 427 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 428 uio, bytes); 429 } 430 len -= bytes; 431 off = 0; 432 if (error) 433 break; 434 } 435 return (error); 436 } 437 438 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 439 440 /* 441 * Read bytes from specified file into supplied buffer. 442 * 443 * IN: vp - vnode of file to be read from. 444 * uio - structure supplying read location, range info, 445 * and return buffer. 446 * ioflag - SYNC flags; used to provide FRSYNC semantics. 447 * cr - credentials of caller. 448 * ct - caller context 449 * 450 * OUT: uio - updated offset and range, buffer filled. 451 * 452 * RETURN: 0 on success, error code on failure. 453 * 454 * Side Effects: 455 * vp - atime updated if byte count > 0 456 */ 457 /* ARGSUSED */ 458 static int 459 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 460 { 461 znode_t *zp = VTOZ(vp); 462 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 463 ssize_t n, nbytes; 464 int error = 0; 465 rl_t *rl; 466 xuio_t *xuio = NULL; 467 468 ZFS_ENTER(zfsvfs); 469 ZFS_VERIFY_ZP(zp); 470 471 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 472 ZFS_EXIT(zfsvfs); 473 return (SET_ERROR(EACCES)); 474 } 475 476 /* 477 * Validate file offset 478 */ 479 if (uio->uio_loffset < (offset_t)0) { 480 ZFS_EXIT(zfsvfs); 481 return (SET_ERROR(EINVAL)); 482 } 483 484 /* 485 * Fasttrack empty reads 486 */ 487 if (uio->uio_resid == 0) { 488 ZFS_EXIT(zfsvfs); 489 return (0); 490 } 491 492 /* 493 * Check for mandatory locks 494 */ 495 if (MANDMODE(zp->z_mode)) { 496 if (error = chklock(vp, FREAD, 497 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 498 ZFS_EXIT(zfsvfs); 499 return (error); 500 } 501 } 502 503 /* 504 * If we're in FRSYNC mode, sync out this znode before reading it. 505 */ 506 if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 507 zil_commit(zfsvfs->z_log, zp->z_id); 508 509 /* 510 * Lock the range against changes. 511 */ 512 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 513 514 /* 515 * If we are reading past end-of-file we can skip 516 * to the end; but we might still need to set atime. 517 */ 518 if (uio->uio_loffset >= zp->z_size) { 519 error = 0; 520 goto out; 521 } 522 523 ASSERT(uio->uio_loffset < zp->z_size); 524 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 525 526 if ((uio->uio_extflg == UIO_XUIO) && 527 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 528 int nblk; 529 int blksz = zp->z_blksz; 530 uint64_t offset = uio->uio_loffset; 531 532 xuio = (xuio_t *)uio; 533 if ((ISP2(blksz))) { 534 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 535 blksz)) / blksz; 536 } else { 537 ASSERT(offset + n <= blksz); 538 nblk = 1; 539 } 540 (void) dmu_xuio_init(xuio, nblk); 541 542 if (vn_has_cached_data(vp)) { 543 /* 544 * For simplicity, we always allocate a full buffer 545 * even if we only expect to read a portion of a block. 546 */ 547 while (--nblk >= 0) { 548 (void) dmu_xuio_add(xuio, 549 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 550 blksz), 0, blksz); 551 } 552 } 553 } 554 555 while (n > 0) { 556 nbytes = MIN(n, zfs_read_chunk_size - 557 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 558 559 if (vn_has_cached_data(vp)) { 560 error = mappedread(vp, nbytes, uio); 561 } else { 562 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 563 uio, nbytes); 564 } 565 if (error) { 566 /* convert checksum errors into IO errors */ 567 if (error == ECKSUM) 568 error = SET_ERROR(EIO); 569 break; 570 } 571 572 n -= nbytes; 573 } 574 out: 575 zfs_range_unlock(rl); 576 577 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 578 ZFS_EXIT(zfsvfs); 579 return (error); 580 } 581 582 /* 583 * Write the bytes to a file. 584 * 585 * IN: vp - vnode of file to be written to. 586 * uio - structure supplying write location, range info, 587 * and data buffer. 588 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 589 * set if in append mode. 590 * cr - credentials of caller. 591 * ct - caller context (NFS/CIFS fem monitor only) 592 * 593 * OUT: uio - updated offset and range. 594 * 595 * RETURN: 0 on success, error code on failure. 596 * 597 * Timestamps: 598 * vp - ctime|mtime updated if byte count > 0 599 */ 600 601 /* ARGSUSED */ 602 static int 603 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 604 { 605 znode_t *zp = VTOZ(vp); 606 rlim64_t limit = uio->uio_llimit; 607 ssize_t start_resid = uio->uio_resid; 608 ssize_t tx_bytes; 609 uint64_t end_size; 610 dmu_tx_t *tx; 611 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 612 zilog_t *zilog; 613 offset_t woff; 614 ssize_t n, nbytes; 615 rl_t *rl; 616 int max_blksz = zfsvfs->z_max_blksz; 617 int error = 0; 618 arc_buf_t *abuf; 619 iovec_t *aiov = NULL; 620 xuio_t *xuio = NULL; 621 int i_iov = 0; 622 int iovcnt = uio->uio_iovcnt; 623 iovec_t *iovp = uio->uio_iov; 624 int write_eof; 625 int count = 0; 626 sa_bulk_attr_t bulk[4]; 627 uint64_t mtime[2], ctime[2]; 628 629 /* 630 * Fasttrack empty write 631 */ 632 n = start_resid; 633 if (n == 0) 634 return (0); 635 636 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 637 limit = MAXOFFSET_T; 638 639 ZFS_ENTER(zfsvfs); 640 ZFS_VERIFY_ZP(zp); 641 642 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 643 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 644 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 645 &zp->z_size, 8); 646 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 647 &zp->z_pflags, 8); 648 649 /* 650 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 651 * callers might not be able to detect properly that we are read-only, 652 * so check it explicitly here. 653 */ 654 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 655 ZFS_EXIT(zfsvfs); 656 return (SET_ERROR(EROFS)); 657 } 658 659 /* 660 * If immutable or not appending then return EPERM 661 */ 662 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 663 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 664 (uio->uio_loffset < zp->z_size))) { 665 ZFS_EXIT(zfsvfs); 666 return (SET_ERROR(EPERM)); 667 } 668 669 zilog = zfsvfs->z_log; 670 671 /* 672 * Validate file offset 673 */ 674 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 675 if (woff < 0) { 676 ZFS_EXIT(zfsvfs); 677 return (SET_ERROR(EINVAL)); 678 } 679 680 /* 681 * Check for mandatory locks before calling zfs_range_lock() 682 * in order to prevent a deadlock with locks set via fcntl(). 683 */ 684 if (MANDMODE((mode_t)zp->z_mode) && 685 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 686 ZFS_EXIT(zfsvfs); 687 return (error); 688 } 689 690 /* 691 * Pre-fault the pages to ensure slow (eg NFS) pages 692 * don't hold up txg. 693 * Skip this if uio contains loaned arc_buf. 694 */ 695 if ((uio->uio_extflg == UIO_XUIO) && 696 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 697 xuio = (xuio_t *)uio; 698 else 699 uio_prefaultpages(MIN(n, max_blksz), uio); 700 701 /* 702 * If in append mode, set the io offset pointer to eof. 703 */ 704 if (ioflag & FAPPEND) { 705 /* 706 * Obtain an appending range lock to guarantee file append 707 * semantics. We reset the write offset once we have the lock. 708 */ 709 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 710 woff = rl->r_off; 711 if (rl->r_len == UINT64_MAX) { 712 /* 713 * We overlocked the file because this write will cause 714 * the file block size to increase. 715 * Note that zp_size cannot change with this lock held. 716 */ 717 woff = zp->z_size; 718 } 719 uio->uio_loffset = woff; 720 } else { 721 /* 722 * Note that if the file block size will change as a result of 723 * this write, then this range lock will lock the entire file 724 * so that we can re-write the block safely. 725 */ 726 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 727 } 728 729 if (woff >= limit) { 730 zfs_range_unlock(rl); 731 ZFS_EXIT(zfsvfs); 732 return (SET_ERROR(EFBIG)); 733 } 734 735 if ((woff + n) > limit || woff > (limit - n)) 736 n = limit - woff; 737 738 /* Will this write extend the file length? */ 739 write_eof = (woff + n > zp->z_size); 740 741 end_size = MAX(zp->z_size, woff + n); 742 743 /* 744 * Write the file in reasonable size chunks. Each chunk is written 745 * in a separate transaction; this keeps the intent log records small 746 * and allows us to do more fine-grained space accounting. 747 */ 748 while (n > 0) { 749 abuf = NULL; 750 woff = uio->uio_loffset; 751 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 752 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 753 if (abuf != NULL) 754 dmu_return_arcbuf(abuf); 755 error = SET_ERROR(EDQUOT); 756 break; 757 } 758 759 if (xuio && abuf == NULL) { 760 ASSERT(i_iov < iovcnt); 761 aiov = &iovp[i_iov]; 762 abuf = dmu_xuio_arcbuf(xuio, i_iov); 763 dmu_xuio_clear(xuio, i_iov); 764 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 765 iovec_t *, aiov, arc_buf_t *, abuf); 766 ASSERT((aiov->iov_base == abuf->b_data) || 767 ((char *)aiov->iov_base - (char *)abuf->b_data + 768 aiov->iov_len == arc_buf_size(abuf))); 769 i_iov++; 770 } else if (abuf == NULL && n >= max_blksz && 771 woff >= zp->z_size && 772 P2PHASE(woff, max_blksz) == 0 && 773 zp->z_blksz == max_blksz) { 774 /* 775 * This write covers a full block. "Borrow" a buffer 776 * from the dmu so that we can fill it before we enter 777 * a transaction. This avoids the possibility of 778 * holding up the transaction if the data copy hangs 779 * up on a pagefault (e.g., from an NFS server mapping). 780 */ 781 size_t cbytes; 782 783 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 784 max_blksz); 785 ASSERT(abuf != NULL); 786 ASSERT(arc_buf_size(abuf) == max_blksz); 787 if (error = uiocopy(abuf->b_data, max_blksz, 788 UIO_WRITE, uio, &cbytes)) { 789 dmu_return_arcbuf(abuf); 790 break; 791 } 792 ASSERT(cbytes == max_blksz); 793 } 794 795 /* 796 * Start a transaction. 797 */ 798 tx = dmu_tx_create(zfsvfs->z_os); 799 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 800 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 801 zfs_sa_upgrade_txholds(tx, zp); 802 error = dmu_tx_assign(tx, TXG_WAIT); 803 if (error) { 804 dmu_tx_abort(tx); 805 if (abuf != NULL) 806 dmu_return_arcbuf(abuf); 807 break; 808 } 809 810 /* 811 * If zfs_range_lock() over-locked we grow the blocksize 812 * and then reduce the lock range. This will only happen 813 * on the first iteration since zfs_range_reduce() will 814 * shrink down r_len to the appropriate size. 815 */ 816 if (rl->r_len == UINT64_MAX) { 817 uint64_t new_blksz; 818 819 if (zp->z_blksz > max_blksz) { 820 ASSERT(!ISP2(zp->z_blksz)); 821 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 822 } else { 823 new_blksz = MIN(end_size, max_blksz); 824 } 825 zfs_grow_blocksize(zp, new_blksz, tx); 826 zfs_range_reduce(rl, woff, n); 827 } 828 829 /* 830 * XXX - should we really limit each write to z_max_blksz? 831 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 832 */ 833 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 834 835 if (abuf == NULL) { 836 tx_bytes = uio->uio_resid; 837 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 838 uio, nbytes, tx); 839 tx_bytes -= uio->uio_resid; 840 } else { 841 tx_bytes = nbytes; 842 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 843 /* 844 * If this is not a full block write, but we are 845 * extending the file past EOF and this data starts 846 * block-aligned, use assign_arcbuf(). Otherwise, 847 * write via dmu_write(). 848 */ 849 if (tx_bytes < max_blksz && (!write_eof || 850 aiov->iov_base != abuf->b_data)) { 851 ASSERT(xuio); 852 dmu_write(zfsvfs->z_os, zp->z_id, woff, 853 aiov->iov_len, aiov->iov_base, tx); 854 dmu_return_arcbuf(abuf); 855 xuio_stat_wbuf_copied(); 856 } else { 857 ASSERT(xuio || tx_bytes == max_blksz); 858 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 859 woff, abuf, tx); 860 } 861 ASSERT(tx_bytes <= uio->uio_resid); 862 uioskip(uio, tx_bytes); 863 } 864 if (tx_bytes && vn_has_cached_data(vp)) { 865 update_pages(vp, woff, 866 tx_bytes, zfsvfs->z_os, zp->z_id); 867 } 868 869 /* 870 * If we made no progress, we're done. If we made even 871 * partial progress, update the znode and ZIL accordingly. 872 */ 873 if (tx_bytes == 0) { 874 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 875 (void *)&zp->z_size, sizeof (uint64_t), tx); 876 dmu_tx_commit(tx); 877 ASSERT(error != 0); 878 break; 879 } 880 881 /* 882 * Clear Set-UID/Set-GID bits on successful write if not 883 * privileged and at least one of the excute bits is set. 884 * 885 * It would be nice to to this after all writes have 886 * been done, but that would still expose the ISUID/ISGID 887 * to another app after the partial write is committed. 888 * 889 * Note: we don't call zfs_fuid_map_id() here because 890 * user 0 is not an ephemeral uid. 891 */ 892 mutex_enter(&zp->z_acl_lock); 893 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 894 (S_IXUSR >> 6))) != 0 && 895 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 896 secpolicy_vnode_setid_retain(cr, 897 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 898 uint64_t newmode; 899 zp->z_mode &= ~(S_ISUID | S_ISGID); 900 newmode = zp->z_mode; 901 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 902 (void *)&newmode, sizeof (uint64_t), tx); 903 } 904 mutex_exit(&zp->z_acl_lock); 905 906 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 907 B_TRUE); 908 909 /* 910 * Update the file size (zp_size) if it has changed; 911 * account for possible concurrent updates. 912 */ 913 while ((end_size = zp->z_size) < uio->uio_loffset) { 914 (void) atomic_cas_64(&zp->z_size, end_size, 915 uio->uio_loffset); 916 ASSERT(error == 0); 917 } 918 /* 919 * If we are replaying and eof is non zero then force 920 * the file size to the specified eof. Note, there's no 921 * concurrency during replay. 922 */ 923 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 924 zp->z_size = zfsvfs->z_replay_eof; 925 926 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 927 928 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 929 dmu_tx_commit(tx); 930 931 if (error != 0) 932 break; 933 ASSERT(tx_bytes == nbytes); 934 n -= nbytes; 935 936 if (!xuio && n > 0) 937 uio_prefaultpages(MIN(n, max_blksz), uio); 938 } 939 940 zfs_range_unlock(rl); 941 942 /* 943 * If we're in replay mode, or we made no progress, return error. 944 * Otherwise, it's at least a partial write, so it's successful. 945 */ 946 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 947 ZFS_EXIT(zfsvfs); 948 return (error); 949 } 950 951 if (ioflag & (FSYNC | FDSYNC) || 952 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 953 zil_commit(zilog, zp->z_id); 954 955 ZFS_EXIT(zfsvfs); 956 return (0); 957 } 958 959 void 960 zfs_get_done(zgd_t *zgd, int error) 961 { 962 znode_t *zp = zgd->zgd_private; 963 objset_t *os = zp->z_zfsvfs->z_os; 964 965 if (zgd->zgd_db) 966 dmu_buf_rele(zgd->zgd_db, zgd); 967 968 zfs_range_unlock(zgd->zgd_rl); 969 970 /* 971 * Release the vnode asynchronously as we currently have the 972 * txg stopped from syncing. 973 */ 974 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 975 976 if (error == 0 && zgd->zgd_bp) 977 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 978 979 kmem_free(zgd, sizeof (zgd_t)); 980 } 981 982 #ifdef DEBUG 983 static int zil_fault_io = 0; 984 #endif 985 986 /* 987 * Get data to generate a TX_WRITE intent log record. 988 */ 989 int 990 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 991 { 992 zfsvfs_t *zfsvfs = arg; 993 objset_t *os = zfsvfs->z_os; 994 znode_t *zp; 995 uint64_t object = lr->lr_foid; 996 uint64_t offset = lr->lr_offset; 997 uint64_t size = lr->lr_length; 998 blkptr_t *bp = &lr->lr_blkptr; 999 dmu_buf_t *db; 1000 zgd_t *zgd; 1001 int error = 0; 1002 1003 ASSERT(zio != NULL); 1004 ASSERT(size != 0); 1005 1006 /* 1007 * Nothing to do if the file has been removed 1008 */ 1009 if (zfs_zget(zfsvfs, object, &zp) != 0) 1010 return (SET_ERROR(ENOENT)); 1011 if (zp->z_unlinked) { 1012 /* 1013 * Release the vnode asynchronously as we currently have the 1014 * txg stopped from syncing. 1015 */ 1016 VN_RELE_ASYNC(ZTOV(zp), 1017 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1018 return (SET_ERROR(ENOENT)); 1019 } 1020 1021 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1022 zgd->zgd_zilog = zfsvfs->z_log; 1023 zgd->zgd_private = zp; 1024 1025 /* 1026 * Write records come in two flavors: immediate and indirect. 1027 * For small writes it's cheaper to store the data with the 1028 * log record (immediate); for large writes it's cheaper to 1029 * sync the data and get a pointer to it (indirect) so that 1030 * we don't have to write the data twice. 1031 */ 1032 if (buf != NULL) { /* immediate write */ 1033 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1034 /* test for truncation needs to be done while range locked */ 1035 if (offset >= zp->z_size) { 1036 error = SET_ERROR(ENOENT); 1037 } else { 1038 error = dmu_read(os, object, offset, size, buf, 1039 DMU_READ_NO_PREFETCH); 1040 } 1041 ASSERT(error == 0 || error == ENOENT); 1042 } else { /* indirect write */ 1043 /* 1044 * Have to lock the whole block to ensure when it's 1045 * written out and it's checksum is being calculated 1046 * that no one can change the data. We need to re-check 1047 * blocksize after we get the lock in case it's changed! 1048 */ 1049 for (;;) { 1050 uint64_t blkoff; 1051 size = zp->z_blksz; 1052 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1053 offset -= blkoff; 1054 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1055 RL_READER); 1056 if (zp->z_blksz == size) 1057 break; 1058 offset += blkoff; 1059 zfs_range_unlock(zgd->zgd_rl); 1060 } 1061 /* test for truncation needs to be done while range locked */ 1062 if (lr->lr_offset >= zp->z_size) 1063 error = SET_ERROR(ENOENT); 1064 #ifdef DEBUG 1065 if (zil_fault_io) { 1066 error = SET_ERROR(EIO); 1067 zil_fault_io = 0; 1068 } 1069 #endif 1070 if (error == 0) 1071 error = dmu_buf_hold(os, object, offset, zgd, &db, 1072 DMU_READ_NO_PREFETCH); 1073 1074 if (error == 0) { 1075 blkptr_t *obp = dmu_buf_get_blkptr(db); 1076 if (obp) { 1077 ASSERT(BP_IS_HOLE(bp)); 1078 *bp = *obp; 1079 } 1080 1081 zgd->zgd_db = db; 1082 zgd->zgd_bp = bp; 1083 1084 ASSERT(db->db_offset == offset); 1085 ASSERT(db->db_size == size); 1086 1087 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1088 zfs_get_done, zgd); 1089 ASSERT(error || lr->lr_length <= zp->z_blksz); 1090 1091 /* 1092 * On success, we need to wait for the write I/O 1093 * initiated by dmu_sync() to complete before we can 1094 * release this dbuf. We will finish everything up 1095 * in the zfs_get_done() callback. 1096 */ 1097 if (error == 0) 1098 return (0); 1099 1100 if (error == EALREADY) { 1101 lr->lr_common.lrc_txtype = TX_WRITE2; 1102 error = 0; 1103 } 1104 } 1105 } 1106 1107 zfs_get_done(zgd, error); 1108 1109 return (error); 1110 } 1111 1112 /*ARGSUSED*/ 1113 static int 1114 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1115 caller_context_t *ct) 1116 { 1117 znode_t *zp = VTOZ(vp); 1118 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1119 int error; 1120 1121 ZFS_ENTER(zfsvfs); 1122 ZFS_VERIFY_ZP(zp); 1123 1124 if (flag & V_ACE_MASK) 1125 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1126 else 1127 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1128 1129 ZFS_EXIT(zfsvfs); 1130 return (error); 1131 } 1132 1133 /* 1134 * If vnode is for a device return a specfs vnode instead. 1135 */ 1136 static int 1137 specvp_check(vnode_t **vpp, cred_t *cr) 1138 { 1139 int error = 0; 1140 1141 if (IS_DEVVP(*vpp)) { 1142 struct vnode *svp; 1143 1144 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1145 VN_RELE(*vpp); 1146 if (svp == NULL) 1147 error = SET_ERROR(ENOSYS); 1148 *vpp = svp; 1149 } 1150 return (error); 1151 } 1152 1153 1154 /* 1155 * Lookup an entry in a directory, or an extended attribute directory. 1156 * If it exists, return a held vnode reference for it. 1157 * 1158 * IN: dvp - vnode of directory to search. 1159 * nm - name of entry to lookup. 1160 * pnp - full pathname to lookup [UNUSED]. 1161 * flags - LOOKUP_XATTR set if looking for an attribute. 1162 * rdir - root directory vnode [UNUSED]. 1163 * cr - credentials of caller. 1164 * ct - caller context 1165 * direntflags - directory lookup flags 1166 * realpnp - returned pathname. 1167 * 1168 * OUT: vpp - vnode of located entry, NULL if not found. 1169 * 1170 * RETURN: 0 on success, error code on failure. 1171 * 1172 * Timestamps: 1173 * NA 1174 */ 1175 /* ARGSUSED */ 1176 static int 1177 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 1178 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1179 int *direntflags, pathname_t *realpnp) 1180 { 1181 znode_t *zdp = VTOZ(dvp); 1182 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1183 int error = 0; 1184 1185 /* fast path */ 1186 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1187 1188 if (dvp->v_type != VDIR) { 1189 return (SET_ERROR(ENOTDIR)); 1190 } else if (zdp->z_sa_hdl == NULL) { 1191 return (SET_ERROR(EIO)); 1192 } 1193 1194 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1195 error = zfs_fastaccesschk_execute(zdp, cr); 1196 if (!error) { 1197 *vpp = dvp; 1198 VN_HOLD(*vpp); 1199 return (0); 1200 } 1201 return (error); 1202 } else { 1203 vnode_t *tvp = dnlc_lookup(dvp, nm); 1204 1205 if (tvp) { 1206 error = zfs_fastaccesschk_execute(zdp, cr); 1207 if (error) { 1208 VN_RELE(tvp); 1209 return (error); 1210 } 1211 if (tvp == DNLC_NO_VNODE) { 1212 VN_RELE(tvp); 1213 return (SET_ERROR(ENOENT)); 1214 } else { 1215 *vpp = tvp; 1216 return (specvp_check(vpp, cr)); 1217 } 1218 } 1219 } 1220 } 1221 1222 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1223 1224 ZFS_ENTER(zfsvfs); 1225 ZFS_VERIFY_ZP(zdp); 1226 1227 *vpp = NULL; 1228 1229 if (flags & LOOKUP_XATTR) { 1230 /* 1231 * If the xattr property is off, refuse the lookup request. 1232 */ 1233 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1234 ZFS_EXIT(zfsvfs); 1235 return (SET_ERROR(EINVAL)); 1236 } 1237 1238 /* 1239 * We don't allow recursive attributes.. 1240 * Maybe someday we will. 1241 */ 1242 if (zdp->z_pflags & ZFS_XATTR) { 1243 ZFS_EXIT(zfsvfs); 1244 return (SET_ERROR(EINVAL)); 1245 } 1246 1247 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1248 ZFS_EXIT(zfsvfs); 1249 return (error); 1250 } 1251 1252 /* 1253 * Do we have permission to get into attribute directory? 1254 */ 1255 1256 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1257 B_FALSE, cr)) { 1258 VN_RELE(*vpp); 1259 *vpp = NULL; 1260 } 1261 1262 ZFS_EXIT(zfsvfs); 1263 return (error); 1264 } 1265 1266 if (dvp->v_type != VDIR) { 1267 ZFS_EXIT(zfsvfs); 1268 return (SET_ERROR(ENOTDIR)); 1269 } 1270 1271 /* 1272 * Check accessibility of directory. 1273 */ 1274 1275 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1276 ZFS_EXIT(zfsvfs); 1277 return (error); 1278 } 1279 1280 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1281 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1282 ZFS_EXIT(zfsvfs); 1283 return (SET_ERROR(EILSEQ)); 1284 } 1285 1286 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1287 if (error == 0) 1288 error = specvp_check(vpp, cr); 1289 1290 ZFS_EXIT(zfsvfs); 1291 return (error); 1292 } 1293 1294 /* 1295 * Attempt to create a new entry in a directory. If the entry 1296 * already exists, truncate the file if permissible, else return 1297 * an error. Return the vp of the created or trunc'd file. 1298 * 1299 * IN: dvp - vnode of directory to put new file entry in. 1300 * name - name of new file entry. 1301 * vap - attributes of new file. 1302 * excl - flag indicating exclusive or non-exclusive mode. 1303 * mode - mode to open file with. 1304 * cr - credentials of caller. 1305 * flag - large file flag [UNUSED]. 1306 * ct - caller context 1307 * vsecp - ACL to be set 1308 * 1309 * OUT: vpp - vnode of created or trunc'd entry. 1310 * 1311 * RETURN: 0 on success, error code on failure. 1312 * 1313 * Timestamps: 1314 * dvp - ctime|mtime updated if new entry created 1315 * vp - ctime|mtime always, atime if new 1316 */ 1317 1318 /* ARGSUSED */ 1319 static int 1320 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1321 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 1322 vsecattr_t *vsecp) 1323 { 1324 znode_t *zp, *dzp = VTOZ(dvp); 1325 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1326 zilog_t *zilog; 1327 objset_t *os; 1328 zfs_dirlock_t *dl; 1329 dmu_tx_t *tx; 1330 int error; 1331 ksid_t *ksid; 1332 uid_t uid; 1333 gid_t gid = crgetgid(cr); 1334 zfs_acl_ids_t acl_ids; 1335 boolean_t fuid_dirtied; 1336 boolean_t have_acl = B_FALSE; 1337 boolean_t waited = B_FALSE; 1338 1339 /* 1340 * If we have an ephemeral id, ACL, or XVATTR then 1341 * make sure file system is at proper version 1342 */ 1343 1344 ksid = crgetsid(cr, KSID_OWNER); 1345 if (ksid) 1346 uid = ksid_getid(ksid); 1347 else 1348 uid = crgetuid(cr); 1349 1350 if (zfsvfs->z_use_fuids == B_FALSE && 1351 (vsecp || (vap->va_mask & AT_XVATTR) || 1352 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1353 return (SET_ERROR(EINVAL)); 1354 1355 ZFS_ENTER(zfsvfs); 1356 ZFS_VERIFY_ZP(dzp); 1357 os = zfsvfs->z_os; 1358 zilog = zfsvfs->z_log; 1359 1360 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1361 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1362 ZFS_EXIT(zfsvfs); 1363 return (SET_ERROR(EILSEQ)); 1364 } 1365 1366 if (vap->va_mask & AT_XVATTR) { 1367 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1368 crgetuid(cr), cr, vap->va_type)) != 0) { 1369 ZFS_EXIT(zfsvfs); 1370 return (error); 1371 } 1372 } 1373 top: 1374 *vpp = NULL; 1375 1376 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1377 vap->va_mode &= ~VSVTX; 1378 1379 if (*name == '\0') { 1380 /* 1381 * Null component name refers to the directory itself. 1382 */ 1383 VN_HOLD(dvp); 1384 zp = dzp; 1385 dl = NULL; 1386 error = 0; 1387 } else { 1388 /* possible VN_HOLD(zp) */ 1389 int zflg = 0; 1390 1391 if (flag & FIGNORECASE) 1392 zflg |= ZCILOOK; 1393 1394 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1395 NULL, NULL); 1396 if (error) { 1397 if (have_acl) 1398 zfs_acl_ids_free(&acl_ids); 1399 if (strcmp(name, "..") == 0) 1400 error = SET_ERROR(EISDIR); 1401 ZFS_EXIT(zfsvfs); 1402 return (error); 1403 } 1404 } 1405 1406 if (zp == NULL) { 1407 uint64_t txtype; 1408 1409 /* 1410 * Create a new file object and update the directory 1411 * to reference it. 1412 */ 1413 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1414 if (have_acl) 1415 zfs_acl_ids_free(&acl_ids); 1416 goto out; 1417 } 1418 1419 /* 1420 * We only support the creation of regular files in 1421 * extended attribute directories. 1422 */ 1423 1424 if ((dzp->z_pflags & ZFS_XATTR) && 1425 (vap->va_type != VREG)) { 1426 if (have_acl) 1427 zfs_acl_ids_free(&acl_ids); 1428 error = SET_ERROR(EINVAL); 1429 goto out; 1430 } 1431 1432 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 1433 cr, vsecp, &acl_ids)) != 0) 1434 goto out; 1435 have_acl = B_TRUE; 1436 1437 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1438 zfs_acl_ids_free(&acl_ids); 1439 error = SET_ERROR(EDQUOT); 1440 goto out; 1441 } 1442 1443 tx = dmu_tx_create(os); 1444 1445 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1446 ZFS_SA_BASE_ATTR_SIZE); 1447 1448 fuid_dirtied = zfsvfs->z_fuid_dirty; 1449 if (fuid_dirtied) 1450 zfs_fuid_txhold(zfsvfs, tx); 1451 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1452 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1453 if (!zfsvfs->z_use_sa && 1454 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1455 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1456 0, acl_ids.z_aclp->z_acl_bytes); 1457 } 1458 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1459 if (error) { 1460 zfs_dirent_unlock(dl); 1461 if (error == ERESTART) { 1462 waited = B_TRUE; 1463 dmu_tx_wait(tx); 1464 dmu_tx_abort(tx); 1465 goto top; 1466 } 1467 zfs_acl_ids_free(&acl_ids); 1468 dmu_tx_abort(tx); 1469 ZFS_EXIT(zfsvfs); 1470 return (error); 1471 } 1472 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1473 1474 if (fuid_dirtied) 1475 zfs_fuid_sync(zfsvfs, tx); 1476 1477 (void) zfs_link_create(dl, zp, tx, ZNEW); 1478 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1479 if (flag & FIGNORECASE) 1480 txtype |= TX_CI; 1481 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1482 vsecp, acl_ids.z_fuidp, vap); 1483 zfs_acl_ids_free(&acl_ids); 1484 dmu_tx_commit(tx); 1485 } else { 1486 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1487 1488 if (have_acl) 1489 zfs_acl_ids_free(&acl_ids); 1490 have_acl = B_FALSE; 1491 1492 /* 1493 * A directory entry already exists for this name. 1494 */ 1495 /* 1496 * Can't truncate an existing file if in exclusive mode. 1497 */ 1498 if (excl == EXCL) { 1499 error = SET_ERROR(EEXIST); 1500 goto out; 1501 } 1502 /* 1503 * Can't open a directory for writing. 1504 */ 1505 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1506 error = SET_ERROR(EISDIR); 1507 goto out; 1508 } 1509 /* 1510 * Verify requested access to file. 1511 */ 1512 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1513 goto out; 1514 } 1515 1516 mutex_enter(&dzp->z_lock); 1517 dzp->z_seq++; 1518 mutex_exit(&dzp->z_lock); 1519 1520 /* 1521 * Truncate regular files if requested. 1522 */ 1523 if ((ZTOV(zp)->v_type == VREG) && 1524 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1525 /* we can't hold any locks when calling zfs_freesp() */ 1526 zfs_dirent_unlock(dl); 1527 dl = NULL; 1528 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1529 if (error == 0) { 1530 vnevent_create(ZTOV(zp), ct); 1531 } 1532 } 1533 } 1534 out: 1535 1536 if (dl) 1537 zfs_dirent_unlock(dl); 1538 1539 if (error) { 1540 if (zp) 1541 VN_RELE(ZTOV(zp)); 1542 } else { 1543 *vpp = ZTOV(zp); 1544 error = specvp_check(vpp, cr); 1545 } 1546 1547 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1548 zil_commit(zilog, 0); 1549 1550 ZFS_EXIT(zfsvfs); 1551 return (error); 1552 } 1553 1554 /* 1555 * Remove an entry from a directory. 1556 * 1557 * IN: dvp - vnode of directory to remove entry from. 1558 * name - name of entry to remove. 1559 * cr - credentials of caller. 1560 * ct - caller context 1561 * flags - case flags 1562 * 1563 * RETURN: 0 on success, error code on failure. 1564 * 1565 * Timestamps: 1566 * dvp - ctime|mtime 1567 * vp - ctime (if nlink > 0) 1568 */ 1569 1570 uint64_t null_xattr = 0; 1571 1572 /*ARGSUSED*/ 1573 static int 1574 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1575 int flags) 1576 { 1577 znode_t *zp, *dzp = VTOZ(dvp); 1578 znode_t *xzp; 1579 vnode_t *vp; 1580 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1581 zilog_t *zilog; 1582 uint64_t acl_obj, xattr_obj; 1583 uint64_t xattr_obj_unlinked = 0; 1584 uint64_t obj = 0; 1585 zfs_dirlock_t *dl; 1586 dmu_tx_t *tx; 1587 boolean_t may_delete_now, delete_now = FALSE; 1588 boolean_t unlinked, toobig = FALSE; 1589 uint64_t txtype; 1590 pathname_t *realnmp = NULL; 1591 pathname_t realnm; 1592 int error; 1593 int zflg = ZEXISTS; 1594 boolean_t waited = B_FALSE; 1595 1596 ZFS_ENTER(zfsvfs); 1597 ZFS_VERIFY_ZP(dzp); 1598 zilog = zfsvfs->z_log; 1599 1600 if (flags & FIGNORECASE) { 1601 zflg |= ZCILOOK; 1602 pn_alloc(&realnm); 1603 realnmp = &realnm; 1604 } 1605 1606 top: 1607 xattr_obj = 0; 1608 xzp = NULL; 1609 /* 1610 * Attempt to lock directory; fail if entry doesn't exist. 1611 */ 1612 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1613 NULL, realnmp)) { 1614 if (realnmp) 1615 pn_free(realnmp); 1616 ZFS_EXIT(zfsvfs); 1617 return (error); 1618 } 1619 1620 vp = ZTOV(zp); 1621 1622 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1623 goto out; 1624 } 1625 1626 /* 1627 * Need to use rmdir for removing directories. 1628 */ 1629 if (vp->v_type == VDIR) { 1630 error = SET_ERROR(EPERM); 1631 goto out; 1632 } 1633 1634 vnevent_remove(vp, dvp, name, ct); 1635 1636 if (realnmp) 1637 dnlc_remove(dvp, realnmp->pn_buf); 1638 else 1639 dnlc_remove(dvp, name); 1640 1641 mutex_enter(&vp->v_lock); 1642 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1643 mutex_exit(&vp->v_lock); 1644 1645 /* 1646 * We may delete the znode now, or we may put it in the unlinked set; 1647 * it depends on whether we're the last link, and on whether there are 1648 * other holds on the vnode. So we dmu_tx_hold() the right things to 1649 * allow for either case. 1650 */ 1651 obj = zp->z_id; 1652 tx = dmu_tx_create(zfsvfs->z_os); 1653 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1654 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1655 zfs_sa_upgrade_txholds(tx, zp); 1656 zfs_sa_upgrade_txholds(tx, dzp); 1657 if (may_delete_now) { 1658 toobig = 1659 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 1660 /* if the file is too big, only hold_free a token amount */ 1661 dmu_tx_hold_free(tx, zp->z_id, 0, 1662 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1663 } 1664 1665 /* are there any extended attributes? */ 1666 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1667 &xattr_obj, sizeof (xattr_obj)); 1668 if (error == 0 && xattr_obj) { 1669 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1670 ASSERT0(error); 1671 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1672 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1673 } 1674 1675 mutex_enter(&zp->z_lock); 1676 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1677 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1678 mutex_exit(&zp->z_lock); 1679 1680 /* charge as an update -- would be nice not to charge at all */ 1681 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1682 1683 /* 1684 * Mark this transaction as typically resulting in a net free of 1685 * space, unless object removal will be delayed indefinitely 1686 * (due to active holds on the vnode due to the file being open). 1687 */ 1688 if (may_delete_now) 1689 dmu_tx_mark_netfree(tx); 1690 1691 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1692 if (error) { 1693 zfs_dirent_unlock(dl); 1694 VN_RELE(vp); 1695 if (xzp) 1696 VN_RELE(ZTOV(xzp)); 1697 if (error == ERESTART) { 1698 waited = B_TRUE; 1699 dmu_tx_wait(tx); 1700 dmu_tx_abort(tx); 1701 goto top; 1702 } 1703 if (realnmp) 1704 pn_free(realnmp); 1705 dmu_tx_abort(tx); 1706 ZFS_EXIT(zfsvfs); 1707 return (error); 1708 } 1709 1710 /* 1711 * Remove the directory entry. 1712 */ 1713 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1714 1715 if (error) { 1716 dmu_tx_commit(tx); 1717 goto out; 1718 } 1719 1720 if (unlinked) { 1721 /* 1722 * Hold z_lock so that we can make sure that the ACL obj 1723 * hasn't changed. Could have been deleted due to 1724 * zfs_sa_upgrade(). 1725 */ 1726 mutex_enter(&zp->z_lock); 1727 mutex_enter(&vp->v_lock); 1728 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1729 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1730 delete_now = may_delete_now && !toobig && 1731 vp->v_count == 1 && !vn_has_cached_data(vp) && 1732 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == 1733 acl_obj; 1734 mutex_exit(&vp->v_lock); 1735 } 1736 1737 if (delete_now) { 1738 if (xattr_obj_unlinked) { 1739 ASSERT3U(xzp->z_links, ==, 2); 1740 mutex_enter(&xzp->z_lock); 1741 xzp->z_unlinked = 1; 1742 xzp->z_links = 0; 1743 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1744 &xzp->z_links, sizeof (xzp->z_links), tx); 1745 ASSERT3U(error, ==, 0); 1746 mutex_exit(&xzp->z_lock); 1747 zfs_unlinked_add(xzp, tx); 1748 1749 if (zp->z_is_sa) 1750 error = sa_remove(zp->z_sa_hdl, 1751 SA_ZPL_XATTR(zfsvfs), tx); 1752 else 1753 error = sa_update(zp->z_sa_hdl, 1754 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1755 sizeof (uint64_t), tx); 1756 ASSERT0(error); 1757 } 1758 mutex_enter(&vp->v_lock); 1759 vp->v_count--; 1760 ASSERT0(vp->v_count); 1761 mutex_exit(&vp->v_lock); 1762 mutex_exit(&zp->z_lock); 1763 zfs_znode_delete(zp, tx); 1764 } else if (unlinked) { 1765 mutex_exit(&zp->z_lock); 1766 zfs_unlinked_add(zp, tx); 1767 } 1768 1769 txtype = TX_REMOVE; 1770 if (flags & FIGNORECASE) 1771 txtype |= TX_CI; 1772 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 1773 1774 dmu_tx_commit(tx); 1775 out: 1776 if (realnmp) 1777 pn_free(realnmp); 1778 1779 zfs_dirent_unlock(dl); 1780 1781 if (!delete_now) 1782 VN_RELE(vp); 1783 if (xzp) 1784 VN_RELE(ZTOV(xzp)); 1785 1786 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1787 zil_commit(zilog, 0); 1788 1789 ZFS_EXIT(zfsvfs); 1790 return (error); 1791 } 1792 1793 /* 1794 * Create a new directory and insert it into dvp using the name 1795 * provided. Return a pointer to the inserted directory. 1796 * 1797 * IN: dvp - vnode of directory to add subdir to. 1798 * dirname - name of new directory. 1799 * vap - attributes of new directory. 1800 * cr - credentials of caller. 1801 * ct - caller context 1802 * flags - case flags 1803 * vsecp - ACL to be set 1804 * 1805 * OUT: vpp - vnode of created directory. 1806 * 1807 * RETURN: 0 on success, error code on failure. 1808 * 1809 * Timestamps: 1810 * dvp - ctime|mtime updated 1811 * vp - ctime|mtime|atime updated 1812 */ 1813 /*ARGSUSED*/ 1814 static int 1815 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 1816 caller_context_t *ct, int flags, vsecattr_t *vsecp) 1817 { 1818 znode_t *zp, *dzp = VTOZ(dvp); 1819 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1820 zilog_t *zilog; 1821 zfs_dirlock_t *dl; 1822 uint64_t txtype; 1823 dmu_tx_t *tx; 1824 int error; 1825 int zf = ZNEW; 1826 ksid_t *ksid; 1827 uid_t uid; 1828 gid_t gid = crgetgid(cr); 1829 zfs_acl_ids_t acl_ids; 1830 boolean_t fuid_dirtied; 1831 boolean_t waited = B_FALSE; 1832 1833 ASSERT(vap->va_type == VDIR); 1834 1835 /* 1836 * If we have an ephemeral id, ACL, or XVATTR then 1837 * make sure file system is at proper version 1838 */ 1839 1840 ksid = crgetsid(cr, KSID_OWNER); 1841 if (ksid) 1842 uid = ksid_getid(ksid); 1843 else 1844 uid = crgetuid(cr); 1845 if (zfsvfs->z_use_fuids == B_FALSE && 1846 (vsecp || (vap->va_mask & AT_XVATTR) || 1847 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1848 return (SET_ERROR(EINVAL)); 1849 1850 ZFS_ENTER(zfsvfs); 1851 ZFS_VERIFY_ZP(dzp); 1852 zilog = zfsvfs->z_log; 1853 1854 if (dzp->z_pflags & ZFS_XATTR) { 1855 ZFS_EXIT(zfsvfs); 1856 return (SET_ERROR(EINVAL)); 1857 } 1858 1859 if (zfsvfs->z_utf8 && u8_validate(dirname, 1860 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1861 ZFS_EXIT(zfsvfs); 1862 return (SET_ERROR(EILSEQ)); 1863 } 1864 if (flags & FIGNORECASE) 1865 zf |= ZCILOOK; 1866 1867 if (vap->va_mask & AT_XVATTR) { 1868 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1869 crgetuid(cr), cr, vap->va_type)) != 0) { 1870 ZFS_EXIT(zfsvfs); 1871 return (error); 1872 } 1873 } 1874 1875 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1876 vsecp, &acl_ids)) != 0) { 1877 ZFS_EXIT(zfsvfs); 1878 return (error); 1879 } 1880 /* 1881 * First make sure the new directory doesn't exist. 1882 * 1883 * Existence is checked first to make sure we don't return 1884 * EACCES instead of EEXIST which can cause some applications 1885 * to fail. 1886 */ 1887 top: 1888 *vpp = NULL; 1889 1890 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1891 NULL, NULL)) { 1892 zfs_acl_ids_free(&acl_ids); 1893 ZFS_EXIT(zfsvfs); 1894 return (error); 1895 } 1896 1897 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 1898 zfs_acl_ids_free(&acl_ids); 1899 zfs_dirent_unlock(dl); 1900 ZFS_EXIT(zfsvfs); 1901 return (error); 1902 } 1903 1904 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1905 zfs_acl_ids_free(&acl_ids); 1906 zfs_dirent_unlock(dl); 1907 ZFS_EXIT(zfsvfs); 1908 return (SET_ERROR(EDQUOT)); 1909 } 1910 1911 /* 1912 * Add a new entry to the directory. 1913 */ 1914 tx = dmu_tx_create(zfsvfs->z_os); 1915 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1916 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1917 fuid_dirtied = zfsvfs->z_fuid_dirty; 1918 if (fuid_dirtied) 1919 zfs_fuid_txhold(zfsvfs, tx); 1920 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1921 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1922 acl_ids.z_aclp->z_acl_bytes); 1923 } 1924 1925 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1926 ZFS_SA_BASE_ATTR_SIZE); 1927 1928 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1929 if (error) { 1930 zfs_dirent_unlock(dl); 1931 if (error == ERESTART) { 1932 waited = B_TRUE; 1933 dmu_tx_wait(tx); 1934 dmu_tx_abort(tx); 1935 goto top; 1936 } 1937 zfs_acl_ids_free(&acl_ids); 1938 dmu_tx_abort(tx); 1939 ZFS_EXIT(zfsvfs); 1940 return (error); 1941 } 1942 1943 /* 1944 * Create new node. 1945 */ 1946 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1947 1948 if (fuid_dirtied) 1949 zfs_fuid_sync(zfsvfs, tx); 1950 1951 /* 1952 * Now put new name in parent dir. 1953 */ 1954 (void) zfs_link_create(dl, zp, tx, ZNEW); 1955 1956 *vpp = ZTOV(zp); 1957 1958 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1959 if (flags & FIGNORECASE) 1960 txtype |= TX_CI; 1961 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1962 acl_ids.z_fuidp, vap); 1963 1964 zfs_acl_ids_free(&acl_ids); 1965 1966 dmu_tx_commit(tx); 1967 1968 zfs_dirent_unlock(dl); 1969 1970 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1971 zil_commit(zilog, 0); 1972 1973 ZFS_EXIT(zfsvfs); 1974 return (0); 1975 } 1976 1977 /* 1978 * Remove a directory subdir entry. If the current working 1979 * directory is the same as the subdir to be removed, the 1980 * remove will fail. 1981 * 1982 * IN: dvp - vnode of directory to remove from. 1983 * name - name of directory to be removed. 1984 * cwd - vnode of current working directory. 1985 * cr - credentials of caller. 1986 * ct - caller context 1987 * flags - case flags 1988 * 1989 * RETURN: 0 on success, error code on failure. 1990 * 1991 * Timestamps: 1992 * dvp - ctime|mtime updated 1993 */ 1994 /*ARGSUSED*/ 1995 static int 1996 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 1997 caller_context_t *ct, int flags) 1998 { 1999 znode_t *dzp = VTOZ(dvp); 2000 znode_t *zp; 2001 vnode_t *vp; 2002 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2003 zilog_t *zilog; 2004 zfs_dirlock_t *dl; 2005 dmu_tx_t *tx; 2006 int error; 2007 int zflg = ZEXISTS; 2008 boolean_t waited = B_FALSE; 2009 2010 ZFS_ENTER(zfsvfs); 2011 ZFS_VERIFY_ZP(dzp); 2012 zilog = zfsvfs->z_log; 2013 2014 if (flags & FIGNORECASE) 2015 zflg |= ZCILOOK; 2016 top: 2017 zp = NULL; 2018 2019 /* 2020 * Attempt to lock directory; fail if entry doesn't exist. 2021 */ 2022 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2023 NULL, NULL)) { 2024 ZFS_EXIT(zfsvfs); 2025 return (error); 2026 } 2027 2028 vp = ZTOV(zp); 2029 2030 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2031 goto out; 2032 } 2033 2034 if (vp->v_type != VDIR) { 2035 error = SET_ERROR(ENOTDIR); 2036 goto out; 2037 } 2038 2039 if (vp == cwd) { 2040 error = SET_ERROR(EINVAL); 2041 goto out; 2042 } 2043 2044 vnevent_rmdir(vp, dvp, name, ct); 2045 2046 /* 2047 * Grab a lock on the directory to make sure that noone is 2048 * trying to add (or lookup) entries while we are removing it. 2049 */ 2050 rw_enter(&zp->z_name_lock, RW_WRITER); 2051 2052 /* 2053 * Grab a lock on the parent pointer to make sure we play well 2054 * with the treewalk and directory rename code. 2055 */ 2056 rw_enter(&zp->z_parent_lock, RW_WRITER); 2057 2058 tx = dmu_tx_create(zfsvfs->z_os); 2059 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2060 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2061 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2062 zfs_sa_upgrade_txholds(tx, zp); 2063 zfs_sa_upgrade_txholds(tx, dzp); 2064 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 2065 if (error) { 2066 rw_exit(&zp->z_parent_lock); 2067 rw_exit(&zp->z_name_lock); 2068 zfs_dirent_unlock(dl); 2069 VN_RELE(vp); 2070 if (error == ERESTART) { 2071 waited = B_TRUE; 2072 dmu_tx_wait(tx); 2073 dmu_tx_abort(tx); 2074 goto top; 2075 } 2076 dmu_tx_abort(tx); 2077 ZFS_EXIT(zfsvfs); 2078 return (error); 2079 } 2080 2081 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2082 2083 if (error == 0) { 2084 uint64_t txtype = TX_RMDIR; 2085 if (flags & FIGNORECASE) 2086 txtype |= TX_CI; 2087 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2088 } 2089 2090 dmu_tx_commit(tx); 2091 2092 rw_exit(&zp->z_parent_lock); 2093 rw_exit(&zp->z_name_lock); 2094 out: 2095 zfs_dirent_unlock(dl); 2096 2097 VN_RELE(vp); 2098 2099 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2100 zil_commit(zilog, 0); 2101 2102 ZFS_EXIT(zfsvfs); 2103 return (error); 2104 } 2105 2106 /* 2107 * Read as many directory entries as will fit into the provided 2108 * buffer from the given directory cursor position (specified in 2109 * the uio structure). 2110 * 2111 * IN: vp - vnode of directory to read. 2112 * uio - structure supplying read location, range info, 2113 * and return buffer. 2114 * cr - credentials of caller. 2115 * ct - caller context 2116 * flags - case flags 2117 * 2118 * OUT: uio - updated offset and range, buffer filled. 2119 * eofp - set to true if end-of-file detected. 2120 * 2121 * RETURN: 0 on success, error code on failure. 2122 * 2123 * Timestamps: 2124 * vp - atime updated 2125 * 2126 * Note that the low 4 bits of the cookie returned by zap is always zero. 2127 * This allows us to use the low range for "special" directory entries: 2128 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2129 * we use the offset 2 for the '.zfs' directory. 2130 */ 2131 /* ARGSUSED */ 2132 static int 2133 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, 2134 caller_context_t *ct, int flags) 2135 { 2136 znode_t *zp = VTOZ(vp); 2137 iovec_t *iovp; 2138 edirent_t *eodp; 2139 dirent64_t *odp; 2140 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2141 objset_t *os; 2142 caddr_t outbuf; 2143 size_t bufsize; 2144 zap_cursor_t zc; 2145 zap_attribute_t zap; 2146 uint_t bytes_wanted; 2147 uint64_t offset; /* must be unsigned; checks for < 1 */ 2148 uint64_t parent; 2149 int local_eof; 2150 int outcount; 2151 int error; 2152 uint8_t prefetch; 2153 boolean_t check_sysattrs; 2154 2155 ZFS_ENTER(zfsvfs); 2156 ZFS_VERIFY_ZP(zp); 2157 2158 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2159 &parent, sizeof (parent))) != 0) { 2160 ZFS_EXIT(zfsvfs); 2161 return (error); 2162 } 2163 2164 /* 2165 * If we are not given an eof variable, 2166 * use a local one. 2167 */ 2168 if (eofp == NULL) 2169 eofp = &local_eof; 2170 2171 /* 2172 * Check for valid iov_len. 2173 */ 2174 if (uio->uio_iov->iov_len <= 0) { 2175 ZFS_EXIT(zfsvfs); 2176 return (SET_ERROR(EINVAL)); 2177 } 2178 2179 /* 2180 * Quit if directory has been removed (posix) 2181 */ 2182 if ((*eofp = zp->z_unlinked) != 0) { 2183 ZFS_EXIT(zfsvfs); 2184 return (0); 2185 } 2186 2187 error = 0; 2188 os = zfsvfs->z_os; 2189 offset = uio->uio_loffset; 2190 prefetch = zp->z_zn_prefetch; 2191 2192 /* 2193 * Initialize the iterator cursor. 2194 */ 2195 if (offset <= 3) { 2196 /* 2197 * Start iteration from the beginning of the directory. 2198 */ 2199 zap_cursor_init(&zc, os, zp->z_id); 2200 } else { 2201 /* 2202 * The offset is a serialized cursor. 2203 */ 2204 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2205 } 2206 2207 /* 2208 * Get space to change directory entries into fs independent format. 2209 */ 2210 iovp = uio->uio_iov; 2211 bytes_wanted = iovp->iov_len; 2212 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2213 bufsize = bytes_wanted; 2214 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2215 odp = (struct dirent64 *)outbuf; 2216 } else { 2217 bufsize = bytes_wanted; 2218 outbuf = NULL; 2219 odp = (struct dirent64 *)iovp->iov_base; 2220 } 2221 eodp = (struct edirent *)odp; 2222 2223 /* 2224 * If this VFS supports the system attribute view interface; and 2225 * we're looking at an extended attribute directory; and we care 2226 * about normalization conflicts on this vfs; then we must check 2227 * for normalization conflicts with the sysattr name space. 2228 */ 2229 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2230 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2231 (flags & V_RDDIR_ENTFLAGS); 2232 2233 /* 2234 * Transform to file-system independent format 2235 */ 2236 outcount = 0; 2237 while (outcount < bytes_wanted) { 2238 ino64_t objnum; 2239 ushort_t reclen; 2240 off64_t *next = NULL; 2241 2242 /* 2243 * Special case `.', `..', and `.zfs'. 2244 */ 2245 if (offset == 0) { 2246 (void) strcpy(zap.za_name, "."); 2247 zap.za_normalization_conflict = 0; 2248 objnum = zp->z_id; 2249 } else if (offset == 1) { 2250 (void) strcpy(zap.za_name, ".."); 2251 zap.za_normalization_conflict = 0; 2252 objnum = parent; 2253 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2254 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2255 zap.za_normalization_conflict = 0; 2256 objnum = ZFSCTL_INO_ROOT; 2257 } else { 2258 /* 2259 * Grab next entry. 2260 */ 2261 if (error = zap_cursor_retrieve(&zc, &zap)) { 2262 if ((*eofp = (error == ENOENT)) != 0) 2263 break; 2264 else 2265 goto update; 2266 } 2267 2268 if (zap.za_integer_length != 8 || 2269 zap.za_num_integers != 1) { 2270 cmn_err(CE_WARN, "zap_readdir: bad directory " 2271 "entry, obj = %lld, offset = %lld\n", 2272 (u_longlong_t)zp->z_id, 2273 (u_longlong_t)offset); 2274 error = SET_ERROR(ENXIO); 2275 goto update; 2276 } 2277 2278 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2279 /* 2280 * MacOS X can extract the object type here such as: 2281 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2282 */ 2283 2284 if (check_sysattrs && !zap.za_normalization_conflict) { 2285 zap.za_normalization_conflict = 2286 xattr_sysattr_casechk(zap.za_name); 2287 } 2288 } 2289 2290 if (flags & V_RDDIR_ACCFILTER) { 2291 /* 2292 * If we have no access at all, don't include 2293 * this entry in the returned information 2294 */ 2295 znode_t *ezp; 2296 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2297 goto skip_entry; 2298 if (!zfs_has_access(ezp, cr)) { 2299 VN_RELE(ZTOV(ezp)); 2300 goto skip_entry; 2301 } 2302 VN_RELE(ZTOV(ezp)); 2303 } 2304 2305 if (flags & V_RDDIR_ENTFLAGS) 2306 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2307 else 2308 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2309 2310 /* 2311 * Will this entry fit in the buffer? 2312 */ 2313 if (outcount + reclen > bufsize) { 2314 /* 2315 * Did we manage to fit anything in the buffer? 2316 */ 2317 if (!outcount) { 2318 error = SET_ERROR(EINVAL); 2319 goto update; 2320 } 2321 break; 2322 } 2323 if (flags & V_RDDIR_ENTFLAGS) { 2324 /* 2325 * Add extended flag entry: 2326 */ 2327 eodp->ed_ino = objnum; 2328 eodp->ed_reclen = reclen; 2329 /* NOTE: ed_off is the offset for the *next* entry */ 2330 next = &(eodp->ed_off); 2331 eodp->ed_eflags = zap.za_normalization_conflict ? 2332 ED_CASE_CONFLICT : 0; 2333 (void) strncpy(eodp->ed_name, zap.za_name, 2334 EDIRENT_NAMELEN(reclen)); 2335 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2336 } else { 2337 /* 2338 * Add normal entry: 2339 */ 2340 odp->d_ino = objnum; 2341 odp->d_reclen = reclen; 2342 /* NOTE: d_off is the offset for the *next* entry */ 2343 next = &(odp->d_off); 2344 (void) strncpy(odp->d_name, zap.za_name, 2345 DIRENT64_NAMELEN(reclen)); 2346 odp = (dirent64_t *)((intptr_t)odp + reclen); 2347 } 2348 outcount += reclen; 2349 2350 ASSERT(outcount <= bufsize); 2351 2352 /* Prefetch znode */ 2353 if (prefetch) 2354 dmu_prefetch(os, objnum, 0, 0); 2355 2356 skip_entry: 2357 /* 2358 * Move to the next entry, fill in the previous offset. 2359 */ 2360 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2361 zap_cursor_advance(&zc); 2362 offset = zap_cursor_serialize(&zc); 2363 } else { 2364 offset += 1; 2365 } 2366 if (next) 2367 *next = offset; 2368 } 2369 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2370 2371 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2372 iovp->iov_base += outcount; 2373 iovp->iov_len -= outcount; 2374 uio->uio_resid -= outcount; 2375 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2376 /* 2377 * Reset the pointer. 2378 */ 2379 offset = uio->uio_loffset; 2380 } 2381 2382 update: 2383 zap_cursor_fini(&zc); 2384 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2385 kmem_free(outbuf, bufsize); 2386 2387 if (error == ENOENT) 2388 error = 0; 2389 2390 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2391 2392 uio->uio_loffset = offset; 2393 ZFS_EXIT(zfsvfs); 2394 return (error); 2395 } 2396 2397 ulong_t zfs_fsync_sync_cnt = 4; 2398 2399 static int 2400 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2401 { 2402 znode_t *zp = VTOZ(vp); 2403 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2404 2405 /* 2406 * Regardless of whether this is required for standards conformance, 2407 * this is the logical behavior when fsync() is called on a file with 2408 * dirty pages. We use B_ASYNC since the ZIL transactions are already 2409 * going to be pushed out as part of the zil_commit(). 2410 */ 2411 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 2412 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 2413 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct); 2414 2415 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2416 2417 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2418 ZFS_ENTER(zfsvfs); 2419 ZFS_VERIFY_ZP(zp); 2420 zil_commit(zfsvfs->z_log, zp->z_id); 2421 ZFS_EXIT(zfsvfs); 2422 } 2423 return (0); 2424 } 2425 2426 2427 /* 2428 * Get the requested file attributes and place them in the provided 2429 * vattr structure. 2430 * 2431 * IN: vp - vnode of file. 2432 * vap - va_mask identifies requested attributes. 2433 * If AT_XVATTR set, then optional attrs are requested 2434 * flags - ATTR_NOACLCHECK (CIFS server context) 2435 * cr - credentials of caller. 2436 * ct - caller context 2437 * 2438 * OUT: vap - attribute values. 2439 * 2440 * RETURN: 0 (always succeeds). 2441 */ 2442 /* ARGSUSED */ 2443 static int 2444 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2445 caller_context_t *ct) 2446 { 2447 znode_t *zp = VTOZ(vp); 2448 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2449 int error = 0; 2450 uint64_t links; 2451 uint64_t mtime[2], ctime[2]; 2452 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2453 xoptattr_t *xoap = NULL; 2454 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2455 sa_bulk_attr_t bulk[2]; 2456 int count = 0; 2457 2458 ZFS_ENTER(zfsvfs); 2459 ZFS_VERIFY_ZP(zp); 2460 2461 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2462 2463 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2464 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2465 2466 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2467 ZFS_EXIT(zfsvfs); 2468 return (error); 2469 } 2470 2471 /* 2472 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2473 * Also, if we are the owner don't bother, since owner should 2474 * always be allowed to read basic attributes of file. 2475 */ 2476 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2477 (vap->va_uid != crgetuid(cr))) { 2478 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2479 skipaclchk, cr)) { 2480 ZFS_EXIT(zfsvfs); 2481 return (error); 2482 } 2483 } 2484 2485 /* 2486 * Return all attributes. It's cheaper to provide the answer 2487 * than to determine whether we were asked the question. 2488 */ 2489 2490 mutex_enter(&zp->z_lock); 2491 vap->va_type = vp->v_type; 2492 vap->va_mode = zp->z_mode & MODEMASK; 2493 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2494 vap->va_nodeid = zp->z_id; 2495 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2496 links = zp->z_links + 1; 2497 else 2498 links = zp->z_links; 2499 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 2500 vap->va_size = zp->z_size; 2501 vap->va_rdev = vp->v_rdev; 2502 vap->va_seq = zp->z_seq; 2503 2504 /* 2505 * Add in any requested optional attributes and the create time. 2506 * Also set the corresponding bits in the returned attribute bitmap. 2507 */ 2508 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2509 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2510 xoap->xoa_archive = 2511 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2512 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2513 } 2514 2515 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2516 xoap->xoa_readonly = 2517 ((zp->z_pflags & ZFS_READONLY) != 0); 2518 XVA_SET_RTN(xvap, XAT_READONLY); 2519 } 2520 2521 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2522 xoap->xoa_system = 2523 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2524 XVA_SET_RTN(xvap, XAT_SYSTEM); 2525 } 2526 2527 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2528 xoap->xoa_hidden = 2529 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2530 XVA_SET_RTN(xvap, XAT_HIDDEN); 2531 } 2532 2533 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2534 xoap->xoa_nounlink = 2535 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2536 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2537 } 2538 2539 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2540 xoap->xoa_immutable = 2541 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2542 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2543 } 2544 2545 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2546 xoap->xoa_appendonly = 2547 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2548 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2549 } 2550 2551 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2552 xoap->xoa_nodump = 2553 ((zp->z_pflags & ZFS_NODUMP) != 0); 2554 XVA_SET_RTN(xvap, XAT_NODUMP); 2555 } 2556 2557 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2558 xoap->xoa_opaque = 2559 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2560 XVA_SET_RTN(xvap, XAT_OPAQUE); 2561 } 2562 2563 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2564 xoap->xoa_av_quarantined = 2565 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2566 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2567 } 2568 2569 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2570 xoap->xoa_av_modified = 2571 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 2572 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 2573 } 2574 2575 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 2576 vp->v_type == VREG) { 2577 zfs_sa_get_scanstamp(zp, xvap); 2578 } 2579 2580 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 2581 uint64_t times[2]; 2582 2583 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 2584 times, sizeof (times)); 2585 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 2586 XVA_SET_RTN(xvap, XAT_CREATETIME); 2587 } 2588 2589 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2590 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 2591 XVA_SET_RTN(xvap, XAT_REPARSE); 2592 } 2593 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 2594 xoap->xoa_generation = zp->z_gen; 2595 XVA_SET_RTN(xvap, XAT_GEN); 2596 } 2597 2598 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 2599 xoap->xoa_offline = 2600 ((zp->z_pflags & ZFS_OFFLINE) != 0); 2601 XVA_SET_RTN(xvap, XAT_OFFLINE); 2602 } 2603 2604 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 2605 xoap->xoa_sparse = 2606 ((zp->z_pflags & ZFS_SPARSE) != 0); 2607 XVA_SET_RTN(xvap, XAT_SPARSE); 2608 } 2609 } 2610 2611 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 2612 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 2613 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 2614 2615 mutex_exit(&zp->z_lock); 2616 2617 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); 2618 2619 if (zp->z_blksz == 0) { 2620 /* 2621 * Block size hasn't been set; suggest maximal I/O transfers. 2622 */ 2623 vap->va_blksize = zfsvfs->z_max_blksz; 2624 } 2625 2626 ZFS_EXIT(zfsvfs); 2627 return (0); 2628 } 2629 2630 /* 2631 * Set the file attributes to the values contained in the 2632 * vattr structure. 2633 * 2634 * IN: vp - vnode of file to be modified. 2635 * vap - new attribute values. 2636 * If AT_XVATTR set, then optional attrs are being set 2637 * flags - ATTR_UTIME set if non-default time values provided. 2638 * - ATTR_NOACLCHECK (CIFS context only). 2639 * cr - credentials of caller. 2640 * ct - caller context 2641 * 2642 * RETURN: 0 on success, error code on failure. 2643 * 2644 * Timestamps: 2645 * vp - ctime updated, mtime updated if size changed. 2646 */ 2647 /* ARGSUSED */ 2648 static int 2649 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2650 caller_context_t *ct) 2651 { 2652 znode_t *zp = VTOZ(vp); 2653 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2654 zilog_t *zilog; 2655 dmu_tx_t *tx; 2656 vattr_t oldva; 2657 xvattr_t tmpxvattr; 2658 uint_t mask = vap->va_mask; 2659 uint_t saved_mask = 0; 2660 int trim_mask = 0; 2661 uint64_t new_mode; 2662 uint64_t new_uid, new_gid; 2663 uint64_t xattr_obj; 2664 uint64_t mtime[2], ctime[2]; 2665 znode_t *attrzp; 2666 int need_policy = FALSE; 2667 int err, err2; 2668 zfs_fuid_info_t *fuidp = NULL; 2669 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2670 xoptattr_t *xoap; 2671 zfs_acl_t *aclp; 2672 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2673 boolean_t fuid_dirtied = B_FALSE; 2674 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 2675 int count = 0, xattr_count = 0; 2676 2677 if (mask == 0) 2678 return (0); 2679 2680 if (mask & AT_NOSET) 2681 return (SET_ERROR(EINVAL)); 2682 2683 ZFS_ENTER(zfsvfs); 2684 ZFS_VERIFY_ZP(zp); 2685 2686 zilog = zfsvfs->z_log; 2687 2688 /* 2689 * Make sure that if we have ephemeral uid/gid or xvattr specified 2690 * that file system is at proper version level 2691 */ 2692 2693 if (zfsvfs->z_use_fuids == B_FALSE && 2694 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 2695 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 2696 (mask & AT_XVATTR))) { 2697 ZFS_EXIT(zfsvfs); 2698 return (SET_ERROR(EINVAL)); 2699 } 2700 2701 if (mask & AT_SIZE && vp->v_type == VDIR) { 2702 ZFS_EXIT(zfsvfs); 2703 return (SET_ERROR(EISDIR)); 2704 } 2705 2706 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 2707 ZFS_EXIT(zfsvfs); 2708 return (SET_ERROR(EINVAL)); 2709 } 2710 2711 /* 2712 * If this is an xvattr_t, then get a pointer to the structure of 2713 * optional attributes. If this is NULL, then we have a vattr_t. 2714 */ 2715 xoap = xva_getxoptattr(xvap); 2716 2717 xva_init(&tmpxvattr); 2718 2719 /* 2720 * Immutable files can only alter immutable bit and atime 2721 */ 2722 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2723 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 2724 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2725 ZFS_EXIT(zfsvfs); 2726 return (SET_ERROR(EPERM)); 2727 } 2728 2729 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 2730 ZFS_EXIT(zfsvfs); 2731 return (SET_ERROR(EPERM)); 2732 } 2733 2734 /* 2735 * Verify timestamps doesn't overflow 32 bits. 2736 * ZFS can handle large timestamps, but 32bit syscalls can't 2737 * handle times greater than 2039. This check should be removed 2738 * once large timestamps are fully supported. 2739 */ 2740 if (mask & (AT_ATIME | AT_MTIME)) { 2741 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2742 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2743 ZFS_EXIT(zfsvfs); 2744 return (SET_ERROR(EOVERFLOW)); 2745 } 2746 } 2747 2748 top: 2749 attrzp = NULL; 2750 aclp = NULL; 2751 2752 /* Can this be moved to before the top label? */ 2753 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 2754 ZFS_EXIT(zfsvfs); 2755 return (SET_ERROR(EROFS)); 2756 } 2757 2758 /* 2759 * First validate permissions 2760 */ 2761 2762 if (mask & AT_SIZE) { 2763 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); 2764 if (err) { 2765 ZFS_EXIT(zfsvfs); 2766 return (err); 2767 } 2768 /* 2769 * XXX - Note, we are not providing any open 2770 * mode flags here (like FNDELAY), so we may 2771 * block if there are locks present... this 2772 * should be addressed in openat(). 2773 */ 2774 /* XXX - would it be OK to generate a log record here? */ 2775 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2776 if (err) { 2777 ZFS_EXIT(zfsvfs); 2778 return (err); 2779 } 2780 2781 if (vap->va_size == 0) 2782 vnevent_truncate(ZTOV(zp), ct); 2783 } 2784 2785 if (mask & (AT_ATIME|AT_MTIME) || 2786 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2787 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2788 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2789 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2790 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2791 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2792 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2793 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2794 skipaclchk, cr); 2795 } 2796 2797 if (mask & (AT_UID|AT_GID)) { 2798 int idmask = (mask & (AT_UID|AT_GID)); 2799 int take_owner; 2800 int take_group; 2801 2802 /* 2803 * NOTE: even if a new mode is being set, 2804 * we may clear S_ISUID/S_ISGID bits. 2805 */ 2806 2807 if (!(mask & AT_MODE)) 2808 vap->va_mode = zp->z_mode; 2809 2810 /* 2811 * Take ownership or chgrp to group we are a member of 2812 */ 2813 2814 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 2815 take_group = (mask & AT_GID) && 2816 zfs_groupmember(zfsvfs, vap->va_gid, cr); 2817 2818 /* 2819 * If both AT_UID and AT_GID are set then take_owner and 2820 * take_group must both be set in order to allow taking 2821 * ownership. 2822 * 2823 * Otherwise, send the check through secpolicy_vnode_setattr() 2824 * 2825 */ 2826 2827 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2828 ((idmask == AT_UID) && take_owner) || 2829 ((idmask == AT_GID) && take_group)) { 2830 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2831 skipaclchk, cr) == 0) { 2832 /* 2833 * Remove setuid/setgid for non-privileged users 2834 */ 2835 secpolicy_setid_clear(vap, cr); 2836 trim_mask = (mask & (AT_UID|AT_GID)); 2837 } else { 2838 need_policy = TRUE; 2839 } 2840 } else { 2841 need_policy = TRUE; 2842 } 2843 } 2844 2845 mutex_enter(&zp->z_lock); 2846 oldva.va_mode = zp->z_mode; 2847 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2848 if (mask & AT_XVATTR) { 2849 /* 2850 * Update xvattr mask to include only those attributes 2851 * that are actually changing. 2852 * 2853 * the bits will be restored prior to actually setting 2854 * the attributes so the caller thinks they were set. 2855 */ 2856 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2857 if (xoap->xoa_appendonly != 2858 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2859 need_policy = TRUE; 2860 } else { 2861 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2862 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 2863 } 2864 } 2865 2866 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2867 if (xoap->xoa_nounlink != 2868 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2869 need_policy = TRUE; 2870 } else { 2871 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2872 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 2873 } 2874 } 2875 2876 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2877 if (xoap->xoa_immutable != 2878 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2879 need_policy = TRUE; 2880 } else { 2881 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2882 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 2883 } 2884 } 2885 2886 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2887 if (xoap->xoa_nodump != 2888 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2889 need_policy = TRUE; 2890 } else { 2891 XVA_CLR_REQ(xvap, XAT_NODUMP); 2892 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 2893 } 2894 } 2895 2896 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2897 if (xoap->xoa_av_modified != 2898 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2899 need_policy = TRUE; 2900 } else { 2901 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2902 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 2903 } 2904 } 2905 2906 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2907 if ((vp->v_type != VREG && 2908 xoap->xoa_av_quarantined) || 2909 xoap->xoa_av_quarantined != 2910 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2911 need_policy = TRUE; 2912 } else { 2913 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2914 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 2915 } 2916 } 2917 2918 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2919 mutex_exit(&zp->z_lock); 2920 ZFS_EXIT(zfsvfs); 2921 return (SET_ERROR(EPERM)); 2922 } 2923 2924 if (need_policy == FALSE && 2925 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2926 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2927 need_policy = TRUE; 2928 } 2929 } 2930 2931 mutex_exit(&zp->z_lock); 2932 2933 if (mask & AT_MODE) { 2934 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 2935 err = secpolicy_setid_setsticky_clear(vp, vap, 2936 &oldva, cr); 2937 if (err) { 2938 ZFS_EXIT(zfsvfs); 2939 return (err); 2940 } 2941 trim_mask |= AT_MODE; 2942 } else { 2943 need_policy = TRUE; 2944 } 2945 } 2946 2947 if (need_policy) { 2948 /* 2949 * If trim_mask is set then take ownership 2950 * has been granted or write_acl is present and user 2951 * has the ability to modify mode. In that case remove 2952 * UID|GID and or MODE from mask so that 2953 * secpolicy_vnode_setattr() doesn't revoke it. 2954 */ 2955 2956 if (trim_mask) { 2957 saved_mask = vap->va_mask; 2958 vap->va_mask &= ~trim_mask; 2959 } 2960 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2961 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 2962 if (err) { 2963 ZFS_EXIT(zfsvfs); 2964 return (err); 2965 } 2966 2967 if (trim_mask) 2968 vap->va_mask |= saved_mask; 2969 } 2970 2971 /* 2972 * secpolicy_vnode_setattr, or take ownership may have 2973 * changed va_mask 2974 */ 2975 mask = vap->va_mask; 2976 2977 if ((mask & (AT_UID | AT_GID))) { 2978 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2979 &xattr_obj, sizeof (xattr_obj)); 2980 2981 if (err == 0 && xattr_obj) { 2982 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 2983 if (err) 2984 goto out2; 2985 } 2986 if (mask & AT_UID) { 2987 new_uid = zfs_fuid_create(zfsvfs, 2988 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2989 if (new_uid != zp->z_uid && 2990 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 2991 if (attrzp) 2992 VN_RELE(ZTOV(attrzp)); 2993 err = SET_ERROR(EDQUOT); 2994 goto out2; 2995 } 2996 } 2997 2998 if (mask & AT_GID) { 2999 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3000 cr, ZFS_GROUP, &fuidp); 3001 if (new_gid != zp->z_gid && 3002 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3003 if (attrzp) 3004 VN_RELE(ZTOV(attrzp)); 3005 err = SET_ERROR(EDQUOT); 3006 goto out2; 3007 } 3008 } 3009 } 3010 tx = dmu_tx_create(zfsvfs->z_os); 3011 3012 if (mask & AT_MODE) { 3013 uint64_t pmode = zp->z_mode; 3014 uint64_t acl_obj; 3015 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3016 3017 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3018 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3019 err = SET_ERROR(EPERM); 3020 goto out; 3021 } 3022 3023 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3024 goto out; 3025 3026 mutex_enter(&zp->z_lock); 3027 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3028 /* 3029 * Are we upgrading ACL from old V0 format 3030 * to V1 format? 3031 */ 3032 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3033 zfs_znode_acl_version(zp) == 3034 ZFS_ACL_VERSION_INITIAL) { 3035 dmu_tx_hold_free(tx, acl_obj, 0, 3036 DMU_OBJECT_END); 3037 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3038 0, aclp->z_acl_bytes); 3039 } else { 3040 dmu_tx_hold_write(tx, acl_obj, 0, 3041 aclp->z_acl_bytes); 3042 } 3043 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3044 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3045 0, aclp->z_acl_bytes); 3046 } 3047 mutex_exit(&zp->z_lock); 3048 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3049 } else { 3050 if ((mask & AT_XVATTR) && 3051 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3052 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3053 else 3054 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3055 } 3056 3057 if (attrzp) { 3058 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3059 } 3060 3061 fuid_dirtied = zfsvfs->z_fuid_dirty; 3062 if (fuid_dirtied) 3063 zfs_fuid_txhold(zfsvfs, tx); 3064 3065 zfs_sa_upgrade_txholds(tx, zp); 3066 3067 err = dmu_tx_assign(tx, TXG_WAIT); 3068 if (err) 3069 goto out; 3070 3071 count = 0; 3072 /* 3073 * Set each attribute requested. 3074 * We group settings according to the locks they need to acquire. 3075 * 3076 * Note: you cannot set ctime directly, although it will be 3077 * updated as a side-effect of calling this function. 3078 */ 3079 3080 3081 if (mask & (AT_UID|AT_GID|AT_MODE)) 3082 mutex_enter(&zp->z_acl_lock); 3083 mutex_enter(&zp->z_lock); 3084 3085 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3086 &zp->z_pflags, sizeof (zp->z_pflags)); 3087 3088 if (attrzp) { 3089 if (mask & (AT_UID|AT_GID|AT_MODE)) 3090 mutex_enter(&attrzp->z_acl_lock); 3091 mutex_enter(&attrzp->z_lock); 3092 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3093 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3094 sizeof (attrzp->z_pflags)); 3095 } 3096 3097 if (mask & (AT_UID|AT_GID)) { 3098 3099 if (mask & AT_UID) { 3100 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3101 &new_uid, sizeof (new_uid)); 3102 zp->z_uid = new_uid; 3103 if (attrzp) { 3104 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3105 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3106 sizeof (new_uid)); 3107 attrzp->z_uid = new_uid; 3108 } 3109 } 3110 3111 if (mask & AT_GID) { 3112 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3113 NULL, &new_gid, sizeof (new_gid)); 3114 zp->z_gid = new_gid; 3115 if (attrzp) { 3116 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3117 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3118 sizeof (new_gid)); 3119 attrzp->z_gid = new_gid; 3120 } 3121 } 3122 if (!(mask & AT_MODE)) { 3123 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3124 NULL, &new_mode, sizeof (new_mode)); 3125 new_mode = zp->z_mode; 3126 } 3127 err = zfs_acl_chown_setattr(zp); 3128 ASSERT(err == 0); 3129 if (attrzp) { 3130 err = zfs_acl_chown_setattr(attrzp); 3131 ASSERT(err == 0); 3132 } 3133 } 3134 3135 if (mask & AT_MODE) { 3136 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3137 &new_mode, sizeof (new_mode)); 3138 zp->z_mode = new_mode; 3139 ASSERT3U((uintptr_t)aclp, !=, NULL); 3140 err = zfs_aclset_common(zp, aclp, cr, tx); 3141 ASSERT0(err); 3142 if (zp->z_acl_cached) 3143 zfs_acl_free(zp->z_acl_cached); 3144 zp->z_acl_cached = aclp; 3145 aclp = NULL; 3146 } 3147 3148 3149 if (mask & AT_ATIME) { 3150 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3151 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3152 &zp->z_atime, sizeof (zp->z_atime)); 3153 } 3154 3155 if (mask & AT_MTIME) { 3156 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3157 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3158 mtime, sizeof (mtime)); 3159 } 3160 3161 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3162 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3163 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3164 NULL, mtime, sizeof (mtime)); 3165 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3166 &ctime, sizeof (ctime)); 3167 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3168 B_TRUE); 3169 } else if (mask != 0) { 3170 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3171 &ctime, sizeof (ctime)); 3172 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3173 B_TRUE); 3174 if (attrzp) { 3175 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3176 SA_ZPL_CTIME(zfsvfs), NULL, 3177 &ctime, sizeof (ctime)); 3178 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3179 mtime, ctime, B_TRUE); 3180 } 3181 } 3182 /* 3183 * Do this after setting timestamps to prevent timestamp 3184 * update from toggling bit 3185 */ 3186 3187 if (xoap && (mask & AT_XVATTR)) { 3188 3189 /* 3190 * restore trimmed off masks 3191 * so that return masks can be set for caller. 3192 */ 3193 3194 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3195 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3196 } 3197 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3198 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3199 } 3200 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3201 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3202 } 3203 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3204 XVA_SET_REQ(xvap, XAT_NODUMP); 3205 } 3206 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3207 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3208 } 3209 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3210 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3211 } 3212 3213 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3214 ASSERT(vp->v_type == VREG); 3215 3216 zfs_xvattr_set(zp, xvap, tx); 3217 } 3218 3219 if (fuid_dirtied) 3220 zfs_fuid_sync(zfsvfs, tx); 3221 3222 if (mask != 0) 3223 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3224 3225 mutex_exit(&zp->z_lock); 3226 if (mask & (AT_UID|AT_GID|AT_MODE)) 3227 mutex_exit(&zp->z_acl_lock); 3228 3229 if (attrzp) { 3230 if (mask & (AT_UID|AT_GID|AT_MODE)) 3231 mutex_exit(&attrzp->z_acl_lock); 3232 mutex_exit(&attrzp->z_lock); 3233 } 3234 out: 3235 if (err == 0 && attrzp) { 3236 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3237 xattr_count, tx); 3238 ASSERT(err2 == 0); 3239 } 3240 3241 if (attrzp) 3242 VN_RELE(ZTOV(attrzp)); 3243 3244 if (aclp) 3245 zfs_acl_free(aclp); 3246 3247 if (fuidp) { 3248 zfs_fuid_info_free(fuidp); 3249 fuidp = NULL; 3250 } 3251 3252 if (err) { 3253 dmu_tx_abort(tx); 3254 if (err == ERESTART) 3255 goto top; 3256 } else { 3257 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3258 dmu_tx_commit(tx); 3259 } 3260 3261 out2: 3262 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3263 zil_commit(zilog, 0); 3264 3265 ZFS_EXIT(zfsvfs); 3266 return (err); 3267 } 3268 3269 typedef struct zfs_zlock { 3270 krwlock_t *zl_rwlock; /* lock we acquired */ 3271 znode_t *zl_znode; /* znode we held */ 3272 struct zfs_zlock *zl_next; /* next in list */ 3273 } zfs_zlock_t; 3274 3275 /* 3276 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3277 */ 3278 static void 3279 zfs_rename_unlock(zfs_zlock_t **zlpp) 3280 { 3281 zfs_zlock_t *zl; 3282 3283 while ((zl = *zlpp) != NULL) { 3284 if (zl->zl_znode != NULL) 3285 VN_RELE(ZTOV(zl->zl_znode)); 3286 rw_exit(zl->zl_rwlock); 3287 *zlpp = zl->zl_next; 3288 kmem_free(zl, sizeof (*zl)); 3289 } 3290 } 3291 3292 /* 3293 * Search back through the directory tree, using the ".." entries. 3294 * Lock each directory in the chain to prevent concurrent renames. 3295 * Fail any attempt to move a directory into one of its own descendants. 3296 * XXX - z_parent_lock can overlap with map or grow locks 3297 */ 3298 static int 3299 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3300 { 3301 zfs_zlock_t *zl; 3302 znode_t *zp = tdzp; 3303 uint64_t rootid = zp->z_zfsvfs->z_root; 3304 uint64_t oidp = zp->z_id; 3305 krwlock_t *rwlp = &szp->z_parent_lock; 3306 krw_t rw = RW_WRITER; 3307 3308 /* 3309 * First pass write-locks szp and compares to zp->z_id. 3310 * Later passes read-lock zp and compare to zp->z_parent. 3311 */ 3312 do { 3313 if (!rw_tryenter(rwlp, rw)) { 3314 /* 3315 * Another thread is renaming in this path. 3316 * Note that if we are a WRITER, we don't have any 3317 * parent_locks held yet. 3318 */ 3319 if (rw == RW_READER && zp->z_id > szp->z_id) { 3320 /* 3321 * Drop our locks and restart 3322 */ 3323 zfs_rename_unlock(&zl); 3324 *zlpp = NULL; 3325 zp = tdzp; 3326 oidp = zp->z_id; 3327 rwlp = &szp->z_parent_lock; 3328 rw = RW_WRITER; 3329 continue; 3330 } else { 3331 /* 3332 * Wait for other thread to drop its locks 3333 */ 3334 rw_enter(rwlp, rw); 3335 } 3336 } 3337 3338 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3339 zl->zl_rwlock = rwlp; 3340 zl->zl_znode = NULL; 3341 zl->zl_next = *zlpp; 3342 *zlpp = zl; 3343 3344 if (oidp == szp->z_id) /* We're a descendant of szp */ 3345 return (SET_ERROR(EINVAL)); 3346 3347 if (oidp == rootid) /* We've hit the top */ 3348 return (0); 3349 3350 if (rw == RW_READER) { /* i.e. not the first pass */ 3351 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); 3352 if (error) 3353 return (error); 3354 zl->zl_znode = zp; 3355 } 3356 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), 3357 &oidp, sizeof (oidp)); 3358 rwlp = &zp->z_parent_lock; 3359 rw = RW_READER; 3360 3361 } while (zp->z_id != sdzp->z_id); 3362 3363 return (0); 3364 } 3365 3366 /* 3367 * Move an entry from the provided source directory to the target 3368 * directory. Change the entry name as indicated. 3369 * 3370 * IN: sdvp - Source directory containing the "old entry". 3371 * snm - Old entry name. 3372 * tdvp - Target directory to contain the "new entry". 3373 * tnm - New entry name. 3374 * cr - credentials of caller. 3375 * ct - caller context 3376 * flags - case flags 3377 * 3378 * RETURN: 0 on success, error code on failure. 3379 * 3380 * Timestamps: 3381 * sdvp,tdvp - ctime|mtime updated 3382 */ 3383 /*ARGSUSED*/ 3384 static int 3385 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3386 caller_context_t *ct, int flags) 3387 { 3388 znode_t *tdzp, *szp, *tzp; 3389 znode_t *sdzp = VTOZ(sdvp); 3390 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 3391 zilog_t *zilog; 3392 vnode_t *realvp; 3393 zfs_dirlock_t *sdl, *tdl; 3394 dmu_tx_t *tx; 3395 zfs_zlock_t *zl; 3396 int cmp, serr, terr; 3397 int error = 0; 3398 int zflg = 0; 3399 boolean_t waited = B_FALSE; 3400 3401 ZFS_ENTER(zfsvfs); 3402 ZFS_VERIFY_ZP(sdzp); 3403 zilog = zfsvfs->z_log; 3404 3405 /* 3406 * Make sure we have the real vp for the target directory. 3407 */ 3408 if (VOP_REALVP(tdvp, &realvp, ct) == 0) 3409 tdvp = realvp; 3410 3411 tdzp = VTOZ(tdvp); 3412 ZFS_VERIFY_ZP(tdzp); 3413 3414 /* 3415 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3416 * ctldir appear to have the same v_vfsp. 3417 */ 3418 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { 3419 ZFS_EXIT(zfsvfs); 3420 return (SET_ERROR(EXDEV)); 3421 } 3422 3423 if (zfsvfs->z_utf8 && u8_validate(tnm, 3424 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3425 ZFS_EXIT(zfsvfs); 3426 return (SET_ERROR(EILSEQ)); 3427 } 3428 3429 if (flags & FIGNORECASE) 3430 zflg |= ZCILOOK; 3431 3432 top: 3433 szp = NULL; 3434 tzp = NULL; 3435 zl = NULL; 3436 3437 /* 3438 * This is to prevent the creation of links into attribute space 3439 * by renaming a linked file into/outof an attribute directory. 3440 * See the comment in zfs_link() for why this is considered bad. 3441 */ 3442 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3443 ZFS_EXIT(zfsvfs); 3444 return (SET_ERROR(EINVAL)); 3445 } 3446 3447 /* 3448 * Lock source and target directory entries. To prevent deadlock, 3449 * a lock ordering must be defined. We lock the directory with 3450 * the smallest object id first, or if it's a tie, the one with 3451 * the lexically first name. 3452 */ 3453 if (sdzp->z_id < tdzp->z_id) { 3454 cmp = -1; 3455 } else if (sdzp->z_id > tdzp->z_id) { 3456 cmp = 1; 3457 } else { 3458 /* 3459 * First compare the two name arguments without 3460 * considering any case folding. 3461 */ 3462 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3463 3464 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3465 ASSERT(error == 0 || !zfsvfs->z_utf8); 3466 if (cmp == 0) { 3467 /* 3468 * POSIX: "If the old argument and the new argument 3469 * both refer to links to the same existing file, 3470 * the rename() function shall return successfully 3471 * and perform no other action." 3472 */ 3473 ZFS_EXIT(zfsvfs); 3474 return (0); 3475 } 3476 /* 3477 * If the file system is case-folding, then we may 3478 * have some more checking to do. A case-folding file 3479 * system is either supporting mixed case sensitivity 3480 * access or is completely case-insensitive. Note 3481 * that the file system is always case preserving. 3482 * 3483 * In mixed sensitivity mode case sensitive behavior 3484 * is the default. FIGNORECASE must be used to 3485 * explicitly request case insensitive behavior. 3486 * 3487 * If the source and target names provided differ only 3488 * by case (e.g., a request to rename 'tim' to 'Tim'), 3489 * we will treat this as a special case in the 3490 * case-insensitive mode: as long as the source name 3491 * is an exact match, we will allow this to proceed as 3492 * a name-change request. 3493 */ 3494 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3495 (zfsvfs->z_case == ZFS_CASE_MIXED && 3496 flags & FIGNORECASE)) && 3497 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3498 &error) == 0) { 3499 /* 3500 * case preserving rename request, require exact 3501 * name matches 3502 */ 3503 zflg |= ZCIEXACT; 3504 zflg &= ~ZCILOOK; 3505 } 3506 } 3507 3508 /* 3509 * If the source and destination directories are the same, we should 3510 * grab the z_name_lock of that directory only once. 3511 */ 3512 if (sdzp == tdzp) { 3513 zflg |= ZHAVELOCK; 3514 rw_enter(&sdzp->z_name_lock, RW_READER); 3515 } 3516 3517 if (cmp < 0) { 3518 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3519 ZEXISTS | zflg, NULL, NULL); 3520 terr = zfs_dirent_lock(&tdl, 3521 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3522 } else { 3523 terr = zfs_dirent_lock(&tdl, 3524 tdzp, tnm, &tzp, zflg, NULL, NULL); 3525 serr = zfs_dirent_lock(&sdl, 3526 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3527 NULL, NULL); 3528 } 3529 3530 if (serr) { 3531 /* 3532 * Source entry invalid or not there. 3533 */ 3534 if (!terr) { 3535 zfs_dirent_unlock(tdl); 3536 if (tzp) 3537 VN_RELE(ZTOV(tzp)); 3538 } 3539 3540 if (sdzp == tdzp) 3541 rw_exit(&sdzp->z_name_lock); 3542 3543 if (strcmp(snm, "..") == 0) 3544 serr = SET_ERROR(EINVAL); 3545 ZFS_EXIT(zfsvfs); 3546 return (serr); 3547 } 3548 if (terr) { 3549 zfs_dirent_unlock(sdl); 3550 VN_RELE(ZTOV(szp)); 3551 3552 if (sdzp == tdzp) 3553 rw_exit(&sdzp->z_name_lock); 3554 3555 if (strcmp(tnm, "..") == 0) 3556 terr = SET_ERROR(EINVAL); 3557 ZFS_EXIT(zfsvfs); 3558 return (terr); 3559 } 3560 3561 /* 3562 * Must have write access at the source to remove the old entry 3563 * and write access at the target to create the new entry. 3564 * Note that if target and source are the same, this can be 3565 * done in a single check. 3566 */ 3567 3568 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 3569 goto out; 3570 3571 if (ZTOV(szp)->v_type == VDIR) { 3572 /* 3573 * Check to make sure rename is valid. 3574 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 3575 */ 3576 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 3577 goto out; 3578 } 3579 3580 /* 3581 * Does target exist? 3582 */ 3583 if (tzp) { 3584 /* 3585 * Source and target must be the same type. 3586 */ 3587 if (ZTOV(szp)->v_type == VDIR) { 3588 if (ZTOV(tzp)->v_type != VDIR) { 3589 error = SET_ERROR(ENOTDIR); 3590 goto out; 3591 } 3592 } else { 3593 if (ZTOV(tzp)->v_type == VDIR) { 3594 error = SET_ERROR(EISDIR); 3595 goto out; 3596 } 3597 } 3598 /* 3599 * POSIX dictates that when the source and target 3600 * entries refer to the same file object, rename 3601 * must do nothing and exit without error. 3602 */ 3603 if (szp->z_id == tzp->z_id) { 3604 error = 0; 3605 goto out; 3606 } 3607 } 3608 3609 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 3610 if (tzp) 3611 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 3612 3613 /* 3614 * notify the target directory if it is not the same 3615 * as source directory. 3616 */ 3617 if (tdvp != sdvp) { 3618 vnevent_rename_dest_dir(tdvp, ct); 3619 } 3620 3621 tx = dmu_tx_create(zfsvfs->z_os); 3622 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3623 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3624 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 3625 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3626 if (sdzp != tdzp) { 3627 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3628 zfs_sa_upgrade_txholds(tx, tdzp); 3629 } 3630 if (tzp) { 3631 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3632 zfs_sa_upgrade_txholds(tx, tzp); 3633 } 3634 3635 zfs_sa_upgrade_txholds(tx, szp); 3636 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3637 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 3638 if (error) { 3639 if (zl != NULL) 3640 zfs_rename_unlock(&zl); 3641 zfs_dirent_unlock(sdl); 3642 zfs_dirent_unlock(tdl); 3643 3644 if (sdzp == tdzp) 3645 rw_exit(&sdzp->z_name_lock); 3646 3647 VN_RELE(ZTOV(szp)); 3648 if (tzp) 3649 VN_RELE(ZTOV(tzp)); 3650 if (error == ERESTART) { 3651 waited = B_TRUE; 3652 dmu_tx_wait(tx); 3653 dmu_tx_abort(tx); 3654 goto top; 3655 } 3656 dmu_tx_abort(tx); 3657 ZFS_EXIT(zfsvfs); 3658 return (error); 3659 } 3660 3661 if (tzp) /* Attempt to remove the existing target */ 3662 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 3663 3664 if (error == 0) { 3665 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3666 if (error == 0) { 3667 szp->z_pflags |= ZFS_AV_MODIFIED; 3668 3669 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3670 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3671 ASSERT0(error); 3672 3673 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3674 if (error == 0) { 3675 zfs_log_rename(zilog, tx, TX_RENAME | 3676 (flags & FIGNORECASE ? TX_CI : 0), sdzp, 3677 sdl->dl_name, tdzp, tdl->dl_name, szp); 3678 3679 /* 3680 * Update path information for the target vnode 3681 */ 3682 vn_renamepath(tdvp, ZTOV(szp), tnm, 3683 strlen(tnm)); 3684 } else { 3685 /* 3686 * At this point, we have successfully created 3687 * the target name, but have failed to remove 3688 * the source name. Since the create was done 3689 * with the ZRENAMING flag, there are 3690 * complications; for one, the link count is 3691 * wrong. The easiest way to deal with this 3692 * is to remove the newly created target, and 3693 * return the original error. This must 3694 * succeed; fortunately, it is very unlikely to 3695 * fail, since we just created it. 3696 */ 3697 VERIFY3U(zfs_link_destroy(tdl, szp, tx, 3698 ZRENAMING, NULL), ==, 0); 3699 } 3700 } 3701 } 3702 3703 dmu_tx_commit(tx); 3704 out: 3705 if (zl != NULL) 3706 zfs_rename_unlock(&zl); 3707 3708 zfs_dirent_unlock(sdl); 3709 zfs_dirent_unlock(tdl); 3710 3711 if (sdzp == tdzp) 3712 rw_exit(&sdzp->z_name_lock); 3713 3714 3715 VN_RELE(ZTOV(szp)); 3716 if (tzp) 3717 VN_RELE(ZTOV(tzp)); 3718 3719 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3720 zil_commit(zilog, 0); 3721 3722 ZFS_EXIT(zfsvfs); 3723 return (error); 3724 } 3725 3726 /* 3727 * Insert the indicated symbolic reference entry into the directory. 3728 * 3729 * IN: dvp - Directory to contain new symbolic link. 3730 * link - Name for new symlink entry. 3731 * vap - Attributes of new entry. 3732 * cr - credentials of caller. 3733 * ct - caller context 3734 * flags - case flags 3735 * 3736 * RETURN: 0 on success, error code on failure. 3737 * 3738 * Timestamps: 3739 * dvp - ctime|mtime updated 3740 */ 3741 /*ARGSUSED*/ 3742 static int 3743 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, 3744 caller_context_t *ct, int flags) 3745 { 3746 znode_t *zp, *dzp = VTOZ(dvp); 3747 zfs_dirlock_t *dl; 3748 dmu_tx_t *tx; 3749 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3750 zilog_t *zilog; 3751 uint64_t len = strlen(link); 3752 int error; 3753 int zflg = ZNEW; 3754 zfs_acl_ids_t acl_ids; 3755 boolean_t fuid_dirtied; 3756 uint64_t txtype = TX_SYMLINK; 3757 boolean_t waited = B_FALSE; 3758 3759 ASSERT(vap->va_type == VLNK); 3760 3761 ZFS_ENTER(zfsvfs); 3762 ZFS_VERIFY_ZP(dzp); 3763 zilog = zfsvfs->z_log; 3764 3765 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3766 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3767 ZFS_EXIT(zfsvfs); 3768 return (SET_ERROR(EILSEQ)); 3769 } 3770 if (flags & FIGNORECASE) 3771 zflg |= ZCILOOK; 3772 3773 if (len > MAXPATHLEN) { 3774 ZFS_EXIT(zfsvfs); 3775 return (SET_ERROR(ENAMETOOLONG)); 3776 } 3777 3778 if ((error = zfs_acl_ids_create(dzp, 0, 3779 vap, cr, NULL, &acl_ids)) != 0) { 3780 ZFS_EXIT(zfsvfs); 3781 return (error); 3782 } 3783 top: 3784 /* 3785 * Attempt to lock directory; fail if entry already exists. 3786 */ 3787 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3788 if (error) { 3789 zfs_acl_ids_free(&acl_ids); 3790 ZFS_EXIT(zfsvfs); 3791 return (error); 3792 } 3793 3794 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 3795 zfs_acl_ids_free(&acl_ids); 3796 zfs_dirent_unlock(dl); 3797 ZFS_EXIT(zfsvfs); 3798 return (error); 3799 } 3800 3801 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 3802 zfs_acl_ids_free(&acl_ids); 3803 zfs_dirent_unlock(dl); 3804 ZFS_EXIT(zfsvfs); 3805 return (SET_ERROR(EDQUOT)); 3806 } 3807 tx = dmu_tx_create(zfsvfs->z_os); 3808 fuid_dirtied = zfsvfs->z_fuid_dirty; 3809 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3810 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3811 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3812 ZFS_SA_BASE_ATTR_SIZE + len); 3813 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3814 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3815 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3816 acl_ids.z_aclp->z_acl_bytes); 3817 } 3818 if (fuid_dirtied) 3819 zfs_fuid_txhold(zfsvfs, tx); 3820 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 3821 if (error) { 3822 zfs_dirent_unlock(dl); 3823 if (error == ERESTART) { 3824 waited = B_TRUE; 3825 dmu_tx_wait(tx); 3826 dmu_tx_abort(tx); 3827 goto top; 3828 } 3829 zfs_acl_ids_free(&acl_ids); 3830 dmu_tx_abort(tx); 3831 ZFS_EXIT(zfsvfs); 3832 return (error); 3833 } 3834 3835 /* 3836 * Create a new object for the symlink. 3837 * for version 4 ZPL datsets the symlink will be an SA attribute 3838 */ 3839 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3840 3841 if (fuid_dirtied) 3842 zfs_fuid_sync(zfsvfs, tx); 3843 3844 mutex_enter(&zp->z_lock); 3845 if (zp->z_is_sa) 3846 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3847 link, len, tx); 3848 else 3849 zfs_sa_symlink(zp, link, len, tx); 3850 mutex_exit(&zp->z_lock); 3851 3852 zp->z_size = len; 3853 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3854 &zp->z_size, sizeof (zp->z_size), tx); 3855 /* 3856 * Insert the new object into the directory. 3857 */ 3858 (void) zfs_link_create(dl, zp, tx, ZNEW); 3859 3860 if (flags & FIGNORECASE) 3861 txtype |= TX_CI; 3862 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3863 3864 zfs_acl_ids_free(&acl_ids); 3865 3866 dmu_tx_commit(tx); 3867 3868 zfs_dirent_unlock(dl); 3869 3870 VN_RELE(ZTOV(zp)); 3871 3872 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3873 zil_commit(zilog, 0); 3874 3875 ZFS_EXIT(zfsvfs); 3876 return (error); 3877 } 3878 3879 /* 3880 * Return, in the buffer contained in the provided uio structure, 3881 * the symbolic path referred to by vp. 3882 * 3883 * IN: vp - vnode of symbolic link. 3884 * uio - structure to contain the link path. 3885 * cr - credentials of caller. 3886 * ct - caller context 3887 * 3888 * OUT: uio - structure containing the link path. 3889 * 3890 * RETURN: 0 on success, error code on failure. 3891 * 3892 * Timestamps: 3893 * vp - atime updated 3894 */ 3895 /* ARGSUSED */ 3896 static int 3897 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 3898 { 3899 znode_t *zp = VTOZ(vp); 3900 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3901 int error; 3902 3903 ZFS_ENTER(zfsvfs); 3904 ZFS_VERIFY_ZP(zp); 3905 3906 mutex_enter(&zp->z_lock); 3907 if (zp->z_is_sa) 3908 error = sa_lookup_uio(zp->z_sa_hdl, 3909 SA_ZPL_SYMLINK(zfsvfs), uio); 3910 else 3911 error = zfs_sa_readlink(zp, uio); 3912 mutex_exit(&zp->z_lock); 3913 3914 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3915 3916 ZFS_EXIT(zfsvfs); 3917 return (error); 3918 } 3919 3920 /* 3921 * Insert a new entry into directory tdvp referencing svp. 3922 * 3923 * IN: tdvp - Directory to contain new entry. 3924 * svp - vnode of new entry. 3925 * name - name of new entry. 3926 * cr - credentials of caller. 3927 * ct - caller context 3928 * 3929 * RETURN: 0 on success, error code on failure. 3930 * 3931 * Timestamps: 3932 * tdvp - ctime|mtime updated 3933 * svp - ctime updated 3934 */ 3935 /* ARGSUSED */ 3936 static int 3937 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 3938 caller_context_t *ct, int flags) 3939 { 3940 znode_t *dzp = VTOZ(tdvp); 3941 znode_t *tzp, *szp; 3942 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 3943 zilog_t *zilog; 3944 zfs_dirlock_t *dl; 3945 dmu_tx_t *tx; 3946 vnode_t *realvp; 3947 int error; 3948 int zf = ZNEW; 3949 uint64_t parent; 3950 uid_t owner; 3951 boolean_t waited = B_FALSE; 3952 3953 ASSERT(tdvp->v_type == VDIR); 3954 3955 ZFS_ENTER(zfsvfs); 3956 ZFS_VERIFY_ZP(dzp); 3957 zilog = zfsvfs->z_log; 3958 3959 if (VOP_REALVP(svp, &realvp, ct) == 0) 3960 svp = realvp; 3961 3962 /* 3963 * POSIX dictates that we return EPERM here. 3964 * Better choices include ENOTSUP or EISDIR. 3965 */ 3966 if (svp->v_type == VDIR) { 3967 ZFS_EXIT(zfsvfs); 3968 return (SET_ERROR(EPERM)); 3969 } 3970 3971 szp = VTOZ(svp); 3972 ZFS_VERIFY_ZP(szp); 3973 3974 /* 3975 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3976 * ctldir appear to have the same v_vfsp. 3977 */ 3978 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { 3979 ZFS_EXIT(zfsvfs); 3980 return (SET_ERROR(EXDEV)); 3981 } 3982 3983 /* Prevent links to .zfs/shares files */ 3984 3985 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3986 &parent, sizeof (uint64_t))) != 0) { 3987 ZFS_EXIT(zfsvfs); 3988 return (error); 3989 } 3990 if (parent == zfsvfs->z_shares_dir) { 3991 ZFS_EXIT(zfsvfs); 3992 return (SET_ERROR(EPERM)); 3993 } 3994 3995 if (zfsvfs->z_utf8 && u8_validate(name, 3996 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3997 ZFS_EXIT(zfsvfs); 3998 return (SET_ERROR(EILSEQ)); 3999 } 4000 if (flags & FIGNORECASE) 4001 zf |= ZCILOOK; 4002 4003 /* 4004 * We do not support links between attributes and non-attributes 4005 * because of the potential security risk of creating links 4006 * into "normal" file space in order to circumvent restrictions 4007 * imposed in attribute space. 4008 */ 4009 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4010 ZFS_EXIT(zfsvfs); 4011 return (SET_ERROR(EINVAL)); 4012 } 4013 4014 4015 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4016 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 4017 ZFS_EXIT(zfsvfs); 4018 return (SET_ERROR(EPERM)); 4019 } 4020 4021 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4022 ZFS_EXIT(zfsvfs); 4023 return (error); 4024 } 4025 4026 top: 4027 /* 4028 * Attempt to lock directory; fail if entry already exists. 4029 */ 4030 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 4031 if (error) { 4032 ZFS_EXIT(zfsvfs); 4033 return (error); 4034 } 4035 4036 tx = dmu_tx_create(zfsvfs->z_os); 4037 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4038 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4039 zfs_sa_upgrade_txholds(tx, szp); 4040 zfs_sa_upgrade_txholds(tx, dzp); 4041 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 4042 if (error) { 4043 zfs_dirent_unlock(dl); 4044 if (error == ERESTART) { 4045 waited = B_TRUE; 4046 dmu_tx_wait(tx); 4047 dmu_tx_abort(tx); 4048 goto top; 4049 } 4050 dmu_tx_abort(tx); 4051 ZFS_EXIT(zfsvfs); 4052 return (error); 4053 } 4054 4055 error = zfs_link_create(dl, szp, tx, 0); 4056 4057 if (error == 0) { 4058 uint64_t txtype = TX_LINK; 4059 if (flags & FIGNORECASE) 4060 txtype |= TX_CI; 4061 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4062 } 4063 4064 dmu_tx_commit(tx); 4065 4066 zfs_dirent_unlock(dl); 4067 4068 if (error == 0) { 4069 vnevent_link(svp, ct); 4070 } 4071 4072 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4073 zil_commit(zilog, 0); 4074 4075 ZFS_EXIT(zfsvfs); 4076 return (error); 4077 } 4078 4079 /* 4080 * zfs_null_putapage() is used when the file system has been force 4081 * unmounted. It just drops the pages. 4082 */ 4083 /* ARGSUSED */ 4084 static int 4085 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4086 size_t *lenp, int flags, cred_t *cr) 4087 { 4088 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 4089 return (0); 4090 } 4091 4092 /* 4093 * Push a page out to disk, klustering if possible. 4094 * 4095 * IN: vp - file to push page to. 4096 * pp - page to push. 4097 * flags - additional flags. 4098 * cr - credentials of caller. 4099 * 4100 * OUT: offp - start of range pushed. 4101 * lenp - len of range pushed. 4102 * 4103 * RETURN: 0 on success, error code on failure. 4104 * 4105 * NOTE: callers must have locked the page to be pushed. On 4106 * exit, the page (and all other pages in the kluster) must be 4107 * unlocked. 4108 */ 4109 /* ARGSUSED */ 4110 static int 4111 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4112 size_t *lenp, int flags, cred_t *cr) 4113 { 4114 znode_t *zp = VTOZ(vp); 4115 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4116 dmu_tx_t *tx; 4117 u_offset_t off, koff; 4118 size_t len, klen; 4119 int err; 4120 4121 off = pp->p_offset; 4122 len = PAGESIZE; 4123 /* 4124 * If our blocksize is bigger than the page size, try to kluster 4125 * multiple pages so that we write a full block (thus avoiding 4126 * a read-modify-write). 4127 */ 4128 if (off < zp->z_size && zp->z_blksz > PAGESIZE) { 4129 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 4130 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 4131 ASSERT(koff <= zp->z_size); 4132 if (koff + klen > zp->z_size) 4133 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); 4134 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 4135 } 4136 ASSERT3U(btop(len), ==, btopr(len)); 4137 4138 /* 4139 * Can't push pages past end-of-file. 4140 */ 4141 if (off >= zp->z_size) { 4142 /* ignore all pages */ 4143 err = 0; 4144 goto out; 4145 } else if (off + len > zp->z_size) { 4146 int npages = btopr(zp->z_size - off); 4147 page_t *trunc; 4148 4149 page_list_break(&pp, &trunc, npages); 4150 /* ignore pages past end of file */ 4151 if (trunc) 4152 pvn_write_done(trunc, flags); 4153 len = zp->z_size - off; 4154 } 4155 4156 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4157 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4158 err = SET_ERROR(EDQUOT); 4159 goto out; 4160 } 4161 tx = dmu_tx_create(zfsvfs->z_os); 4162 dmu_tx_hold_write(tx, zp->z_id, off, len); 4163 4164 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4165 zfs_sa_upgrade_txholds(tx, zp); 4166 err = dmu_tx_assign(tx, TXG_WAIT); 4167 if (err != 0) { 4168 dmu_tx_abort(tx); 4169 goto out; 4170 } 4171 4172 if (zp->z_blksz <= PAGESIZE) { 4173 caddr_t va = zfs_map_page(pp, S_READ); 4174 ASSERT3U(len, <=, PAGESIZE); 4175 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 4176 zfs_unmap_page(pp, va); 4177 } else { 4178 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 4179 } 4180 4181 if (err == 0) { 4182 uint64_t mtime[2], ctime[2]; 4183 sa_bulk_attr_t bulk[3]; 4184 int count = 0; 4185 4186 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4187 &mtime, 16); 4188 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4189 &ctime, 16); 4190 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4191 &zp->z_pflags, 8); 4192 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4193 B_TRUE); 4194 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4195 } 4196 dmu_tx_commit(tx); 4197 4198 out: 4199 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 4200 if (offp) 4201 *offp = off; 4202 if (lenp) 4203 *lenp = len; 4204 4205 return (err); 4206 } 4207 4208 /* 4209 * Copy the portion of the file indicated from pages into the file. 4210 * The pages are stored in a page list attached to the files vnode. 4211 * 4212 * IN: vp - vnode of file to push page data to. 4213 * off - position in file to put data. 4214 * len - amount of data to write. 4215 * flags - flags to control the operation. 4216 * cr - credentials of caller. 4217 * ct - caller context. 4218 * 4219 * RETURN: 0 on success, error code on failure. 4220 * 4221 * Timestamps: 4222 * vp - ctime|mtime updated 4223 */ 4224 /*ARGSUSED*/ 4225 static int 4226 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4227 caller_context_t *ct) 4228 { 4229 znode_t *zp = VTOZ(vp); 4230 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4231 page_t *pp; 4232 size_t io_len; 4233 u_offset_t io_off; 4234 uint_t blksz; 4235 rl_t *rl; 4236 int error = 0; 4237 4238 ZFS_ENTER(zfsvfs); 4239 ZFS_VERIFY_ZP(zp); 4240 4241 /* 4242 * There's nothing to do if no data is cached. 4243 */ 4244 if (!vn_has_cached_data(vp)) { 4245 ZFS_EXIT(zfsvfs); 4246 return (0); 4247 } 4248 4249 /* 4250 * Align this request to the file block size in case we kluster. 4251 * XXX - this can result in pretty aggresive locking, which can 4252 * impact simultanious read/write access. One option might be 4253 * to break up long requests (len == 0) into block-by-block 4254 * operations to get narrower locking. 4255 */ 4256 blksz = zp->z_blksz; 4257 if (ISP2(blksz)) 4258 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4259 else 4260 io_off = 0; 4261 if (len > 0 && ISP2(blksz)) 4262 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4263 else 4264 io_len = 0; 4265 4266 if (io_len == 0) { 4267 /* 4268 * Search the entire vp list for pages >= io_off. 4269 */ 4270 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); 4271 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4272 goto out; 4273 } 4274 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); 4275 4276 if (off > zp->z_size) { 4277 /* past end of file */ 4278 zfs_range_unlock(rl); 4279 ZFS_EXIT(zfsvfs); 4280 return (0); 4281 } 4282 4283 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); 4284 4285 for (off = io_off; io_off < off + len; io_off += io_len) { 4286 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4287 pp = page_lookup(vp, io_off, 4288 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4289 } else { 4290 pp = page_lookup_nowait(vp, io_off, 4291 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4292 } 4293 4294 if (pp != NULL && pvn_getdirty(pp, flags)) { 4295 int err; 4296 4297 /* 4298 * Found a dirty page to push 4299 */ 4300 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4301 if (err) 4302 error = err; 4303 } else { 4304 io_len = PAGESIZE; 4305 } 4306 } 4307 out: 4308 zfs_range_unlock(rl); 4309 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4310 zil_commit(zfsvfs->z_log, zp->z_id); 4311 ZFS_EXIT(zfsvfs); 4312 return (error); 4313 } 4314 4315 /*ARGSUSED*/ 4316 void 4317 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4318 { 4319 znode_t *zp = VTOZ(vp); 4320 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4321 int error; 4322 4323 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4324 if (zp->z_sa_hdl == NULL) { 4325 /* 4326 * The fs has been unmounted, or we did a 4327 * suspend/resume and this file no longer exists. 4328 */ 4329 if (vn_has_cached_data(vp)) { 4330 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 4331 B_INVAL, cr); 4332 } 4333 4334 mutex_enter(&zp->z_lock); 4335 mutex_enter(&vp->v_lock); 4336 ASSERT(vp->v_count == 1); 4337 vp->v_count = 0; 4338 mutex_exit(&vp->v_lock); 4339 mutex_exit(&zp->z_lock); 4340 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4341 zfs_znode_free(zp); 4342 return; 4343 } 4344 4345 /* 4346 * Attempt to push any data in the page cache. If this fails 4347 * we will get kicked out later in zfs_zinactive(). 4348 */ 4349 if (vn_has_cached_data(vp)) { 4350 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 4351 cr); 4352 } 4353 4354 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4355 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4356 4357 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4358 zfs_sa_upgrade_txholds(tx, zp); 4359 error = dmu_tx_assign(tx, TXG_WAIT); 4360 if (error) { 4361 dmu_tx_abort(tx); 4362 } else { 4363 mutex_enter(&zp->z_lock); 4364 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4365 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4366 zp->z_atime_dirty = 0; 4367 mutex_exit(&zp->z_lock); 4368 dmu_tx_commit(tx); 4369 } 4370 } 4371 4372 zfs_zinactive(zp); 4373 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4374 } 4375 4376 /* 4377 * Bounds-check the seek operation. 4378 * 4379 * IN: vp - vnode seeking within 4380 * ooff - old file offset 4381 * noffp - pointer to new file offset 4382 * ct - caller context 4383 * 4384 * RETURN: 0 on success, EINVAL if new offset invalid. 4385 */ 4386 /* ARGSUSED */ 4387 static int 4388 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4389 caller_context_t *ct) 4390 { 4391 if (vp->v_type == VDIR) 4392 return (0); 4393 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4394 } 4395 4396 /* 4397 * Pre-filter the generic locking function to trap attempts to place 4398 * a mandatory lock on a memory mapped file. 4399 */ 4400 static int 4401 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4402 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4403 { 4404 znode_t *zp = VTOZ(vp); 4405 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4406 4407 ZFS_ENTER(zfsvfs); 4408 ZFS_VERIFY_ZP(zp); 4409 4410 /* 4411 * We are following the UFS semantics with respect to mapcnt 4412 * here: If we see that the file is mapped already, then we will 4413 * return an error, but we don't worry about races between this 4414 * function and zfs_map(). 4415 */ 4416 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { 4417 ZFS_EXIT(zfsvfs); 4418 return (SET_ERROR(EAGAIN)); 4419 } 4420 ZFS_EXIT(zfsvfs); 4421 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4422 } 4423 4424 /* 4425 * If we can't find a page in the cache, we will create a new page 4426 * and fill it with file data. For efficiency, we may try to fill 4427 * multiple pages at once (klustering) to fill up the supplied page 4428 * list. Note that the pages to be filled are held with an exclusive 4429 * lock to prevent access by other threads while they are being filled. 4430 */ 4431 static int 4432 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4433 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4434 { 4435 znode_t *zp = VTOZ(vp); 4436 page_t *pp, *cur_pp; 4437 objset_t *os = zp->z_zfsvfs->z_os; 4438 u_offset_t io_off, total; 4439 size_t io_len; 4440 int err; 4441 4442 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4443 /* 4444 * We only have a single page, don't bother klustering 4445 */ 4446 io_off = off; 4447 io_len = PAGESIZE; 4448 pp = page_create_va(vp, io_off, io_len, 4449 PG_EXCL | PG_WAIT, seg, addr); 4450 } else { 4451 /* 4452 * Try to find enough pages to fill the page list 4453 */ 4454 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4455 &io_len, off, plsz, 0); 4456 } 4457 if (pp == NULL) { 4458 /* 4459 * The page already exists, nothing to do here. 4460 */ 4461 *pl = NULL; 4462 return (0); 4463 } 4464 4465 /* 4466 * Fill the pages in the kluster. 4467 */ 4468 cur_pp = pp; 4469 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4470 caddr_t va; 4471 4472 ASSERT3U(io_off, ==, cur_pp->p_offset); 4473 va = zfs_map_page(cur_pp, S_WRITE); 4474 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4475 DMU_READ_PREFETCH); 4476 zfs_unmap_page(cur_pp, va); 4477 if (err) { 4478 /* On error, toss the entire kluster */ 4479 pvn_read_done(pp, B_ERROR); 4480 /* convert checksum errors into IO errors */ 4481 if (err == ECKSUM) 4482 err = SET_ERROR(EIO); 4483 return (err); 4484 } 4485 cur_pp = cur_pp->p_next; 4486 } 4487 4488 /* 4489 * Fill in the page list array from the kluster starting 4490 * from the desired offset `off'. 4491 * NOTE: the page list will always be null terminated. 4492 */ 4493 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4494 ASSERT(pl == NULL || (*pl)->p_offset == off); 4495 4496 return (0); 4497 } 4498 4499 /* 4500 * Return pointers to the pages for the file region [off, off + len] 4501 * in the pl array. If plsz is greater than len, this function may 4502 * also return page pointers from after the specified region 4503 * (i.e. the region [off, off + plsz]). These additional pages are 4504 * only returned if they are already in the cache, or were created as 4505 * part of a klustered read. 4506 * 4507 * IN: vp - vnode of file to get data from. 4508 * off - position in file to get data from. 4509 * len - amount of data to retrieve. 4510 * plsz - length of provided page list. 4511 * seg - segment to obtain pages for. 4512 * addr - virtual address of fault. 4513 * rw - mode of created pages. 4514 * cr - credentials of caller. 4515 * ct - caller context. 4516 * 4517 * OUT: protp - protection mode of created pages. 4518 * pl - list of pages created. 4519 * 4520 * RETURN: 0 on success, error code on failure. 4521 * 4522 * Timestamps: 4523 * vp - atime updated 4524 */ 4525 /* ARGSUSED */ 4526 static int 4527 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4528 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4529 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4530 { 4531 znode_t *zp = VTOZ(vp); 4532 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4533 page_t **pl0 = pl; 4534 int err = 0; 4535 4536 /* we do our own caching, faultahead is unnecessary */ 4537 if (pl == NULL) 4538 return (0); 4539 else if (len > plsz) 4540 len = plsz; 4541 else 4542 len = P2ROUNDUP(len, PAGESIZE); 4543 ASSERT(plsz >= len); 4544 4545 ZFS_ENTER(zfsvfs); 4546 ZFS_VERIFY_ZP(zp); 4547 4548 if (protp) 4549 *protp = PROT_ALL; 4550 4551 /* 4552 * Loop through the requested range [off, off + len) looking 4553 * for pages. If we don't find a page, we will need to create 4554 * a new page and fill it with data from the file. 4555 */ 4556 while (len > 0) { 4557 if (*pl = page_lookup(vp, off, SE_SHARED)) 4558 *(pl+1) = NULL; 4559 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 4560 goto out; 4561 while (*pl) { 4562 ASSERT3U((*pl)->p_offset, ==, off); 4563 off += PAGESIZE; 4564 addr += PAGESIZE; 4565 if (len > 0) { 4566 ASSERT3U(len, >=, PAGESIZE); 4567 len -= PAGESIZE; 4568 } 4569 ASSERT3U(plsz, >=, PAGESIZE); 4570 plsz -= PAGESIZE; 4571 pl++; 4572 } 4573 } 4574 4575 /* 4576 * Fill out the page array with any pages already in the cache. 4577 */ 4578 while (plsz > 0 && 4579 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 4580 off += PAGESIZE; 4581 plsz -= PAGESIZE; 4582 } 4583 out: 4584 if (err) { 4585 /* 4586 * Release any pages we have previously locked. 4587 */ 4588 while (pl > pl0) 4589 page_unlock(*--pl); 4590 } else { 4591 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4592 } 4593 4594 *pl = NULL; 4595 4596 ZFS_EXIT(zfsvfs); 4597 return (err); 4598 } 4599 4600 /* 4601 * Request a memory map for a section of a file. This code interacts 4602 * with common code and the VM system as follows: 4603 * 4604 * - common code calls mmap(), which ends up in smmap_common() 4605 * - this calls VOP_MAP(), which takes you into (say) zfs 4606 * - zfs_map() calls as_map(), passing segvn_create() as the callback 4607 * - segvn_create() creates the new segment and calls VOP_ADDMAP() 4608 * - zfs_addmap() updates z_mapcnt 4609 */ 4610 /*ARGSUSED*/ 4611 static int 4612 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 4613 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4614 caller_context_t *ct) 4615 { 4616 znode_t *zp = VTOZ(vp); 4617 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4618 segvn_crargs_t vn_a; 4619 int error; 4620 4621 ZFS_ENTER(zfsvfs); 4622 ZFS_VERIFY_ZP(zp); 4623 4624 if ((prot & PROT_WRITE) && (zp->z_pflags & 4625 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4626 ZFS_EXIT(zfsvfs); 4627 return (SET_ERROR(EPERM)); 4628 } 4629 4630 if ((prot & (PROT_READ | PROT_EXEC)) && 4631 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4632 ZFS_EXIT(zfsvfs); 4633 return (SET_ERROR(EACCES)); 4634 } 4635 4636 if (vp->v_flag & VNOMAP) { 4637 ZFS_EXIT(zfsvfs); 4638 return (SET_ERROR(ENOSYS)); 4639 } 4640 4641 if (off < 0 || len > MAXOFFSET_T - off) { 4642 ZFS_EXIT(zfsvfs); 4643 return (SET_ERROR(ENXIO)); 4644 } 4645 4646 if (vp->v_type != VREG) { 4647 ZFS_EXIT(zfsvfs); 4648 return (SET_ERROR(ENODEV)); 4649 } 4650 4651 /* 4652 * If file is locked, disallow mapping. 4653 */ 4654 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { 4655 ZFS_EXIT(zfsvfs); 4656 return (SET_ERROR(EAGAIN)); 4657 } 4658 4659 as_rangelock(as); 4660 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 4661 if (error != 0) { 4662 as_rangeunlock(as); 4663 ZFS_EXIT(zfsvfs); 4664 return (error); 4665 } 4666 4667 vn_a.vp = vp; 4668 vn_a.offset = (u_offset_t)off; 4669 vn_a.type = flags & MAP_TYPE; 4670 vn_a.prot = prot; 4671 vn_a.maxprot = maxprot; 4672 vn_a.cred = cr; 4673 vn_a.amp = NULL; 4674 vn_a.flags = flags & ~MAP_TYPE; 4675 vn_a.szc = 0; 4676 vn_a.lgrp_mem_policy_flags = 0; 4677 4678 error = as_map(as, *addrp, len, segvn_create, &vn_a); 4679 4680 as_rangeunlock(as); 4681 ZFS_EXIT(zfsvfs); 4682 return (error); 4683 } 4684 4685 /* ARGSUSED */ 4686 static int 4687 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4688 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 4689 caller_context_t *ct) 4690 { 4691 uint64_t pages = btopr(len); 4692 4693 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 4694 return (0); 4695 } 4696 4697 /* 4698 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 4699 * more accurate mtime for the associated file. Since we don't have a way of 4700 * detecting when the data was actually modified, we have to resort to 4701 * heuristics. If an explicit msync() is done, then we mark the mtime when the 4702 * last page is pushed. The problem occurs when the msync() call is omitted, 4703 * which by far the most common case: 4704 * 4705 * open() 4706 * mmap() 4707 * <modify memory> 4708 * munmap() 4709 * close() 4710 * <time lapse> 4711 * putpage() via fsflush 4712 * 4713 * If we wait until fsflush to come along, we can have a modification time that 4714 * is some arbitrary point in the future. In order to prevent this in the 4715 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 4716 * torn down. 4717 */ 4718 /* ARGSUSED */ 4719 static int 4720 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 4721 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 4722 caller_context_t *ct) 4723 { 4724 uint64_t pages = btopr(len); 4725 4726 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 4727 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 4728 4729 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 4730 vn_has_cached_data(vp)) 4731 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 4732 4733 return (0); 4734 } 4735 4736 /* 4737 * Free or allocate space in a file. Currently, this function only 4738 * supports the `F_FREESP' command. However, this command is somewhat 4739 * misnamed, as its functionality includes the ability to allocate as 4740 * well as free space. 4741 * 4742 * IN: vp - vnode of file to free data in. 4743 * cmd - action to take (only F_FREESP supported). 4744 * bfp - section of file to free/alloc. 4745 * flag - current file open mode flags. 4746 * offset - current file offset. 4747 * cr - credentials of caller [UNUSED]. 4748 * ct - caller context. 4749 * 4750 * RETURN: 0 on success, error code on failure. 4751 * 4752 * Timestamps: 4753 * vp - ctime|mtime updated 4754 */ 4755 /* ARGSUSED */ 4756 static int 4757 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 4758 offset_t offset, cred_t *cr, caller_context_t *ct) 4759 { 4760 znode_t *zp = VTOZ(vp); 4761 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4762 uint64_t off, len; 4763 int error; 4764 4765 ZFS_ENTER(zfsvfs); 4766 ZFS_VERIFY_ZP(zp); 4767 4768 if (cmd != F_FREESP) { 4769 ZFS_EXIT(zfsvfs); 4770 return (SET_ERROR(EINVAL)); 4771 } 4772 4773 /* 4774 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 4775 * callers might not be able to detect properly that we are read-only, 4776 * so check it explicitly here. 4777 */ 4778 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 4779 ZFS_EXIT(zfsvfs); 4780 return (SET_ERROR(EROFS)); 4781 } 4782 4783 if (error = convoff(vp, bfp, 0, offset)) { 4784 ZFS_EXIT(zfsvfs); 4785 return (error); 4786 } 4787 4788 if (bfp->l_len < 0) { 4789 ZFS_EXIT(zfsvfs); 4790 return (SET_ERROR(EINVAL)); 4791 } 4792 4793 off = bfp->l_start; 4794 len = bfp->l_len; /* 0 means from off to end of file */ 4795 4796 error = zfs_freesp(zp, off, len, flag, TRUE); 4797 4798 if (error == 0 && off == 0 && len == 0) 4799 vnevent_truncate(ZTOV(zp), ct); 4800 4801 ZFS_EXIT(zfsvfs); 4802 return (error); 4803 } 4804 4805 /*ARGSUSED*/ 4806 static int 4807 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4808 { 4809 znode_t *zp = VTOZ(vp); 4810 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4811 uint32_t gen; 4812 uint64_t gen64; 4813 uint64_t object = zp->z_id; 4814 zfid_short_t *zfid; 4815 int size, i, error; 4816 4817 ZFS_ENTER(zfsvfs); 4818 ZFS_VERIFY_ZP(zp); 4819 4820 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4821 &gen64, sizeof (uint64_t))) != 0) { 4822 ZFS_EXIT(zfsvfs); 4823 return (error); 4824 } 4825 4826 gen = (uint32_t)gen64; 4827 4828 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4829 if (fidp->fid_len < size) { 4830 fidp->fid_len = size; 4831 ZFS_EXIT(zfsvfs); 4832 return (SET_ERROR(ENOSPC)); 4833 } 4834 4835 zfid = (zfid_short_t *)fidp; 4836 4837 zfid->zf_len = size; 4838 4839 for (i = 0; i < sizeof (zfid->zf_object); i++) 4840 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4841 4842 /* Must have a non-zero generation number to distinguish from .zfs */ 4843 if (gen == 0) 4844 gen = 1; 4845 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4846 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4847 4848 if (size == LONG_FID_LEN) { 4849 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4850 zfid_long_t *zlfid; 4851 4852 zlfid = (zfid_long_t *)fidp; 4853 4854 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4855 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4856 4857 /* XXX - this should be the generation number for the objset */ 4858 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4859 zlfid->zf_setgen[i] = 0; 4860 } 4861 4862 ZFS_EXIT(zfsvfs); 4863 return (0); 4864 } 4865 4866 static int 4867 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4868 caller_context_t *ct) 4869 { 4870 znode_t *zp, *xzp; 4871 zfsvfs_t *zfsvfs; 4872 zfs_dirlock_t *dl; 4873 int error; 4874 4875 switch (cmd) { 4876 case _PC_LINK_MAX: 4877 *valp = ULONG_MAX; 4878 return (0); 4879 4880 case _PC_FILESIZEBITS: 4881 *valp = 64; 4882 return (0); 4883 4884 case _PC_XATTR_EXISTS: 4885 zp = VTOZ(vp); 4886 zfsvfs = zp->z_zfsvfs; 4887 ZFS_ENTER(zfsvfs); 4888 ZFS_VERIFY_ZP(zp); 4889 *valp = 0; 4890 error = zfs_dirent_lock(&dl, zp, "", &xzp, 4891 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 4892 if (error == 0) { 4893 zfs_dirent_unlock(dl); 4894 if (!zfs_dirempty(xzp)) 4895 *valp = 1; 4896 VN_RELE(ZTOV(xzp)); 4897 } else if (error == ENOENT) { 4898 /* 4899 * If there aren't extended attributes, it's the 4900 * same as having zero of them. 4901 */ 4902 error = 0; 4903 } 4904 ZFS_EXIT(zfsvfs); 4905 return (error); 4906 4907 case _PC_SATTR_ENABLED: 4908 case _PC_SATTR_EXISTS: 4909 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 4910 (vp->v_type == VREG || vp->v_type == VDIR); 4911 return (0); 4912 4913 case _PC_ACCESS_FILTERING: 4914 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 4915 vp->v_type == VDIR; 4916 return (0); 4917 4918 case _PC_ACL_ENABLED: 4919 *valp = _ACL_ACE_ENABLED; 4920 return (0); 4921 4922 case _PC_MIN_HOLE_SIZE: 4923 *valp = (ulong_t)SPA_MINBLOCKSIZE; 4924 return (0); 4925 4926 case _PC_TIMESTAMP_RESOLUTION: 4927 /* nanosecond timestamp resolution */ 4928 *valp = 1L; 4929 return (0); 4930 4931 default: 4932 return (fs_pathconf(vp, cmd, valp, cr, ct)); 4933 } 4934 } 4935 4936 /*ARGSUSED*/ 4937 static int 4938 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4939 caller_context_t *ct) 4940 { 4941 znode_t *zp = VTOZ(vp); 4942 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4943 int error; 4944 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4945 4946 ZFS_ENTER(zfsvfs); 4947 ZFS_VERIFY_ZP(zp); 4948 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 4949 ZFS_EXIT(zfsvfs); 4950 4951 return (error); 4952 } 4953 4954 /*ARGSUSED*/ 4955 static int 4956 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 4957 caller_context_t *ct) 4958 { 4959 znode_t *zp = VTOZ(vp); 4960 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4961 int error; 4962 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 4963 zilog_t *zilog = zfsvfs->z_log; 4964 4965 ZFS_ENTER(zfsvfs); 4966 ZFS_VERIFY_ZP(zp); 4967 4968 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 4969 4970 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4971 zil_commit(zilog, 0); 4972 4973 ZFS_EXIT(zfsvfs); 4974 return (error); 4975 } 4976 4977 /* 4978 * The smallest read we may consider to loan out an arcbuf. 4979 * This must be a power of 2. 4980 */ 4981 int zcr_blksz_min = (1 << 10); /* 1K */ 4982 /* 4983 * If set to less than the file block size, allow loaning out of an 4984 * arcbuf for a partial block read. This must be a power of 2. 4985 */ 4986 int zcr_blksz_max = (1 << 17); /* 128K */ 4987 4988 /*ARGSUSED*/ 4989 static int 4990 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 4991 caller_context_t *ct) 4992 { 4993 znode_t *zp = VTOZ(vp); 4994 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4995 int max_blksz = zfsvfs->z_max_blksz; 4996 uio_t *uio = &xuio->xu_uio; 4997 ssize_t size = uio->uio_resid; 4998 offset_t offset = uio->uio_loffset; 4999 int blksz; 5000 int fullblk, i; 5001 arc_buf_t *abuf; 5002 ssize_t maxsize; 5003 int preamble, postamble; 5004 5005 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 5006 return (SET_ERROR(EINVAL)); 5007 5008 ZFS_ENTER(zfsvfs); 5009 ZFS_VERIFY_ZP(zp); 5010 switch (ioflag) { 5011 case UIO_WRITE: 5012 /* 5013 * Loan out an arc_buf for write if write size is bigger than 5014 * max_blksz, and the file's block size is also max_blksz. 5015 */ 5016 blksz = max_blksz; 5017 if (size < blksz || zp->z_blksz != blksz) { 5018 ZFS_EXIT(zfsvfs); 5019 return (SET_ERROR(EINVAL)); 5020 } 5021 /* 5022 * Caller requests buffers for write before knowing where the 5023 * write offset might be (e.g. NFS TCP write). 5024 */ 5025 if (offset == -1) { 5026 preamble = 0; 5027 } else { 5028 preamble = P2PHASE(offset, blksz); 5029 if (preamble) { 5030 preamble = blksz - preamble; 5031 size -= preamble; 5032 } 5033 } 5034 5035 postamble = P2PHASE(size, blksz); 5036 size -= postamble; 5037 5038 fullblk = size / blksz; 5039 (void) dmu_xuio_init(xuio, 5040 (preamble != 0) + fullblk + (postamble != 0)); 5041 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5042 int, postamble, int, 5043 (preamble != 0) + fullblk + (postamble != 0)); 5044 5045 /* 5046 * Have to fix iov base/len for partial buffers. They 5047 * currently represent full arc_buf's. 5048 */ 5049 if (preamble) { 5050 /* data begins in the middle of the arc_buf */ 5051 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5052 blksz); 5053 ASSERT(abuf); 5054 (void) dmu_xuio_add(xuio, abuf, 5055 blksz - preamble, preamble); 5056 } 5057 5058 for (i = 0; i < fullblk; i++) { 5059 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5060 blksz); 5061 ASSERT(abuf); 5062 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5063 } 5064 5065 if (postamble) { 5066 /* data ends in the middle of the arc_buf */ 5067 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5068 blksz); 5069 ASSERT(abuf); 5070 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5071 } 5072 break; 5073 case UIO_READ: 5074 /* 5075 * Loan out an arc_buf for read if the read size is larger than 5076 * the current file block size. Block alignment is not 5077 * considered. Partial arc_buf will be loaned out for read. 5078 */ 5079 blksz = zp->z_blksz; 5080 if (blksz < zcr_blksz_min) 5081 blksz = zcr_blksz_min; 5082 if (blksz > zcr_blksz_max) 5083 blksz = zcr_blksz_max; 5084 /* avoid potential complexity of dealing with it */ 5085 if (blksz > max_blksz) { 5086 ZFS_EXIT(zfsvfs); 5087 return (SET_ERROR(EINVAL)); 5088 } 5089 5090 maxsize = zp->z_size - uio->uio_loffset; 5091 if (size > maxsize) 5092 size = maxsize; 5093 5094 if (size < blksz || vn_has_cached_data(vp)) { 5095 ZFS_EXIT(zfsvfs); 5096 return (SET_ERROR(EINVAL)); 5097 } 5098 break; 5099 default: 5100 ZFS_EXIT(zfsvfs); 5101 return (SET_ERROR(EINVAL)); 5102 } 5103 5104 uio->uio_extflg = UIO_XUIO; 5105 XUIO_XUZC_RW(xuio) = ioflag; 5106 ZFS_EXIT(zfsvfs); 5107 return (0); 5108 } 5109 5110 /*ARGSUSED*/ 5111 static int 5112 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5113 { 5114 int i; 5115 arc_buf_t *abuf; 5116 int ioflag = XUIO_XUZC_RW(xuio); 5117 5118 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5119 5120 i = dmu_xuio_cnt(xuio); 5121 while (i-- > 0) { 5122 abuf = dmu_xuio_arcbuf(xuio, i); 5123 /* 5124 * if abuf == NULL, it must be a write buffer 5125 * that has been returned in zfs_write(). 5126 */ 5127 if (abuf) 5128 dmu_return_arcbuf(abuf); 5129 ASSERT(abuf || ioflag == UIO_WRITE); 5130 } 5131 5132 dmu_xuio_fini(xuio); 5133 return (0); 5134 } 5135 5136 /* 5137 * Predeclare these here so that the compiler assumes that 5138 * this is an "old style" function declaration that does 5139 * not include arguments => we won't get type mismatch errors 5140 * in the initializations that follow. 5141 */ 5142 static int zfs_inval(); 5143 static int zfs_isdir(); 5144 5145 static int 5146 zfs_inval() 5147 { 5148 return (SET_ERROR(EINVAL)); 5149 } 5150 5151 static int 5152 zfs_isdir() 5153 { 5154 return (SET_ERROR(EISDIR)); 5155 } 5156 /* 5157 * Directory vnode operations template 5158 */ 5159 vnodeops_t *zfs_dvnodeops; 5160 const fs_operation_def_t zfs_dvnodeops_template[] = { 5161 VOPNAME_OPEN, { .vop_open = zfs_open }, 5162 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5163 VOPNAME_READ, { .error = zfs_isdir }, 5164 VOPNAME_WRITE, { .error = zfs_isdir }, 5165 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5166 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5167 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5168 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5169 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5170 VOPNAME_CREATE, { .vop_create = zfs_create }, 5171 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5172 VOPNAME_LINK, { .vop_link = zfs_link }, 5173 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5174 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 5175 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5176 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5177 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 5178 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5179 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5180 VOPNAME_FID, { .vop_fid = zfs_fid }, 5181 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5182 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5183 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5184 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5185 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5186 NULL, NULL 5187 }; 5188 5189 /* 5190 * Regular file vnode operations template 5191 */ 5192 vnodeops_t *zfs_fvnodeops; 5193 const fs_operation_def_t zfs_fvnodeops_template[] = { 5194 VOPNAME_OPEN, { .vop_open = zfs_open }, 5195 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5196 VOPNAME_READ, { .vop_read = zfs_read }, 5197 VOPNAME_WRITE, { .vop_write = zfs_write }, 5198 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5199 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5200 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5201 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5202 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5203 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5204 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5205 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5206 VOPNAME_FID, { .vop_fid = zfs_fid }, 5207 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5208 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 5209 VOPNAME_SPACE, { .vop_space = zfs_space }, 5210 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 5211 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 5212 VOPNAME_MAP, { .vop_map = zfs_map }, 5213 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 5214 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 5215 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5216 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5217 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5218 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5219 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, 5220 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, 5221 NULL, NULL 5222 }; 5223 5224 /* 5225 * Symbolic link vnode operations template 5226 */ 5227 vnodeops_t *zfs_symvnodeops; 5228 const fs_operation_def_t zfs_symvnodeops_template[] = { 5229 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5230 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5231 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5232 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5233 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 5234 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5235 VOPNAME_FID, { .vop_fid = zfs_fid }, 5236 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5237 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5238 NULL, NULL 5239 }; 5240 5241 /* 5242 * special share hidden files vnode operations template 5243 */ 5244 vnodeops_t *zfs_sharevnodeops; 5245 const fs_operation_def_t zfs_sharevnodeops_template[] = { 5246 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5247 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5248 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5249 VOPNAME_FID, { .vop_fid = zfs_fid }, 5250 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5251 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5252 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5253 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5254 NULL, NULL 5255 }; 5256 5257 /* 5258 * Extended attribute directory vnode operations template 5259 * 5260 * This template is identical to the directory vnodes 5261 * operation template except for restricted operations: 5262 * VOP_MKDIR() 5263 * VOP_SYMLINK() 5264 * 5265 * Note that there are other restrictions embedded in: 5266 * zfs_create() - restrict type to VREG 5267 * zfs_link() - no links into/out of attribute space 5268 * zfs_rename() - no moves into/out of attribute space 5269 */ 5270 vnodeops_t *zfs_xdvnodeops; 5271 const fs_operation_def_t zfs_xdvnodeops_template[] = { 5272 VOPNAME_OPEN, { .vop_open = zfs_open }, 5273 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5274 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5275 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5276 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5277 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5278 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5279 VOPNAME_CREATE, { .vop_create = zfs_create }, 5280 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5281 VOPNAME_LINK, { .vop_link = zfs_link }, 5282 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5283 VOPNAME_MKDIR, { .error = zfs_inval }, 5284 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5285 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5286 VOPNAME_SYMLINK, { .error = zfs_inval }, 5287 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5288 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5289 VOPNAME_FID, { .vop_fid = zfs_fid }, 5290 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5291 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5292 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5293 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5294 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5295 NULL, NULL 5296 }; 5297 5298 /* 5299 * Error vnode operations template 5300 */ 5301 vnodeops_t *zfs_evnodeops; 5302 const fs_operation_def_t zfs_evnodeops_template[] = { 5303 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5304 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5305 NULL, NULL 5306 }; 5307