1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/file.h> 37 #include <sys/stat.h> 38 #include <sys/kmem.h> 39 #include <sys/taskq.h> 40 #include <sys/uio.h> 41 #include <sys/vmsystm.h> 42 #include <sys/atomic.h> 43 #include <vm/seg_vn.h> 44 #include <vm/pvn.h> 45 #include <vm/as.h> 46 #include <sys/mman.h> 47 #include <sys/pathname.h> 48 #include <sys/cmn_err.h> 49 #include <sys/errno.h> 50 #include <sys/unistd.h> 51 #include <sys/zfs_vfsops.h> 52 #include <sys/zfs_dir.h> 53 #include <sys/zfs_acl.h> 54 #include <sys/zfs_ioctl.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/dmu.h> 57 #include <sys/spa.h> 58 #include <sys/txg.h> 59 #include <sys/dbuf.h> 60 #include <sys/zap.h> 61 #include <sys/dirent.h> 62 #include <sys/policy.h> 63 #include <sys/sunddi.h> 64 #include <sys/filio.h> 65 #include "fs/fs_subr.h" 66 #include <sys/zfs_ctldir.h> 67 #include <sys/dnlc.h> 68 #include <sys/zfs_rlock.h> 69 70 /* 71 * Programming rules. 72 * 73 * Each vnode op performs some logical unit of work. To do this, the ZPL must 74 * properly lock its in-core state, create a DMU transaction, do the work, 75 * record this work in the intent log (ZIL), commit the DMU transaction, 76 * and wait the the intent log to commit if it's is a synchronous operation. 77 * Morover, the vnode ops must work in both normal and log replay context. 78 * The ordering of events is important to avoid deadlocks and references 79 * to freed memory. The example below illustrates the following Big Rules: 80 * 81 * (1) A check must be made in each zfs thread for a mounted file system. 82 * This is done avoiding races using ZFS_ENTER(zfsvfs). 83 * A ZFS_EXIT(zfsvfs) is needed before all returns. 84 * 85 * (2) VN_RELE() should always be the last thing except for zil_commit() 86 * and ZFS_EXIT(). This is for 3 reasons: 87 * First, if it's the last reference, the vnode/znode 88 * can be freed, so the zp may point to freed memory. Second, the last 89 * reference will call zfs_zinactive(), which may induce a lot of work -- 90 * pushing cached pages (which acquires range locks) and syncing out 91 * cached atime changes. Third, zfs_zinactive() may require a new tx, 92 * which could deadlock the system if you were already holding one. 93 * 94 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 95 * as they can span dmu_tx_assign() calls. 96 * 97 * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). 98 * In normal operation, this will be TXG_NOWAIT. During ZIL replay, 99 * it will be a specific txg. Either way, dmu_tx_assign() never blocks. 100 * This is critical because we don't want to block while holding locks. 101 * Note, in particular, that if a lock is sometimes acquired before 102 * the tx assigns, and sometimes after (e.g. z_lock), then failing to 103 * use a non-blocking assign can deadlock the system. The scenario: 104 * 105 * Thread A has grabbed a lock before calling dmu_tx_assign(). 106 * Thread B is in an already-assigned tx, and blocks for this lock. 107 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 108 * forever, because the previous txg can't quiesce until B's tx commits. 109 * 110 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 111 * then drop all locks, call txg_wait_open(), and try again. 112 * 113 * (5) If the operation succeeded, generate the intent log entry for it 114 * before dropping locks. This ensures that the ordering of events 115 * in the intent log matches the order in which they actually occurred. 116 * 117 * (6) At the end of each vnode op, the DMU tx must always commit, 118 * regardless of whether there were any errors. 119 * 120 * (7) After dropping all locks, invoke zil_commit(zilog, seq, ioflag) 121 * to ensure that synchronous semantics are provided when necessary. 122 * 123 * In general, this is how things should be ordered in each vnode op: 124 * 125 * ZFS_ENTER(zfsvfs); // exit if unmounted 126 * top: 127 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 128 * rw_enter(...); // grab any other locks you need 129 * tx = dmu_tx_create(...); // get DMU tx 130 * dmu_tx_hold_*(); // hold each object you might modify 131 * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign 132 * if (error) { 133 * dmu_tx_abort(tx); // abort DMU tx 134 * rw_exit(...); // drop locks 135 * zfs_dirent_unlock(dl); // unlock directory entry 136 * VN_RELE(...); // release held vnodes 137 * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 138 * txg_wait_open(dmu_objset_pool(os), 0); 139 * goto top; 140 * } 141 * ZFS_EXIT(zfsvfs); // finished in zfs 142 * return (error); // really out of space 143 * } 144 * error = do_real_work(); // do whatever this VOP does 145 * if (error == 0) 146 * seq = zfs_log_*(...); // on success, make ZIL entry 147 * dmu_tx_commit(tx); // commit DMU tx -- error or not 148 * rw_exit(...); // drop locks 149 * zfs_dirent_unlock(dl); // unlock directory entry 150 * VN_RELE(...); // release held vnodes 151 * zil_commit(zilog, seq, ioflag); // synchronous when necessary 152 * ZFS_EXIT(zfsvfs); // finished in zfs 153 * return (error); // done, report error 154 */ 155 156 /* ARGSUSED */ 157 static int 158 zfs_open(vnode_t **vpp, int flag, cred_t *cr) 159 { 160 return (0); 161 } 162 163 /* ARGSUSED */ 164 static int 165 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 166 { 167 /* 168 * Clean up any locks held by this process on the vp. 169 */ 170 cleanlocks(vp, ddi_get_pid(), 0); 171 cleanshares(vp, ddi_get_pid()); 172 173 return (0); 174 } 175 176 /* 177 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 178 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 179 */ 180 static int 181 zfs_holey(vnode_t *vp, int cmd, offset_t *off) 182 { 183 znode_t *zp = VTOZ(vp); 184 uint64_t noff = (uint64_t)*off; /* new offset */ 185 uint64_t file_sz; 186 int error; 187 boolean_t hole; 188 189 file_sz = zp->z_phys->zp_size; 190 if (noff >= file_sz) { 191 return (ENXIO); 192 } 193 194 if (cmd == _FIO_SEEK_HOLE) 195 hole = B_TRUE; 196 else 197 hole = B_FALSE; 198 199 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 200 201 /* end of file? */ 202 if ((error == ESRCH) || (noff > file_sz)) { 203 /* 204 * Handle the virtual hole at the end of file. 205 */ 206 if (hole) { 207 *off = file_sz; 208 return (0); 209 } 210 return (ENXIO); 211 } 212 213 if (noff < *off) 214 return (error); 215 *off = noff; 216 return (error); 217 } 218 219 /* ARGSUSED */ 220 static int 221 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, 222 int *rvalp) 223 { 224 offset_t off; 225 int error; 226 zfsvfs_t *zfsvfs; 227 228 switch (com) { 229 case _FIOFFS: 230 return (zfs_sync(vp->v_vfsp, 0, cred)); 231 232 /* 233 * The following two ioctls are used by bfu. Faking out, 234 * necessary to avoid bfu errors. 235 */ 236 case _FIOGDIO: 237 case _FIOSDIO: 238 return (0); 239 240 case _FIO_SEEK_DATA: 241 case _FIO_SEEK_HOLE: 242 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 243 return (EFAULT); 244 245 zfsvfs = VTOZ(vp)->z_zfsvfs; 246 ZFS_ENTER(zfsvfs); 247 248 /* offset parameter is in/out */ 249 error = zfs_holey(vp, com, &off); 250 ZFS_EXIT(zfsvfs); 251 if (error) 252 return (error); 253 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 254 return (EFAULT); 255 return (0); 256 } 257 return (ENOTTY); 258 } 259 260 /* 261 * When a file is memory mapped, we must keep the IO data synchronized 262 * between the DMU cache and the memory mapped pages. What this means: 263 * 264 * On Write: If we find a memory mapped page, we write to *both* 265 * the page and the dmu buffer. 266 * 267 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 268 * the file is memory mapped. 269 */ 270 static int 271 mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx) 272 { 273 znode_t *zp = VTOZ(vp); 274 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 275 int64_t start, off; 276 int len = nbytes; 277 int error = 0; 278 279 start = uio->uio_loffset; 280 off = start & PAGEOFFSET; 281 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 282 page_t *pp; 283 uint64_t bytes = MIN(PAGESIZE - off, len); 284 285 /* 286 * We don't want a new page to "appear" in the middle of 287 * the file update (because it may not get the write 288 * update data), so we grab a lock to block 289 * zfs_getpage(). 290 */ 291 rw_enter(&zp->z_map_lock, RW_WRITER); 292 if (pp = page_lookup(vp, start, SE_SHARED)) { 293 caddr_t va; 294 295 rw_exit(&zp->z_map_lock); 296 va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); 297 error = uiomove(va+off, bytes, UIO_WRITE, uio); 298 if (error == 0) { 299 dmu_write(zfsvfs->z_os, zp->z_id, 300 woff, bytes, va+off, tx); 301 } 302 ppmapout(va); 303 page_unlock(pp); 304 } else { 305 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 306 woff, bytes, uio, tx); 307 rw_exit(&zp->z_map_lock); 308 } 309 len -= bytes; 310 woff += bytes; 311 off = 0; 312 if (error) 313 break; 314 } 315 return (error); 316 } 317 318 /* 319 * When a file is memory mapped, we must keep the IO data synchronized 320 * between the DMU cache and the memory mapped pages. What this means: 321 * 322 * On Read: We "read" preferentially from memory mapped pages, 323 * else we default from the dmu buffer. 324 * 325 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 326 * the file is memory mapped. 327 */ 328 static int 329 mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio) 330 { 331 int64_t start, off, bytes; 332 int len = nbytes; 333 int error = 0; 334 335 start = uio->uio_loffset; 336 off = start & PAGEOFFSET; 337 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 338 page_t *pp; 339 340 bytes = MIN(PAGESIZE - off, len); 341 if (pp = page_lookup(vp, start, SE_SHARED)) { 342 caddr_t va; 343 344 va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L); 345 error = uiomove(va + off, bytes, UIO_READ, uio); 346 ppmapout(va); 347 page_unlock(pp); 348 } else { 349 /* XXX use dmu_read here? */ 350 error = uiomove(addr, bytes, UIO_READ, uio); 351 } 352 len -= bytes; 353 addr += bytes; 354 off = 0; 355 if (error) 356 break; 357 } 358 return (error); 359 } 360 361 uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 362 363 /* 364 * Read bytes from specified file into supplied buffer. 365 * 366 * IN: vp - vnode of file to be read from. 367 * uio - structure supplying read location, range info, 368 * and return buffer. 369 * ioflag - SYNC flags; used to provide FRSYNC semantics. 370 * cr - credentials of caller. 371 * 372 * OUT: uio - updated offset and range, buffer filled. 373 * 374 * RETURN: 0 if success 375 * error code if failure 376 * 377 * Side Effects: 378 * vp - atime updated if byte count > 0 379 */ 380 /* ARGSUSED */ 381 static int 382 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 383 { 384 znode_t *zp = VTOZ(vp); 385 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 386 uint64_t delta; 387 ssize_t n, size, cnt, ndone; 388 int error, i, numbufs; 389 dmu_buf_t *dbp, **dbpp; 390 rl_t *rl; 391 392 ZFS_ENTER(zfsvfs); 393 394 /* 395 * Validate file offset 396 */ 397 if (uio->uio_loffset < (offset_t)0) { 398 ZFS_EXIT(zfsvfs); 399 return (EINVAL); 400 } 401 402 /* 403 * Fasttrack empty reads 404 */ 405 if (uio->uio_resid == 0) { 406 ZFS_EXIT(zfsvfs); 407 return (0); 408 } 409 410 /* 411 * Check for mandatory locks 412 */ 413 if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { 414 if (error = chklock(vp, FREAD, 415 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 416 ZFS_EXIT(zfsvfs); 417 return (error); 418 } 419 } 420 421 /* 422 * If we're in FRSYNC mode, sync out this znode before reading it. 423 */ 424 zil_commit(zfsvfs->z_log, zp->z_last_itx, ioflag & FRSYNC); 425 426 /* 427 * Lock the range against changes. 428 */ 429 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 430 431 /* 432 * If we are reading past end-of-file we can skip 433 * to the end; but we might still need to set atime. 434 */ 435 if (uio->uio_loffset >= zp->z_phys->zp_size) { 436 cnt = 0; 437 error = 0; 438 goto out; 439 } 440 441 cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); 442 443 for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) { 444 ASSERT(uio->uio_loffset < zp->z_phys->zp_size); 445 n = MIN(zfs_read_chunk_size, 446 zp->z_phys->zp_size - uio->uio_loffset); 447 n = MIN(n, cnt); 448 error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id, 449 uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp); 450 if (error) 451 goto out; 452 /* 453 * Compute the adjustment to align the dmu buffers 454 * with the uio buffer. 455 */ 456 delta = uio->uio_loffset - dbpp[0]->db_offset; 457 458 for (i = 0; i < numbufs; i++) { 459 if (n < 0) 460 break; 461 dbp = dbpp[i]; 462 size = dbp->db_size - delta; 463 /* 464 * XXX -- this is correct, but may be suboptimal. 465 * If the pages are all clean, we don't need to 466 * go through mappedread(). Maybe the VMODSORT 467 * stuff can help us here. 468 */ 469 if (vn_has_cached_data(vp)) { 470 error = mappedread(vp, (caddr_t)dbp->db_data + 471 delta, (n < size ? n : size), uio); 472 } else { 473 error = uiomove((caddr_t)dbp->db_data + delta, 474 (n < size ? n : size), UIO_READ, uio); 475 } 476 if (error) { 477 dmu_buf_rele_array(dbpp, numbufs, FTAG); 478 goto out; 479 } 480 n -= dbp->db_size; 481 if (delta) { 482 n += delta; 483 delta = 0; 484 } 485 } 486 dmu_buf_rele_array(dbpp, numbufs, FTAG); 487 } 488 out: 489 zfs_range_unlock(zp, rl); 490 491 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 492 ZFS_EXIT(zfsvfs); 493 return (error); 494 } 495 496 /* 497 * Fault in the pages of the first n bytes specified by the uio structure. 498 * 1 byte in each page is touched and the uio struct is unmodified. 499 * Any error will exit this routine as this is only a best 500 * attempt to get the pages resident. This is a copy of ufs_trans_touch(). 501 */ 502 static void 503 zfs_prefault_write(ssize_t n, struct uio *uio) 504 { 505 struct iovec *iov; 506 ulong_t cnt, incr; 507 caddr_t p; 508 uint8_t tmp; 509 510 iov = uio->uio_iov; 511 512 while (n) { 513 cnt = MIN(iov->iov_len, n); 514 if (cnt == 0) { 515 /* empty iov entry */ 516 iov++; 517 continue; 518 } 519 n -= cnt; 520 /* 521 * touch each page in this segment. 522 */ 523 p = iov->iov_base; 524 while (cnt) { 525 switch (uio->uio_segflg) { 526 case UIO_USERSPACE: 527 case UIO_USERISPACE: 528 if (fuword8(p, &tmp)) 529 return; 530 break; 531 case UIO_SYSSPACE: 532 if (kcopy(p, &tmp, 1)) 533 return; 534 break; 535 } 536 incr = MIN(cnt, PAGESIZE); 537 p += incr; 538 cnt -= incr; 539 } 540 /* 541 * touch the last byte in case it straddles a page. 542 */ 543 p--; 544 switch (uio->uio_segflg) { 545 case UIO_USERSPACE: 546 case UIO_USERISPACE: 547 if (fuword8(p, &tmp)) 548 return; 549 break; 550 case UIO_SYSSPACE: 551 if (kcopy(p, &tmp, 1)) 552 return; 553 break; 554 } 555 iov++; 556 } 557 } 558 559 /* 560 * Write the bytes to a file. 561 * 562 * IN: vp - vnode of file to be written to. 563 * uio - structure supplying write location, range info, 564 * and data buffer. 565 * ioflag - FAPPEND flag set if in append mode. 566 * cr - credentials of caller. 567 * 568 * OUT: uio - updated offset and range. 569 * 570 * RETURN: 0 if success 571 * error code if failure 572 * 573 * Timestamps: 574 * vp - ctime|mtime updated if byte count > 0 575 */ 576 /* ARGSUSED */ 577 static int 578 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 579 { 580 znode_t *zp = VTOZ(vp); 581 rlim64_t limit = uio->uio_llimit; 582 ssize_t start_resid = uio->uio_resid; 583 ssize_t tx_bytes; 584 uint64_t end_size; 585 dmu_tx_t *tx; 586 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 587 zilog_t *zilog = zfsvfs->z_log; 588 uint64_t seq = 0; 589 offset_t woff; 590 ssize_t n, nbytes; 591 rl_t *rl; 592 int max_blksz = zfsvfs->z_max_blksz; 593 int error; 594 595 /* 596 * Fasttrack empty write 597 */ 598 n = start_resid; 599 if (n == 0) 600 return (0); 601 602 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 603 limit = MAXOFFSET_T; 604 605 ZFS_ENTER(zfsvfs); 606 607 /* 608 * Pre-fault the initial pages to ensure slow (eg NFS) pages 609 * don't hold up txg. 610 */ 611 zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio); 612 613 /* 614 * If in append mode, set the io offset pointer to eof. 615 */ 616 if (ioflag & FAPPEND) { 617 /* 618 * Range lock for a file append: 619 * The value for the start of range will be determined by 620 * zfs_range_lock() (to guarantee append semantics). 621 * If this write will cause the block size to increase, 622 * zfs_range_lock() will lock the entire file, so we must 623 * later reduce the range after we grow the block size. 624 */ 625 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 626 if (rl->r_len == UINT64_MAX) { 627 /* overlocked, zp_size can't change */ 628 woff = uio->uio_loffset = zp->z_phys->zp_size; 629 } else { 630 woff = uio->uio_loffset = rl->r_off; 631 } 632 } else { 633 woff = uio->uio_loffset; 634 /* 635 * Validate file offset 636 */ 637 if (woff < 0) { 638 ZFS_EXIT(zfsvfs); 639 return (EINVAL); 640 } 641 642 /* 643 * If we need to grow the block size then zfs_range_lock() 644 * will lock a wider range than we request here. 645 * Later after growing the block size we reduce the range. 646 */ 647 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 648 } 649 650 if (woff >= limit) { 651 error = EFBIG; 652 goto no_tx_done; 653 } 654 655 if ((woff + n) > limit || woff > (limit - n)) 656 n = limit - woff; 657 658 /* 659 * Check for mandatory locks 660 */ 661 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && 662 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) 663 goto no_tx_done; 664 end_size = MAX(zp->z_phys->zp_size, woff + n); 665 top: 666 tx = dmu_tx_create(zfsvfs->z_os); 667 dmu_tx_hold_bonus(tx, zp->z_id); 668 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 669 error = dmu_tx_assign(tx, zfsvfs->z_assign); 670 if (error) { 671 dmu_tx_abort(tx); 672 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 673 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 674 goto top; 675 } 676 goto no_tx_done; 677 } 678 679 /* 680 * If zfs_range_lock() over-locked we grow the blocksize 681 * and then reduce the lock range. 682 */ 683 if (rl->r_len == UINT64_MAX) { 684 uint64_t new_blksz; 685 686 if (zp->z_blksz > max_blksz) { 687 ASSERT(!ISP2(zp->z_blksz)); 688 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); 689 } else { 690 new_blksz = MIN(end_size, max_blksz); 691 } 692 zfs_grow_blocksize(zp, new_blksz, tx); 693 zfs_range_reduce(zp, rl, woff, n); 694 } 695 696 /* 697 * The file data does not fit in the znode "cache", so we 698 * will be writing to the file block data buffers. 699 * Each buffer will be written in a separate transaction; 700 * this keeps the intent log records small and allows us 701 * to do more fine-grained space accounting. 702 */ 703 while (n > 0) { 704 /* 705 * XXX - should we really limit each write to z_max_blksz? 706 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 707 */ 708 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 709 rw_enter(&zp->z_map_lock, RW_READER); 710 711 tx_bytes = uio->uio_resid; 712 if (vn_has_cached_data(vp)) { 713 rw_exit(&zp->z_map_lock); 714 error = mappedwrite(vp, woff, nbytes, uio, tx); 715 } else { 716 error = dmu_write_uio(zfsvfs->z_os, zp->z_id, 717 woff, nbytes, uio, tx); 718 rw_exit(&zp->z_map_lock); 719 } 720 tx_bytes -= uio->uio_resid; 721 722 if (error) { 723 /* XXX - do we need to "clean up" the dmu buffer? */ 724 break; 725 } 726 727 ASSERT(tx_bytes == nbytes); 728 729 /* 730 * Clear Set-UID/Set-GID bits on successful write if not 731 * privileged and at least one of the excute bits is set. 732 * 733 * It would be nice to to this after all writes have 734 * been done, but that would still expose the ISUID/ISGID 735 * to another app after the partial write is committed. 736 */ 737 738 mutex_enter(&zp->z_acl_lock); 739 if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | 740 (S_IXUSR >> 6))) != 0 && 741 (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && 742 secpolicy_vnode_setid_retain(cr, 743 (zp->z_phys->zp_mode & S_ISUID) != 0 && 744 zp->z_phys->zp_uid == 0) != 0) { 745 zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); 746 } 747 mutex_exit(&zp->z_acl_lock); 748 749 n -= nbytes; 750 if (n <= 0) 751 break; 752 753 /* 754 * We have more work ahead of us, so wrap up this transaction 755 * and start another. Exact same logic as tx_done below. 756 */ 757 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) { 758 dmu_buf_will_dirty(zp->z_dbuf, tx); 759 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 760 uio->uio_loffset); 761 } 762 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 763 seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, 764 ioflag, uio); 765 dmu_tx_commit(tx); 766 767 /* Pre-fault the next set of pages */ 768 zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio); 769 770 /* 771 * Start another transaction. 772 */ 773 woff = uio->uio_loffset; 774 tx = dmu_tx_create(zfsvfs->z_os); 775 dmu_tx_hold_bonus(tx, zp->z_id); 776 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 777 error = dmu_tx_assign(tx, zfsvfs->z_assign); 778 if (error) { 779 dmu_tx_abort(tx); 780 if (error == ERESTART && 781 zfsvfs->z_assign == TXG_NOWAIT) { 782 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 783 goto top; 784 } 785 goto no_tx_done; 786 } 787 } 788 789 tx_done: 790 791 if (tx_bytes != 0) { 792 /* 793 * Update the file size if it has changed; account 794 * for possible concurrent updates. 795 */ 796 while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) { 797 dmu_buf_will_dirty(zp->z_dbuf, tx); 798 (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, 799 uio->uio_loffset); 800 } 801 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 802 seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, 803 ioflag, uio); 804 } 805 dmu_tx_commit(tx); 806 807 808 no_tx_done: 809 810 zfs_range_unlock(zp, rl); 811 812 /* 813 * If we're in replay mode, or we made no progress, return error. 814 * Otherwise, it's at least a partial write, so it's successful. 815 */ 816 if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { 817 ZFS_EXIT(zfsvfs); 818 return (error); 819 } 820 821 zil_commit(zilog, seq, ioflag & (FSYNC | FDSYNC)); 822 823 ZFS_EXIT(zfsvfs); 824 return (0); 825 } 826 827 /* 828 * Get data to generate a TX_WRITE intent log record. 829 */ 830 int 831 zfs_get_data(void *arg, lr_write_t *lr, char *buf) 832 { 833 zfsvfs_t *zfsvfs = arg; 834 objset_t *os = zfsvfs->z_os; 835 znode_t *zp; 836 uint64_t off = lr->lr_offset; 837 rl_t *rl; 838 int dlen = lr->lr_length; /* length of user data */ 839 int error = 0; 840 841 ASSERT(dlen != 0); 842 843 /* 844 * Nothing to do if the file has been removed 845 */ 846 if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) 847 return (ENOENT); 848 if (zp->z_reap) { 849 VN_RELE(ZTOV(zp)); 850 return (ENOENT); 851 } 852 853 /* 854 * Write records come in two flavors: immediate and indirect. 855 * For small writes it's cheaper to store the data with the 856 * log record (immediate); for large writes it's cheaper to 857 * sync the data and get a pointer to it (indirect) so that 858 * we don't have to write the data twice. 859 */ 860 if (buf != NULL) { /* immediate write */ 861 dmu_buf_t *db; 862 863 rl = zfs_range_lock(zp, off, dlen, RL_READER); 864 /* test for truncation needs to be done while range locked */ 865 if (off >= zp->z_phys->zp_size) { 866 error = ENOENT; 867 goto out; 868 } 869 VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, off, FTAG, &db)); 870 bcopy((char *)db->db_data + off - db->db_offset, buf, dlen); 871 dmu_buf_rele(db, FTAG); 872 } else { /* indirect write */ 873 uint64_t boff; /* block starting offset */ 874 875 /* 876 * Have to lock the whole block to ensure when it's 877 * written out and it's checksum is being calculated 878 * that no one can change the data. We need to re-check 879 * blocksize after we get the lock in case it's changed! 880 */ 881 for (;;) { 882 boff = off & ~(zp->z_blksz - 1); 883 dlen = zp->z_blksz; 884 rl = zfs_range_lock(zp, boff, dlen, RL_READER); 885 if (zp->z_blksz == dlen) 886 break; 887 zfs_range_unlock(zp, rl); 888 } 889 /* test for truncation needs to be done while range locked */ 890 if (off >= zp->z_phys->zp_size) { 891 error = ENOENT; 892 goto out; 893 } 894 txg_suspend(dmu_objset_pool(os)); 895 error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff, 896 &lr->lr_blkptr, lr->lr_common.lrc_txg); 897 txg_resume(dmu_objset_pool(os)); 898 } 899 out: 900 zfs_range_unlock(zp, rl); 901 VN_RELE(ZTOV(zp)); 902 return (error); 903 } 904 905 /*ARGSUSED*/ 906 static int 907 zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) 908 { 909 znode_t *zp = VTOZ(vp); 910 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 911 int error; 912 913 ZFS_ENTER(zfsvfs); 914 error = zfs_zaccess_rwx(zp, mode, cr); 915 ZFS_EXIT(zfsvfs); 916 return (error); 917 } 918 919 /* 920 * Lookup an entry in a directory, or an extended attribute directory. 921 * If it exists, return a held vnode reference for it. 922 * 923 * IN: dvp - vnode of directory to search. 924 * nm - name of entry to lookup. 925 * pnp - full pathname to lookup [UNUSED]. 926 * flags - LOOKUP_XATTR set if looking for an attribute. 927 * rdir - root directory vnode [UNUSED]. 928 * cr - credentials of caller. 929 * 930 * OUT: vpp - vnode of located entry, NULL if not found. 931 * 932 * RETURN: 0 if success 933 * error code if failure 934 * 935 * Timestamps: 936 * NA 937 */ 938 /* ARGSUSED */ 939 static int 940 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 941 int flags, vnode_t *rdir, cred_t *cr) 942 { 943 944 znode_t *zdp = VTOZ(dvp); 945 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 946 int error; 947 948 ZFS_ENTER(zfsvfs); 949 950 *vpp = NULL; 951 952 if (flags & LOOKUP_XATTR) { 953 /* 954 * We don't allow recursive attributes.. 955 * Maybe someday we will. 956 */ 957 if (zdp->z_phys->zp_flags & ZFS_XATTR) { 958 ZFS_EXIT(zfsvfs); 959 return (EINVAL); 960 } 961 962 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr)) { 963 ZFS_EXIT(zfsvfs); 964 return (error); 965 } 966 967 /* 968 * Do we have permission to get into attribute directory? 969 */ 970 971 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) { 972 VN_RELE(*vpp); 973 } 974 975 ZFS_EXIT(zfsvfs); 976 return (error); 977 } 978 979 if (dvp->v_type != VDIR) { 980 ZFS_EXIT(zfsvfs); 981 return (ENOTDIR); 982 } 983 984 /* 985 * Check accessibility of directory. 986 */ 987 988 if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) { 989 ZFS_EXIT(zfsvfs); 990 return (error); 991 } 992 993 if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) { 994 995 /* 996 * Convert device special files 997 */ 998 if (IS_DEVVP(*vpp)) { 999 vnode_t *svp; 1000 1001 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1002 VN_RELE(*vpp); 1003 if (svp == NULL) 1004 error = ENOSYS; 1005 else 1006 *vpp = svp; 1007 } 1008 } 1009 1010 ZFS_EXIT(zfsvfs); 1011 return (error); 1012 } 1013 1014 /* 1015 * Attempt to create a new entry in a directory. If the entry 1016 * already exists, truncate the file if permissible, else return 1017 * an error. Return the vp of the created or trunc'd file. 1018 * 1019 * IN: dvp - vnode of directory to put new file entry in. 1020 * name - name of new file entry. 1021 * vap - attributes of new file. 1022 * excl - flag indicating exclusive or non-exclusive mode. 1023 * mode - mode to open file with. 1024 * cr - credentials of caller. 1025 * flag - large file flag [UNUSED]. 1026 * 1027 * OUT: vpp - vnode of created or trunc'd entry. 1028 * 1029 * RETURN: 0 if success 1030 * error code if failure 1031 * 1032 * Timestamps: 1033 * dvp - ctime|mtime updated if new entry created 1034 * vp - ctime|mtime always, atime if new 1035 */ 1036 /* ARGSUSED */ 1037 static int 1038 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, 1039 int mode, vnode_t **vpp, cred_t *cr, int flag) 1040 { 1041 znode_t *zp, *dzp = VTOZ(dvp); 1042 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1043 zilog_t *zilog = zfsvfs->z_log; 1044 uint64_t seq = 0; 1045 objset_t *os = zfsvfs->z_os; 1046 zfs_dirlock_t *dl; 1047 dmu_tx_t *tx; 1048 int error; 1049 uint64_t zoid; 1050 1051 ZFS_ENTER(zfsvfs); 1052 1053 top: 1054 *vpp = NULL; 1055 1056 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr)) 1057 vap->va_mode &= ~VSVTX; 1058 1059 if (*name == '\0') { 1060 /* 1061 * Null component name refers to the directory itself. 1062 */ 1063 VN_HOLD(dvp); 1064 zp = dzp; 1065 dl = NULL; 1066 error = 0; 1067 } else { 1068 /* possible VN_HOLD(zp) */ 1069 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) { 1070 if (strcmp(name, "..") == 0) 1071 error = EISDIR; 1072 ZFS_EXIT(zfsvfs); 1073 return (error); 1074 } 1075 } 1076 1077 zoid = zp ? zp->z_id : -1ULL; 1078 1079 if (zp == NULL) { 1080 /* 1081 * Create a new file object and update the directory 1082 * to reference it. 1083 */ 1084 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { 1085 goto out; 1086 } 1087 1088 /* 1089 * We only support the creation of regular files in 1090 * extended attribute directories. 1091 */ 1092 if ((dzp->z_phys->zp_flags & ZFS_XATTR) && 1093 (vap->va_type != VREG)) { 1094 error = EINVAL; 1095 goto out; 1096 } 1097 1098 tx = dmu_tx_create(os); 1099 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1100 dmu_tx_hold_bonus(tx, dzp->z_id); 1101 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1102 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 1103 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1104 0, SPA_MAXBLOCKSIZE); 1105 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1106 if (error) { 1107 dmu_tx_abort(tx); 1108 zfs_dirent_unlock(dl); 1109 if (error == ERESTART && 1110 zfsvfs->z_assign == TXG_NOWAIT) { 1111 txg_wait_open(dmu_objset_pool(os), 0); 1112 goto top; 1113 } 1114 ZFS_EXIT(zfsvfs); 1115 return (error); 1116 } 1117 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); 1118 ASSERT(zp->z_id == zoid); 1119 (void) zfs_link_create(dl, zp, tx, ZNEW); 1120 seq = zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name); 1121 dmu_tx_commit(tx); 1122 } else { 1123 /* 1124 * A directory entry already exists for this name. 1125 */ 1126 /* 1127 * Can't truncate an existing file if in exclusive mode. 1128 */ 1129 if (excl == EXCL) { 1130 error = EEXIST; 1131 goto out; 1132 } 1133 /* 1134 * Can't open a directory for writing. 1135 */ 1136 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1137 error = EISDIR; 1138 goto out; 1139 } 1140 /* 1141 * Verify requested access to file. 1142 */ 1143 if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) { 1144 goto out; 1145 } 1146 1147 mutex_enter(&dzp->z_lock); 1148 dzp->z_seq++; 1149 mutex_exit(&dzp->z_lock); 1150 1151 /* 1152 * Truncate regular files if requested. 1153 */ 1154 if ((ZTOV(zp)->v_type == VREG) && 1155 (zp->z_phys->zp_size != 0) && 1156 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1157 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1158 if (error == ERESTART && 1159 zfsvfs->z_assign == TXG_NOWAIT) { 1160 zfs_dirent_unlock(dl); 1161 txg_wait_open(dmu_objset_pool(os), 0); 1162 goto top; 1163 } 1164 } 1165 } 1166 out: 1167 1168 if (dl) 1169 zfs_dirent_unlock(dl); 1170 1171 if (error) { 1172 if (zp) 1173 VN_RELE(ZTOV(zp)); 1174 } else { 1175 *vpp = ZTOV(zp); 1176 /* 1177 * If vnode is for a device return a specfs vnode instead. 1178 */ 1179 if (IS_DEVVP(*vpp)) { 1180 struct vnode *svp; 1181 1182 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1183 VN_RELE(*vpp); 1184 if (svp == NULL) { 1185 error = ENOSYS; 1186 } 1187 *vpp = svp; 1188 } 1189 } 1190 1191 zil_commit(zilog, seq, 0); 1192 1193 ZFS_EXIT(zfsvfs); 1194 return (error); 1195 } 1196 1197 /* 1198 * Remove an entry from a directory. 1199 * 1200 * IN: dvp - vnode of directory to remove entry from. 1201 * name - name of entry to remove. 1202 * cr - credentials of caller. 1203 * 1204 * RETURN: 0 if success 1205 * error code if failure 1206 * 1207 * Timestamps: 1208 * dvp - ctime|mtime 1209 * vp - ctime (if nlink > 0) 1210 */ 1211 static int 1212 zfs_remove(vnode_t *dvp, char *name, cred_t *cr) 1213 { 1214 znode_t *zp, *dzp = VTOZ(dvp); 1215 znode_t *xzp = NULL; 1216 vnode_t *vp; 1217 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1218 zilog_t *zilog = zfsvfs->z_log; 1219 uint64_t seq = 0; 1220 uint64_t acl_obj, xattr_obj; 1221 zfs_dirlock_t *dl; 1222 dmu_tx_t *tx; 1223 int may_delete_now, delete_now = FALSE; 1224 int reaped; 1225 int error; 1226 1227 ZFS_ENTER(zfsvfs); 1228 1229 top: 1230 /* 1231 * Attempt to lock directory; fail if entry doesn't exist. 1232 */ 1233 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { 1234 ZFS_EXIT(zfsvfs); 1235 return (error); 1236 } 1237 1238 vp = ZTOV(zp); 1239 1240 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1241 goto out; 1242 } 1243 1244 /* 1245 * Need to use rmdir for removing directories. 1246 */ 1247 if (vp->v_type == VDIR) { 1248 error = EPERM; 1249 goto out; 1250 } 1251 1252 vnevent_remove(vp); 1253 1254 dnlc_remove(dvp, name); 1255 1256 mutex_enter(&vp->v_lock); 1257 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 1258 mutex_exit(&vp->v_lock); 1259 1260 /* 1261 * We may delete the znode now, or we may put it on the delete queue; 1262 * it depends on whether we're the last link, and on whether there are 1263 * other holds on the vnode. So we dmu_tx_hold() the right things to 1264 * allow for either case. 1265 */ 1266 tx = dmu_tx_create(zfsvfs->z_os); 1267 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1268 dmu_tx_hold_bonus(tx, zp->z_id); 1269 if (may_delete_now) 1270 dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); 1271 1272 /* are there any extended attributes? */ 1273 if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { 1274 /* 1275 * XXX - There is a possibility that the delete 1276 * of the parent file could succeed, but then we get 1277 * an ENOSPC when we try to delete the xattrs... 1278 * so we would need to re-try the deletes periodically 1279 */ 1280 /* XXX - do we need this if we are deleting? */ 1281 dmu_tx_hold_bonus(tx, xattr_obj); 1282 } 1283 1284 /* are there any additional acls */ 1285 if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && 1286 may_delete_now) 1287 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1288 1289 /* charge as an update -- would be nice not to charge at all */ 1290 dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); 1291 1292 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1293 if (error) { 1294 dmu_tx_abort(tx); 1295 zfs_dirent_unlock(dl); 1296 VN_RELE(vp); 1297 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1298 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 1299 goto top; 1300 } 1301 ZFS_EXIT(zfsvfs); 1302 return (error); 1303 } 1304 1305 /* 1306 * Remove the directory entry. 1307 */ 1308 error = zfs_link_destroy(dl, zp, tx, 0, &reaped); 1309 1310 if (error) { 1311 dmu_tx_commit(tx); 1312 goto out; 1313 } 1314 1315 if (reaped) { 1316 mutex_enter(&vp->v_lock); 1317 delete_now = may_delete_now && 1318 vp->v_count == 1 && !vn_has_cached_data(vp) && 1319 zp->z_phys->zp_xattr == xattr_obj && 1320 zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; 1321 mutex_exit(&vp->v_lock); 1322 } 1323 1324 if (delete_now) { 1325 if (zp->z_phys->zp_xattr) { 1326 error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); 1327 ASSERT3U(error, ==, 0); 1328 ASSERT3U(xzp->z_phys->zp_links, ==, 2); 1329 dmu_buf_will_dirty(xzp->z_dbuf, tx); 1330 mutex_enter(&xzp->z_lock); 1331 xzp->z_reap = 1; 1332 xzp->z_phys->zp_links = 0; 1333 mutex_exit(&xzp->z_lock); 1334 zfs_dq_add(xzp, tx); 1335 zp->z_phys->zp_xattr = 0; /* probably unnecessary */ 1336 } 1337 mutex_enter(&zp->z_lock); 1338 mutex_enter(&vp->v_lock); 1339 vp->v_count--; 1340 ASSERT3U(vp->v_count, ==, 0); 1341 mutex_exit(&vp->v_lock); 1342 zp->z_active = 0; 1343 mutex_exit(&zp->z_lock); 1344 zfs_znode_delete(zp, tx); 1345 VFS_RELE(zfsvfs->z_vfs); 1346 } else if (reaped) { 1347 zfs_dq_add(zp, tx); 1348 } 1349 1350 seq = zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name); 1351 1352 dmu_tx_commit(tx); 1353 out: 1354 zfs_dirent_unlock(dl); 1355 1356 if (!delete_now) { 1357 VN_RELE(vp); 1358 } else if (xzp) { 1359 /* this rele delayed to prevent nesting transactions */ 1360 VN_RELE(ZTOV(xzp)); 1361 } 1362 1363 zil_commit(zilog, seq, 0); 1364 1365 ZFS_EXIT(zfsvfs); 1366 return (error); 1367 } 1368 1369 /* 1370 * Create a new directory and insert it into dvp using the name 1371 * provided. Return a pointer to the inserted directory. 1372 * 1373 * IN: dvp - vnode of directory to add subdir to. 1374 * dirname - name of new directory. 1375 * vap - attributes of new directory. 1376 * cr - credentials of caller. 1377 * 1378 * OUT: vpp - vnode of created directory. 1379 * 1380 * RETURN: 0 if success 1381 * error code if failure 1382 * 1383 * Timestamps: 1384 * dvp - ctime|mtime updated 1385 * vp - ctime|mtime|atime updated 1386 */ 1387 static int 1388 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 1389 { 1390 znode_t *zp, *dzp = VTOZ(dvp); 1391 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1392 zilog_t *zilog = zfsvfs->z_log; 1393 uint64_t seq = 0; 1394 zfs_dirlock_t *dl; 1395 uint64_t zoid = 0; 1396 dmu_tx_t *tx; 1397 int error; 1398 1399 ASSERT(vap->va_type == VDIR); 1400 1401 ZFS_ENTER(zfsvfs); 1402 1403 if (dzp->z_phys->zp_flags & ZFS_XATTR) { 1404 ZFS_EXIT(zfsvfs); 1405 return (EINVAL); 1406 } 1407 top: 1408 *vpp = NULL; 1409 1410 /* 1411 * First make sure the new directory doesn't exist. 1412 */ 1413 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) { 1414 ZFS_EXIT(zfsvfs); 1415 return (error); 1416 } 1417 1418 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) { 1419 zfs_dirent_unlock(dl); 1420 ZFS_EXIT(zfsvfs); 1421 return (error); 1422 } 1423 1424 /* 1425 * Add a new entry to the directory. 1426 */ 1427 tx = dmu_tx_create(zfsvfs->z_os); 1428 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1429 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1430 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 1431 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1432 0, SPA_MAXBLOCKSIZE); 1433 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1434 if (error) { 1435 dmu_tx_abort(tx); 1436 zfs_dirent_unlock(dl); 1437 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1438 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 1439 goto top; 1440 } 1441 ZFS_EXIT(zfsvfs); 1442 return (error); 1443 } 1444 1445 /* 1446 * Create new node. 1447 */ 1448 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); 1449 1450 /* 1451 * Now put new name in parent dir. 1452 */ 1453 (void) zfs_link_create(dl, zp, tx, ZNEW); 1454 1455 *vpp = ZTOV(zp); 1456 1457 seq = zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname); 1458 dmu_tx_commit(tx); 1459 1460 zfs_dirent_unlock(dl); 1461 1462 zil_commit(zilog, seq, 0); 1463 1464 ZFS_EXIT(zfsvfs); 1465 return (0); 1466 } 1467 1468 /* 1469 * Remove a directory subdir entry. If the current working 1470 * directory is the same as the subdir to be removed, the 1471 * remove will fail. 1472 * 1473 * IN: dvp - vnode of directory to remove from. 1474 * name - name of directory to be removed. 1475 * cwd - vnode of current working directory. 1476 * cr - credentials of caller. 1477 * 1478 * RETURN: 0 if success 1479 * error code if failure 1480 * 1481 * Timestamps: 1482 * dvp - ctime|mtime updated 1483 */ 1484 static int 1485 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) 1486 { 1487 znode_t *dzp = VTOZ(dvp); 1488 znode_t *zp; 1489 vnode_t *vp; 1490 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1491 zilog_t *zilog = zfsvfs->z_log; 1492 uint64_t seq = 0; 1493 zfs_dirlock_t *dl; 1494 dmu_tx_t *tx; 1495 int error; 1496 1497 ZFS_ENTER(zfsvfs); 1498 1499 top: 1500 zp = NULL; 1501 1502 /* 1503 * Attempt to lock directory; fail if entry doesn't exist. 1504 */ 1505 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) { 1506 ZFS_EXIT(zfsvfs); 1507 return (error); 1508 } 1509 1510 vp = ZTOV(zp); 1511 1512 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1513 goto out; 1514 } 1515 1516 if (vp->v_type != VDIR) { 1517 error = ENOTDIR; 1518 goto out; 1519 } 1520 1521 if (vp == cwd) { 1522 error = EINVAL; 1523 goto out; 1524 } 1525 1526 vnevent_rmdir(vp); 1527 1528 /* 1529 * Grab a lock on the parent pointer make sure we play well 1530 * with the treewalk and directory rename code. 1531 */ 1532 rw_enter(&zp->z_parent_lock, RW_WRITER); 1533 1534 tx = dmu_tx_create(zfsvfs->z_os); 1535 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1536 dmu_tx_hold_bonus(tx, zp->z_id); 1537 dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); 1538 error = dmu_tx_assign(tx, zfsvfs->z_assign); 1539 if (error) { 1540 dmu_tx_abort(tx); 1541 rw_exit(&zp->z_parent_lock); 1542 zfs_dirent_unlock(dl); 1543 VN_RELE(vp); 1544 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1545 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 1546 goto top; 1547 } 1548 ZFS_EXIT(zfsvfs); 1549 return (error); 1550 } 1551 1552 error = zfs_link_destroy(dl, zp, tx, 0, NULL); 1553 1554 if (error == 0) 1555 seq = zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name); 1556 1557 dmu_tx_commit(tx); 1558 1559 rw_exit(&zp->z_parent_lock); 1560 out: 1561 zfs_dirent_unlock(dl); 1562 1563 VN_RELE(vp); 1564 1565 zil_commit(zilog, seq, 0); 1566 1567 ZFS_EXIT(zfsvfs); 1568 return (error); 1569 } 1570 1571 /* 1572 * Read as many directory entries as will fit into the provided 1573 * buffer from the given directory cursor position (specified in 1574 * the uio structure. 1575 * 1576 * IN: vp - vnode of directory to read. 1577 * uio - structure supplying read location, range info, 1578 * and return buffer. 1579 * cr - credentials of caller. 1580 * 1581 * OUT: uio - updated offset and range, buffer filled. 1582 * eofp - set to true if end-of-file detected. 1583 * 1584 * RETURN: 0 if success 1585 * error code if failure 1586 * 1587 * Timestamps: 1588 * vp - atime updated 1589 * 1590 * Note that the low 4 bits of the cookie returned by zap is always zero. 1591 * This allows us to use the low range for "special" directory entries: 1592 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1593 * we use the offset 2 for the '.zfs' directory. 1594 */ 1595 /* ARGSUSED */ 1596 static int 1597 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp) 1598 { 1599 znode_t *zp = VTOZ(vp); 1600 iovec_t *iovp; 1601 dirent64_t *odp; 1602 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1603 objset_t *os; 1604 caddr_t outbuf; 1605 size_t bufsize; 1606 zap_cursor_t zc; 1607 zap_attribute_t zap; 1608 uint_t bytes_wanted; 1609 ushort_t this_reclen; 1610 uint64_t offset; /* must be unsigned; checks for < 1 */ 1611 off64_t *next; 1612 int local_eof; 1613 int outcount; 1614 int error; 1615 uint8_t prefetch; 1616 1617 ZFS_ENTER(zfsvfs); 1618 1619 /* 1620 * If we are not given an eof variable, 1621 * use a local one. 1622 */ 1623 if (eofp == NULL) 1624 eofp = &local_eof; 1625 1626 /* 1627 * Check for valid iov_len. 1628 */ 1629 if (uio->uio_iov->iov_len <= 0) { 1630 ZFS_EXIT(zfsvfs); 1631 return (EINVAL); 1632 } 1633 1634 /* 1635 * Quit if directory has been removed (posix) 1636 */ 1637 if ((*eofp = zp->z_reap) != 0) { 1638 ZFS_EXIT(zfsvfs); 1639 return (0); 1640 } 1641 1642 error = 0; 1643 os = zfsvfs->z_os; 1644 offset = uio->uio_loffset; 1645 prefetch = zp->z_zn_prefetch; 1646 1647 /* 1648 * Initialize the iterator cursor. 1649 */ 1650 if (offset <= 3) { 1651 /* 1652 * Start iteration from the beginning of the directory. 1653 */ 1654 zap_cursor_init(&zc, os, zp->z_id); 1655 } else { 1656 /* 1657 * The offset is a serialized cursor. 1658 */ 1659 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1660 } 1661 1662 /* 1663 * Get space to change directory entries into fs independent format. 1664 */ 1665 iovp = uio->uio_iov; 1666 bytes_wanted = iovp->iov_len; 1667 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 1668 bufsize = bytes_wanted; 1669 outbuf = kmem_alloc(bufsize, KM_SLEEP); 1670 odp = (struct dirent64 *)outbuf; 1671 } else { 1672 bufsize = bytes_wanted; 1673 odp = (struct dirent64 *)iovp->iov_base; 1674 } 1675 1676 /* 1677 * Transform to file-system independent format 1678 */ 1679 outcount = 0; 1680 while (outcount < bytes_wanted) { 1681 /* 1682 * Special case `.', `..', and `.zfs'. 1683 */ 1684 if (offset == 0) { 1685 (void) strcpy(zap.za_name, "."); 1686 zap.za_first_integer = zp->z_id; 1687 this_reclen = DIRENT64_RECLEN(1); 1688 } else if (offset == 1) { 1689 (void) strcpy(zap.za_name, ".."); 1690 zap.za_first_integer = zp->z_phys->zp_parent; 1691 this_reclen = DIRENT64_RECLEN(2); 1692 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1693 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 1694 zap.za_first_integer = ZFSCTL_INO_ROOT; 1695 this_reclen = 1696 DIRENT64_RECLEN(sizeof (ZFS_CTLDIR_NAME) - 1); 1697 } else { 1698 /* 1699 * Grab next entry. 1700 */ 1701 if (error = zap_cursor_retrieve(&zc, &zap)) { 1702 if ((*eofp = (error == ENOENT)) != 0) 1703 break; 1704 else 1705 goto update; 1706 } 1707 1708 if (zap.za_integer_length != 8 || 1709 zap.za_num_integers != 1) { 1710 cmn_err(CE_WARN, "zap_readdir: bad directory " 1711 "entry, obj = %lld, offset = %lld\n", 1712 (u_longlong_t)zp->z_id, 1713 (u_longlong_t)offset); 1714 error = ENXIO; 1715 goto update; 1716 } 1717 this_reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 1718 } 1719 1720 /* 1721 * Will this entry fit in the buffer? 1722 */ 1723 if (outcount + this_reclen > bufsize) { 1724 /* 1725 * Did we manage to fit anything in the buffer? 1726 */ 1727 if (!outcount) { 1728 error = EINVAL; 1729 goto update; 1730 } 1731 break; 1732 } 1733 /* 1734 * Add this entry: 1735 */ 1736 odp->d_ino = (ino64_t)zap.za_first_integer; 1737 odp->d_reclen = (ushort_t)this_reclen; 1738 /* NOTE: d_off is the offset for the *next* entry */ 1739 next = &(odp->d_off); 1740 (void) strncpy(odp->d_name, zap.za_name, 1741 DIRENT64_NAMELEN(this_reclen)); 1742 outcount += this_reclen; 1743 odp = (dirent64_t *)((intptr_t)odp + this_reclen); 1744 1745 ASSERT(outcount <= bufsize); 1746 1747 /* Prefetch znode */ 1748 if (prefetch) 1749 dmu_prefetch(os, zap.za_first_integer, 0, 0); 1750 1751 /* 1752 * Move to the next entry, fill in the previous offset. 1753 */ 1754 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1755 zap_cursor_advance(&zc); 1756 offset = zap_cursor_serialize(&zc); 1757 } else { 1758 offset += 1; 1759 } 1760 *next = offset; 1761 } 1762 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1763 1764 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 1765 iovp->iov_base += outcount; 1766 iovp->iov_len -= outcount; 1767 uio->uio_resid -= outcount; 1768 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 1769 /* 1770 * Reset the pointer. 1771 */ 1772 offset = uio->uio_loffset; 1773 } 1774 1775 update: 1776 zap_cursor_fini(&zc); 1777 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 1778 kmem_free(outbuf, bufsize); 1779 1780 if (error == ENOENT) 1781 error = 0; 1782 1783 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 1784 1785 uio->uio_loffset = offset; 1786 ZFS_EXIT(zfsvfs); 1787 return (error); 1788 } 1789 1790 static int 1791 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) 1792 { 1793 znode_t *zp = VTOZ(vp); 1794 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1795 1796 /* 1797 * Regardless of whether this is required for standards conformance, 1798 * this is the logical behavior when fsync() is called on a file with 1799 * dirty pages. We use B_ASYNC since the ZIL transactions are already 1800 * going to be pushed out as part of the zil_commit(). 1801 */ 1802 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) && 1803 (vp->v_type == VREG) && !(IS_SWAPVP(vp))) 1804 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr); 1805 1806 ZFS_ENTER(zfsvfs); 1807 zil_commit(zfsvfs->z_log, zp->z_last_itx, FSYNC); 1808 ZFS_EXIT(zfsvfs); 1809 return (0); 1810 } 1811 1812 /* 1813 * Get the requested file attributes and place them in the provided 1814 * vattr structure. 1815 * 1816 * IN: vp - vnode of file. 1817 * vap - va_mask identifies requested attributes. 1818 * flags - [UNUSED] 1819 * cr - credentials of caller. 1820 * 1821 * OUT: vap - attribute values. 1822 * 1823 * RETURN: 0 (always succeeds) 1824 */ 1825 /* ARGSUSED */ 1826 static int 1827 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) 1828 { 1829 znode_t *zp = VTOZ(vp); 1830 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1831 znode_phys_t *pzp = zp->z_phys; 1832 int error; 1833 1834 ZFS_ENTER(zfsvfs); 1835 1836 /* 1837 * Return all attributes. It's cheaper to provide the answer 1838 * than to determine whether we were asked the question. 1839 */ 1840 mutex_enter(&zp->z_lock); 1841 1842 vap->va_type = vp->v_type; 1843 vap->va_mode = pzp->zp_mode & MODEMASK; 1844 vap->va_uid = zp->z_phys->zp_uid; 1845 vap->va_gid = zp->z_phys->zp_gid; 1846 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 1847 vap->va_nodeid = zp->z_id; 1848 vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */ 1849 vap->va_size = pzp->zp_size; 1850 vap->va_rdev = vp->v_rdev; 1851 vap->va_seq = zp->z_seq; 1852 1853 ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); 1854 ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); 1855 ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); 1856 1857 /* 1858 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 1859 * Also, if we are the owner don't bother, since owner should 1860 * always be allowed to read basic attributes of file. 1861 */ 1862 if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) && 1863 (zp->z_phys->zp_uid != crgetuid(cr))) { 1864 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) { 1865 mutex_exit(&zp->z_lock); 1866 ZFS_EXIT(zfsvfs); 1867 return (error); 1868 } 1869 } 1870 1871 mutex_exit(&zp->z_lock); 1872 1873 dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); 1874 1875 if (zp->z_blksz == 0) { 1876 /* 1877 * Block size hasn't been set; suggest maximal I/O transfers. 1878 */ 1879 vap->va_blksize = zfsvfs->z_max_blksz; 1880 } 1881 1882 ZFS_EXIT(zfsvfs); 1883 return (0); 1884 } 1885 1886 /* 1887 * Set the file attributes to the values contained in the 1888 * vattr structure. 1889 * 1890 * IN: vp - vnode of file to be modified. 1891 * vap - new attribute values. 1892 * flags - ATTR_UTIME set if non-default time values provided. 1893 * cr - credentials of caller. 1894 * 1895 * RETURN: 0 if success 1896 * error code if failure 1897 * 1898 * Timestamps: 1899 * vp - ctime updated, mtime updated if size changed. 1900 */ 1901 /* ARGSUSED */ 1902 static int 1903 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 1904 caller_context_t *ct) 1905 { 1906 struct znode *zp = VTOZ(vp); 1907 znode_phys_t *pzp = zp->z_phys; 1908 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1909 zilog_t *zilog = zfsvfs->z_log; 1910 uint64_t seq = 0; 1911 dmu_tx_t *tx; 1912 vattr_t oldva; 1913 uint_t mask = vap->va_mask; 1914 uint_t saved_mask; 1915 int trim_mask = FALSE; 1916 uint64_t new_mode; 1917 znode_t *attrzp; 1918 int need_policy = FALSE; 1919 int err; 1920 1921 if (mask == 0) 1922 return (0); 1923 1924 if (mask & AT_NOSET) 1925 return (EINVAL); 1926 1927 if (mask & AT_SIZE && vp->v_type == VDIR) 1928 return (EISDIR); 1929 1930 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) 1931 return (EINVAL); 1932 1933 ZFS_ENTER(zfsvfs); 1934 1935 top: 1936 attrzp = NULL; 1937 1938 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 1939 ZFS_EXIT(zfsvfs); 1940 return (EROFS); 1941 } 1942 1943 /* 1944 * First validate permissions 1945 */ 1946 1947 if (mask & AT_SIZE) { 1948 err = zfs_zaccess(zp, ACE_WRITE_DATA, cr); 1949 if (err) { 1950 ZFS_EXIT(zfsvfs); 1951 return (err); 1952 } 1953 /* 1954 * XXX - Note, we are not providing any open 1955 * mode flags here (like FNDELAY), so we may 1956 * block if there are locks present... this 1957 * should be addressed in openat(). 1958 */ 1959 do { 1960 if (err == ERESTART) 1961 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 1962 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 1963 } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); 1964 if (err) { 1965 ZFS_EXIT(zfsvfs); 1966 return (err); 1967 } 1968 } 1969 1970 if (mask & (AT_ATIME|AT_MTIME)) 1971 need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr); 1972 1973 if (mask & (AT_UID|AT_GID)) { 1974 int idmask = (mask & (AT_UID|AT_GID)); 1975 int take_owner; 1976 int take_group; 1977 1978 /* 1979 * NOTE: even if a new mode is being set, 1980 * we may clear S_ISUID/S_ISGID bits. 1981 */ 1982 1983 if (!(mask & AT_MODE)) 1984 vap->va_mode = pzp->zp_mode; 1985 1986 /* 1987 * Take ownership or chgrp to group we are a member of 1988 */ 1989 1990 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 1991 take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr); 1992 1993 /* 1994 * If both AT_UID and AT_GID are set then take_owner and 1995 * take_group must both be set in order to allow taking 1996 * ownership. 1997 * 1998 * Otherwise, send the check through secpolicy_vnode_setattr() 1999 * 2000 */ 2001 2002 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 2003 ((idmask == AT_UID) && take_owner) || 2004 ((idmask == AT_GID) && take_group)) { 2005 if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) { 2006 /* 2007 * Remove setuid/setgid for non-privileged users 2008 */ 2009 secpolicy_setid_clear(vap, cr); 2010 trim_mask = TRUE; 2011 saved_mask = vap->va_mask; 2012 } else { 2013 need_policy = TRUE; 2014 } 2015 } else { 2016 need_policy = TRUE; 2017 } 2018 } 2019 2020 if (mask & AT_MODE) 2021 need_policy = TRUE; 2022 2023 if (need_policy) { 2024 mutex_enter(&zp->z_lock); 2025 oldva.va_mode = pzp->zp_mode; 2026 oldva.va_uid = zp->z_phys->zp_uid; 2027 oldva.va_gid = zp->z_phys->zp_gid; 2028 mutex_exit(&zp->z_lock); 2029 2030 /* 2031 * If trim_mask is set then take ownership 2032 * has been granted. In that case remove 2033 * UID|GID from mask so that 2034 * secpolicy_vnode_setattr() doesn't revoke it. 2035 */ 2036 if (trim_mask) 2037 vap->va_mask &= ~(AT_UID|AT_GID); 2038 2039 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 2040 (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp); 2041 if (err) { 2042 ZFS_EXIT(zfsvfs); 2043 return (err); 2044 } 2045 2046 if (trim_mask) 2047 vap->va_mask |= (saved_mask & (AT_UID|AT_GID)); 2048 } 2049 2050 /* 2051 * secpolicy_vnode_setattr, or take ownership may have 2052 * changed va_mask 2053 */ 2054 mask = vap->va_mask; 2055 2056 tx = dmu_tx_create(zfsvfs->z_os); 2057 dmu_tx_hold_bonus(tx, zp->z_id); 2058 2059 if (mask & AT_MODE) { 2060 uint64_t pmode = pzp->zp_mode; 2061 2062 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2063 2064 if (zp->z_phys->zp_acl.z_acl_extern_obj) 2065 dmu_tx_hold_write(tx, 2066 pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE); 2067 else 2068 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2069 0, ZFS_ACL_SIZE(MAX_ACL_SIZE)); 2070 } 2071 2072 if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) { 2073 err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp); 2074 if (err) { 2075 dmu_tx_abort(tx); 2076 ZFS_EXIT(zfsvfs); 2077 return (err); 2078 } 2079 dmu_tx_hold_bonus(tx, attrzp->z_id); 2080 } 2081 2082 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2083 if (err) { 2084 if (attrzp) 2085 VN_RELE(ZTOV(attrzp)); 2086 dmu_tx_abort(tx); 2087 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2088 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 2089 goto top; 2090 } 2091 ZFS_EXIT(zfsvfs); 2092 return (err); 2093 } 2094 2095 dmu_buf_will_dirty(zp->z_dbuf, tx); 2096 2097 /* 2098 * Set each attribute requested. 2099 * We group settings according to the locks they need to acquire. 2100 * 2101 * Note: you cannot set ctime directly, although it will be 2102 * updated as a side-effect of calling this function. 2103 */ 2104 2105 mutex_enter(&zp->z_lock); 2106 2107 if (mask & AT_MODE) { 2108 err = zfs_acl_chmod_setattr(zp, new_mode, tx); 2109 ASSERT3U(err, ==, 0); 2110 } 2111 2112 if (attrzp) 2113 mutex_enter(&attrzp->z_lock); 2114 2115 if (mask & AT_UID) { 2116 zp->z_phys->zp_uid = (uint64_t)vap->va_uid; 2117 if (attrzp) { 2118 attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid; 2119 } 2120 } 2121 2122 if (mask & AT_GID) { 2123 zp->z_phys->zp_gid = (uint64_t)vap->va_gid; 2124 if (attrzp) 2125 attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid; 2126 } 2127 2128 if (attrzp) 2129 mutex_exit(&attrzp->z_lock); 2130 2131 if (mask & AT_ATIME) 2132 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 2133 2134 if (mask & AT_MTIME) 2135 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 2136 2137 if (mask & AT_SIZE) 2138 zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); 2139 else if (mask != 0) 2140 zfs_time_stamper_locked(zp, STATE_CHANGED, tx); 2141 2142 if (mask != 0) 2143 seq = zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask); 2144 2145 mutex_exit(&zp->z_lock); 2146 2147 if (attrzp) 2148 VN_RELE(ZTOV(attrzp)); 2149 2150 dmu_tx_commit(tx); 2151 2152 zil_commit(zilog, seq, 0); 2153 2154 ZFS_EXIT(zfsvfs); 2155 return (err); 2156 } 2157 2158 /* 2159 * Search back through the directory tree, using the ".." entries. 2160 * Lock each directory in the chain to prevent concurrent renames. 2161 * Fail any attempt to move a directory into one of its own descendants. 2162 * XXX - z_parent_lock can overlap with map or grow locks 2163 */ 2164 typedef struct zfs_zlock { 2165 krwlock_t *zl_rwlock; /* lock we acquired */ 2166 znode_t *zl_znode; /* znode we held */ 2167 struct zfs_zlock *zl_next; /* next in list */ 2168 } zfs_zlock_t; 2169 2170 static int 2171 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2172 { 2173 zfs_zlock_t *zl; 2174 znode_t *zp = tdzp; 2175 uint64_t rootid = zp->z_zfsvfs->z_root; 2176 uint64_t *oidp = &zp->z_id; 2177 krwlock_t *rwlp = &szp->z_parent_lock; 2178 krw_t rw = RW_WRITER; 2179 2180 /* 2181 * First pass write-locks szp and compares to zp->z_id. 2182 * Later passes read-lock zp and compare to zp->z_parent. 2183 */ 2184 do { 2185 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2186 zl->zl_rwlock = rwlp; 2187 zl->zl_znode = NULL; 2188 zl->zl_next = *zlpp; 2189 *zlpp = zl; 2190 2191 rw_enter(rwlp, rw); 2192 2193 if (*oidp == szp->z_id) /* We're a descendant of szp */ 2194 return (EINVAL); 2195 2196 if (*oidp == rootid) /* We've hit the top */ 2197 return (0); 2198 2199 if (rw == RW_READER) { /* i.e. not the first pass */ 2200 int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); 2201 if (error) 2202 return (error); 2203 zl->zl_znode = zp; 2204 } 2205 oidp = &zp->z_phys->zp_parent; 2206 rwlp = &zp->z_parent_lock; 2207 rw = RW_READER; 2208 2209 } while (zp->z_id != sdzp->z_id); 2210 2211 return (0); 2212 } 2213 2214 /* 2215 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2216 */ 2217 static void 2218 zfs_rename_unlock(zfs_zlock_t **zlpp) 2219 { 2220 zfs_zlock_t *zl; 2221 2222 while ((zl = *zlpp) != NULL) { 2223 if (zl->zl_znode != NULL) 2224 VN_RELE(ZTOV(zl->zl_znode)); 2225 rw_exit(zl->zl_rwlock); 2226 *zlpp = zl->zl_next; 2227 kmem_free(zl, sizeof (*zl)); 2228 } 2229 } 2230 2231 /* 2232 * Move an entry from the provided source directory to the target 2233 * directory. Change the entry name as indicated. 2234 * 2235 * IN: sdvp - Source directory containing the "old entry". 2236 * snm - Old entry name. 2237 * tdvp - Target directory to contain the "new entry". 2238 * tnm - New entry name. 2239 * cr - credentials of caller. 2240 * 2241 * RETURN: 0 if success 2242 * error code if failure 2243 * 2244 * Timestamps: 2245 * sdvp,tdvp - ctime|mtime updated 2246 */ 2247 static int 2248 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) 2249 { 2250 znode_t *tdzp, *szp, *tzp; 2251 znode_t *sdzp = VTOZ(sdvp); 2252 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; 2253 zilog_t *zilog = zfsvfs->z_log; 2254 uint64_t seq = 0; 2255 vnode_t *realvp; 2256 zfs_dirlock_t *sdl, *tdl; 2257 dmu_tx_t *tx; 2258 zfs_zlock_t *zl; 2259 int cmp, serr, terr, error; 2260 2261 ZFS_ENTER(zfsvfs); 2262 2263 /* 2264 * Make sure we have the real vp for the target directory. 2265 */ 2266 if (VOP_REALVP(tdvp, &realvp) == 0) 2267 tdvp = realvp; 2268 2269 if (tdvp->v_vfsp != sdvp->v_vfsp) { 2270 ZFS_EXIT(zfsvfs); 2271 return (EXDEV); 2272 } 2273 2274 tdzp = VTOZ(tdvp); 2275 top: 2276 szp = NULL; 2277 tzp = NULL; 2278 zl = NULL; 2279 2280 /* 2281 * This is to prevent the creation of links into attribute space 2282 * by renaming a linked file into/outof an attribute directory. 2283 * See the comment in zfs_link() for why this is considered bad. 2284 */ 2285 if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != 2286 (sdzp->z_phys->zp_flags & ZFS_XATTR)) { 2287 ZFS_EXIT(zfsvfs); 2288 return (EINVAL); 2289 } 2290 2291 /* 2292 * Lock source and target directory entries. To prevent deadlock, 2293 * a lock ordering must be defined. We lock the directory with 2294 * the smallest object id first, or if it's a tie, the one with 2295 * the lexically first name. 2296 */ 2297 if (sdzp->z_id < tdzp->z_id) { 2298 cmp = -1; 2299 } else if (sdzp->z_id > tdzp->z_id) { 2300 cmp = 1; 2301 } else { 2302 cmp = strcmp(snm, tnm); 2303 if (cmp == 0) { 2304 /* 2305 * POSIX: "If the old argument and the new argument 2306 * both refer to links to the same existing file, 2307 * the rename() function shall return successfully 2308 * and perform no other action." 2309 */ 2310 ZFS_EXIT(zfsvfs); 2311 return (0); 2312 } 2313 } 2314 if (cmp < 0) { 2315 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); 2316 terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); 2317 } else { 2318 terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0); 2319 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS); 2320 } 2321 2322 if (serr) { 2323 /* 2324 * Source entry invalid or not there. 2325 */ 2326 if (!terr) { 2327 zfs_dirent_unlock(tdl); 2328 if (tzp) 2329 VN_RELE(ZTOV(tzp)); 2330 } 2331 if (strcmp(snm, "..") == 0) 2332 serr = EINVAL; 2333 ZFS_EXIT(zfsvfs); 2334 return (serr); 2335 } 2336 if (terr) { 2337 zfs_dirent_unlock(sdl); 2338 VN_RELE(ZTOV(szp)); 2339 if (strcmp(tnm, "..") == 0) 2340 terr = EINVAL; 2341 ZFS_EXIT(zfsvfs); 2342 return (terr); 2343 } 2344 2345 /* 2346 * Must have write access at the source to remove the old entry 2347 * and write access at the target to create the new entry. 2348 * Note that if target and source are the same, this can be 2349 * done in a single check. 2350 */ 2351 2352 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 2353 goto out; 2354 2355 if (ZTOV(szp)->v_type == VDIR) { 2356 /* 2357 * Check to make sure rename is valid. 2358 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2359 */ 2360 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 2361 goto out; 2362 } 2363 2364 /* 2365 * Does target exist? 2366 */ 2367 if (tzp) { 2368 /* 2369 * Source and target must be the same type. 2370 */ 2371 if (ZTOV(szp)->v_type == VDIR) { 2372 if (ZTOV(tzp)->v_type != VDIR) { 2373 error = ENOTDIR; 2374 goto out; 2375 } 2376 } else { 2377 if (ZTOV(tzp)->v_type == VDIR) { 2378 error = EISDIR; 2379 goto out; 2380 } 2381 } 2382 /* 2383 * POSIX dictates that when the source and target 2384 * entries refer to the same file object, rename 2385 * must do nothing and exit without error. 2386 */ 2387 if (szp->z_id == tzp->z_id) { 2388 error = 0; 2389 goto out; 2390 } 2391 } 2392 2393 vnevent_rename_src(ZTOV(szp)); 2394 if (tzp) 2395 vnevent_rename_dest(ZTOV(tzp)); 2396 2397 tx = dmu_tx_create(zfsvfs->z_os); 2398 dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ 2399 dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ 2400 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 2401 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 2402 if (sdzp != tdzp) 2403 dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ 2404 if (tzp) 2405 dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ 2406 dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); 2407 error = dmu_tx_assign(tx, zfsvfs->z_assign); 2408 if (error) { 2409 dmu_tx_abort(tx); 2410 if (zl != NULL) 2411 zfs_rename_unlock(&zl); 2412 zfs_dirent_unlock(sdl); 2413 zfs_dirent_unlock(tdl); 2414 VN_RELE(ZTOV(szp)); 2415 if (tzp) 2416 VN_RELE(ZTOV(tzp)); 2417 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2418 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 2419 goto top; 2420 } 2421 ZFS_EXIT(zfsvfs); 2422 return (error); 2423 } 2424 2425 if (tzp) /* Attempt to remove the existing target */ 2426 error = zfs_link_destroy(tdl, tzp, tx, 0, NULL); 2427 2428 if (error == 0) { 2429 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 2430 if (error == 0) { 2431 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 2432 ASSERT(error == 0); 2433 seq = zfs_log_rename(zilog, tx, TX_RENAME, 2434 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 2435 } 2436 } 2437 2438 dmu_tx_commit(tx); 2439 out: 2440 if (zl != NULL) 2441 zfs_rename_unlock(&zl); 2442 2443 zfs_dirent_unlock(sdl); 2444 zfs_dirent_unlock(tdl); 2445 2446 VN_RELE(ZTOV(szp)); 2447 if (tzp) 2448 VN_RELE(ZTOV(tzp)); 2449 2450 zil_commit(zilog, seq, 0); 2451 2452 ZFS_EXIT(zfsvfs); 2453 return (error); 2454 } 2455 2456 /* 2457 * Insert the indicated symbolic reference entry into the directory. 2458 * 2459 * IN: dvp - Directory to contain new symbolic link. 2460 * link - Name for new symlink entry. 2461 * vap - Attributes of new entry. 2462 * target - Target path of new symlink. 2463 * cr - credentials of caller. 2464 * 2465 * RETURN: 0 if success 2466 * error code if failure 2467 * 2468 * Timestamps: 2469 * dvp - ctime|mtime updated 2470 */ 2471 static int 2472 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr) 2473 { 2474 znode_t *zp, *dzp = VTOZ(dvp); 2475 zfs_dirlock_t *dl; 2476 dmu_tx_t *tx; 2477 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2478 zilog_t *zilog = zfsvfs->z_log; 2479 uint64_t seq = 0; 2480 uint64_t zoid; 2481 int len = strlen(link); 2482 int error; 2483 2484 ASSERT(vap->va_type == VLNK); 2485 2486 ZFS_ENTER(zfsvfs); 2487 top: 2488 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { 2489 ZFS_EXIT(zfsvfs); 2490 return (error); 2491 } 2492 2493 if (len > MAXPATHLEN) { 2494 ZFS_EXIT(zfsvfs); 2495 return (ENAMETOOLONG); 2496 } 2497 2498 /* 2499 * Attempt to lock directory; fail if entry already exists. 2500 */ 2501 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) { 2502 ZFS_EXIT(zfsvfs); 2503 return (error); 2504 } 2505 2506 tx = dmu_tx_create(zfsvfs->z_os); 2507 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 2508 dmu_tx_hold_bonus(tx, dzp->z_id); 2509 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 2510 if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) 2511 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); 2512 error = dmu_tx_assign(tx, zfsvfs->z_assign); 2513 if (error) { 2514 dmu_tx_abort(tx); 2515 zfs_dirent_unlock(dl); 2516 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2517 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 2518 goto top; 2519 } 2520 ZFS_EXIT(zfsvfs); 2521 return (error); 2522 } 2523 2524 dmu_buf_will_dirty(dzp->z_dbuf, tx); 2525 2526 /* 2527 * Create a new object for the symlink. 2528 * Put the link content into bonus buffer if it will fit; 2529 * otherwise, store it just like any other file data. 2530 */ 2531 zoid = 0; 2532 if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { 2533 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len); 2534 if (len != 0) 2535 bcopy(link, zp->z_phys + 1, len); 2536 } else { 2537 dmu_buf_t *dbp; 2538 2539 zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0); 2540 2541 /* 2542 * Nothing can access the znode yet so no locking needed 2543 * for growing the znode's blocksize. 2544 */ 2545 zfs_grow_blocksize(zp, len, tx); 2546 2547 VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); 2548 dmu_buf_will_dirty(dbp, tx); 2549 2550 ASSERT3U(len, <=, dbp->db_size); 2551 bcopy(link, dbp->db_data, len); 2552 dmu_buf_rele(dbp, FTAG); 2553 } 2554 zp->z_phys->zp_size = len; 2555 2556 /* 2557 * Insert the new object into the directory. 2558 */ 2559 (void) zfs_link_create(dl, zp, tx, ZNEW); 2560 out: 2561 if (error == 0) 2562 seq = zfs_log_symlink(zilog, tx, TX_SYMLINK, 2563 dzp, zp, name, link); 2564 2565 dmu_tx_commit(tx); 2566 2567 zfs_dirent_unlock(dl); 2568 2569 VN_RELE(ZTOV(zp)); 2570 2571 zil_commit(zilog, seq, 0); 2572 2573 ZFS_EXIT(zfsvfs); 2574 return (error); 2575 } 2576 2577 /* 2578 * Return, in the buffer contained in the provided uio structure, 2579 * the symbolic path referred to by vp. 2580 * 2581 * IN: vp - vnode of symbolic link. 2582 * uoip - structure to contain the link path. 2583 * cr - credentials of caller. 2584 * 2585 * OUT: uio - structure to contain the link path. 2586 * 2587 * RETURN: 0 if success 2588 * error code if failure 2589 * 2590 * Timestamps: 2591 * vp - atime updated 2592 */ 2593 /* ARGSUSED */ 2594 static int 2595 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) 2596 { 2597 znode_t *zp = VTOZ(vp); 2598 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2599 size_t bufsz; 2600 int error; 2601 2602 ZFS_ENTER(zfsvfs); 2603 2604 bufsz = (size_t)zp->z_phys->zp_size; 2605 if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { 2606 error = uiomove(zp->z_phys + 1, 2607 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 2608 } else { 2609 dmu_buf_t *dbp; 2610 error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); 2611 if (error) { 2612 ZFS_EXIT(zfsvfs); 2613 return (error); 2614 } 2615 error = uiomove(dbp->db_data, 2616 MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); 2617 dmu_buf_rele(dbp, FTAG); 2618 } 2619 2620 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2621 ZFS_EXIT(zfsvfs); 2622 return (error); 2623 } 2624 2625 /* 2626 * Insert a new entry into directory tdvp referencing svp. 2627 * 2628 * IN: tdvp - Directory to contain new entry. 2629 * svp - vnode of new entry. 2630 * name - name of new entry. 2631 * cr - credentials of caller. 2632 * 2633 * RETURN: 0 if success 2634 * error code if failure 2635 * 2636 * Timestamps: 2637 * tdvp - ctime|mtime updated 2638 * svp - ctime updated 2639 */ 2640 /* ARGSUSED */ 2641 static int 2642 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) 2643 { 2644 znode_t *dzp = VTOZ(tdvp); 2645 znode_t *tzp, *szp; 2646 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2647 zilog_t *zilog = zfsvfs->z_log; 2648 uint64_t seq = 0; 2649 zfs_dirlock_t *dl; 2650 dmu_tx_t *tx; 2651 vnode_t *realvp; 2652 int error; 2653 2654 ASSERT(tdvp->v_type == VDIR); 2655 2656 ZFS_ENTER(zfsvfs); 2657 2658 if (VOP_REALVP(svp, &realvp) == 0) 2659 svp = realvp; 2660 2661 if (svp->v_vfsp != tdvp->v_vfsp) { 2662 ZFS_EXIT(zfsvfs); 2663 return (EXDEV); 2664 } 2665 2666 szp = VTOZ(svp); 2667 top: 2668 /* 2669 * We do not support links between attributes and non-attributes 2670 * because of the potential security risk of creating links 2671 * into "normal" file space in order to circumvent restrictions 2672 * imposed in attribute space. 2673 */ 2674 if ((szp->z_phys->zp_flags & ZFS_XATTR) != 2675 (dzp->z_phys->zp_flags & ZFS_XATTR)) { 2676 ZFS_EXIT(zfsvfs); 2677 return (EINVAL); 2678 } 2679 2680 /* 2681 * POSIX dictates that we return EPERM here. 2682 * Better choices include ENOTSUP or EISDIR. 2683 */ 2684 if (svp->v_type == VDIR) { 2685 ZFS_EXIT(zfsvfs); 2686 return (EPERM); 2687 } 2688 2689 if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) && 2690 secpolicy_basic_link(cr) != 0) { 2691 ZFS_EXIT(zfsvfs); 2692 return (EPERM); 2693 } 2694 2695 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { 2696 ZFS_EXIT(zfsvfs); 2697 return (error); 2698 } 2699 2700 /* 2701 * Attempt to lock directory; fail if entry already exists. 2702 */ 2703 if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) { 2704 ZFS_EXIT(zfsvfs); 2705 return (error); 2706 } 2707 2708 tx = dmu_tx_create(zfsvfs->z_os); 2709 dmu_tx_hold_bonus(tx, szp->z_id); 2710 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 2711 error = dmu_tx_assign(tx, zfsvfs->z_assign); 2712 if (error) { 2713 dmu_tx_abort(tx); 2714 zfs_dirent_unlock(dl); 2715 if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2716 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 2717 goto top; 2718 } 2719 ZFS_EXIT(zfsvfs); 2720 return (error); 2721 } 2722 2723 error = zfs_link_create(dl, szp, tx, 0); 2724 2725 if (error == 0) 2726 seq = zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name); 2727 2728 dmu_tx_commit(tx); 2729 2730 zfs_dirent_unlock(dl); 2731 2732 zil_commit(zilog, seq, 0); 2733 2734 ZFS_EXIT(zfsvfs); 2735 return (error); 2736 } 2737 2738 /* 2739 * zfs_null_putapage() is used when the file system has been force 2740 * unmounted. It just drops the pages. 2741 */ 2742 /* ARGSUSED */ 2743 static int 2744 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 2745 size_t *lenp, int flags, cred_t *cr) 2746 { 2747 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 2748 return (0); 2749 } 2750 2751 /* ARGSUSED */ 2752 static int 2753 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 2754 size_t *lenp, int flags, cred_t *cr) 2755 { 2756 znode_t *zp = VTOZ(vp); 2757 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2758 zilog_t *zilog = zfsvfs->z_log; 2759 dmu_tx_t *tx; 2760 rl_t *rl; 2761 u_offset_t off; 2762 ssize_t len; 2763 caddr_t va; 2764 int err; 2765 2766 top: 2767 off = pp->p_offset; 2768 rl = zfs_range_lock(zp, off, PAGESIZE, RL_WRITER); 2769 /* 2770 * Can't push pages past end-of-file. 2771 */ 2772 if (off >= zp->z_phys->zp_size) { 2773 zfs_range_unlock(zp, rl); 2774 return (EIO); 2775 } 2776 len = MIN(PAGESIZE, zp->z_phys->zp_size - off); 2777 2778 tx = dmu_tx_create(zfsvfs->z_os); 2779 dmu_tx_hold_write(tx, zp->z_id, off, len); 2780 dmu_tx_hold_bonus(tx, zp->z_id); 2781 err = dmu_tx_assign(tx, zfsvfs->z_assign); 2782 if (err != 0) { 2783 dmu_tx_abort(tx); 2784 zfs_range_unlock(zp, rl); 2785 if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 2786 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 2787 goto top; 2788 } 2789 goto out; 2790 } 2791 2792 va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); 2793 2794 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 2795 2796 ppmapout(va); 2797 2798 zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 2799 (void) zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL); 2800 dmu_tx_commit(tx); 2801 2802 zfs_range_unlock(zp, rl); 2803 2804 pvn_write_done(pp, B_WRITE | flags); 2805 if (offp) 2806 *offp = off; 2807 if (lenp) 2808 *lenp = len; 2809 2810 out: 2811 return (err); 2812 } 2813 2814 /* 2815 * Copy the portion of the file indicated from pages into the file. 2816 * The pages are stored in a page list attached to the files vnode. 2817 * 2818 * IN: vp - vnode of file to push page data to. 2819 * off - position in file to put data. 2820 * len - amount of data to write. 2821 * flags - flags to control the operation. 2822 * cr - credentials of caller. 2823 * 2824 * RETURN: 0 if success 2825 * error code if failure 2826 * 2827 * Timestamps: 2828 * vp - ctime|mtime updated 2829 */ 2830 static int 2831 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 2832 { 2833 znode_t *zp = VTOZ(vp); 2834 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2835 page_t *pp; 2836 size_t io_len; 2837 u_offset_t io_off; 2838 uint64_t filesz; 2839 int error = 0; 2840 2841 ZFS_ENTER(zfsvfs); 2842 2843 ASSERT(zp->z_dbuf_held && zp->z_phys); 2844 2845 if (len == 0) { 2846 /* 2847 * Search the entire vp list for pages >= off. 2848 */ 2849 error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, 2850 flags, cr); 2851 goto out; 2852 } 2853 2854 filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ 2855 if (off > filesz) { 2856 /* past end of file */ 2857 ZFS_EXIT(zfsvfs); 2858 return (0); 2859 } 2860 2861 len = MIN(len, filesz - off); 2862 2863 for (io_off = off; io_off < off + len; io_off += io_len) { 2864 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 2865 pp = page_lookup(vp, io_off, 2866 (flags & (B_INVAL | B_FREE)) ? 2867 SE_EXCL : SE_SHARED); 2868 } else { 2869 pp = page_lookup_nowait(vp, io_off, 2870 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2871 } 2872 2873 if (pp != NULL && pvn_getdirty(pp, flags)) { 2874 int err; 2875 2876 /* 2877 * Found a dirty page to push 2878 */ 2879 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 2880 if (err) 2881 error = err; 2882 } else { 2883 io_len = PAGESIZE; 2884 } 2885 } 2886 out: 2887 zil_commit(zfsvfs->z_log, UINT64_MAX, (flags & B_ASYNC) ? 0 : FDSYNC); 2888 ZFS_EXIT(zfsvfs); 2889 return (error); 2890 } 2891 2892 void 2893 zfs_inactive(vnode_t *vp, cred_t *cr) 2894 { 2895 znode_t *zp = VTOZ(vp); 2896 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2897 int error; 2898 2899 rw_enter(&zfsvfs->z_um_lock, RW_READER); 2900 if (zfsvfs->z_unmounted2) { 2901 ASSERT(zp->z_dbuf_held == 0); 2902 2903 if (vn_has_cached_data(vp)) { 2904 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, 2905 B_INVAL, cr); 2906 } 2907 2908 mutex_enter(&zp->z_lock); 2909 vp->v_count = 0; /* count arrives as 1 */ 2910 if (zp->z_dbuf == NULL) { 2911 mutex_exit(&zp->z_lock); 2912 zfs_znode_free(zp); 2913 } else { 2914 mutex_exit(&zp->z_lock); 2915 } 2916 rw_exit(&zfsvfs->z_um_lock); 2917 VFS_RELE(zfsvfs->z_vfs); 2918 return; 2919 } 2920 2921 /* 2922 * Attempt to push any data in the page cache. If this fails 2923 * we will get kicked out later in zfs_zinactive(). 2924 */ 2925 if (vn_has_cached_data(vp)) { 2926 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, 2927 cr); 2928 } 2929 2930 if (zp->z_atime_dirty && zp->z_reap == 0) { 2931 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 2932 2933 dmu_tx_hold_bonus(tx, zp->z_id); 2934 error = dmu_tx_assign(tx, TXG_WAIT); 2935 if (error) { 2936 dmu_tx_abort(tx); 2937 } else { 2938 dmu_buf_will_dirty(zp->z_dbuf, tx); 2939 mutex_enter(&zp->z_lock); 2940 zp->z_atime_dirty = 0; 2941 mutex_exit(&zp->z_lock); 2942 dmu_tx_commit(tx); 2943 } 2944 } 2945 2946 zfs_zinactive(zp); 2947 rw_exit(&zfsvfs->z_um_lock); 2948 } 2949 2950 /* 2951 * Bounds-check the seek operation. 2952 * 2953 * IN: vp - vnode seeking within 2954 * ooff - old file offset 2955 * noffp - pointer to new file offset 2956 * 2957 * RETURN: 0 if success 2958 * EINVAL if new offset invalid 2959 */ 2960 /* ARGSUSED */ 2961 static int 2962 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 2963 { 2964 if (vp->v_type == VDIR) 2965 return (0); 2966 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 2967 } 2968 2969 /* 2970 * Pre-filter the generic locking function to trap attempts to place 2971 * a mandatory lock on a memory mapped file. 2972 */ 2973 static int 2974 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 2975 flk_callback_t *flk_cbp, cred_t *cr) 2976 { 2977 znode_t *zp = VTOZ(vp); 2978 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2979 int error; 2980 2981 ZFS_ENTER(zfsvfs); 2982 2983 /* 2984 * We are following the UFS semantics with respect to mapcnt 2985 * here: If we see that the file is mapped already, then we will 2986 * return an error, but we don't worry about races between this 2987 * function and zfs_map(). 2988 */ 2989 if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { 2990 ZFS_EXIT(zfsvfs); 2991 return (EAGAIN); 2992 } 2993 error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr); 2994 ZFS_EXIT(zfsvfs); 2995 return (error); 2996 } 2997 2998 /* 2999 * If we can't find a page in the cache, we will create a new page 3000 * and fill it with file data. For efficiency, we may try to fill 3001 * multiple pages at once (klustering). 3002 */ 3003 static int 3004 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 3005 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 3006 { 3007 znode_t *zp = VTOZ(vp); 3008 page_t *pp, *cur_pp; 3009 objset_t *os = zp->z_zfsvfs->z_os; 3010 caddr_t va; 3011 u_offset_t io_off, total; 3012 uint64_t oid = zp->z_id; 3013 size_t io_len; 3014 uint64_t filesz; 3015 int err; 3016 3017 /* 3018 * If we are only asking for a single page don't bother klustering. 3019 */ 3020 filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ 3021 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE || off > filesz) { 3022 io_off = off; 3023 io_len = PAGESIZE; 3024 pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); 3025 } else { 3026 /* 3027 * Try to fill a kluster of pages (a blocks worth). 3028 */ 3029 size_t klen; 3030 u_offset_t koff; 3031 3032 if (!ISP2(zp->z_blksz)) { 3033 /* Only one block in the file. */ 3034 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 3035 koff = 0; 3036 } else { 3037 klen = plsz; 3038 koff = P2ALIGN(off, (u_offset_t)klen); 3039 } 3040 ASSERT(koff <= filesz); 3041 if (koff + klen > filesz) 3042 klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; 3043 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 3044 &io_len, koff, klen, 0); 3045 } 3046 if (pp == NULL) { 3047 /* 3048 * Some other thread entered the page before us. 3049 * Return to zfs_getpage to retry the lookup. 3050 */ 3051 *pl = NULL; 3052 return (0); 3053 } 3054 3055 /* 3056 * Fill the pages in the kluster. 3057 */ 3058 cur_pp = pp; 3059 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 3060 ASSERT(io_off == cur_pp->p_offset); 3061 va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1); 3062 err = dmu_read(os, oid, io_off, PAGESIZE, va); 3063 ppmapout(va); 3064 if (err) { 3065 /* On error, toss the entire kluster */ 3066 pvn_read_done(pp, B_ERROR); 3067 return (err); 3068 } 3069 cur_pp = cur_pp->p_next; 3070 } 3071 out: 3072 /* 3073 * Fill in the page list array from the kluster. If 3074 * there are too many pages in the kluster, return 3075 * as many pages as possible starting from the desired 3076 * offset `off'. 3077 * NOTE: the page list will always be null terminated. 3078 */ 3079 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 3080 3081 return (0); 3082 } 3083 3084 /* 3085 * Return pointers to the pages for the file region [off, off + len] 3086 * in the pl array. If plsz is greater than len, this function may 3087 * also return page pointers from before or after the specified 3088 * region (i.e. some region [off', off' + plsz]). These additional 3089 * pages are only returned if they are already in the cache, or were 3090 * created as part of a klustered read. 3091 * 3092 * IN: vp - vnode of file to get data from. 3093 * off - position in file to get data from. 3094 * len - amount of data to retrieve. 3095 * plsz - length of provided page list. 3096 * seg - segment to obtain pages for. 3097 * addr - virtual address of fault. 3098 * rw - mode of created pages. 3099 * cr - credentials of caller. 3100 * 3101 * OUT: protp - protection mode of created pages. 3102 * pl - list of pages created. 3103 * 3104 * RETURN: 0 if success 3105 * error code if failure 3106 * 3107 * Timestamps: 3108 * vp - atime updated 3109 */ 3110 /* ARGSUSED */ 3111 static int 3112 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 3113 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 3114 enum seg_rw rw, cred_t *cr) 3115 { 3116 znode_t *zp = VTOZ(vp); 3117 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3118 page_t *pp, **pl0 = pl; 3119 rl_t *rl; 3120 int cnt = 0, need_unlock = 0, err = 0; 3121 3122 ZFS_ENTER(zfsvfs); 3123 3124 if (protp) 3125 *protp = PROT_ALL; 3126 3127 ASSERT(zp->z_dbuf_held && zp->z_phys); 3128 3129 /* no faultahead (for now) */ 3130 if (pl == NULL) { 3131 ZFS_EXIT(zfsvfs); 3132 return (0); 3133 } 3134 3135 /* 3136 * Make sure nobody restructures the file in the middle of the getpage. 3137 */ 3138 rl = zfs_range_lock(zp, off, len, RL_READER); 3139 3140 /* can't fault past EOF */ 3141 if (off >= zp->z_phys->zp_size) { 3142 zfs_range_unlock(zp, rl); 3143 ZFS_EXIT(zfsvfs); 3144 return (EFAULT); 3145 } 3146 3147 /* 3148 * If we already own the lock, then we must be page faulting 3149 * in the middle of a write to this file (i.e., we are writing 3150 * to this file using data from a mapped region of the file). 3151 */ 3152 if (!rw_owner(&zp->z_map_lock)) { 3153 rw_enter(&zp->z_map_lock, RW_WRITER); 3154 need_unlock = TRUE; 3155 } 3156 3157 /* 3158 * Loop through the requested range [off, off + len] looking 3159 * for pages. If we don't find a page, we will need to create 3160 * a new page and fill it with data from the file. 3161 */ 3162 while (len > 0) { 3163 if (plsz < PAGESIZE) 3164 break; 3165 if (pp = page_lookup(vp, off, SE_SHARED)) { 3166 *pl++ = pp; 3167 off += PAGESIZE; 3168 addr += PAGESIZE; 3169 len -= PAGESIZE; 3170 plsz -= PAGESIZE; 3171 } else { 3172 err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); 3173 /* 3174 * klustering may have changed our region 3175 * to be block aligned. 3176 */ 3177 if (((pp = *pl) != 0) && (off != pp->p_offset)) { 3178 int delta = off - pp->p_offset; 3179 len += delta; 3180 off -= delta; 3181 addr -= delta; 3182 } 3183 while (*pl) { 3184 pl++; 3185 cnt++; 3186 off += PAGESIZE; 3187 addr += PAGESIZE; 3188 plsz -= PAGESIZE; 3189 if (len > PAGESIZE) 3190 len -= PAGESIZE; 3191 else 3192 len = 0; 3193 } 3194 if (err) { 3195 /* 3196 * Release any pages we have locked. 3197 */ 3198 while (pl > pl0) 3199 page_unlock(*--pl); 3200 goto out; 3201 } 3202 } 3203 } 3204 3205 /* 3206 * Fill out the page array with any pages already in the cache. 3207 */ 3208 while (plsz > 0) { 3209 pp = page_lookup_nowait(vp, off, SE_SHARED); 3210 if (pp == NULL) 3211 break; 3212 *pl++ = pp; 3213 off += PAGESIZE; 3214 plsz -= PAGESIZE; 3215 } 3216 3217 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3218 out: 3219 *pl = NULL; 3220 3221 if (need_unlock) 3222 rw_exit(&zp->z_map_lock); 3223 zfs_range_unlock(zp, rl); 3224 3225 ZFS_EXIT(zfsvfs); 3226 return (err); 3227 } 3228 3229 /* 3230 * Request a memory map for a section of a file. This code interacts 3231 * with common code and the VM system as follows: 3232 * 3233 * common code calls mmap(), which ends up in smmap_common() 3234 * 3235 * this calls VOP_MAP(), which takes you into (say) zfs 3236 * 3237 * zfs_map() calls as_map(), passing segvn_create() as the callback 3238 * 3239 * segvn_create() creates the new segment and calls VOP_ADDMAP() 3240 * 3241 * zfs_addmap() updates z_mapcnt 3242 */ 3243 static int 3244 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 3245 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 3246 { 3247 znode_t *zp = VTOZ(vp); 3248 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3249 segvn_crargs_t vn_a; 3250 int error; 3251 3252 ZFS_ENTER(zfsvfs); 3253 3254 if (vp->v_flag & VNOMAP) { 3255 ZFS_EXIT(zfsvfs); 3256 return (ENOSYS); 3257 } 3258 3259 if (off < 0 || len > MAXOFFSET_T - off) { 3260 ZFS_EXIT(zfsvfs); 3261 return (ENXIO); 3262 } 3263 3264 if (vp->v_type != VREG) { 3265 ZFS_EXIT(zfsvfs); 3266 return (ENODEV); 3267 } 3268 3269 /* 3270 * If file is locked, disallow mapping. 3271 */ 3272 if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { 3273 ZFS_EXIT(zfsvfs); 3274 return (EAGAIN); 3275 } 3276 3277 as_rangelock(as); 3278 if ((flags & MAP_FIXED) == 0) { 3279 map_addr(addrp, len, off, 1, flags); 3280 if (*addrp == NULL) { 3281 as_rangeunlock(as); 3282 ZFS_EXIT(zfsvfs); 3283 return (ENOMEM); 3284 } 3285 } else { 3286 /* 3287 * User specified address - blow away any previous mappings 3288 */ 3289 (void) as_unmap(as, *addrp, len); 3290 } 3291 3292 vn_a.vp = vp; 3293 vn_a.offset = (u_offset_t)off; 3294 vn_a.type = flags & MAP_TYPE; 3295 vn_a.prot = prot; 3296 vn_a.maxprot = maxprot; 3297 vn_a.cred = cr; 3298 vn_a.amp = NULL; 3299 vn_a.flags = flags & ~MAP_TYPE; 3300 vn_a.szc = 0; 3301 vn_a.lgrp_mem_policy_flags = 0; 3302 3303 error = as_map(as, *addrp, len, segvn_create, &vn_a); 3304 3305 as_rangeunlock(as); 3306 ZFS_EXIT(zfsvfs); 3307 return (error); 3308 } 3309 3310 /* ARGSUSED */ 3311 static int 3312 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 3313 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 3314 { 3315 uint64_t pages = btopr(len); 3316 3317 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 3318 return (0); 3319 } 3320 3321 /* 3322 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 3323 * more accurate mtime for the associated file. Since we don't have a way of 3324 * detecting when the data was actually modified, we have to resort to 3325 * heuristics. If an explicit msync() is done, then we mark the mtime when the 3326 * last page is pushed. The problem occurs when the msync() call is omitted, 3327 * which by far the most common case: 3328 * 3329 * open() 3330 * mmap() 3331 * <modify memory> 3332 * munmap() 3333 * close() 3334 * <time lapse> 3335 * putpage() via fsflush 3336 * 3337 * If we wait until fsflush to come along, we can have a modification time that 3338 * is some arbitrary point in the future. In order to prevent this in the 3339 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 3340 * torn down. 3341 */ 3342 /* ARGSUSED */ 3343 static int 3344 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 3345 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 3346 { 3347 uint64_t pages = btopr(len); 3348 3349 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 3350 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 3351 3352 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 3353 vn_has_cached_data(vp)) 3354 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr); 3355 3356 return (0); 3357 } 3358 3359 /* 3360 * Free or allocate space in a file. Currently, this function only 3361 * supports the `F_FREESP' command. However, this command is somewhat 3362 * misnamed, as its functionality includes the ability to allocate as 3363 * well as free space. 3364 * 3365 * IN: vp - vnode of file to free data in. 3366 * cmd - action to take (only F_FREESP supported). 3367 * bfp - section of file to free/alloc. 3368 * flag - current file open mode flags. 3369 * offset - current file offset. 3370 * cr - credentials of caller [UNUSED]. 3371 * 3372 * RETURN: 0 if success 3373 * error code if failure 3374 * 3375 * Timestamps: 3376 * vp - ctime|mtime updated 3377 */ 3378 /* ARGSUSED */ 3379 static int 3380 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 3381 offset_t offset, cred_t *cr, caller_context_t *ct) 3382 { 3383 znode_t *zp = VTOZ(vp); 3384 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3385 uint64_t off, len; 3386 int error; 3387 3388 ZFS_ENTER(zfsvfs); 3389 3390 top: 3391 if (cmd != F_FREESP) { 3392 ZFS_EXIT(zfsvfs); 3393 return (EINVAL); 3394 } 3395 3396 if (error = convoff(vp, bfp, 0, offset)) { 3397 ZFS_EXIT(zfsvfs); 3398 return (error); 3399 } 3400 3401 if (bfp->l_len < 0) { 3402 ZFS_EXIT(zfsvfs); 3403 return (EINVAL); 3404 } 3405 3406 off = bfp->l_start; 3407 len = bfp->l_len; /* 0 means from off to end of file */ 3408 3409 do { 3410 if (error == ERESTART) 3411 txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0); 3412 error = zfs_freesp(zp, off, len, flag, TRUE); 3413 } while (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT); 3414 3415 ZFS_EXIT(zfsvfs); 3416 return (error); 3417 } 3418 3419 static int 3420 zfs_fid(vnode_t *vp, fid_t *fidp) 3421 { 3422 znode_t *zp = VTOZ(vp); 3423 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3424 uint32_t gen = (uint32_t)zp->z_phys->zp_gen; 3425 uint64_t object = zp->z_id; 3426 zfid_short_t *zfid; 3427 int size, i; 3428 3429 ZFS_ENTER(zfsvfs); 3430 3431 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 3432 if (fidp->fid_len < size) { 3433 fidp->fid_len = size; 3434 ZFS_EXIT(zfsvfs); 3435 return (ENOSPC); 3436 } 3437 3438 zfid = (zfid_short_t *)fidp; 3439 3440 zfid->zf_len = size; 3441 3442 for (i = 0; i < sizeof (zfid->zf_object); i++) 3443 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 3444 3445 /* Must have a non-zero generation number to distinguish from .zfs */ 3446 if (gen == 0) 3447 gen = 1; 3448 for (i = 0; i < sizeof (zfid->zf_gen); i++) 3449 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 3450 3451 if (size == LONG_FID_LEN) { 3452 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 3453 zfid_long_t *zlfid; 3454 3455 zlfid = (zfid_long_t *)fidp; 3456 3457 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 3458 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 3459 3460 /* XXX - this should be the generation number for the objset */ 3461 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 3462 zlfid->zf_setgen[i] = 0; 3463 } 3464 3465 ZFS_EXIT(zfsvfs); 3466 return (0); 3467 } 3468 3469 static int 3470 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 3471 { 3472 znode_t *zp, *xzp; 3473 zfsvfs_t *zfsvfs; 3474 zfs_dirlock_t *dl; 3475 int error; 3476 3477 switch (cmd) { 3478 case _PC_LINK_MAX: 3479 *valp = ULONG_MAX; 3480 return (0); 3481 3482 case _PC_FILESIZEBITS: 3483 *valp = 64; 3484 return (0); 3485 3486 case _PC_XATTR_EXISTS: 3487 zp = VTOZ(vp); 3488 zfsvfs = zp->z_zfsvfs; 3489 ZFS_ENTER(zfsvfs); 3490 *valp = 0; 3491 error = zfs_dirent_lock(&dl, zp, "", &xzp, 3492 ZXATTR | ZEXISTS | ZSHARED); 3493 if (error == 0) { 3494 zfs_dirent_unlock(dl); 3495 if (!zfs_dirempty(xzp)) 3496 *valp = 1; 3497 VN_RELE(ZTOV(xzp)); 3498 } else if (error == ENOENT) { 3499 /* 3500 * If there aren't extended attributes, it's the 3501 * same as having zero of them. 3502 */ 3503 error = 0; 3504 } 3505 ZFS_EXIT(zfsvfs); 3506 return (error); 3507 3508 case _PC_ACL_ENABLED: 3509 *valp = _ACL_ACE_ENABLED; 3510 return (0); 3511 3512 case _PC_MIN_HOLE_SIZE: 3513 *valp = (ulong_t)SPA_MINBLOCKSIZE; 3514 return (0); 3515 3516 default: 3517 return (fs_pathconf(vp, cmd, valp, cr)); 3518 } 3519 } 3520 3521 /*ARGSUSED*/ 3522 static int 3523 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) 3524 { 3525 znode_t *zp = VTOZ(vp); 3526 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3527 int error; 3528 3529 ZFS_ENTER(zfsvfs); 3530 error = zfs_getacl(zp, vsecp, cr); 3531 ZFS_EXIT(zfsvfs); 3532 3533 return (error); 3534 } 3535 3536 /*ARGSUSED*/ 3537 static int 3538 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) 3539 { 3540 znode_t *zp = VTOZ(vp); 3541 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3542 int error; 3543 3544 ZFS_ENTER(zfsvfs); 3545 error = zfs_setacl(zp, vsecp, cr); 3546 ZFS_EXIT(zfsvfs); 3547 return (error); 3548 } 3549 3550 /* 3551 * Predeclare these here so that the compiler assumes that 3552 * this is an "old style" function declaration that does 3553 * not include arguments => we won't get type mismatch errors 3554 * in the initializations that follow. 3555 */ 3556 static int zfs_inval(); 3557 static int zfs_isdir(); 3558 3559 static int 3560 zfs_inval() 3561 { 3562 return (EINVAL); 3563 } 3564 3565 static int 3566 zfs_isdir() 3567 { 3568 return (EISDIR); 3569 } 3570 /* 3571 * Directory vnode operations template 3572 */ 3573 vnodeops_t *zfs_dvnodeops; 3574 const fs_operation_def_t zfs_dvnodeops_template[] = { 3575 VOPNAME_OPEN, zfs_open, 3576 VOPNAME_CLOSE, zfs_close, 3577 VOPNAME_READ, zfs_isdir, 3578 VOPNAME_WRITE, zfs_isdir, 3579 VOPNAME_IOCTL, zfs_ioctl, 3580 VOPNAME_GETATTR, zfs_getattr, 3581 VOPNAME_SETATTR, zfs_setattr, 3582 VOPNAME_ACCESS, zfs_access, 3583 VOPNAME_LOOKUP, zfs_lookup, 3584 VOPNAME_CREATE, zfs_create, 3585 VOPNAME_REMOVE, zfs_remove, 3586 VOPNAME_LINK, zfs_link, 3587 VOPNAME_RENAME, zfs_rename, 3588 VOPNAME_MKDIR, zfs_mkdir, 3589 VOPNAME_RMDIR, zfs_rmdir, 3590 VOPNAME_READDIR, zfs_readdir, 3591 VOPNAME_SYMLINK, zfs_symlink, 3592 VOPNAME_FSYNC, zfs_fsync, 3593 VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, 3594 VOPNAME_FID, zfs_fid, 3595 VOPNAME_SEEK, zfs_seek, 3596 VOPNAME_PATHCONF, zfs_pathconf, 3597 VOPNAME_GETSECATTR, zfs_getsecattr, 3598 VOPNAME_SETSECATTR, zfs_setsecattr, 3599 NULL, NULL 3600 }; 3601 3602 /* 3603 * Regular file vnode operations template 3604 */ 3605 vnodeops_t *zfs_fvnodeops; 3606 const fs_operation_def_t zfs_fvnodeops_template[] = { 3607 VOPNAME_OPEN, zfs_open, 3608 VOPNAME_CLOSE, zfs_close, 3609 VOPNAME_READ, zfs_read, 3610 VOPNAME_WRITE, zfs_write, 3611 VOPNAME_IOCTL, zfs_ioctl, 3612 VOPNAME_GETATTR, zfs_getattr, 3613 VOPNAME_SETATTR, zfs_setattr, 3614 VOPNAME_ACCESS, zfs_access, 3615 VOPNAME_LOOKUP, zfs_lookup, 3616 VOPNAME_RENAME, zfs_rename, 3617 VOPNAME_FSYNC, zfs_fsync, 3618 VOPNAME_INACTIVE, (fs_generic_func_p)zfs_inactive, 3619 VOPNAME_FID, zfs_fid, 3620 VOPNAME_SEEK, zfs_seek, 3621 VOPNAME_FRLOCK, zfs_frlock, 3622 VOPNAME_SPACE, zfs_space, 3623 VOPNAME_GETPAGE, zfs_getpage, 3624 VOPNAME_PUTPAGE, zfs_putpage, 3625 VOPNAME_MAP, (fs_generic_func_p) zfs_map, 3626 VOPNAME_ADDMAP, (fs_generic_func_p) zfs_addmap, 3627 VOPNAME_DELMAP, zfs_delmap, 3628 VOPNAME_PATHCONF, zfs_pathconf, 3629 VOPNAME_GETSECATTR, zfs_getsecattr, 3630 VOPNAME_SETSECATTR, zfs_setsecattr, 3631 VOPNAME_VNEVENT, fs_vnevent_support, 3632 NULL, NULL 3633 }; 3634 3635 /* 3636 * Symbolic link vnode operations template 3637 */ 3638 vnodeops_t *zfs_symvnodeops; 3639 const fs_operation_def_t zfs_symvnodeops_template[] = { 3640 VOPNAME_GETATTR, zfs_getattr, 3641 VOPNAME_SETATTR, zfs_setattr, 3642 VOPNAME_ACCESS, zfs_access, 3643 VOPNAME_RENAME, zfs_rename, 3644 VOPNAME_READLINK, zfs_readlink, 3645 VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, 3646 VOPNAME_FID, zfs_fid, 3647 VOPNAME_PATHCONF, zfs_pathconf, 3648 VOPNAME_VNEVENT, fs_vnevent_support, 3649 NULL, NULL 3650 }; 3651 3652 /* 3653 * Extended attribute directory vnode operations template 3654 * This template is identical to the directory vnodes 3655 * operation template except for restricted operations: 3656 * VOP_MKDIR() 3657 * VOP_SYMLINK() 3658 * Note that there are other restrictions embedded in: 3659 * zfs_create() - restrict type to VREG 3660 * zfs_link() - no links into/out of attribute space 3661 * zfs_rename() - no moves into/out of attribute space 3662 */ 3663 vnodeops_t *zfs_xdvnodeops; 3664 const fs_operation_def_t zfs_xdvnodeops_template[] = { 3665 VOPNAME_OPEN, zfs_open, 3666 VOPNAME_CLOSE, zfs_close, 3667 VOPNAME_IOCTL, zfs_ioctl, 3668 VOPNAME_GETATTR, zfs_getattr, 3669 VOPNAME_SETATTR, zfs_setattr, 3670 VOPNAME_ACCESS, zfs_access, 3671 VOPNAME_LOOKUP, zfs_lookup, 3672 VOPNAME_CREATE, zfs_create, 3673 VOPNAME_REMOVE, zfs_remove, 3674 VOPNAME_LINK, zfs_link, 3675 VOPNAME_RENAME, zfs_rename, 3676 VOPNAME_MKDIR, zfs_inval, 3677 VOPNAME_RMDIR, zfs_rmdir, 3678 VOPNAME_READDIR, zfs_readdir, 3679 VOPNAME_SYMLINK, zfs_inval, 3680 VOPNAME_FSYNC, zfs_fsync, 3681 VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, 3682 VOPNAME_FID, zfs_fid, 3683 VOPNAME_SEEK, zfs_seek, 3684 VOPNAME_PATHCONF, zfs_pathconf, 3685 VOPNAME_GETSECATTR, zfs_getsecattr, 3686 VOPNAME_SETSECATTR, zfs_setsecattr, 3687 VOPNAME_VNEVENT, fs_vnevent_support, 3688 NULL, NULL 3689 }; 3690 3691 /* 3692 * Error vnode operations template 3693 */ 3694 vnodeops_t *zfs_evnodeops; 3695 const fs_operation_def_t zfs_evnodeops_template[] = { 3696 VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive, 3697 VOPNAME_PATHCONF, zfs_pathconf, 3698 NULL, NULL 3699 }; 3700