1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/buf.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/uio.h> 39 #include <sys/stat.h> 40 #include <sys/errno.h> 41 #include <sys/sysmacros.h> 42 #include <sys/statvfs.h> 43 #include <sys/kmem.h> 44 #include <sys/kstat.h> 45 #include <sys/dirent.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/vtrace.h> 49 #include <sys/mode.h> 50 #include <sys/acl.h> 51 #include <sys/nbmlock.h> 52 #include <sys/policy.h> 53 #include <sys/sdt.h> 54 55 #include <rpc/types.h> 56 #include <rpc/auth.h> 57 #include <rpc/svc.h> 58 59 #include <nfs/nfs.h> 60 #include <nfs/export.h> 61 62 #include <vm/hat.h> 63 #include <vm/as.h> 64 #include <vm/seg.h> 65 #include <vm/seg_map.h> 66 #include <vm/seg_kmem.h> 67 68 #include <sys/strsubr.h> 69 70 /* 71 * These are the interface routines for the server side of the 72 * Network File System. See the NFS version 2 protocol specification 73 * for a description of this interface. 74 */ 75 76 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 77 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 78 cred_t *); 79 80 /* 81 * Some "over the wire" UNIX file types. These are encoded 82 * into the mode. This needs to be fixed in the next rev. 83 */ 84 #define IFMT 0170000 /* type of file */ 85 #define IFCHR 0020000 /* character special */ 86 #define IFBLK 0060000 /* block special */ 87 #define IFSOCK 0140000 /* socket */ 88 89 u_longlong_t nfs2_srv_caller_id; 90 91 /* 92 * Get file attributes. 93 * Returns the current attributes of the file with the given fhandle. 94 */ 95 /* ARGSUSED */ 96 void 97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 98 struct svc_req *req, cred_t *cr) 99 { 100 int error; 101 vnode_t *vp; 102 struct vattr va; 103 104 vp = nfs_fhtovp(fhp, exi); 105 if (vp == NULL) { 106 ns->ns_status = NFSERR_STALE; 107 return; 108 } 109 110 /* 111 * Do the getattr. 112 */ 113 va.va_mask = AT_ALL; /* we want all the attributes */ 114 115 error = rfs4_delegated_getattr(vp, &va, 0, cr); 116 117 /* check for overflows */ 118 if (!error) { 119 acl_perm(vp, exi, &va, cr); 120 error = vattr_to_nattr(&va, &ns->ns_attr); 121 } 122 123 VN_RELE(vp); 124 125 ns->ns_status = puterrno(error); 126 } 127 void * 128 rfs_getattr_getfh(fhandle_t *fhp) 129 { 130 return (fhp); 131 } 132 133 /* 134 * Set file attributes. 135 * Sets the attributes of the file with the given fhandle. Returns 136 * the new attributes. 137 */ 138 void 139 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 140 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 141 { 142 int error; 143 int flag; 144 int in_crit = 0; 145 vnode_t *vp; 146 struct vattr va; 147 struct vattr bva; 148 struct flock64 bf; 149 caller_context_t ct; 150 151 152 vp = nfs_fhtovp(&args->saa_fh, exi); 153 if (vp == NULL) { 154 ns->ns_status = NFSERR_STALE; 155 return; 156 } 157 158 if (rdonly(exi, req) || vn_is_readonly(vp)) { 159 VN_RELE(vp); 160 ns->ns_status = NFSERR_ROFS; 161 return; 162 } 163 164 error = sattr_to_vattr(&args->saa_sa, &va); 165 if (error) { 166 VN_RELE(vp); 167 ns->ns_status = puterrno(error); 168 return; 169 } 170 171 /* 172 * If the client is requesting a change to the mtime, 173 * but the nanosecond field is set to 1 billion, then 174 * this is a flag to the server that it should set the 175 * atime and mtime fields to the server's current time. 176 * The 1 billion number actually came from the client 177 * as 1 million, but the units in the over the wire 178 * request are microseconds instead of nanoseconds. 179 * 180 * This is an overload of the protocol and should be 181 * documented in the NFS Version 2 protocol specification. 182 */ 183 if (va.va_mask & AT_MTIME) { 184 if (va.va_mtime.tv_nsec == 1000000000) { 185 gethrestime(&va.va_mtime); 186 va.va_atime = va.va_mtime; 187 va.va_mask |= AT_ATIME; 188 flag = 0; 189 } else 190 flag = ATTR_UTIME; 191 } else 192 flag = 0; 193 194 /* 195 * If the filesystem is exported with nosuid, then mask off 196 * the setuid and setgid bits. 197 */ 198 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 199 (exi->exi_export.ex_flags & EX_NOSUID)) 200 va.va_mode &= ~(VSUID | VSGID); 201 202 ct.cc_sysid = 0; 203 ct.cc_pid = 0; 204 ct.cc_caller_id = nfs2_srv_caller_id; 205 ct.cc_flags = CC_DONTBLOCK; 206 207 /* 208 * We need to specially handle size changes because it is 209 * possible for the client to create a file with modes 210 * which indicate read-only, but with the file opened for 211 * writing. If the client then tries to set the size of 212 * the file, then the normal access checking done in 213 * VOP_SETATTR would prevent the client from doing so, 214 * although it should be legal for it to do so. To get 215 * around this, we do the access checking for ourselves 216 * and then use VOP_SPACE which doesn't do the access 217 * checking which VOP_SETATTR does. VOP_SPACE can only 218 * operate on VREG files, let VOP_SETATTR handle the other 219 * extremely rare cases. 220 * Also the client should not be allowed to change the 221 * size of the file if there is a conflicting non-blocking 222 * mandatory lock in the region of change. 223 */ 224 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 225 if (nbl_need_check(vp)) { 226 nbl_start_crit(vp, RW_READER); 227 in_crit = 1; 228 } 229 230 bva.va_mask = AT_UID | AT_SIZE; 231 232 error = VOP_GETATTR(vp, &bva, 0, cr, &ct); 233 234 if (error) { 235 if (in_crit) 236 nbl_end_crit(vp); 237 VN_RELE(vp); 238 ns->ns_status = puterrno(error); 239 return; 240 } 241 242 if (in_crit) { 243 u_offset_t offset; 244 ssize_t length; 245 246 if (va.va_size < bva.va_size) { 247 offset = va.va_size; 248 length = bva.va_size - va.va_size; 249 } else { 250 offset = bva.va_size; 251 length = va.va_size - bva.va_size; 252 } 253 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0, 254 NULL)) { 255 error = EACCES; 256 } 257 } 258 259 if (crgetuid(cr) == bva.va_uid && !error && 260 va.va_size != bva.va_size) { 261 va.va_mask &= ~AT_SIZE; 262 bf.l_type = F_WRLCK; 263 bf.l_whence = 0; 264 bf.l_start = (off64_t)va.va_size; 265 bf.l_len = 0; 266 bf.l_sysid = 0; 267 bf.l_pid = 0; 268 269 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 270 (offset_t)va.va_size, cr, &ct); 271 } 272 if (in_crit) 273 nbl_end_crit(vp); 274 } else 275 error = 0; 276 277 /* 278 * Do the setattr. 279 */ 280 if (!error && va.va_mask) { 281 error = VOP_SETATTR(vp, &va, flag, cr, &ct); 282 } 283 284 /* 285 * check if the monitor on either vop_space or vop_setattr detected 286 * a delegation conflict and if so, mark the thread flag as 287 * wouldblock so that the response is dropped and the client will 288 * try again. 289 */ 290 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 291 VN_RELE(vp); 292 curthread->t_flag |= T_WOULDBLOCK; 293 return; 294 } 295 296 if (!error) { 297 va.va_mask = AT_ALL; /* get everything */ 298 299 error = rfs4_delegated_getattr(vp, &va, 0, cr); 300 301 /* check for overflows */ 302 if (!error) { 303 acl_perm(vp, exi, &va, cr); 304 error = vattr_to_nattr(&va, &ns->ns_attr); 305 } 306 } 307 308 ct.cc_flags = 0; 309 310 /* 311 * Force modified metadata out to stable storage. 312 */ 313 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct); 314 315 VN_RELE(vp); 316 317 ns->ns_status = puterrno(error); 318 } 319 void * 320 rfs_setattr_getfh(struct nfssaargs *args) 321 { 322 return (&args->saa_fh); 323 } 324 325 /* 326 * Directory lookup. 327 * Returns an fhandle and file attributes for file name in a directory. 328 */ 329 /* ARGSUSED */ 330 void 331 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 332 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 333 { 334 int error; 335 vnode_t *dvp; 336 vnode_t *vp; 337 struct vattr va; 338 fhandle_t *fhp = da->da_fhandle; 339 struct sec_ol sec = {0, 0}; 340 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 341 342 /* 343 * Trusted Extension doesn't support NFSv2. MOUNT 344 * will reject v2 clients. Need to prevent v2 client 345 * access via WebNFS here. 346 */ 347 if (is_system_labeled() && req->rq_vers == 2) { 348 dr->dr_status = NFSERR_ACCES; 349 return; 350 } 351 352 /* 353 * Disallow NULL paths 354 */ 355 if (da->da_name == NULL || *da->da_name == '\0') { 356 dr->dr_status = NFSERR_ACCES; 357 return; 358 } 359 360 /* 361 * Allow lookups from the root - the default 362 * location of the public filehandle. 363 */ 364 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 365 dvp = rootdir; 366 VN_HOLD(dvp); 367 } else { 368 dvp = nfs_fhtovp(fhp, exi); 369 if (dvp == NULL) { 370 dr->dr_status = NFSERR_STALE; 371 return; 372 } 373 } 374 375 /* 376 * Not allow lookup beyond root. 377 * If the filehandle matches a filehandle of the exi, 378 * then the ".." refers beyond the root of an exported filesystem. 379 */ 380 if (strcmp(da->da_name, "..") == 0 && 381 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 382 VN_RELE(dvp); 383 dr->dr_status = NFSERR_NOENT; 384 return; 385 } 386 387 /* 388 * If the public filehandle is used then allow 389 * a multi-component lookup, i.e. evaluate 390 * a pathname and follow symbolic links if 391 * necessary. 392 * 393 * This may result in a vnode in another filesystem 394 * which is OK as long as the filesystem is exported. 395 */ 396 if (PUBLIC_FH2(fhp)) { 397 publicfh_flag = TRUE; 398 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 399 &sec); 400 } else { 401 /* 402 * Do a normal single component lookup. 403 */ 404 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr, 405 NULL, NULL, NULL); 406 } 407 408 if (!error) { 409 va.va_mask = AT_ALL; /* we want everything */ 410 411 error = rfs4_delegated_getattr(vp, &va, 0, cr); 412 413 /* check for overflows */ 414 if (!error) { 415 acl_perm(vp, exi, &va, cr); 416 error = vattr_to_nattr(&va, &dr->dr_attr); 417 if (!error) { 418 if (sec.sec_flags & SEC_QUERY) 419 error = makefh_ol(&dr->dr_fhandle, exi, 420 sec.sec_index); 421 else { 422 error = makefh(&dr->dr_fhandle, vp, 423 exi); 424 if (!error && publicfh_flag && 425 !chk_clnt_sec(exi, req)) 426 auth_weak = TRUE; 427 } 428 } 429 } 430 VN_RELE(vp); 431 } 432 433 VN_RELE(dvp); 434 435 /* 436 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 437 * and have obtained a new exportinfo in exi which needs to be 438 * released. Note the the original exportinfo pointed to by exi 439 * will be released by the caller, comon_dispatch. 440 */ 441 if (publicfh_flag && exi != NULL) 442 exi_rele(exi); 443 444 /* 445 * If it's public fh, no 0x81, and client's flavor is 446 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 447 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 448 */ 449 if (auth_weak) 450 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 451 else 452 dr->dr_status = puterrno(error); 453 } 454 void * 455 rfs_lookup_getfh(struct nfsdiropargs *da) 456 { 457 return (da->da_fhandle); 458 } 459 460 /* 461 * Read symbolic link. 462 * Returns the string in the symbolic link at the given fhandle. 463 */ 464 /* ARGSUSED */ 465 void 466 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 467 struct svc_req *req, cred_t *cr) 468 { 469 int error; 470 struct iovec iov; 471 struct uio uio; 472 vnode_t *vp; 473 struct vattr va; 474 475 vp = nfs_fhtovp(fhp, exi); 476 if (vp == NULL) { 477 rl->rl_data = NULL; 478 rl->rl_status = NFSERR_STALE; 479 return; 480 } 481 482 va.va_mask = AT_MODE; 483 484 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 485 486 if (error) { 487 VN_RELE(vp); 488 rl->rl_data = NULL; 489 rl->rl_status = puterrno(error); 490 return; 491 } 492 493 if (MANDLOCK(vp, va.va_mode)) { 494 VN_RELE(vp); 495 rl->rl_data = NULL; 496 rl->rl_status = NFSERR_ACCES; 497 return; 498 } 499 500 /* 501 * XNFS and RFC1094 require us to return ENXIO if argument 502 * is not a link. BUGID 1138002. 503 */ 504 if (vp->v_type != VLNK) { 505 VN_RELE(vp); 506 rl->rl_data = NULL; 507 rl->rl_status = NFSERR_NXIO; 508 return; 509 } 510 511 /* 512 * Allocate data for pathname. This will be freed by rfs_rlfree. 513 */ 514 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 515 516 /* 517 * Set up io vector to read sym link data 518 */ 519 iov.iov_base = rl->rl_data; 520 iov.iov_len = NFS_MAXPATHLEN; 521 uio.uio_iov = &iov; 522 uio.uio_iovcnt = 1; 523 uio.uio_segflg = UIO_SYSSPACE; 524 uio.uio_extflg = UIO_COPY_CACHED; 525 uio.uio_loffset = (offset_t)0; 526 uio.uio_resid = NFS_MAXPATHLEN; 527 528 /* 529 * Do the readlink. 530 */ 531 error = VOP_READLINK(vp, &uio, cr, NULL); 532 533 VN_RELE(vp); 534 535 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 536 537 /* 538 * XNFS and RFC1094 require us to return ENXIO if argument 539 * is not a link. UFS returns EINVAL if this is the case, 540 * so we do the mapping here. BUGID 1138002. 541 */ 542 if (error == EINVAL) 543 rl->rl_status = NFSERR_NXIO; 544 else 545 rl->rl_status = puterrno(error); 546 547 } 548 void * 549 rfs_readlink_getfh(fhandle_t *fhp) 550 { 551 return (fhp); 552 } 553 /* 554 * Free data allocated by rfs_readlink 555 */ 556 void 557 rfs_rlfree(struct nfsrdlnres *rl) 558 { 559 if (rl->rl_data != NULL) 560 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 561 } 562 563 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *); 564 565 /* 566 * Read data. 567 * Returns some data read from the file at the given fhandle. 568 */ 569 /* ARGSUSED */ 570 void 571 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 572 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 573 { 574 vnode_t *vp; 575 int error; 576 struct vattr va; 577 struct iovec iov; 578 struct uio uio; 579 mblk_t *mp; 580 int alloc_err = 0; 581 int in_crit = 0; 582 caller_context_t ct; 583 584 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 585 if (vp == NULL) { 586 rr->rr_data = NULL; 587 rr->rr_status = NFSERR_STALE; 588 return; 589 } 590 591 if (vp->v_type != VREG) { 592 VN_RELE(vp); 593 rr->rr_data = NULL; 594 rr->rr_status = NFSERR_ISDIR; 595 return; 596 } 597 598 ct.cc_sysid = 0; 599 ct.cc_pid = 0; 600 ct.cc_caller_id = nfs2_srv_caller_id; 601 ct.cc_flags = CC_DONTBLOCK; 602 603 /* 604 * Enter the critical region before calling VOP_RWLOCK 605 * to avoid a deadlock with write requests. 606 */ 607 if (nbl_need_check(vp)) { 608 nbl_start_crit(vp, RW_READER); 609 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 610 0, NULL)) { 611 nbl_end_crit(vp); 612 VN_RELE(vp); 613 rr->rr_data = NULL; 614 rr->rr_status = NFSERR_ACCES; 615 return; 616 } 617 in_crit = 1; 618 } 619 620 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct); 621 622 /* check if a monitor detected a delegation conflict */ 623 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 624 VN_RELE(vp); 625 /* mark as wouldblock so response is dropped */ 626 curthread->t_flag |= T_WOULDBLOCK; 627 628 rr->rr_data = NULL; 629 return; 630 } 631 632 va.va_mask = AT_ALL; 633 634 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 635 636 if (error) { 637 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 638 if (in_crit) 639 nbl_end_crit(vp); 640 641 VN_RELE(vp); 642 rr->rr_data = NULL; 643 rr->rr_status = puterrno(error); 644 645 return; 646 } 647 648 /* 649 * This is a kludge to allow reading of files created 650 * with no read permission. The owner of the file 651 * is always allowed to read it. 652 */ 653 if (crgetuid(cr) != va.va_uid) { 654 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct); 655 656 if (error) { 657 /* 658 * Exec is the same as read over the net because 659 * of demand loading. 660 */ 661 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct); 662 } 663 if (error) { 664 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 665 if (in_crit) 666 nbl_end_crit(vp); 667 VN_RELE(vp); 668 rr->rr_data = NULL; 669 rr->rr_status = puterrno(error); 670 671 return; 672 } 673 } 674 675 if (MANDLOCK(vp, va.va_mode)) { 676 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 677 if (in_crit) 678 nbl_end_crit(vp); 679 680 VN_RELE(vp); 681 rr->rr_data = NULL; 682 rr->rr_status = NFSERR_ACCES; 683 684 return; 685 } 686 687 rr->rr_ok.rrok_wlist_len = 0; 688 rr->rr_ok.rrok_wlist = NULL; 689 690 if ((u_offset_t)ra->ra_offset >= va.va_size) { 691 rr->rr_count = 0; 692 rr->rr_data = NULL; 693 /* 694 * In this case, status is NFS_OK, but there is no data 695 * to encode. So set rr_mp to NULL. 696 */ 697 rr->rr_mp = NULL; 698 goto done; 699 } 700 701 if (ra->ra_wlist) { 702 mp = NULL; 703 rr->rr_mp = NULL; 704 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist); 705 } else { 706 /* 707 * mp will contain the data to be sent out in the read reply. 708 * This will be freed after the reply has been sent out (by the 709 * driver). 710 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 711 * that the call to xdrmblk_putmblk() never fails. 712 */ 713 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 714 &alloc_err); 715 ASSERT(mp != NULL); 716 ASSERT(alloc_err == 0); 717 718 rr->rr_mp = mp; 719 720 /* 721 * Set up io vector 722 */ 723 iov.iov_base = (caddr_t)mp->b_datap->db_base; 724 iov.iov_len = ra->ra_count; 725 } 726 727 uio.uio_iov = &iov; 728 uio.uio_iovcnt = 1; 729 uio.uio_segflg = UIO_SYSSPACE; 730 uio.uio_extflg = UIO_COPY_CACHED; 731 uio.uio_loffset = (offset_t)ra->ra_offset; 732 uio.uio_resid = ra->ra_count; 733 734 error = VOP_READ(vp, &uio, 0, cr, &ct); 735 736 if (error) { 737 if (mp) 738 freeb(mp); 739 740 /* 741 * check if a monitor detected a delegation conflict and 742 * mark as wouldblock so response is dropped 743 */ 744 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 745 curthread->t_flag |= T_WOULDBLOCK; 746 else 747 rr->rr_status = puterrno(error); 748 749 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 750 if (in_crit) 751 nbl_end_crit(vp); 752 753 VN_RELE(vp); 754 rr->rr_data = NULL; 755 756 return; 757 } 758 759 /* 760 * Get attributes again so we can send the latest access 761 * time to the client side for his cache. 762 */ 763 va.va_mask = AT_ALL; 764 765 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 766 767 if (error) { 768 if (mp) 769 freeb(mp); 770 771 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 772 if (in_crit) 773 nbl_end_crit(vp); 774 775 VN_RELE(vp); 776 rr->rr_data = NULL; 777 rr->rr_status = puterrno(error); 778 779 return; 780 } 781 782 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 783 784 if (mp) { 785 rr->rr_data = (char *)mp->b_datap->db_base; 786 } else { 787 if (ra->ra_wlist) { 788 rr->rr_data = (caddr_t)iov.iov_base; 789 if (!rdma_setup_read_data2(ra, rr)) { 790 rr->rr_data = NULL; 791 rr->rr_status = puterrno(NFSERR_INVAL); 792 } 793 } 794 } 795 done: 796 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 797 if (in_crit) 798 nbl_end_crit(vp); 799 800 acl_perm(vp, exi, &va, cr); 801 802 /* check for overflows */ 803 error = vattr_to_nattr(&va, &rr->rr_attr); 804 805 VN_RELE(vp); 806 807 rr->rr_status = puterrno(error); 808 } 809 810 /* 811 * Free data allocated by rfs_read 812 */ 813 void 814 rfs_rdfree(struct nfsrdresult *rr) 815 { 816 mblk_t *mp; 817 818 if (rr->rr_status == NFS_OK) { 819 mp = rr->rr_mp; 820 if (mp != NULL) 821 freeb(mp); 822 } 823 } 824 825 void * 826 rfs_read_getfh(struct nfsreadargs *ra) 827 { 828 return (&ra->ra_fhandle); 829 } 830 831 #define MAX_IOVECS 12 832 833 #ifdef DEBUG 834 static int rfs_write_sync_hits = 0; 835 static int rfs_write_sync_misses = 0; 836 #endif 837 838 /* 839 * Write data to file. 840 * Returns attributes of a file after writing some data to it. 841 * 842 * Any changes made here, especially in error handling might have 843 * to also be done in rfs_write (which clusters write requests). 844 */ 845 void 846 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 847 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 848 { 849 int error; 850 vnode_t *vp; 851 rlim64_t rlimit; 852 struct vattr va; 853 struct uio uio; 854 struct iovec iov[MAX_IOVECS]; 855 mblk_t *m; 856 struct iovec *iovp; 857 int iovcnt; 858 cred_t *savecred; 859 int in_crit = 0; 860 caller_context_t ct; 861 862 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 863 if (vp == NULL) { 864 ns->ns_status = NFSERR_STALE; 865 return; 866 } 867 868 if (rdonly(exi, req)) { 869 VN_RELE(vp); 870 ns->ns_status = NFSERR_ROFS; 871 return; 872 } 873 874 if (vp->v_type != VREG) { 875 VN_RELE(vp); 876 ns->ns_status = NFSERR_ISDIR; 877 return; 878 } 879 880 ct.cc_sysid = 0; 881 ct.cc_pid = 0; 882 ct.cc_caller_id = nfs2_srv_caller_id; 883 ct.cc_flags = CC_DONTBLOCK; 884 885 va.va_mask = AT_UID|AT_MODE; 886 887 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 888 889 if (error) { 890 VN_RELE(vp); 891 ns->ns_status = puterrno(error); 892 893 return; 894 } 895 896 if (crgetuid(cr) != va.va_uid) { 897 /* 898 * This is a kludge to allow writes of files created 899 * with read only permission. The owner of the file 900 * is always allowed to write it. 901 */ 902 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct); 903 904 if (error) { 905 VN_RELE(vp); 906 ns->ns_status = puterrno(error); 907 return; 908 } 909 } 910 911 /* 912 * Can't access a mandatory lock file. This might cause 913 * the NFS service thread to block forever waiting for a 914 * lock to be released that will never be released. 915 */ 916 if (MANDLOCK(vp, va.va_mode)) { 917 VN_RELE(vp); 918 ns->ns_status = NFSERR_ACCES; 919 return; 920 } 921 922 /* 923 * We have to enter the critical region before calling VOP_RWLOCK 924 * to avoid a deadlock with ufs. 925 */ 926 if (nbl_need_check(vp)) { 927 nbl_start_crit(vp, RW_READER); 928 in_crit = 1; 929 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 930 wa->wa_count, 0, NULL)) { 931 error = EACCES; 932 goto out; 933 } 934 } 935 936 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); 937 938 /* check if a monitor detected a delegation conflict */ 939 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 940 VN_RELE(vp); 941 /* mark as wouldblock so response is dropped */ 942 curthread->t_flag |= T_WOULDBLOCK; 943 return; 944 } 945 946 if (wa->wa_data || wa->wa_rlist) { 947 /* Do the RDMA thing if necessary */ 948 if (wa->wa_rlist) { 949 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3); 950 iov[0].iov_len = wa->wa_count; 951 } else { 952 iov[0].iov_base = wa->wa_data; 953 iov[0].iov_len = wa->wa_count; 954 } 955 uio.uio_iov = iov; 956 uio.uio_iovcnt = 1; 957 uio.uio_segflg = UIO_SYSSPACE; 958 uio.uio_extflg = UIO_COPY_DEFAULT; 959 uio.uio_loffset = (offset_t)wa->wa_offset; 960 uio.uio_resid = wa->wa_count; 961 /* 962 * The limit is checked on the client. We 963 * should allow any size writes here. 964 */ 965 uio.uio_llimit = curproc->p_fsz_ctl; 966 rlimit = uio.uio_llimit - wa->wa_offset; 967 if (rlimit < (rlim64_t)uio.uio_resid) 968 uio.uio_resid = (uint_t)rlimit; 969 970 /* 971 * for now we assume no append mode 972 */ 973 /* 974 * We're changing creds because VM may fault and we need 975 * the cred of the current thread to be used if quota 976 * checking is enabled. 977 */ 978 savecred = curthread->t_cred; 979 curthread->t_cred = cr; 980 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); 981 curthread->t_cred = savecred; 982 } else { 983 iovcnt = 0; 984 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 985 iovcnt++; 986 if (iovcnt <= MAX_IOVECS) { 987 #ifdef DEBUG 988 rfs_write_sync_hits++; 989 #endif 990 iovp = iov; 991 } else { 992 #ifdef DEBUG 993 rfs_write_sync_misses++; 994 #endif 995 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 996 } 997 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 998 uio.uio_iov = iovp; 999 uio.uio_iovcnt = iovcnt; 1000 uio.uio_segflg = UIO_SYSSPACE; 1001 uio.uio_extflg = UIO_COPY_DEFAULT; 1002 uio.uio_loffset = (offset_t)wa->wa_offset; 1003 uio.uio_resid = wa->wa_count; 1004 /* 1005 * The limit is checked on the client. We 1006 * should allow any size writes here. 1007 */ 1008 uio.uio_llimit = curproc->p_fsz_ctl; 1009 rlimit = uio.uio_llimit - wa->wa_offset; 1010 if (rlimit < (rlim64_t)uio.uio_resid) 1011 uio.uio_resid = (uint_t)rlimit; 1012 1013 /* 1014 * For now we assume no append mode. 1015 */ 1016 /* 1017 * We're changing creds because VM may fault and we need 1018 * the cred of the current thread to be used if quota 1019 * checking is enabled. 1020 */ 1021 savecred = curthread->t_cred; 1022 curthread->t_cred = cr; 1023 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); 1024 curthread->t_cred = savecred; 1025 1026 if (iovp != iov) 1027 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1028 } 1029 1030 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); 1031 1032 if (!error) { 1033 /* 1034 * Get attributes again so we send the latest mod 1035 * time to the client side for his cache. 1036 */ 1037 va.va_mask = AT_ALL; /* now we want everything */ 1038 1039 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 1040 1041 /* check for overflows */ 1042 if (!error) { 1043 acl_perm(vp, exi, &va, cr); 1044 error = vattr_to_nattr(&va, &ns->ns_attr); 1045 } 1046 } 1047 1048 out: 1049 if (in_crit) 1050 nbl_end_crit(vp); 1051 VN_RELE(vp); 1052 1053 /* check if a monitor detected a delegation conflict */ 1054 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 1055 /* mark as wouldblock so response is dropped */ 1056 curthread->t_flag |= T_WOULDBLOCK; 1057 else 1058 ns->ns_status = puterrno(error); 1059 1060 } 1061 1062 struct rfs_async_write { 1063 struct nfswriteargs *wa; 1064 struct nfsattrstat *ns; 1065 struct svc_req *req; 1066 cred_t *cr; 1067 kthread_t *thread; 1068 struct rfs_async_write *list; 1069 }; 1070 1071 struct rfs_async_write_list { 1072 fhandle_t *fhp; 1073 kcondvar_t cv; 1074 struct rfs_async_write *list; 1075 struct rfs_async_write_list *next; 1076 }; 1077 1078 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1079 static kmutex_t rfs_async_write_lock; 1080 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1081 1082 #define MAXCLIOVECS 42 1083 #define RFSWRITE_INITVAL (enum nfsstat) -1 1084 1085 #ifdef DEBUG 1086 static int rfs_write_hits = 0; 1087 static int rfs_write_misses = 0; 1088 #endif 1089 1090 /* 1091 * Write data to file. 1092 * Returns attributes of a file after writing some data to it. 1093 */ 1094 void 1095 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1096 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1097 { 1098 int error; 1099 vnode_t *vp; 1100 rlim64_t rlimit; 1101 struct vattr va; 1102 struct uio uio; 1103 struct rfs_async_write_list *lp; 1104 struct rfs_async_write_list *nlp; 1105 struct rfs_async_write *rp; 1106 struct rfs_async_write *nrp; 1107 struct rfs_async_write *trp; 1108 struct rfs_async_write *lrp; 1109 int data_written; 1110 int iovcnt; 1111 mblk_t *m; 1112 struct iovec *iovp; 1113 struct iovec *niovp; 1114 struct iovec iov[MAXCLIOVECS]; 1115 int count; 1116 int rcount; 1117 uint_t off; 1118 uint_t len; 1119 struct rfs_async_write nrpsp; 1120 struct rfs_async_write_list nlpsp; 1121 ushort_t t_flag; 1122 cred_t *savecred; 1123 int in_crit = 0; 1124 caller_context_t ct; 1125 1126 if (!rfs_write_async) { 1127 rfs_write_sync(wa, ns, exi, req, cr); 1128 return; 1129 } 1130 1131 /* 1132 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1133 * is considered an OK. 1134 */ 1135 ns->ns_status = RFSWRITE_INITVAL; 1136 1137 nrp = &nrpsp; 1138 nrp->wa = wa; 1139 nrp->ns = ns; 1140 nrp->req = req; 1141 nrp->cr = cr; 1142 nrp->thread = curthread; 1143 1144 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1145 1146 /* 1147 * Look to see if there is already a cluster started 1148 * for this file. 1149 */ 1150 mutex_enter(&rfs_async_write_lock); 1151 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1152 if (bcmp(&wa->wa_fhandle, lp->fhp, 1153 sizeof (fhandle_t)) == 0) 1154 break; 1155 } 1156 1157 /* 1158 * If lp is non-NULL, then there is already a cluster 1159 * started. We need to place ourselves in the cluster 1160 * list in the right place as determined by starting 1161 * offset. Conflicts with non-blocking mandatory locked 1162 * regions will be checked when the cluster is processed. 1163 */ 1164 if (lp != NULL) { 1165 rp = lp->list; 1166 trp = NULL; 1167 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1168 trp = rp; 1169 rp = rp->list; 1170 } 1171 nrp->list = rp; 1172 if (trp == NULL) 1173 lp->list = nrp; 1174 else 1175 trp->list = nrp; 1176 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1177 cv_wait(&lp->cv, &rfs_async_write_lock); 1178 mutex_exit(&rfs_async_write_lock); 1179 1180 return; 1181 } 1182 1183 /* 1184 * No cluster started yet, start one and add ourselves 1185 * to the list of clusters. 1186 */ 1187 nrp->list = NULL; 1188 1189 nlp = &nlpsp; 1190 nlp->fhp = &wa->wa_fhandle; 1191 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1192 nlp->list = nrp; 1193 nlp->next = NULL; 1194 1195 if (rfs_async_write_head == NULL) { 1196 rfs_async_write_head = nlp; 1197 } else { 1198 lp = rfs_async_write_head; 1199 while (lp->next != NULL) 1200 lp = lp->next; 1201 lp->next = nlp; 1202 } 1203 mutex_exit(&rfs_async_write_lock); 1204 1205 /* 1206 * Convert the file handle common to all of the requests 1207 * in this cluster to a vnode. 1208 */ 1209 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1210 if (vp == NULL) { 1211 mutex_enter(&rfs_async_write_lock); 1212 if (rfs_async_write_head == nlp) 1213 rfs_async_write_head = nlp->next; 1214 else { 1215 lp = rfs_async_write_head; 1216 while (lp->next != nlp) 1217 lp = lp->next; 1218 lp->next = nlp->next; 1219 } 1220 t_flag = curthread->t_flag & T_WOULDBLOCK; 1221 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1222 rp->ns->ns_status = NFSERR_STALE; 1223 rp->thread->t_flag |= t_flag; 1224 } 1225 cv_broadcast(&nlp->cv); 1226 mutex_exit(&rfs_async_write_lock); 1227 1228 return; 1229 } 1230 1231 /* 1232 * Can only write regular files. Attempts to write any 1233 * other file types fail with EISDIR. 1234 */ 1235 if (vp->v_type != VREG) { 1236 VN_RELE(vp); 1237 mutex_enter(&rfs_async_write_lock); 1238 if (rfs_async_write_head == nlp) 1239 rfs_async_write_head = nlp->next; 1240 else { 1241 lp = rfs_async_write_head; 1242 while (lp->next != nlp) 1243 lp = lp->next; 1244 lp->next = nlp->next; 1245 } 1246 t_flag = curthread->t_flag & T_WOULDBLOCK; 1247 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1248 rp->ns->ns_status = NFSERR_ISDIR; 1249 rp->thread->t_flag |= t_flag; 1250 } 1251 cv_broadcast(&nlp->cv); 1252 mutex_exit(&rfs_async_write_lock); 1253 1254 return; 1255 } 1256 1257 /* 1258 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1259 * deadlock with ufs. 1260 */ 1261 if (nbl_need_check(vp)) { 1262 nbl_start_crit(vp, RW_READER); 1263 in_crit = 1; 1264 } 1265 1266 ct.cc_sysid = 0; 1267 ct.cc_pid = 0; 1268 ct.cc_caller_id = nfs2_srv_caller_id; 1269 ct.cc_flags = CC_DONTBLOCK; 1270 1271 /* 1272 * Lock the file for writing. This operation provides 1273 * the delay which allows clusters to grow. 1274 */ 1275 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); 1276 1277 /* check if a monitor detected a delegation conflict */ 1278 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 1279 VN_RELE(vp); 1280 /* mark as wouldblock so response is dropped */ 1281 curthread->t_flag |= T_WOULDBLOCK; 1282 mutex_enter(&rfs_async_write_lock); 1283 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1284 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1285 rp->ns->ns_status = puterrno(error); 1286 rp->thread->t_flag |= T_WOULDBLOCK; 1287 } 1288 } 1289 cv_broadcast(&nlp->cv); 1290 mutex_exit(&rfs_async_write_lock); 1291 1292 return; 1293 } 1294 1295 /* 1296 * Disconnect this cluster from the list of clusters. 1297 * The cluster that is being dealt with must be fixed 1298 * in size after this point, so there is no reason 1299 * to leave it on the list so that new requests can 1300 * find it. 1301 * 1302 * The algorithm is that the first write request will 1303 * create a cluster, convert the file handle to a 1304 * vnode pointer, and then lock the file for writing. 1305 * This request is not likely to be clustered with 1306 * any others. However, the next request will create 1307 * a new cluster and be blocked in VOP_RWLOCK while 1308 * the first request is being processed. This delay 1309 * will allow more requests to be clustered in this 1310 * second cluster. 1311 */ 1312 mutex_enter(&rfs_async_write_lock); 1313 if (rfs_async_write_head == nlp) 1314 rfs_async_write_head = nlp->next; 1315 else { 1316 lp = rfs_async_write_head; 1317 while (lp->next != nlp) 1318 lp = lp->next; 1319 lp->next = nlp->next; 1320 } 1321 mutex_exit(&rfs_async_write_lock); 1322 1323 /* 1324 * Step through the list of requests in this cluster. 1325 * We need to check permissions to make sure that all 1326 * of the requests have sufficient permission to write 1327 * the file. A cluster can be composed of requests 1328 * from different clients and different users on each 1329 * client. 1330 * 1331 * As a side effect, we also calculate the size of the 1332 * byte range that this cluster encompasses. 1333 */ 1334 rp = nlp->list; 1335 off = rp->wa->wa_offset; 1336 len = (uint_t)0; 1337 do { 1338 if (rdonly(exi, rp->req)) { 1339 rp->ns->ns_status = NFSERR_ROFS; 1340 t_flag = curthread->t_flag & T_WOULDBLOCK; 1341 rp->thread->t_flag |= t_flag; 1342 continue; 1343 } 1344 1345 va.va_mask = AT_UID|AT_MODE; 1346 1347 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct); 1348 1349 if (!error) { 1350 if (crgetuid(rp->cr) != va.va_uid) { 1351 /* 1352 * This is a kludge to allow writes of files 1353 * created with read only permission. The 1354 * owner of the file is always allowed to 1355 * write it. 1356 */ 1357 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct); 1358 } 1359 if (!error && MANDLOCK(vp, va.va_mode)) 1360 error = EACCES; 1361 } 1362 1363 /* 1364 * Check for a conflict with a nbmand-locked region. 1365 */ 1366 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1367 rp->wa->wa_count, 0, NULL)) { 1368 error = EACCES; 1369 } 1370 1371 if (error) { 1372 rp->ns->ns_status = puterrno(error); 1373 t_flag = curthread->t_flag & T_WOULDBLOCK; 1374 rp->thread->t_flag |= t_flag; 1375 continue; 1376 } 1377 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1378 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1379 } while ((rp = rp->list) != NULL); 1380 1381 /* 1382 * Step through the cluster attempting to gather as many 1383 * requests which are contiguous as possible. These 1384 * contiguous requests are handled via one call to VOP_WRITE 1385 * instead of different calls to VOP_WRITE. We also keep 1386 * track of the fact that any data was written. 1387 */ 1388 rp = nlp->list; 1389 data_written = 0; 1390 do { 1391 /* 1392 * Skip any requests which are already marked as having an 1393 * error. 1394 */ 1395 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1396 rp = rp->list; 1397 continue; 1398 } 1399 1400 /* 1401 * Count the number of iovec's which are required 1402 * to handle this set of requests. One iovec is 1403 * needed for each data buffer, whether addressed 1404 * by wa_data or by the b_rptr pointers in the 1405 * mblk chains. 1406 */ 1407 iovcnt = 0; 1408 lrp = rp; 1409 for (;;) { 1410 if (lrp->wa->wa_data || lrp->wa->wa_rlist) 1411 iovcnt++; 1412 else { 1413 m = lrp->wa->wa_mblk; 1414 while (m != NULL) { 1415 iovcnt++; 1416 m = m->b_cont; 1417 } 1418 } 1419 if (lrp->list == NULL || 1420 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1421 lrp->wa->wa_offset + lrp->wa->wa_count != 1422 lrp->list->wa->wa_offset) { 1423 lrp = lrp->list; 1424 break; 1425 } 1426 lrp = lrp->list; 1427 } 1428 1429 if (iovcnt <= MAXCLIOVECS) { 1430 #ifdef DEBUG 1431 rfs_write_hits++; 1432 #endif 1433 niovp = iov; 1434 } else { 1435 #ifdef DEBUG 1436 rfs_write_misses++; 1437 #endif 1438 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1439 } 1440 /* 1441 * Put together the scatter/gather iovecs. 1442 */ 1443 iovp = niovp; 1444 trp = rp; 1445 count = 0; 1446 do { 1447 if (trp->wa->wa_data || trp->wa->wa_rlist) { 1448 if (trp->wa->wa_rlist) { 1449 iovp->iov_base = 1450 (char *)((trp->wa->wa_rlist)-> 1451 u.c_daddr3); 1452 iovp->iov_len = trp->wa->wa_count; 1453 } else { 1454 iovp->iov_base = trp->wa->wa_data; 1455 iovp->iov_len = trp->wa->wa_count; 1456 } 1457 iovp++; 1458 } else { 1459 m = trp->wa->wa_mblk; 1460 rcount = trp->wa->wa_count; 1461 while (m != NULL) { 1462 iovp->iov_base = (caddr_t)m->b_rptr; 1463 iovp->iov_len = (m->b_wptr - m->b_rptr); 1464 rcount -= iovp->iov_len; 1465 if (rcount < 0) 1466 iovp->iov_len += rcount; 1467 iovp++; 1468 if (rcount <= 0) 1469 break; 1470 m = m->b_cont; 1471 } 1472 } 1473 count += trp->wa->wa_count; 1474 trp = trp->list; 1475 } while (trp != lrp); 1476 1477 uio.uio_iov = niovp; 1478 uio.uio_iovcnt = iovcnt; 1479 uio.uio_segflg = UIO_SYSSPACE; 1480 uio.uio_extflg = UIO_COPY_DEFAULT; 1481 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1482 uio.uio_resid = count; 1483 /* 1484 * The limit is checked on the client. We 1485 * should allow any size writes here. 1486 */ 1487 uio.uio_llimit = curproc->p_fsz_ctl; 1488 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1489 if (rlimit < (rlim64_t)uio.uio_resid) 1490 uio.uio_resid = (uint_t)rlimit; 1491 1492 /* 1493 * For now we assume no append mode. 1494 */ 1495 1496 /* 1497 * We're changing creds because VM may fault 1498 * and we need the cred of the current 1499 * thread to be used if quota * checking is 1500 * enabled. 1501 */ 1502 savecred = curthread->t_cred; 1503 curthread->t_cred = cr; 1504 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct); 1505 curthread->t_cred = savecred; 1506 1507 /* check if a monitor detected a delegation conflict */ 1508 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 1509 /* mark as wouldblock so response is dropped */ 1510 curthread->t_flag |= T_WOULDBLOCK; 1511 1512 if (niovp != iov) 1513 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1514 1515 if (!error) { 1516 data_written = 1; 1517 /* 1518 * Get attributes again so we send the latest mod 1519 * time to the client side for his cache. 1520 */ 1521 va.va_mask = AT_ALL; /* now we want everything */ 1522 1523 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct); 1524 1525 if (!error) 1526 acl_perm(vp, exi, &va, rp->cr); 1527 } 1528 1529 /* 1530 * Fill in the status responses for each request 1531 * which was just handled. Also, copy the latest 1532 * attributes in to the attribute responses if 1533 * appropriate. 1534 */ 1535 t_flag = curthread->t_flag & T_WOULDBLOCK; 1536 do { 1537 rp->thread->t_flag |= t_flag; 1538 /* check for overflows */ 1539 if (!error) { 1540 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1541 } 1542 rp->ns->ns_status = puterrno(error); 1543 rp = rp->list; 1544 } while (rp != lrp); 1545 } while (rp != NULL); 1546 1547 /* 1548 * If any data was written at all, then we need to flush 1549 * the data and metadata to stable storage. 1550 */ 1551 if (data_written) { 1552 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct); 1553 1554 if (!error) { 1555 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct); 1556 } 1557 } 1558 1559 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); 1560 1561 if (in_crit) 1562 nbl_end_crit(vp); 1563 VN_RELE(vp); 1564 1565 t_flag = curthread->t_flag & T_WOULDBLOCK; 1566 mutex_enter(&rfs_async_write_lock); 1567 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1568 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1569 rp->ns->ns_status = puterrno(error); 1570 rp->thread->t_flag |= t_flag; 1571 } 1572 } 1573 cv_broadcast(&nlp->cv); 1574 mutex_exit(&rfs_async_write_lock); 1575 1576 } 1577 1578 void * 1579 rfs_write_getfh(struct nfswriteargs *wa) 1580 { 1581 return (&wa->wa_fhandle); 1582 } 1583 1584 /* 1585 * Create a file. 1586 * Creates a file with given attributes and returns those attributes 1587 * and an fhandle for the new file. 1588 */ 1589 void 1590 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1591 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1592 { 1593 int error; 1594 int lookuperr; 1595 int in_crit = 0; 1596 struct vattr va; 1597 vnode_t *vp; 1598 vnode_t *realvp; 1599 vnode_t *dvp; 1600 char *name = args->ca_da.da_name; 1601 vnode_t *tvp = NULL; 1602 int mode; 1603 int lookup_ok; 1604 bool_t trunc; 1605 1606 /* 1607 * Disallow NULL paths 1608 */ 1609 if (name == NULL || *name == '\0') { 1610 dr->dr_status = NFSERR_ACCES; 1611 return; 1612 } 1613 1614 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1615 if (dvp == NULL) { 1616 dr->dr_status = NFSERR_STALE; 1617 return; 1618 } 1619 1620 error = sattr_to_vattr(args->ca_sa, &va); 1621 if (error) { 1622 dr->dr_status = puterrno(error); 1623 return; 1624 } 1625 1626 /* 1627 * Must specify the mode. 1628 */ 1629 if (!(va.va_mask & AT_MODE)) { 1630 VN_RELE(dvp); 1631 dr->dr_status = NFSERR_INVAL; 1632 return; 1633 } 1634 1635 /* 1636 * This is a completely gross hack to make mknod 1637 * work over the wire until we can wack the protocol 1638 */ 1639 if ((va.va_mode & IFMT) == IFCHR) { 1640 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1641 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1642 else { 1643 va.va_type = VCHR; 1644 /* 1645 * uncompress the received dev_t 1646 * if the top half is zero indicating a request 1647 * from an `older style' OS. 1648 */ 1649 if ((va.va_size & 0xffff0000) == 0) 1650 va.va_rdev = nfsv2_expdev(va.va_size); 1651 else 1652 va.va_rdev = (dev_t)va.va_size; 1653 } 1654 va.va_mask &= ~AT_SIZE; 1655 } else if ((va.va_mode & IFMT) == IFBLK) { 1656 va.va_type = VBLK; 1657 /* 1658 * uncompress the received dev_t 1659 * if the top half is zero indicating a request 1660 * from an `older style' OS. 1661 */ 1662 if ((va.va_size & 0xffff0000) == 0) 1663 va.va_rdev = nfsv2_expdev(va.va_size); 1664 else 1665 va.va_rdev = (dev_t)va.va_size; 1666 va.va_mask &= ~AT_SIZE; 1667 } else if ((va.va_mode & IFMT) == IFSOCK) { 1668 va.va_type = VSOCK; 1669 } else 1670 va.va_type = VREG; 1671 va.va_mode &= ~IFMT; 1672 va.va_mask |= AT_TYPE; 1673 1674 /* 1675 * Why was the choice made to use VWRITE as the mode to the 1676 * call to VOP_CREATE ? This results in a bug. When a client 1677 * opens a file that already exists and is RDONLY, the second 1678 * open fails with an EACESS because of the mode. 1679 * bug ID 1054648. 1680 */ 1681 lookup_ok = 0; 1682 mode = VWRITE; 1683 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1684 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1685 NULL, NULL, NULL); 1686 if (!error) { 1687 struct vattr at; 1688 1689 lookup_ok = 1; 1690 at.va_mask = AT_MODE; 1691 error = VOP_GETATTR(tvp, &at, 0, cr, NULL); 1692 if (!error) 1693 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1694 VN_RELE(tvp); 1695 tvp = NULL; 1696 } 1697 } 1698 1699 if (!lookup_ok) { 1700 if (rdonly(exi, req)) { 1701 error = EROFS; 1702 } else if (va.va_type != VREG && va.va_type != VFIFO && 1703 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1704 error = EPERM; 1705 } else { 1706 error = 0; 1707 } 1708 } 1709 1710 /* 1711 * If file size is being modified on an already existing file 1712 * make sure that there are no conflicting non-blocking mandatory 1713 * locks in the region being manipulated. Return EACCES if there 1714 * are conflicting locks. 1715 */ 1716 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1717 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1718 NULL, NULL, NULL); 1719 1720 if (!lookuperr && 1721 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1722 VN_RELE(tvp); 1723 curthread->t_flag |= T_WOULDBLOCK; 1724 goto out; 1725 } 1726 1727 if (!lookuperr && nbl_need_check(tvp)) { 1728 /* 1729 * The file exists. Now check if it has any 1730 * conflicting non-blocking mandatory locks 1731 * in the region being changed. 1732 */ 1733 struct vattr bva; 1734 u_offset_t offset; 1735 ssize_t length; 1736 1737 nbl_start_crit(tvp, RW_READER); 1738 in_crit = 1; 1739 1740 bva.va_mask = AT_SIZE; 1741 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL); 1742 if (!error) { 1743 if (va.va_size < bva.va_size) { 1744 offset = va.va_size; 1745 length = bva.va_size - va.va_size; 1746 } else { 1747 offset = bva.va_size; 1748 length = va.va_size - bva.va_size; 1749 } 1750 if (length) { 1751 if (nbl_conflict(tvp, NBL_WRITE, 1752 offset, length, 0, NULL)) { 1753 error = EACCES; 1754 } 1755 } 1756 } 1757 if (error) { 1758 nbl_end_crit(tvp); 1759 VN_RELE(tvp); 1760 in_crit = 0; 1761 } 1762 } else if (tvp != NULL) { 1763 VN_RELE(tvp); 1764 } 1765 } 1766 1767 if (!error) { 1768 /* 1769 * If filesystem is shared with nosuid the remove any 1770 * setuid/setgid bits on create. 1771 */ 1772 if (va.va_type == VREG && 1773 exi->exi_export.ex_flags & EX_NOSUID) 1774 va.va_mode &= ~(VSUID | VSGID); 1775 1776 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0, 1777 NULL, NULL); 1778 1779 if (!error) { 1780 1781 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1782 trunc = TRUE; 1783 else 1784 trunc = FALSE; 1785 1786 if (rfs4_check_delegated(FWRITE, vp, trunc)) { 1787 VN_RELE(vp); 1788 curthread->t_flag |= T_WOULDBLOCK; 1789 goto out; 1790 } 1791 va.va_mask = AT_ALL; 1792 1793 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 1794 1795 /* check for overflows */ 1796 if (!error) { 1797 acl_perm(vp, exi, &va, cr); 1798 error = vattr_to_nattr(&va, &dr->dr_attr); 1799 if (!error) { 1800 error = makefh(&dr->dr_fhandle, vp, 1801 exi); 1802 } 1803 } 1804 /* 1805 * Force modified metadata out to stable storage. 1806 * 1807 * if a underlying vp exists, pass it to VOP_FSYNC 1808 */ 1809 if (VOP_REALVP(vp, &realvp, NULL) == 0) 1810 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL); 1811 else 1812 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 1813 VN_RELE(vp); 1814 } 1815 1816 if (in_crit) { 1817 nbl_end_crit(tvp); 1818 VN_RELE(tvp); 1819 } 1820 } 1821 1822 /* 1823 * Force modified data and metadata out to stable storage. 1824 */ 1825 (void) VOP_FSYNC(dvp, 0, cr, NULL); 1826 1827 out: 1828 1829 VN_RELE(dvp); 1830 1831 dr->dr_status = puterrno(error); 1832 1833 } 1834 void * 1835 rfs_create_getfh(struct nfscreatargs *args) 1836 { 1837 return (args->ca_da.da_fhandle); 1838 } 1839 1840 /* 1841 * Remove a file. 1842 * Remove named file from parent directory. 1843 */ 1844 void 1845 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 1846 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1847 { 1848 int error = 0; 1849 vnode_t *vp; 1850 vnode_t *targvp; 1851 int in_crit = 0; 1852 1853 /* 1854 * Disallow NULL paths 1855 */ 1856 if (da->da_name == NULL || *da->da_name == '\0') { 1857 *status = NFSERR_ACCES; 1858 return; 1859 } 1860 1861 vp = nfs_fhtovp(da->da_fhandle, exi); 1862 if (vp == NULL) { 1863 *status = NFSERR_STALE; 1864 return; 1865 } 1866 1867 if (rdonly(exi, req)) { 1868 VN_RELE(vp); 1869 *status = NFSERR_ROFS; 1870 return; 1871 } 1872 1873 /* 1874 * Check for a conflict with a non-blocking mandatory share reservation. 1875 */ 1876 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 1877 NULL, cr, NULL, NULL, NULL); 1878 if (error != 0) { 1879 VN_RELE(vp); 1880 *status = puterrno(error); 1881 return; 1882 } 1883 1884 /* 1885 * If the file is delegated to an v4 client, then initiate 1886 * recall and drop this request (by setting T_WOULDBLOCK). 1887 * The client will eventually re-transmit the request and 1888 * (hopefully), by then, the v4 client will have returned 1889 * the delegation. 1890 */ 1891 1892 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 1893 VN_RELE(vp); 1894 VN_RELE(targvp); 1895 curthread->t_flag |= T_WOULDBLOCK; 1896 return; 1897 } 1898 1899 if (nbl_need_check(targvp)) { 1900 nbl_start_crit(targvp, RW_READER); 1901 in_crit = 1; 1902 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { 1903 error = EACCES; 1904 goto out; 1905 } 1906 } 1907 1908 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0); 1909 1910 /* 1911 * Force modified data and metadata out to stable storage. 1912 */ 1913 (void) VOP_FSYNC(vp, 0, cr, NULL); 1914 1915 out: 1916 if (in_crit) 1917 nbl_end_crit(targvp); 1918 VN_RELE(targvp); 1919 VN_RELE(vp); 1920 1921 *status = puterrno(error); 1922 1923 } 1924 1925 void * 1926 rfs_remove_getfh(struct nfsdiropargs *da) 1927 { 1928 return (da->da_fhandle); 1929 } 1930 1931 /* 1932 * rename a file 1933 * Give a file (from) a new name (to). 1934 */ 1935 void 1936 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 1937 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1938 { 1939 int error = 0; 1940 vnode_t *fromvp; 1941 vnode_t *tovp; 1942 struct exportinfo *to_exi; 1943 fhandle_t *fh; 1944 vnode_t *srcvp; 1945 vnode_t *targvp; 1946 int in_crit = 0; 1947 1948 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 1949 if (fromvp == NULL) { 1950 *status = NFSERR_STALE; 1951 return; 1952 } 1953 1954 fh = args->rna_to.da_fhandle; 1955 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 1956 if (to_exi == NULL) { 1957 VN_RELE(fromvp); 1958 *status = NFSERR_ACCES; 1959 return; 1960 } 1961 exi_rele(to_exi); 1962 1963 if (to_exi != exi) { 1964 VN_RELE(fromvp); 1965 *status = NFSERR_XDEV; 1966 return; 1967 } 1968 1969 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 1970 if (tovp == NULL) { 1971 VN_RELE(fromvp); 1972 *status = NFSERR_STALE; 1973 return; 1974 } 1975 1976 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 1977 VN_RELE(tovp); 1978 VN_RELE(fromvp); 1979 *status = NFSERR_NOTDIR; 1980 return; 1981 } 1982 1983 /* 1984 * Disallow NULL paths 1985 */ 1986 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 1987 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 1988 VN_RELE(tovp); 1989 VN_RELE(fromvp); 1990 *status = NFSERR_ACCES; 1991 return; 1992 } 1993 1994 if (rdonly(exi, req)) { 1995 VN_RELE(tovp); 1996 VN_RELE(fromvp); 1997 *status = NFSERR_ROFS; 1998 return; 1999 } 2000 2001 /* 2002 * Check for a conflict with a non-blocking mandatory share reservation. 2003 */ 2004 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2005 NULL, cr, NULL, NULL, NULL); 2006 if (error != 0) { 2007 VN_RELE(tovp); 2008 VN_RELE(fromvp); 2009 *status = puterrno(error); 2010 return; 2011 } 2012 2013 /* Check for delegations on the source file */ 2014 2015 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2016 VN_RELE(tovp); 2017 VN_RELE(fromvp); 2018 VN_RELE(srcvp); 2019 curthread->t_flag |= T_WOULDBLOCK; 2020 return; 2021 } 2022 2023 /* Check for delegation on the file being renamed over, if it exists */ 2024 2025 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2026 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr, 2027 NULL, NULL, NULL) == 0) { 2028 2029 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2030 VN_RELE(tovp); 2031 VN_RELE(fromvp); 2032 VN_RELE(srcvp); 2033 VN_RELE(targvp); 2034 curthread->t_flag |= T_WOULDBLOCK; 2035 return; 2036 } 2037 VN_RELE(targvp); 2038 } 2039 2040 2041 if (nbl_need_check(srcvp)) { 2042 nbl_start_crit(srcvp, RW_READER); 2043 in_crit = 1; 2044 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) { 2045 error = EACCES; 2046 goto out; 2047 } 2048 } 2049 2050 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2051 tovp, args->rna_to.da_name, cr, NULL, 0); 2052 2053 if (error == 0) 2054 vn_renamepath(tovp, srcvp, args->rna_to.da_name, 2055 strlen(args->rna_to.da_name)); 2056 2057 /* 2058 * Force modified data and metadata out to stable storage. 2059 */ 2060 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2061 (void) VOP_FSYNC(fromvp, 0, cr, NULL); 2062 2063 out: 2064 if (in_crit) 2065 nbl_end_crit(srcvp); 2066 VN_RELE(srcvp); 2067 VN_RELE(tovp); 2068 VN_RELE(fromvp); 2069 2070 *status = puterrno(error); 2071 2072 } 2073 void * 2074 rfs_rename_getfh(struct nfsrnmargs *args) 2075 { 2076 return (args->rna_from.da_fhandle); 2077 } 2078 2079 /* 2080 * Link to a file. 2081 * Create a file (to) which is a hard link to the given file (from). 2082 */ 2083 void 2084 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2085 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2086 { 2087 int error; 2088 vnode_t *fromvp; 2089 vnode_t *tovp; 2090 struct exportinfo *to_exi; 2091 fhandle_t *fh; 2092 2093 fromvp = nfs_fhtovp(args->la_from, exi); 2094 if (fromvp == NULL) { 2095 *status = NFSERR_STALE; 2096 return; 2097 } 2098 2099 fh = args->la_to.da_fhandle; 2100 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2101 if (to_exi == NULL) { 2102 VN_RELE(fromvp); 2103 *status = NFSERR_ACCES; 2104 return; 2105 } 2106 exi_rele(to_exi); 2107 2108 if (to_exi != exi) { 2109 VN_RELE(fromvp); 2110 *status = NFSERR_XDEV; 2111 return; 2112 } 2113 2114 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2115 if (tovp == NULL) { 2116 VN_RELE(fromvp); 2117 *status = NFSERR_STALE; 2118 return; 2119 } 2120 2121 if (tovp->v_type != VDIR) { 2122 VN_RELE(tovp); 2123 VN_RELE(fromvp); 2124 *status = NFSERR_NOTDIR; 2125 return; 2126 } 2127 /* 2128 * Disallow NULL paths 2129 */ 2130 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2131 VN_RELE(tovp); 2132 VN_RELE(fromvp); 2133 *status = NFSERR_ACCES; 2134 return; 2135 } 2136 2137 if (rdonly(exi, req)) { 2138 VN_RELE(tovp); 2139 VN_RELE(fromvp); 2140 *status = NFSERR_ROFS; 2141 return; 2142 } 2143 2144 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0); 2145 2146 /* 2147 * Force modified data and metadata out to stable storage. 2148 */ 2149 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2150 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL); 2151 2152 VN_RELE(tovp); 2153 VN_RELE(fromvp); 2154 2155 *status = puterrno(error); 2156 2157 } 2158 void * 2159 rfs_link_getfh(struct nfslinkargs *args) 2160 { 2161 return (args->la_from); 2162 } 2163 2164 /* 2165 * Symbolicly link to a file. 2166 * Create a file (to) with the given attributes which is a symbolic link 2167 * to the given path name (to). 2168 */ 2169 void 2170 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2171 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2172 { 2173 int error; 2174 struct vattr va; 2175 vnode_t *vp; 2176 vnode_t *svp; 2177 int lerror; 2178 2179 /* 2180 * Disallow NULL paths 2181 */ 2182 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2183 *status = NFSERR_ACCES; 2184 return; 2185 } 2186 2187 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2188 if (vp == NULL) { 2189 *status = NFSERR_STALE; 2190 return; 2191 } 2192 2193 if (rdonly(exi, req)) { 2194 VN_RELE(vp); 2195 *status = NFSERR_ROFS; 2196 return; 2197 } 2198 2199 error = sattr_to_vattr(args->sla_sa, &va); 2200 if (error) { 2201 VN_RELE(vp); 2202 *status = puterrno(error); 2203 return; 2204 } 2205 2206 if (!(va.va_mask & AT_MODE)) { 2207 VN_RELE(vp); 2208 *status = NFSERR_INVAL; 2209 return; 2210 } 2211 2212 va.va_type = VLNK; 2213 va.va_mask |= AT_TYPE; 2214 2215 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr, 2216 NULL, 0); 2217 2218 /* 2219 * Force new data and metadata out to stable storage. 2220 */ 2221 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2222 0, NULL, cr, NULL, NULL, NULL); 2223 2224 if (!lerror) { 2225 (void) VOP_FSYNC(svp, 0, cr, NULL); 2226 VN_RELE(svp); 2227 } 2228 2229 /* 2230 * Force modified data and metadata out to stable storage. 2231 */ 2232 (void) VOP_FSYNC(vp, 0, cr, NULL); 2233 2234 VN_RELE(vp); 2235 2236 *status = puterrno(error); 2237 2238 } 2239 void * 2240 rfs_symlink_getfh(struct nfsslargs *args) 2241 { 2242 return (args->sla_from.da_fhandle); 2243 } 2244 2245 /* 2246 * Make a directory. 2247 * Create a directory with the given name, parent directory, and attributes. 2248 * Returns a file handle and attributes for the new directory. 2249 */ 2250 void 2251 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2252 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2253 { 2254 int error; 2255 struct vattr va; 2256 vnode_t *dvp = NULL; 2257 vnode_t *vp; 2258 char *name = args->ca_da.da_name; 2259 2260 /* 2261 * Disallow NULL paths 2262 */ 2263 if (name == NULL || *name == '\0') { 2264 dr->dr_status = NFSERR_ACCES; 2265 return; 2266 } 2267 2268 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2269 if (vp == NULL) { 2270 dr->dr_status = NFSERR_STALE; 2271 return; 2272 } 2273 2274 if (rdonly(exi, req)) { 2275 VN_RELE(vp); 2276 dr->dr_status = NFSERR_ROFS; 2277 return; 2278 } 2279 2280 error = sattr_to_vattr(args->ca_sa, &va); 2281 if (error) { 2282 VN_RELE(vp); 2283 dr->dr_status = puterrno(error); 2284 return; 2285 } 2286 2287 if (!(va.va_mask & AT_MODE)) { 2288 VN_RELE(vp); 2289 dr->dr_status = NFSERR_INVAL; 2290 return; 2291 } 2292 2293 va.va_type = VDIR; 2294 va.va_mask |= AT_TYPE; 2295 2296 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL); 2297 2298 if (!error) { 2299 /* 2300 * Attribtutes of the newly created directory should 2301 * be returned to the client. 2302 */ 2303 va.va_mask = AT_ALL; /* We want everything */ 2304 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2305 2306 /* check for overflows */ 2307 if (!error) { 2308 acl_perm(vp, exi, &va, cr); 2309 error = vattr_to_nattr(&va, &dr->dr_attr); 2310 if (!error) { 2311 error = makefh(&dr->dr_fhandle, dvp, exi); 2312 } 2313 } 2314 /* 2315 * Force new data and metadata out to stable storage. 2316 */ 2317 (void) VOP_FSYNC(dvp, 0, cr, NULL); 2318 VN_RELE(dvp); 2319 } 2320 2321 /* 2322 * Force modified data and metadata out to stable storage. 2323 */ 2324 (void) VOP_FSYNC(vp, 0, cr, NULL); 2325 2326 VN_RELE(vp); 2327 2328 dr->dr_status = puterrno(error); 2329 2330 } 2331 void * 2332 rfs_mkdir_getfh(struct nfscreatargs *args) 2333 { 2334 return (args->ca_da.da_fhandle); 2335 } 2336 2337 /* 2338 * Remove a directory. 2339 * Remove the given directory name from the given parent directory. 2340 */ 2341 void 2342 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2343 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2344 { 2345 int error; 2346 vnode_t *vp; 2347 2348 2349 /* 2350 * Disallow NULL paths 2351 */ 2352 if (da->da_name == NULL || *da->da_name == '\0') { 2353 *status = NFSERR_ACCES; 2354 return; 2355 } 2356 2357 vp = nfs_fhtovp(da->da_fhandle, exi); 2358 if (vp == NULL) { 2359 *status = NFSERR_STALE; 2360 return; 2361 } 2362 2363 if (rdonly(exi, req)) { 2364 VN_RELE(vp); 2365 *status = NFSERR_ROFS; 2366 return; 2367 } 2368 2369 /* 2370 * VOP_RMDIR now takes a new third argument (the current 2371 * directory of the process). That's because someone 2372 * wants to return EINVAL if one tries to remove ".". 2373 * Of course, NFS servers have no idea what their 2374 * clients' current directories are. We fake it by 2375 * supplying a vnode known to exist and illegal to 2376 * remove. 2377 */ 2378 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0); 2379 2380 /* 2381 * Force modified data and metadata out to stable storage. 2382 */ 2383 (void) VOP_FSYNC(vp, 0, cr, NULL); 2384 2385 VN_RELE(vp); 2386 2387 /* 2388 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2389 * if the directory is not empty. A System V NFS server 2390 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2391 * over the wire. 2392 */ 2393 if (error == EEXIST) 2394 *status = NFSERR_NOTEMPTY; 2395 else 2396 *status = puterrno(error); 2397 2398 } 2399 void * 2400 rfs_rmdir_getfh(struct nfsdiropargs *da) 2401 { 2402 return (da->da_fhandle); 2403 } 2404 2405 /* ARGSUSED */ 2406 void 2407 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2408 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2409 { 2410 int error; 2411 int iseof; 2412 struct iovec iov; 2413 struct uio uio; 2414 vnode_t *vp; 2415 2416 vp = nfs_fhtovp(&rda->rda_fh, exi); 2417 if (vp == NULL) { 2418 rd->rd_entries = NULL; 2419 rd->rd_status = NFSERR_STALE; 2420 return; 2421 } 2422 2423 if (vp->v_type != VDIR) { 2424 VN_RELE(vp); 2425 rd->rd_entries = NULL; 2426 rd->rd_status = NFSERR_NOTDIR; 2427 return; 2428 } 2429 2430 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2431 2432 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL); 2433 2434 if (error) { 2435 rd->rd_entries = NULL; 2436 goto bad; 2437 } 2438 2439 if (rda->rda_count == 0) { 2440 rd->rd_entries = NULL; 2441 rd->rd_size = 0; 2442 rd->rd_eof = FALSE; 2443 goto bad; 2444 } 2445 2446 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2447 2448 /* 2449 * Allocate data for entries. This will be freed by rfs_rddirfree. 2450 */ 2451 rd->rd_bufsize = (uint_t)rda->rda_count; 2452 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2453 2454 /* 2455 * Set up io vector to read directory data 2456 */ 2457 iov.iov_base = (caddr_t)rd->rd_entries; 2458 iov.iov_len = rda->rda_count; 2459 uio.uio_iov = &iov; 2460 uio.uio_iovcnt = 1; 2461 uio.uio_segflg = UIO_SYSSPACE; 2462 uio.uio_extflg = UIO_COPY_CACHED; 2463 uio.uio_loffset = (offset_t)rda->rda_offset; 2464 uio.uio_resid = rda->rda_count; 2465 2466 /* 2467 * read directory 2468 */ 2469 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0); 2470 2471 /* 2472 * Clean up 2473 */ 2474 if (!error) { 2475 /* 2476 * set size and eof 2477 */ 2478 if (uio.uio_resid == rda->rda_count) { 2479 rd->rd_size = 0; 2480 rd->rd_eof = TRUE; 2481 } else { 2482 rd->rd_size = (uint32_t)(rda->rda_count - 2483 uio.uio_resid); 2484 rd->rd_eof = iseof ? TRUE : FALSE; 2485 } 2486 } 2487 2488 bad: 2489 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2490 2491 #if 0 /* notyet */ 2492 /* 2493 * Don't do this. It causes local disk writes when just 2494 * reading the file and the overhead is deemed larger 2495 * than the benefit. 2496 */ 2497 /* 2498 * Force modified metadata out to stable storage. 2499 */ 2500 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 2501 #endif 2502 2503 VN_RELE(vp); 2504 2505 rd->rd_status = puterrno(error); 2506 2507 } 2508 void * 2509 rfs_readdir_getfh(struct nfsrddirargs *rda) 2510 { 2511 return (&rda->rda_fh); 2512 } 2513 void 2514 rfs_rddirfree(struct nfsrddirres *rd) 2515 { 2516 if (rd->rd_entries != NULL) 2517 kmem_free(rd->rd_entries, rd->rd_bufsize); 2518 } 2519 2520 /* ARGSUSED */ 2521 void 2522 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2523 struct svc_req *req, cred_t *cr) 2524 { 2525 int error; 2526 struct statvfs64 sb; 2527 vnode_t *vp; 2528 2529 vp = nfs_fhtovp(fh, exi); 2530 if (vp == NULL) { 2531 fs->fs_status = NFSERR_STALE; 2532 return; 2533 } 2534 2535 error = VFS_STATVFS(vp->v_vfsp, &sb); 2536 2537 if (!error) { 2538 fs->fs_tsize = nfstsize(); 2539 fs->fs_bsize = sb.f_frsize; 2540 fs->fs_blocks = sb.f_blocks; 2541 fs->fs_bfree = sb.f_bfree; 2542 fs->fs_bavail = sb.f_bavail; 2543 } 2544 2545 VN_RELE(vp); 2546 2547 fs->fs_status = puterrno(error); 2548 2549 } 2550 void * 2551 rfs_statfs_getfh(fhandle_t *fh) 2552 { 2553 return (fh); 2554 } 2555 2556 static int 2557 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2558 { 2559 vap->va_mask = 0; 2560 2561 /* 2562 * There was a sign extension bug in some VFS based systems 2563 * which stored the mode as a short. When it would get 2564 * assigned to a u_long, no sign extension would occur. 2565 * It needed to, but this wasn't noticed because sa_mode 2566 * would then get assigned back to the short, thus ignoring 2567 * the upper 16 bits of sa_mode. 2568 * 2569 * To make this implementation work for both broken 2570 * clients and good clients, we check for both versions 2571 * of the mode. 2572 */ 2573 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2574 sa->sa_mode != (uint32_t)-1) { 2575 vap->va_mask |= AT_MODE; 2576 vap->va_mode = sa->sa_mode; 2577 } 2578 if (sa->sa_uid != (uint32_t)-1) { 2579 vap->va_mask |= AT_UID; 2580 vap->va_uid = sa->sa_uid; 2581 } 2582 if (sa->sa_gid != (uint32_t)-1) { 2583 vap->va_mask |= AT_GID; 2584 vap->va_gid = sa->sa_gid; 2585 } 2586 if (sa->sa_size != (uint32_t)-1) { 2587 vap->va_mask |= AT_SIZE; 2588 vap->va_size = sa->sa_size; 2589 } 2590 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2591 sa->sa_atime.tv_usec != (int32_t)-1) { 2592 #ifndef _LP64 2593 /* return error if time overflow */ 2594 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2595 return (EOVERFLOW); 2596 #endif 2597 vap->va_mask |= AT_ATIME; 2598 /* 2599 * nfs protocol defines times as unsigned so don't extend sign, 2600 * unless sysadmin set nfs_allow_preepoch_time. 2601 */ 2602 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2603 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2604 } 2605 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2606 sa->sa_mtime.tv_usec != (int32_t)-1) { 2607 #ifndef _LP64 2608 /* return error if time overflow */ 2609 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2610 return (EOVERFLOW); 2611 #endif 2612 vap->va_mask |= AT_MTIME; 2613 /* 2614 * nfs protocol defines times as unsigned so don't extend sign, 2615 * unless sysadmin set nfs_allow_preepoch_time. 2616 */ 2617 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2618 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2619 } 2620 return (0); 2621 } 2622 2623 static enum nfsftype vt_to_nf[] = { 2624 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2625 }; 2626 2627 /* 2628 * check the following fields for overflow: nodeid, size, and time. 2629 * There could be a problem when converting 64-bit LP64 fields 2630 * into 32-bit ones. Return an error if there is an overflow. 2631 */ 2632 int 2633 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2634 { 2635 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2636 na->na_type = vt_to_nf[vap->va_type]; 2637 2638 if (vap->va_mode == (unsigned short) -1) 2639 na->na_mode = (uint32_t)-1; 2640 else 2641 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2642 2643 if (vap->va_uid == (unsigned short)(-1)) 2644 na->na_uid = (uint32_t)(-1); 2645 else if (vap->va_uid == UID_NOBODY) 2646 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2647 else 2648 na->na_uid = vap->va_uid; 2649 2650 if (vap->va_gid == (unsigned short)(-1)) 2651 na->na_gid = (uint32_t)-1; 2652 else if (vap->va_gid == GID_NOBODY) 2653 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2654 else 2655 na->na_gid = vap->va_gid; 2656 2657 /* 2658 * Do we need to check fsid for overflow? It is 64-bit in the 2659 * vattr, but are bigger than 32 bit values supported? 2660 */ 2661 na->na_fsid = vap->va_fsid; 2662 2663 na->na_nodeid = vap->va_nodeid; 2664 2665 /* 2666 * Check to make sure that the nodeid is representable over the 2667 * wire without losing bits. 2668 */ 2669 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2670 return (EFBIG); 2671 na->na_nlink = vap->va_nlink; 2672 2673 /* 2674 * Check for big files here, instead of at the caller. See 2675 * comments in cstat for large special file explanation. 2676 */ 2677 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2678 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2679 return (EFBIG); 2680 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2681 /* UNKNOWN_SIZE | OVERFLOW */ 2682 na->na_size = MAXOFF32_T; 2683 } else 2684 na->na_size = vap->va_size; 2685 } else 2686 na->na_size = vap->va_size; 2687 2688 /* 2689 * If the vnode times overflow the 32-bit times that NFS2 2690 * uses on the wire then return an error. 2691 */ 2692 if (!NFS_VAP_TIME_OK(vap)) { 2693 return (EOVERFLOW); 2694 } 2695 na->na_atime.tv_sec = vap->va_atime.tv_sec; 2696 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2697 2698 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 2699 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2700 2701 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 2702 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 2703 2704 /* 2705 * If the dev_t will fit into 16 bits then compress 2706 * it, otherwise leave it alone. See comments in 2707 * nfs_client.c. 2708 */ 2709 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 2710 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 2711 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 2712 else 2713 (void) cmpldev(&na->na_rdev, vap->va_rdev); 2714 2715 na->na_blocks = vap->va_nblocks; 2716 na->na_blocksize = vap->va_blksize; 2717 2718 /* 2719 * This bit of ugliness is a *TEMPORARY* hack to preserve the 2720 * over-the-wire protocols for named-pipe vnodes. It remaps the 2721 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 2722 * 2723 * BUYER BEWARE: 2724 * If you are porting the NFS to a non-Sun server, you probably 2725 * don't want to include the following block of code. The 2726 * over-the-wire special file types will be changing with the 2727 * NFS Protocol Revision. 2728 */ 2729 if (vap->va_type == VFIFO) 2730 NA_SETFIFO(na); 2731 return (0); 2732 } 2733 2734 /* 2735 * acl v2 support: returns approximate permission. 2736 * default: returns minimal permission (more restrictive) 2737 * aclok: returns maximal permission (less restrictive) 2738 * This routine changes the permissions that are alaredy in *va. 2739 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 2740 * CLASS_OBJ is always the same as GROUP_OBJ entry. 2741 */ 2742 static void 2743 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 2744 { 2745 vsecattr_t vsa; 2746 int aclcnt; 2747 aclent_t *aclentp; 2748 mode_t mask_perm; 2749 mode_t grp_perm; 2750 mode_t other_perm; 2751 mode_t other_orig; 2752 int error; 2753 2754 /* dont care default acl */ 2755 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 2756 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL); 2757 2758 if (!error) { 2759 aclcnt = vsa.vsa_aclcnt; 2760 if (aclcnt > MIN_ACL_ENTRIES) { 2761 /* non-trivial ACL */ 2762 aclentp = vsa.vsa_aclentp; 2763 if (exi->exi_export.ex_flags & EX_ACLOK) { 2764 /* maximal permissions */ 2765 grp_perm = 0; 2766 other_perm = 0; 2767 for (; aclcnt > 0; aclcnt--, aclentp++) { 2768 switch (aclentp->a_type) { 2769 case USER_OBJ: 2770 break; 2771 case USER: 2772 grp_perm |= 2773 aclentp->a_perm << 3; 2774 other_perm |= aclentp->a_perm; 2775 break; 2776 case GROUP_OBJ: 2777 grp_perm |= 2778 aclentp->a_perm << 3; 2779 break; 2780 case GROUP: 2781 other_perm |= aclentp->a_perm; 2782 break; 2783 case OTHER_OBJ: 2784 other_orig = aclentp->a_perm; 2785 break; 2786 case CLASS_OBJ: 2787 mask_perm = aclentp->a_perm; 2788 break; 2789 default: 2790 break; 2791 } 2792 } 2793 grp_perm &= mask_perm << 3; 2794 other_perm &= mask_perm; 2795 other_perm |= other_orig; 2796 2797 } else { 2798 /* minimal permissions */ 2799 grp_perm = 070; 2800 other_perm = 07; 2801 for (; aclcnt > 0; aclcnt--, aclentp++) { 2802 switch (aclentp->a_type) { 2803 case USER_OBJ: 2804 break; 2805 case USER: 2806 case CLASS_OBJ: 2807 grp_perm &= 2808 aclentp->a_perm << 3; 2809 other_perm &= 2810 aclentp->a_perm; 2811 break; 2812 case GROUP_OBJ: 2813 grp_perm &= 2814 aclentp->a_perm << 3; 2815 break; 2816 case GROUP: 2817 other_perm &= 2818 aclentp->a_perm; 2819 break; 2820 case OTHER_OBJ: 2821 other_perm &= 2822 aclentp->a_perm; 2823 break; 2824 default: 2825 break; 2826 } 2827 } 2828 } 2829 /* copy to va */ 2830 va->va_mode &= ~077; 2831 va->va_mode |= grp_perm | other_perm; 2832 } 2833 if (vsa.vsa_aclcnt) 2834 kmem_free(vsa.vsa_aclentp, 2835 vsa.vsa_aclcnt * sizeof (aclent_t)); 2836 } 2837 } 2838 2839 void 2840 rfs_srvrinit(void) 2841 { 2842 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 2843 nfs2_srv_caller_id = fs_new_caller_id(); 2844 } 2845 2846 void 2847 rfs_srvrfini(void) 2848 { 2849 mutex_destroy(&rfs_async_write_lock); 2850 } 2851 2852 static int 2853 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr) 2854 { 2855 struct clist *wcl; 2856 int data_len, avail_len, num; 2857 uint32_t count = rr->rr_count; 2858 2859 data_len = num = avail_len = 0; 2860 2861 wcl = ra->ra_wlist; 2862 while (wcl != NULL) { 2863 if (wcl->c_dmemhandle.mrc_rmr == 0) 2864 break; 2865 2866 avail_len += wcl->c_len; 2867 if (wcl->c_len < count) { 2868 data_len += wcl->c_len; 2869 } else { 2870 /* Can make the rest chunks all 0-len */ 2871 data_len += count; 2872 wcl->c_len = count; 2873 } 2874 count -= wcl->c_len; 2875 num ++; 2876 wcl = wcl->c_next; 2877 } 2878 2879 /* 2880 * MUST fail if there are still more data 2881 */ 2882 if (count > 0) { 2883 DTRACE_PROBE2(nfss__e__read__wlist__fail, 2884 int, data_len, int, count); 2885 return (FALSE); 2886 } 2887 2888 wcl = ra->ra_wlist; 2889 rr->rr_count = data_len; 2890 rr->rr_ok.rrok_wlist_len = data_len; 2891 rr->rr_ok.rrok_wlist = wcl; 2892 2893 return (TRUE); 2894 } 2895