1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All rights reserved. 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/uio.h> 42 #include <sys/stat.h> 43 #include <sys/errno.h> 44 #include <sys/sysmacros.h> 45 #include <sys/statvfs.h> 46 #include <sys/kmem.h> 47 #include <sys/kstat.h> 48 #include <sys/dirent.h> 49 #include <sys/cmn_err.h> 50 #include <sys/debug.h> 51 #include <sys/vtrace.h> 52 #include <sys/mode.h> 53 #include <sys/acl.h> 54 #include <sys/nbmlock.h> 55 #include <sys/policy.h> 56 57 #include <rpc/types.h> 58 #include <rpc/auth.h> 59 #include <rpc/svc.h> 60 61 #include <nfs/nfs.h> 62 #include <nfs/export.h> 63 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/seg_map.h> 68 #include <vm/seg_kmem.h> 69 70 #include <sys/strsubr.h> 71 72 /* 73 * These are the interface routines for the server side of the 74 * Network File System. See the NFS version 2 protocol specification 75 * for a description of this interface. 76 */ 77 78 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 79 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 80 cred_t *); 81 82 /* 83 * Some "over the wire" UNIX file types. These are encoded 84 * into the mode. This needs to be fixed in the next rev. 85 */ 86 #define IFMT 0170000 /* type of file */ 87 #define IFCHR 0020000 /* character special */ 88 #define IFBLK 0060000 /* block special */ 89 #define IFSOCK 0140000 /* socket */ 90 91 /* 92 * Get file attributes. 93 * Returns the current attributes of the file with the given fhandle. 94 */ 95 /* ARGSUSED */ 96 void 97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 98 struct svc_req *req, cred_t *cr) 99 { 100 int error; 101 vnode_t *vp; 102 struct vattr va; 103 104 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, 105 "rfs_getattr_start:"); 106 107 vp = nfs_fhtovp(fhp, exi); 108 if (vp == NULL) { 109 ns->ns_status = NFSERR_STALE; 110 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 111 "rfs_getattr_end:(%S)", "stale"); 112 return; 113 } 114 115 /* 116 * Do the getattr. 117 */ 118 va.va_mask = AT_ALL; /* we want all the attributes */ 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 120 "vop_getattr_start:"); 121 error = rfs4_delegated_getattr(vp, &va, 0, cr); 122 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 123 "vop_getattr_end:"); 124 125 /* check for overflows */ 126 if (!error) { 127 acl_perm(vp, exi, &va, cr); 128 error = vattr_to_nattr(&va, &ns->ns_attr); 129 } 130 131 VN_RELE(vp); 132 133 ns->ns_status = puterrno(error); 134 135 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 136 "rfs_getattr_end:(%S)", "done"); 137 } 138 fhandle_t * 139 rfs_getattr_getfh(fhandle_t *fhp) 140 { 141 return (fhp); 142 } 143 144 /* 145 * Set file attributes. 146 * Sets the attributes of the file with the given fhandle. Returns 147 * the new attributes. 148 */ 149 void 150 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 151 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 152 { 153 int error; 154 int flag; 155 int in_crit = 0; 156 vnode_t *vp; 157 struct vattr va; 158 struct vattr bva; 159 struct flock64 bf; 160 161 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, 162 "rfs_setattr_start:"); 163 164 vp = nfs_fhtovp(&args->saa_fh, exi); 165 if (vp == NULL) { 166 ns->ns_status = NFSERR_STALE; 167 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 168 "rfs_setattr_end:(%S)", "stale"); 169 return; 170 } 171 172 if (rdonly(exi, req) || vn_is_readonly(vp)) { 173 VN_RELE(vp); 174 ns->ns_status = NFSERR_ROFS; 175 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 176 "rfs_setattr_end:(%S)", "rofs"); 177 return; 178 } 179 180 error = sattr_to_vattr(&args->saa_sa, &va); 181 if (error) { 182 VN_RELE(vp); 183 ns->ns_status = puterrno(error); 184 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 185 "rfs_setattr_end:(%S)", "sattr"); 186 return; 187 } 188 189 /* 190 * If the client is requesting a change to the mtime, 191 * but the nanosecond field is set to 1 billion, then 192 * this is a flag to the server that it should set the 193 * atime and mtime fields to the server's current time. 194 * The 1 billion number actually came from the client 195 * as 1 million, but the units in the over the wire 196 * request are microseconds instead of nanoseconds. 197 * 198 * This is an overload of the protocol and should be 199 * documented in the NFS Version 2 protocol specification. 200 */ 201 if (va.va_mask & AT_MTIME) { 202 if (va.va_mtime.tv_nsec == 1000000000) { 203 gethrestime(&va.va_mtime); 204 va.va_atime = va.va_mtime; 205 va.va_mask |= AT_ATIME; 206 flag = 0; 207 } else 208 flag = ATTR_UTIME; 209 } else 210 flag = 0; 211 212 /* 213 * If the filesystem is exported with nosuid, then mask off 214 * the setuid and setgid bits. 215 */ 216 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 217 (exi->exi_export.ex_flags & EX_NOSUID)) 218 va.va_mode &= ~(VSUID | VSGID); 219 220 /* 221 * We need to specially handle size changes because it is 222 * possible for the client to create a file with modes 223 * which indicate read-only, but with the file opened for 224 * writing. If the client then tries to set the size of 225 * the file, then the normal access checking done in 226 * VOP_SETATTR would prevent the client from doing so, 227 * although it should be legal for it to do so. To get 228 * around this, we do the access checking for ourselves 229 * and then use VOP_SPACE which doesn't do the access 230 * checking which VOP_SETATTR does. VOP_SPACE can only 231 * operate on VREG files, let VOP_SETATTR handle the other 232 * extremely rare cases. 233 * Also the client should not be allowed to change the 234 * size of the file if there is a conflicting non-blocking 235 * mandatory lock in the region of change. 236 * 237 * Also(2), check to see if the v4 side of the server has 238 * delegated this file. If so, then we set T_WOULDBLOCK 239 * so that the dispatch function dosn't send a reply, forcing 240 * the client to retrasmit its request. 241 */ 242 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 243 /* If delegated, mark as wouldblock so response is dropped */ 244 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 245 VN_RELE(vp); 246 curthread->t_flag |= T_WOULDBLOCK; 247 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 248 "rfs_setattr_end:(%S)", "delegated"); 249 return; 250 } 251 if (nbl_need_check(vp)) { 252 nbl_start_crit(vp, RW_READER); 253 in_crit = 1; 254 } 255 256 bva.va_mask = AT_UID | AT_SIZE; 257 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 258 "vop_getattr_start:"); 259 error = VOP_GETATTR(vp, &bva, 0, cr); 260 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 261 "vop_getattr_end:"); 262 if (error) { 263 if (in_crit) 264 nbl_end_crit(vp); 265 VN_RELE(vp); 266 ns->ns_status = puterrno(error); 267 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 268 "rfs_setattr_end:(%S)", "getattr"); 269 return; 270 } 271 272 if (in_crit) { 273 u_offset_t offset; 274 ssize_t length; 275 276 if (va.va_size < bva.va_size) { 277 offset = va.va_size; 278 length = bva.va_size - va.va_size; 279 } else { 280 offset = bva.va_size; 281 length = va.va_size - bva.va_size; 282 } 283 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) { 284 error = EACCES; 285 } 286 } 287 288 if (crgetuid(cr) == bva.va_uid && !error && 289 va.va_size != bva.va_size) { 290 va.va_mask &= ~AT_SIZE; 291 bf.l_type = F_WRLCK; 292 bf.l_whence = 0; 293 bf.l_start = (off64_t)va.va_size; 294 bf.l_len = 0; 295 bf.l_sysid = 0; 296 bf.l_pid = 0; 297 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 298 "vop_space_start:"); 299 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 300 (offset_t)va.va_size, cr, NULL); 301 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, 302 "vop_space_end:"); 303 } 304 if (in_crit) 305 nbl_end_crit(vp); 306 } else 307 error = 0; 308 309 /* 310 * Do the setattr. 311 */ 312 if (!error && va.va_mask) { 313 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, 314 "vop_setattr_start:"); 315 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 316 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, 317 "vop_setattr_end:"); 318 } 319 320 if (!error) { 321 va.va_mask = AT_ALL; /* get everything */ 322 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 323 "vop_getattr_start:"); 324 error = rfs4_delegated_getattr(vp, &va, 0, cr); 325 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 326 "vop_getattr_end:"); 327 328 /* check for overflows */ 329 if (!error) { 330 acl_perm(vp, exi, &va, cr); 331 error = vattr_to_nattr(&va, &ns->ns_attr); 332 } 333 } 334 335 /* 336 * Force modified metadata out to stable storage. 337 */ 338 (void) VOP_FSYNC(vp, FNODSYNC, cr); 339 340 VN_RELE(vp); 341 342 ns->ns_status = puterrno(error); 343 344 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 345 "rfs_setattr_end:(%S)", "done"); 346 } 347 fhandle_t * 348 rfs_setattr_getfh(struct nfssaargs *args) 349 { 350 return (&args->saa_fh); 351 } 352 353 /* 354 * Directory lookup. 355 * Returns an fhandle and file attributes for file name in a directory. 356 */ 357 /* ARGSUSED */ 358 void 359 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 360 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 361 { 362 int error; 363 vnode_t *dvp; 364 vnode_t *vp; 365 struct vattr va; 366 fhandle_t *fhp = da->da_fhandle; 367 struct sec_ol sec = {0, 0}; 368 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 369 370 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, 371 "rfs_lookup_start:"); 372 373 /* 374 * Disallow NULL paths 375 */ 376 if (da->da_name == NULL || *da->da_name == '\0') { 377 dr->dr_status = NFSERR_ACCES; 378 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 379 "rfs_lookup_end:(%S)", "access"); 380 return; 381 } 382 383 /* 384 * Allow lookups from the root - the default 385 * location of the public filehandle. 386 */ 387 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 388 dvp = rootdir; 389 VN_HOLD(dvp); 390 } else { 391 dvp = nfs_fhtovp(fhp, exi); 392 if (dvp == NULL) { 393 dr->dr_status = NFSERR_STALE; 394 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 395 "rfs_lookup_end:(%S)", "stale"); 396 return; 397 } 398 } 399 400 /* 401 * Not allow lookup beyond root. 402 * If the filehandle matches a filehandle of the exi, 403 * then the ".." refers beyond the root of an exported filesystem. 404 */ 405 if (strcmp(da->da_name, "..") == 0 && 406 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 407 VN_RELE(dvp); 408 dr->dr_status = NFSERR_NOENT; 409 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 410 "rfs_lookup_end:(%S)", "noent"); 411 return; 412 } 413 414 /* 415 * If the public filehandle is used then allow 416 * a multi-component lookup, i.e. evaluate 417 * a pathname and follow symbolic links if 418 * necessary. 419 * 420 * This may result in a vnode in another filesystem 421 * which is OK as long as the filesystem is exported. 422 */ 423 if (PUBLIC_FH2(fhp)) { 424 publicfh_flag = TRUE; 425 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 426 &sec); 427 } else { 428 /* 429 * Do a normal single component lookup. 430 */ 431 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 432 "vop_lookup_start:"); 433 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr); 434 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 435 "vop_lookup_end:"); 436 } 437 438 if (!error) { 439 va.va_mask = AT_ALL; /* we want everything */ 440 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 441 "vop_getattr_start:"); 442 error = rfs4_delegated_getattr(vp, &va, 0, cr); 443 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 444 "vop_getattr_end:"); 445 /* check for overflows */ 446 if (!error) { 447 acl_perm(vp, exi, &va, cr); 448 error = vattr_to_nattr(&va, &dr->dr_attr); 449 if (!error) { 450 if (sec.sec_flags & SEC_QUERY) 451 error = makefh_ol(&dr->dr_fhandle, exi, 452 sec.sec_index); 453 else { 454 error = makefh(&dr->dr_fhandle, vp, 455 exi); 456 if (!error && publicfh_flag && 457 !chk_clnt_sec(exi, req)) 458 auth_weak = TRUE; 459 } 460 } 461 } 462 VN_RELE(vp); 463 } 464 465 VN_RELE(dvp); 466 467 /* 468 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 469 * and have obtained a new exportinfo in exi which needs to be 470 * released. Note the the original exportinfo pointed to by exi 471 * will be released by the caller, comon_dispatch. 472 */ 473 if (publicfh_flag && exi != NULL) 474 exi_rele(exi); 475 476 /* 477 * If it's public fh, no 0x81, and client's flavor is 478 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 479 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 480 */ 481 if (auth_weak) 482 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 483 else 484 dr->dr_status = puterrno(error); 485 486 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 487 "rfs_lookup_end:(%S)", "done"); 488 } 489 fhandle_t * 490 rfs_lookup_getfh(struct nfsdiropargs *da) 491 { 492 return (da->da_fhandle); 493 } 494 495 /* 496 * Read symbolic link. 497 * Returns the string in the symbolic link at the given fhandle. 498 */ 499 /* ARGSUSED */ 500 void 501 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 502 struct svc_req *req, cred_t *cr) 503 { 504 int error; 505 struct iovec iov; 506 struct uio uio; 507 vnode_t *vp; 508 struct vattr va; 509 510 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, 511 "rfs_readlink_start:"); 512 513 vp = nfs_fhtovp(fhp, exi); 514 if (vp == NULL) { 515 rl->rl_data = NULL; 516 rl->rl_status = NFSERR_STALE; 517 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 518 "rfs_readlink_end:(%S)", "stale"); 519 return; 520 } 521 522 va.va_mask = AT_MODE; 523 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 524 "vop_getattr_start:"); 525 error = VOP_GETATTR(vp, &va, 0, cr); 526 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 527 "vop_getattr_end:"); 528 529 if (error) { 530 VN_RELE(vp); 531 rl->rl_data = NULL; 532 rl->rl_status = puterrno(error); 533 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 534 "rfs_readlink_end:(%S)", "getattr error"); 535 return; 536 } 537 538 if (MANDLOCK(vp, va.va_mode)) { 539 VN_RELE(vp); 540 rl->rl_data = NULL; 541 rl->rl_status = NFSERR_ACCES; 542 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 543 "rfs_readlink_end:(%S)", "access"); 544 return; 545 } 546 547 /* 548 * XNFS and RFC1094 require us to return ENXIO if argument 549 * is not a link. BUGID 1138002. 550 */ 551 if (vp->v_type != VLNK) { 552 VN_RELE(vp); 553 rl->rl_data = NULL; 554 rl->rl_status = NFSERR_NXIO; 555 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 556 "rfs_readlink_end:(%S)", "nxio"); 557 return; 558 } 559 560 /* 561 * Allocate data for pathname. This will be freed by rfs_rlfree. 562 */ 563 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 564 565 /* 566 * Set up io vector to read sym link data 567 */ 568 iov.iov_base = rl->rl_data; 569 iov.iov_len = NFS_MAXPATHLEN; 570 uio.uio_iov = &iov; 571 uio.uio_iovcnt = 1; 572 uio.uio_segflg = UIO_SYSSPACE; 573 uio.uio_extflg = UIO_COPY_CACHED; 574 uio.uio_loffset = (offset_t)0; 575 uio.uio_resid = NFS_MAXPATHLEN; 576 577 /* 578 * Do the readlink. 579 */ 580 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, 581 "vop_readlink_start:"); 582 error = VOP_READLINK(vp, &uio, cr); 583 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, 584 "vop_readlink_end:"); 585 586 #if 0 /* notyet */ 587 /* 588 * Don't do this. It causes local disk writes when just 589 * reading the file and the overhead is deemed larger 590 * than the benefit. 591 */ 592 /* 593 * Force modified metadata out to stable storage. 594 */ 595 (void) VOP_FSYNC(vp, FNODSYNC, cr); 596 #endif 597 598 VN_RELE(vp); 599 600 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 601 602 /* 603 * XNFS and RFC1094 require us to return ENXIO if argument 604 * is not a link. UFS returns EINVAL if this is the case, 605 * so we do the mapping here. BUGID 1138002. 606 */ 607 if (error == EINVAL) 608 rl->rl_status = NFSERR_NXIO; 609 else 610 rl->rl_status = puterrno(error); 611 612 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 613 "rfs_readlink_end:(%S)", "done"); 614 } 615 fhandle_t * 616 rfs_readlink_getfh(fhandle_t *fhp) 617 { 618 return (fhp); 619 } 620 /* 621 * Free data allocated by rfs_readlink 622 */ 623 void 624 rfs_rlfree(struct nfsrdlnres *rl) 625 { 626 if (rl->rl_data != NULL) 627 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 628 } 629 630 /* 631 * Read data. 632 * Returns some data read from the file at the given fhandle. 633 */ 634 /* ARGSUSED */ 635 void 636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 637 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 638 { 639 vnode_t *vp; 640 int error; 641 struct vattr va; 642 struct iovec iov; 643 struct uio uio; 644 mblk_t *mp; 645 int alloc_err = 0; 646 int in_crit = 0; 647 648 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, 649 "rfs_read_start:"); 650 651 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 652 if (vp == NULL) { 653 rr->rr_data = NULL; 654 rr->rr_status = NFSERR_STALE; 655 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 656 "rfs_read_end:(%S)", "stale"); 657 return; 658 } 659 660 if (vp->v_type != VREG) { 661 VN_RELE(vp); 662 rr->rr_data = NULL; 663 rr->rr_status = NFSERR_ISDIR; 664 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 665 "rfs_read_end:(%S)", "isdir"); 666 return; 667 } 668 669 /* 670 * Check to see if the v4 side of the server has delegated 671 * this file. If so, then we mark thread as wouldblock so 672 * the response is dropped. 673 */ 674 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 675 VN_RELE(vp); 676 curthread->t_flag |= T_WOULDBLOCK; 677 rr->rr_data = NULL; 678 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 679 "rfs_read_end:(%S)", "delegated"); 680 return; 681 } 682 683 /* 684 * Enter the critical region before calling VOP_RWLOCK 685 * to avoid a deadlock with write requests. 686 */ 687 if (nbl_need_check(vp)) { 688 nbl_start_crit(vp, RW_READER); 689 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 690 0)) { 691 nbl_end_crit(vp); 692 VN_RELE(vp); 693 rr->rr_data = NULL; 694 rr->rr_status = NFSERR_ACCES; 695 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 696 "rfs_read_end:(%S)", " csf access error"); 697 return; 698 } 699 in_crit = 1; 700 } 701 702 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 703 "vop_rwlock_start:"); 704 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 705 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 706 "vop_rwlock_end:"); 707 708 va.va_mask = AT_ALL; 709 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 710 "vop_getattr_start:"); 711 error = VOP_GETATTR(vp, &va, 0, cr); 712 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 713 "vop_getattr_end:"); 714 715 if (error) { 716 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 717 "vop_rwunlock_start:"); 718 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 719 if (in_crit) 720 nbl_end_crit(vp); 721 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 722 "vop_rwunlock_end:"); 723 VN_RELE(vp); 724 rr->rr_data = NULL; 725 rr->rr_status = puterrno(error); 726 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 727 "rfs_read_end:(%S)", "getattr error"); 728 return; 729 } 730 731 /* 732 * This is a kludge to allow reading of files created 733 * with no read permission. The owner of the file 734 * is always allowed to read it. 735 */ 736 if (crgetuid(cr) != va.va_uid) { 737 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 738 "vop_access_start:"); 739 error = VOP_ACCESS(vp, VREAD, 0, cr); 740 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 741 "vop_access_end:"); 742 if (error) { 743 /* 744 * Exec is the same as read over the net because 745 * of demand loading. 746 */ 747 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 748 "vop_access_start:"); 749 error = VOP_ACCESS(vp, VEXEC, 0, cr); 750 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 751 "vop_access_end:"); 752 } 753 if (error) { 754 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 755 "vop_rwunlock_start:"); 756 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 757 if (in_crit) 758 nbl_end_crit(vp); 759 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 760 "vop_rwunlock_end:"); 761 VN_RELE(vp); 762 rr->rr_data = NULL; 763 rr->rr_status = puterrno(error); 764 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 765 "rfs_read_end:(%S)", "access error"); 766 return; 767 } 768 } 769 770 if (MANDLOCK(vp, va.va_mode)) { 771 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 772 "vop_rwunlock_start:"); 773 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 774 if (in_crit) 775 nbl_end_crit(vp); 776 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 777 "vop_rwunlock_end:"); 778 VN_RELE(vp); 779 rr->rr_data = NULL; 780 rr->rr_status = NFSERR_ACCES; 781 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 782 "rfs_read_end:(%S)", "mand lock"); 783 return; 784 } 785 786 if ((u_offset_t)ra->ra_offset >= va.va_size) { 787 rr->rr_count = 0; 788 rr->rr_data = NULL; 789 /* 790 * In this case, status is NFS_OK, but there is no data 791 * to encode. So set rr_mp to NULL. 792 */ 793 rr->rr_mp = NULL; 794 goto done; 795 } 796 797 /* 798 * mp will contain the data to be sent out in the read reply. 799 * This will be freed after the reply has been sent out (by the 800 * driver). 801 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 802 * that the call to xdrmblk_putmblk() never fails. 803 */ 804 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 805 &alloc_err); 806 ASSERT(mp != NULL); 807 ASSERT(alloc_err == 0); 808 809 rr->rr_mp = mp; 810 811 /* 812 * Set up io vector 813 */ 814 iov.iov_base = (caddr_t)mp->b_datap->db_base; 815 iov.iov_len = ra->ra_count; 816 uio.uio_iov = &iov; 817 uio.uio_iovcnt = 1; 818 uio.uio_segflg = UIO_SYSSPACE; 819 uio.uio_extflg = UIO_COPY_CACHED; 820 uio.uio_loffset = (offset_t)ra->ra_offset; 821 uio.uio_resid = ra->ra_count; 822 823 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, 824 "vop_read_start:"); 825 error = VOP_READ(vp, &uio, 0, cr, NULL); 826 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, 827 "vop_read_end:"); 828 829 if (error) { 830 freeb(mp); 831 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 832 "vop_rwunlock_start:"); 833 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 834 if (in_crit) 835 nbl_end_crit(vp); 836 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 837 "vop_rwunlock_end:"); 838 VN_RELE(vp); 839 rr->rr_data = NULL; 840 rr->rr_status = puterrno(error); 841 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 842 "rfs_read_end:(%S)", "read error"); 843 return; 844 } 845 846 /* 847 * Get attributes again so we can send the latest access 848 * time to the client side for his cache. 849 */ 850 va.va_mask = AT_ALL; 851 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 852 "vop_getattr_start:"); 853 error = VOP_GETATTR(vp, &va, 0, cr); 854 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 855 "vop_getattr_end:"); 856 if (error) { 857 freeb(mp); 858 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 859 "vop_rwunlock_start:"); 860 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 861 if (in_crit) 862 nbl_end_crit(vp); 863 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 864 "vop_rwunlock_end:"); 865 VN_RELE(vp); 866 rr->rr_data = NULL; 867 rr->rr_status = puterrno(error); 868 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 869 "rfs_read_end:(%S)", "read error"); 870 return; 871 } 872 873 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 874 875 rr->rr_data = (char *)mp->b_datap->db_base; 876 877 done: 878 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 879 "vop_rwunlock_start:"); 880 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 881 if (in_crit) 882 nbl_end_crit(vp); 883 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 884 "vop_rwunlock_end:"); 885 886 acl_perm(vp, exi, &va, cr); 887 888 /* check for overflows */ 889 error = vattr_to_nattr(&va, &rr->rr_attr); 890 891 #if 0 /* notyet */ 892 /* 893 * Don't do this. It causes local disk writes when just 894 * reading the file and the overhead is deemed larger 895 * than the benefit. 896 */ 897 /* 898 * Force modified metadata out to stable storage. 899 */ 900 (void) VOP_FSYNC(vp, FNODSYNC, cr); 901 #endif 902 903 VN_RELE(vp); 904 905 rr->rr_status = puterrno(error); 906 907 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 908 "rfs_read_end:(%S)", "done"); 909 } 910 911 /* 912 * Free data allocated by rfs_read 913 */ 914 void 915 rfs_rdfree(struct nfsrdresult *rr) 916 { 917 mblk_t *mp; 918 919 if (rr->rr_status == NFS_OK) { 920 mp = rr->rr_mp; 921 if (mp != NULL) 922 freeb(mp); 923 } 924 } 925 926 fhandle_t * 927 rfs_read_getfh(struct nfsreadargs *ra) 928 { 929 return (&ra->ra_fhandle); 930 } 931 932 #define MAX_IOVECS 12 933 934 #ifdef DEBUG 935 static int rfs_write_sync_hits = 0; 936 static int rfs_write_sync_misses = 0; 937 #endif 938 939 /* 940 * Write data to file. 941 * Returns attributes of a file after writing some data to it. 942 * 943 * Any changes made here, especially in error handling might have 944 * to also be done in rfs_write (which clusters write requests). 945 */ 946 void 947 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 948 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 949 { 950 int error; 951 vnode_t *vp; 952 rlim64_t rlimit; 953 struct vattr va; 954 struct uio uio; 955 struct iovec iov[MAX_IOVECS]; 956 mblk_t *m; 957 struct iovec *iovp; 958 int iovcnt; 959 cred_t *savecred; 960 int in_crit = 0; 961 962 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 963 "rfs_write_start:(%S)", "sync"); 964 965 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 966 if (vp == NULL) { 967 ns->ns_status = NFSERR_STALE; 968 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 969 "rfs_write_end:(%S)", "stale"); 970 return; 971 } 972 973 if (rdonly(exi, req)) { 974 VN_RELE(vp); 975 ns->ns_status = NFSERR_ROFS; 976 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 977 "rfs_write_end:(%S)", "rofs"); 978 return; 979 } 980 981 if (vp->v_type != VREG) { 982 VN_RELE(vp); 983 ns->ns_status = NFSERR_ISDIR; 984 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 985 "rfs_write_end:(%S)", "isdir"); 986 return; 987 } 988 989 /* 990 * Check to see if the v4 side of the server has delegated 991 * this file. If so, then we mark thread as wouldblock so 992 * the response is dropped. 993 */ 994 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 995 VN_RELE(vp); 996 curthread->t_flag |= T_WOULDBLOCK; 997 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 998 "rfs_write_end:(%S)", "delegated"); 999 return; 1000 } 1001 1002 va.va_mask = AT_UID|AT_MODE; 1003 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1004 "vop_getattr_start:"); 1005 error = VOP_GETATTR(vp, &va, 0, cr); 1006 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1007 "vop_getattr_end:"); 1008 1009 if (error) { 1010 VN_RELE(vp); 1011 ns->ns_status = puterrno(error); 1012 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1013 "rfs_write_end:(%S)", "getattr error"); 1014 return; 1015 } 1016 1017 if (crgetuid(cr) != va.va_uid) { 1018 /* 1019 * This is a kludge to allow writes of files created 1020 * with read only permission. The owner of the file 1021 * is always allowed to write it. 1022 */ 1023 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1024 "vop_access_start:"); 1025 error = VOP_ACCESS(vp, VWRITE, 0, cr); 1026 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1027 "vop_access_end:"); 1028 if (error) { 1029 VN_RELE(vp); 1030 ns->ns_status = puterrno(error); 1031 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1032 "rfs_write_end:(%S)", "access error"); 1033 return; 1034 } 1035 } 1036 1037 /* 1038 * Can't access a mandatory lock file. This might cause 1039 * the NFS service thread to block forever waiting for a 1040 * lock to be released that will never be released. 1041 */ 1042 if (MANDLOCK(vp, va.va_mode)) { 1043 VN_RELE(vp); 1044 ns->ns_status = NFSERR_ACCES; 1045 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1046 "rfs_write_end:(%S)", "mand lock"); 1047 return; 1048 } 1049 1050 /* 1051 * We have to enter the critical region before calling VOP_RWLOCK 1052 * to avoid a deadlock with ufs. 1053 */ 1054 if (nbl_need_check(vp)) { 1055 nbl_start_crit(vp, RW_READER); 1056 in_crit = 1; 1057 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1058 wa->wa_count, 0)) { 1059 error = EACCES; 1060 goto out; 1061 } 1062 } 1063 1064 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1065 "vop_rwlock_start:"); 1066 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1067 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1068 "vop_rwlock_end:"); 1069 1070 if (wa->wa_data) { 1071 iov[0].iov_base = wa->wa_data; 1072 iov[0].iov_len = wa->wa_count; 1073 uio.uio_iov = iov; 1074 uio.uio_iovcnt = 1; 1075 uio.uio_segflg = UIO_SYSSPACE; 1076 uio.uio_extflg = UIO_COPY_DEFAULT; 1077 uio.uio_loffset = (offset_t)wa->wa_offset; 1078 uio.uio_resid = wa->wa_count; 1079 /* 1080 * The limit is checked on the client. We 1081 * should allow any size writes here. 1082 */ 1083 uio.uio_llimit = curproc->p_fsz_ctl; 1084 rlimit = uio.uio_llimit - wa->wa_offset; 1085 if (rlimit < (rlim64_t)uio.uio_resid) 1086 uio.uio_resid = (uint_t)rlimit; 1087 1088 /* 1089 * for now we assume no append mode 1090 */ 1091 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1092 "vop_write_start:(%S)", "sync"); 1093 /* 1094 * We're changing creds because VM may fault and we need 1095 * the cred of the current thread to be used if quota 1096 * checking is enabled. 1097 */ 1098 savecred = curthread->t_cred; 1099 curthread->t_cred = cr; 1100 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1101 curthread->t_cred = savecred; 1102 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1103 "vop_write_end:"); 1104 } else { 1105 iovcnt = 0; 1106 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1107 iovcnt++; 1108 if (iovcnt <= MAX_IOVECS) { 1109 #ifdef DEBUG 1110 rfs_write_sync_hits++; 1111 #endif 1112 iovp = iov; 1113 } else { 1114 #ifdef DEBUG 1115 rfs_write_sync_misses++; 1116 #endif 1117 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1118 } 1119 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1120 uio.uio_iov = iovp; 1121 uio.uio_iovcnt = iovcnt; 1122 uio.uio_segflg = UIO_SYSSPACE; 1123 uio.uio_extflg = UIO_COPY_DEFAULT; 1124 uio.uio_loffset = (offset_t)wa->wa_offset; 1125 uio.uio_resid = wa->wa_count; 1126 /* 1127 * The limit is checked on the client. We 1128 * should allow any size writes here. 1129 */ 1130 uio.uio_llimit = curproc->p_fsz_ctl; 1131 rlimit = uio.uio_llimit - wa->wa_offset; 1132 if (rlimit < (rlim64_t)uio.uio_resid) 1133 uio.uio_resid = (uint_t)rlimit; 1134 1135 /* 1136 * For now we assume no append mode. 1137 */ 1138 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1139 "vop_write_start:(%S)", "iov sync"); 1140 /* 1141 * We're changing creds because VM may fault and we need 1142 * the cred of the current thread to be used if quota 1143 * checking is enabled. 1144 */ 1145 savecred = curthread->t_cred; 1146 curthread->t_cred = cr; 1147 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1148 curthread->t_cred = savecred; 1149 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1150 "vop_write_end:"); 1151 1152 if (iovp != iov) 1153 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1154 } 1155 1156 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1157 "vop_rwunlock_start:"); 1158 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1159 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1160 "vop_rwunlock_end:"); 1161 1162 if (!error) { 1163 /* 1164 * Get attributes again so we send the latest mod 1165 * time to the client side for his cache. 1166 */ 1167 va.va_mask = AT_ALL; /* now we want everything */ 1168 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1169 "vop_getattr_start:"); 1170 error = VOP_GETATTR(vp, &va, 0, cr); 1171 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1172 "vop_getattr_end:"); 1173 /* check for overflows */ 1174 if (!error) { 1175 acl_perm(vp, exi, &va, cr); 1176 error = vattr_to_nattr(&va, &ns->ns_attr); 1177 } 1178 } 1179 1180 out: 1181 if (in_crit) 1182 nbl_end_crit(vp); 1183 VN_RELE(vp); 1184 1185 ns->ns_status = puterrno(error); 1186 1187 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1188 "rfs_write_end:(%S)", "sync"); 1189 } 1190 1191 struct rfs_async_write { 1192 struct nfswriteargs *wa; 1193 struct nfsattrstat *ns; 1194 struct svc_req *req; 1195 cred_t *cr; 1196 kthread_t *thread; 1197 struct rfs_async_write *list; 1198 }; 1199 1200 struct rfs_async_write_list { 1201 fhandle_t *fhp; 1202 kcondvar_t cv; 1203 struct rfs_async_write *list; 1204 struct rfs_async_write_list *next; 1205 }; 1206 1207 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1208 static kmutex_t rfs_async_write_lock; 1209 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1210 1211 #define MAXCLIOVECS 42 1212 #define RFSWRITE_INITVAL (enum nfsstat) -1 1213 1214 #ifdef DEBUG 1215 static int rfs_write_hits = 0; 1216 static int rfs_write_misses = 0; 1217 #endif 1218 1219 /* 1220 * Write data to file. 1221 * Returns attributes of a file after writing some data to it. 1222 */ 1223 void 1224 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1225 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1226 { 1227 int error; 1228 vnode_t *vp; 1229 rlim64_t rlimit; 1230 struct vattr va; 1231 struct uio uio; 1232 struct rfs_async_write_list *lp; 1233 struct rfs_async_write_list *nlp; 1234 struct rfs_async_write *rp; 1235 struct rfs_async_write *nrp; 1236 struct rfs_async_write *trp; 1237 struct rfs_async_write *lrp; 1238 int data_written; 1239 int iovcnt; 1240 mblk_t *m; 1241 struct iovec *iovp; 1242 struct iovec *niovp; 1243 struct iovec iov[MAXCLIOVECS]; 1244 int count; 1245 int rcount; 1246 uint_t off; 1247 uint_t len; 1248 struct rfs_async_write nrpsp; 1249 struct rfs_async_write_list nlpsp; 1250 ushort_t t_flag; 1251 cred_t *savecred; 1252 int in_crit = 0; 1253 1254 if (!rfs_write_async) { 1255 rfs_write_sync(wa, ns, exi, req, cr); 1256 return; 1257 } 1258 1259 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1260 "rfs_write_start:(%S)", "async"); 1261 1262 /* 1263 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1264 * is considered an OK. 1265 */ 1266 ns->ns_status = RFSWRITE_INITVAL; 1267 1268 nrp = &nrpsp; 1269 nrp->wa = wa; 1270 nrp->ns = ns; 1271 nrp->req = req; 1272 nrp->cr = cr; 1273 nrp->thread = curthread; 1274 1275 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1276 1277 /* 1278 * Look to see if there is already a cluster started 1279 * for this file. 1280 */ 1281 mutex_enter(&rfs_async_write_lock); 1282 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1283 if (bcmp(&wa->wa_fhandle, lp->fhp, 1284 sizeof (fhandle_t)) == 0) 1285 break; 1286 } 1287 1288 /* 1289 * If lp is non-NULL, then there is already a cluster 1290 * started. We need to place ourselves in the cluster 1291 * list in the right place as determined by starting 1292 * offset. Conflicts with non-blocking mandatory locked 1293 * regions will be checked when the cluster is processed. 1294 */ 1295 if (lp != NULL) { 1296 rp = lp->list; 1297 trp = NULL; 1298 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1299 trp = rp; 1300 rp = rp->list; 1301 } 1302 nrp->list = rp; 1303 if (trp == NULL) 1304 lp->list = nrp; 1305 else 1306 trp->list = nrp; 1307 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1308 cv_wait(&lp->cv, &rfs_async_write_lock); 1309 mutex_exit(&rfs_async_write_lock); 1310 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1311 "rfs_write_end:(%S)", "cluster child"); 1312 return; 1313 } 1314 1315 /* 1316 * No cluster started yet, start one and add ourselves 1317 * to the list of clusters. 1318 */ 1319 nrp->list = NULL; 1320 1321 nlp = &nlpsp; 1322 nlp->fhp = &wa->wa_fhandle; 1323 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1324 nlp->list = nrp; 1325 nlp->next = NULL; 1326 1327 if (rfs_async_write_head == NULL) { 1328 rfs_async_write_head = nlp; 1329 } else { 1330 lp = rfs_async_write_head; 1331 while (lp->next != NULL) 1332 lp = lp->next; 1333 lp->next = nlp; 1334 } 1335 mutex_exit(&rfs_async_write_lock); 1336 1337 /* 1338 * Convert the file handle common to all of the requests 1339 * in this cluster to a vnode. 1340 */ 1341 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1342 if (vp == NULL) { 1343 mutex_enter(&rfs_async_write_lock); 1344 if (rfs_async_write_head == nlp) 1345 rfs_async_write_head = nlp->next; 1346 else { 1347 lp = rfs_async_write_head; 1348 while (lp->next != nlp) 1349 lp = lp->next; 1350 lp->next = nlp->next; 1351 } 1352 t_flag = curthread->t_flag & T_WOULDBLOCK; 1353 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1354 rp->ns->ns_status = NFSERR_STALE; 1355 rp->thread->t_flag |= t_flag; 1356 } 1357 cv_broadcast(&nlp->cv); 1358 mutex_exit(&rfs_async_write_lock); 1359 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1360 "rfs_write_end:(%S)", "stale"); 1361 return; 1362 } 1363 1364 /* 1365 * Can only write regular files. Attempts to write any 1366 * other file types fail with EISDIR. 1367 */ 1368 if (vp->v_type != VREG) { 1369 VN_RELE(vp); 1370 mutex_enter(&rfs_async_write_lock); 1371 if (rfs_async_write_head == nlp) 1372 rfs_async_write_head = nlp->next; 1373 else { 1374 lp = rfs_async_write_head; 1375 while (lp->next != nlp) 1376 lp = lp->next; 1377 lp->next = nlp->next; 1378 } 1379 t_flag = curthread->t_flag & T_WOULDBLOCK; 1380 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1381 rp->ns->ns_status = NFSERR_ISDIR; 1382 rp->thread->t_flag |= t_flag; 1383 } 1384 cv_broadcast(&nlp->cv); 1385 mutex_exit(&rfs_async_write_lock); 1386 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1387 "rfs_write_end:(%S)", "isdir"); 1388 return; 1389 } 1390 1391 /* 1392 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1393 * deadlock with ufs. 1394 */ 1395 if (nbl_need_check(vp)) { 1396 nbl_start_crit(vp, RW_READER); 1397 in_crit = 1; 1398 } 1399 1400 /* 1401 * Lock the file for writing. This operation provides 1402 * the delay which allows clusters to grow. 1403 */ 1404 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1405 "vop_wrlock_start:"); 1406 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1407 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1408 "vop_wrlock_end"); 1409 1410 /* 1411 * Disconnect this cluster from the list of clusters. 1412 * The cluster that is being dealt with must be fixed 1413 * in size after this point, so there is no reason 1414 * to leave it on the list so that new requests can 1415 * find it. 1416 * 1417 * The algorithm is that the first write request will 1418 * create a cluster, convert the file handle to a 1419 * vnode pointer, and then lock the file for writing. 1420 * This request is not likely to be clustered with 1421 * any others. However, the next request will create 1422 * a new cluster and be blocked in VOP_RWLOCK while 1423 * the first request is being processed. This delay 1424 * will allow more requests to be clustered in this 1425 * second cluster. 1426 */ 1427 mutex_enter(&rfs_async_write_lock); 1428 if (rfs_async_write_head == nlp) 1429 rfs_async_write_head = nlp->next; 1430 else { 1431 lp = rfs_async_write_head; 1432 while (lp->next != nlp) 1433 lp = lp->next; 1434 lp->next = nlp->next; 1435 } 1436 mutex_exit(&rfs_async_write_lock); 1437 1438 /* 1439 * Step through the list of requests in this cluster. 1440 * We need to check permissions to make sure that all 1441 * of the requests have sufficient permission to write 1442 * the file. A cluster can be composed of requests 1443 * from different clients and different users on each 1444 * client. 1445 * 1446 * As a side effect, we also calculate the size of the 1447 * byte range that this cluster encompasses. 1448 */ 1449 rp = nlp->list; 1450 off = rp->wa->wa_offset; 1451 len = (uint_t)0; 1452 do { 1453 if (rdonly(exi, rp->req)) { 1454 rp->ns->ns_status = NFSERR_ROFS; 1455 t_flag = curthread->t_flag & T_WOULDBLOCK; 1456 rp->thread->t_flag |= t_flag; 1457 continue; 1458 } 1459 1460 va.va_mask = AT_UID|AT_MODE; 1461 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1462 "vop_getattr_start:"); 1463 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1464 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1465 "vop_getattr_end:"); 1466 if (!error) { 1467 if (crgetuid(rp->cr) != va.va_uid) { 1468 /* 1469 * This is a kludge to allow writes of files 1470 * created with read only permission. The 1471 * owner of the file is always allowed to 1472 * write it. 1473 */ 1474 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1475 "vop_access_start:"); 1476 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr); 1477 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1478 "vop_access_end:"); 1479 } 1480 if (!error && MANDLOCK(vp, va.va_mode)) 1481 error = EACCES; 1482 } 1483 1484 /* 1485 * Check for a conflict with a nbmand-locked region. 1486 */ 1487 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1488 rp->wa->wa_count, 0)) { 1489 error = EACCES; 1490 } 1491 1492 if (error) { 1493 rp->ns->ns_status = puterrno(error); 1494 t_flag = curthread->t_flag & T_WOULDBLOCK; 1495 rp->thread->t_flag |= t_flag; 1496 continue; 1497 } 1498 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1499 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1500 } while ((rp = rp->list) != NULL); 1501 1502 /* 1503 * Step through the cluster attempting to gather as many 1504 * requests which are contiguous as possible. These 1505 * contiguous requests are handled via one call to VOP_WRITE 1506 * instead of different calls to VOP_WRITE. We also keep 1507 * track of the fact that any data was written. 1508 */ 1509 rp = nlp->list; 1510 data_written = 0; 1511 do { 1512 /* 1513 * Skip any requests which are already marked as having an 1514 * error. 1515 */ 1516 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1517 rp = rp->list; 1518 continue; 1519 } 1520 1521 /* 1522 * Count the number of iovec's which are required 1523 * to handle this set of requests. One iovec is 1524 * needed for each data buffer, whether addressed 1525 * by wa_data or by the b_rptr pointers in the 1526 * mblk chains. 1527 */ 1528 iovcnt = 0; 1529 lrp = rp; 1530 for (;;) { 1531 if (lrp->wa->wa_data) 1532 iovcnt++; 1533 else { 1534 m = lrp->wa->wa_mblk; 1535 while (m != NULL) { 1536 iovcnt++; 1537 m = m->b_cont; 1538 } 1539 } 1540 if (lrp->list == NULL || 1541 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1542 lrp->wa->wa_offset + lrp->wa->wa_count != 1543 lrp->list->wa->wa_offset) { 1544 lrp = lrp->list; 1545 break; 1546 } 1547 lrp = lrp->list; 1548 } 1549 1550 if (iovcnt <= MAXCLIOVECS) { 1551 #ifdef DEBUG 1552 rfs_write_hits++; 1553 #endif 1554 niovp = iov; 1555 } else { 1556 #ifdef DEBUG 1557 rfs_write_misses++; 1558 #endif 1559 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1560 } 1561 /* 1562 * Put together the scatter/gather iovecs. 1563 */ 1564 iovp = niovp; 1565 trp = rp; 1566 count = 0; 1567 do { 1568 if (trp->wa->wa_data) { 1569 iovp->iov_base = trp->wa->wa_data; 1570 iovp->iov_len = trp->wa->wa_count; 1571 iovp++; 1572 } else { 1573 m = trp->wa->wa_mblk; 1574 rcount = trp->wa->wa_count; 1575 while (m != NULL) { 1576 iovp->iov_base = (caddr_t)m->b_rptr; 1577 iovp->iov_len = (m->b_wptr - m->b_rptr); 1578 rcount -= iovp->iov_len; 1579 if (rcount < 0) 1580 iovp->iov_len += rcount; 1581 iovp++; 1582 if (rcount <= 0) 1583 break; 1584 m = m->b_cont; 1585 } 1586 } 1587 count += trp->wa->wa_count; 1588 trp = trp->list; 1589 } while (trp != lrp); 1590 1591 uio.uio_iov = niovp; 1592 uio.uio_iovcnt = iovcnt; 1593 uio.uio_segflg = UIO_SYSSPACE; 1594 uio.uio_extflg = UIO_COPY_DEFAULT; 1595 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1596 uio.uio_resid = count; 1597 /* 1598 * The limit is checked on the client. We 1599 * should allow any size writes here. 1600 */ 1601 uio.uio_llimit = curproc->p_fsz_ctl; 1602 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1603 if (rlimit < (rlim64_t)uio.uio_resid) 1604 uio.uio_resid = (uint_t)rlimit; 1605 1606 /* 1607 * For now we assume no append mode. 1608 */ 1609 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1610 "vop_write_start:(%S)", "async"); 1611 1612 /* 1613 * Check to see if the v4 side of the server has 1614 * delegated this file. If so, then we mark thread 1615 * as wouldblock so the response is dropped. 1616 */ 1617 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1618 curthread->t_flag |= T_WOULDBLOCK; 1619 error = EACCES; /* just to have an error */ 1620 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1621 "rfs_write_end:(%S)", "delegated"); 1622 } else { 1623 /* 1624 * We're changing creds because VM may fault 1625 * and we need the cred of the current 1626 * thread to be used if quota * checking is 1627 * enabled. 1628 */ 1629 savecred = curthread->t_cred; 1630 curthread->t_cred = cr; 1631 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1632 curthread->t_cred = savecred; 1633 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1634 "vop_write_end:"); 1635 } 1636 1637 if (niovp != iov) 1638 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1639 1640 if (!error) { 1641 data_written = 1; 1642 /* 1643 * Get attributes again so we send the latest mod 1644 * time to the client side for his cache. 1645 */ 1646 va.va_mask = AT_ALL; /* now we want everything */ 1647 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1648 "vop_getattr_start:"); 1649 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1650 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1651 "vop_getattr_end:"); 1652 if (!error) 1653 acl_perm(vp, exi, &va, rp->cr); 1654 } 1655 1656 /* 1657 * Fill in the status responses for each request 1658 * which was just handled. Also, copy the latest 1659 * attributes in to the attribute responses if 1660 * appropriate. 1661 */ 1662 t_flag = curthread->t_flag & T_WOULDBLOCK; 1663 do { 1664 rp->thread->t_flag |= t_flag; 1665 /* check for overflows */ 1666 if (!error) { 1667 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1668 } 1669 rp->ns->ns_status = puterrno(error); 1670 rp = rp->list; 1671 } while (rp != lrp); 1672 } while (rp != NULL); 1673 1674 /* 1675 * If any data was written at all, then we need to flush 1676 * the data and metadata to stable storage. 1677 */ 1678 if (data_written) { 1679 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, 1680 "vop_putpage_start:"); 1681 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr); 1682 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, 1683 "vop_putpage_end:"); 1684 if (!error) { 1685 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1686 "vop_fsync_start:"); 1687 error = VOP_FSYNC(vp, FNODSYNC, cr); 1688 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, 1689 "vop_fsync_end:"); 1690 } 1691 } 1692 1693 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1694 "vop_rwunlock_start:"); 1695 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1696 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1697 "vop_rwunlock_end:"); 1698 1699 if (in_crit) 1700 nbl_end_crit(vp); 1701 VN_RELE(vp); 1702 1703 t_flag = curthread->t_flag & T_WOULDBLOCK; 1704 mutex_enter(&rfs_async_write_lock); 1705 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1706 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1707 rp->ns->ns_status = puterrno(error); 1708 rp->thread->t_flag |= t_flag; 1709 } 1710 } 1711 cv_broadcast(&nlp->cv); 1712 mutex_exit(&rfs_async_write_lock); 1713 1714 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1715 "rfs_write_end:(%S)", "async"); 1716 } 1717 1718 fhandle_t * 1719 rfs_write_getfh(struct nfswriteargs *wa) 1720 { 1721 return (&wa->wa_fhandle); 1722 } 1723 1724 /* 1725 * Create a file. 1726 * Creates a file with given attributes and returns those attributes 1727 * and an fhandle for the new file. 1728 */ 1729 void 1730 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1731 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1732 { 1733 int error; 1734 int lookuperr; 1735 int in_crit = 0; 1736 struct vattr va; 1737 vnode_t *vp; 1738 vnode_t *dvp; 1739 char *name = args->ca_da.da_name; 1740 vnode_t *tvp = NULL; 1741 int mode; 1742 int lookup_ok; 1743 bool_t trunc; 1744 1745 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, 1746 "rfs_create_start:"); 1747 1748 /* 1749 * Disallow NULL paths 1750 */ 1751 if (name == NULL || *name == '\0') { 1752 dr->dr_status = NFSERR_ACCES; 1753 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1754 "rfs_create_end:(%S)", "access"); 1755 return; 1756 } 1757 1758 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1759 if (dvp == NULL) { 1760 dr->dr_status = NFSERR_STALE; 1761 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1762 "rfs_create_end:(%S)", "stale"); 1763 return; 1764 } 1765 1766 error = sattr_to_vattr(args->ca_sa, &va); 1767 if (error) { 1768 dr->dr_status = puterrno(error); 1769 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1770 "rfs_create_end:(%S)", "sattr"); 1771 return; 1772 } 1773 1774 /* 1775 * Must specify the mode. 1776 */ 1777 if (!(va.va_mask & AT_MODE)) { 1778 VN_RELE(dvp); 1779 dr->dr_status = NFSERR_INVAL; 1780 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1781 "rfs_create_end:(%S)", "no mode"); 1782 return; 1783 } 1784 1785 /* 1786 * This is a completely gross hack to make mknod 1787 * work over the wire until we can wack the protocol 1788 */ 1789 if ((va.va_mode & IFMT) == IFCHR) { 1790 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1791 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1792 else { 1793 va.va_type = VCHR; 1794 /* 1795 * uncompress the received dev_t 1796 * if the top half is zero indicating a request 1797 * from an `older style' OS. 1798 */ 1799 if ((va.va_size & 0xffff0000) == 0) 1800 va.va_rdev = nfsv2_expdev(va.va_size); 1801 else 1802 va.va_rdev = (dev_t)va.va_size; 1803 } 1804 va.va_mask &= ~AT_SIZE; 1805 } else if ((va.va_mode & IFMT) == IFBLK) { 1806 va.va_type = VBLK; 1807 /* 1808 * uncompress the received dev_t 1809 * if the top half is zero indicating a request 1810 * from an `older style' OS. 1811 */ 1812 if ((va.va_size & 0xffff0000) == 0) 1813 va.va_rdev = nfsv2_expdev(va.va_size); 1814 else 1815 va.va_rdev = (dev_t)va.va_size; 1816 va.va_mask &= ~AT_SIZE; 1817 } else if ((va.va_mode & IFMT) == IFSOCK) { 1818 va.va_type = VSOCK; 1819 } else 1820 va.va_type = VREG; 1821 va.va_mode &= ~IFMT; 1822 va.va_mask |= AT_TYPE; 1823 1824 /* 1825 * Why was the choice made to use VWRITE as the mode to the 1826 * call to VOP_CREATE ? This results in a bug. When a client 1827 * opens a file that already exists and is RDONLY, the second 1828 * open fails with an EACESS because of the mode. 1829 * bug ID 1054648. 1830 */ 1831 lookup_ok = 0; 1832 mode = VWRITE; 1833 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1834 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 1835 "vop_lookup_start:"); 1836 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1837 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 1838 "vop_lookup_end:"); 1839 if (!error) { 1840 struct vattr at; 1841 1842 lookup_ok = 1; 1843 at.va_mask = AT_MODE; 1844 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1845 "vop_getattr_start:"); 1846 error = VOP_GETATTR(tvp, &at, 0, cr); 1847 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1848 "vop_getattr_end:"); 1849 if (!error) 1850 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1851 VN_RELE(tvp); 1852 tvp = NULL; 1853 } 1854 } 1855 1856 if (!lookup_ok) { 1857 if (rdonly(exi, req)) { 1858 error = EROFS; 1859 } else if (va.va_type != VREG && va.va_type != VFIFO && 1860 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1861 error = EPERM; 1862 } else { 1863 error = 0; 1864 } 1865 } 1866 1867 /* 1868 * If file size is being modified on an already existing file 1869 * make sure that there are no conflicting non-blocking mandatory 1870 * locks in the region being manipulated. Return EACCES if there 1871 * are conflicting locks. 1872 */ 1873 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1874 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1875 1876 if (!lookuperr && 1877 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1878 VN_RELE(tvp); 1879 curthread->t_flag |= T_WOULDBLOCK; 1880 goto out; 1881 } 1882 1883 if (!lookuperr && nbl_need_check(tvp)) { 1884 /* 1885 * The file exists. Now check if it has any 1886 * conflicting non-blocking mandatory locks 1887 * in the region being changed. 1888 */ 1889 struct vattr bva; 1890 u_offset_t offset; 1891 ssize_t length; 1892 1893 nbl_start_crit(tvp, RW_READER); 1894 in_crit = 1; 1895 1896 bva.va_mask = AT_SIZE; 1897 error = VOP_GETATTR(tvp, &bva, 0, cr); 1898 if (!error) { 1899 if (va.va_size < bva.va_size) { 1900 offset = va.va_size; 1901 length = bva.va_size - va.va_size; 1902 } else { 1903 offset = bva.va_size; 1904 length = va.va_size - bva.va_size; 1905 } 1906 if (length) { 1907 if (nbl_conflict(tvp, NBL_WRITE, 1908 offset, length, 0)) { 1909 error = EACCES; 1910 } 1911 } 1912 } 1913 if (error) { 1914 nbl_end_crit(tvp); 1915 VN_RELE(tvp); 1916 in_crit = 0; 1917 } 1918 } else if (tvp != NULL) { 1919 VN_RELE(tvp); 1920 } 1921 } 1922 1923 if (!error) { 1924 /* 1925 * If filesystem is shared with nosuid the remove any 1926 * setuid/setgid bits on create. 1927 */ 1928 if (va.va_type == VREG && 1929 exi->exi_export.ex_flags & EX_NOSUID) 1930 va.va_mode &= ~(VSUID | VSGID); 1931 1932 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, 1933 "vop_create_start:"); 1934 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0); 1935 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, 1936 "vop_create_end:"); 1937 1938 if (!error) { 1939 1940 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1941 trunc = TRUE; 1942 else 1943 trunc = FALSE; 1944 1945 if (rfs4_check_delegated(FWRITE, tvp, trunc)) { 1946 VN_RELE(tvp); 1947 curthread->t_flag |= T_WOULDBLOCK; 1948 goto out; 1949 } 1950 va.va_mask = AT_ALL; 1951 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1952 "vop_getattr_start:"); 1953 error = VOP_GETATTR(vp, &va, 0, cr); 1954 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1955 "vop_getattr_end:"); 1956 /* check for overflows */ 1957 if (!error) { 1958 acl_perm(vp, exi, &va, cr); 1959 error = vattr_to_nattr(&va, &dr->dr_attr); 1960 if (!error) { 1961 error = makefh(&dr->dr_fhandle, vp, 1962 exi); 1963 } 1964 } 1965 /* 1966 * Force modified metadata out to stable storage. 1967 */ 1968 (void) VOP_FSYNC(vp, FNODSYNC, cr); 1969 VN_RELE(vp); 1970 } 1971 1972 if (in_crit) { 1973 nbl_end_crit(tvp); 1974 VN_RELE(tvp); 1975 } 1976 } 1977 1978 /* 1979 * Force modified data and metadata out to stable storage. 1980 */ 1981 (void) VOP_FSYNC(dvp, 0, cr); 1982 1983 out: 1984 1985 VN_RELE(dvp); 1986 1987 dr->dr_status = puterrno(error); 1988 1989 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1990 "rfs_create_end:(%S)", "done"); 1991 } 1992 fhandle_t * 1993 rfs_create_getfh(struct nfscreatargs *args) 1994 { 1995 return (args->ca_da.da_fhandle); 1996 } 1997 1998 /* 1999 * Remove a file. 2000 * Remove named file from parent directory. 2001 */ 2002 void 2003 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 2004 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2005 { 2006 int error = 0; 2007 vnode_t *vp; 2008 vnode_t *targvp; 2009 int in_crit = 0; 2010 2011 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, 2012 "rfs_remove_start:"); 2013 2014 /* 2015 * Disallow NULL paths 2016 */ 2017 if (da->da_name == NULL || *da->da_name == '\0') { 2018 *status = NFSERR_ACCES; 2019 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2020 "rfs_remove_end:(%S)", "access"); 2021 return; 2022 } 2023 2024 vp = nfs_fhtovp(da->da_fhandle, exi); 2025 if (vp == NULL) { 2026 *status = NFSERR_STALE; 2027 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2028 "rfs_remove_end:(%S)", "stale"); 2029 return; 2030 } 2031 2032 if (rdonly(exi, req)) { 2033 VN_RELE(vp); 2034 *status = NFSERR_ROFS; 2035 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2036 "rfs_remove_end:(%S)", "rofs"); 2037 return; 2038 } 2039 2040 /* 2041 * Check for a conflict with a non-blocking mandatory share reservation. 2042 */ 2043 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2044 NULL, cr); 2045 if (error != 0) { 2046 VN_RELE(vp); 2047 *status = puterrno(error); 2048 return; 2049 } 2050 2051 /* 2052 * If the file is delegated to an v4 client, then initiate 2053 * recall and drop this request (by setting T_WOULDBLOCK). 2054 * The client will eventually re-transmit the request and 2055 * (hopefully), by then, the v4 client will have returned 2056 * the delegation. 2057 */ 2058 2059 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2060 VN_RELE(vp); 2061 VN_RELE(targvp); 2062 curthread->t_flag |= T_WOULDBLOCK; 2063 return; 2064 } 2065 2066 if (nbl_need_check(targvp)) { 2067 nbl_start_crit(targvp, RW_READER); 2068 in_crit = 1; 2069 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) { 2070 error = EACCES; 2071 goto out; 2072 } 2073 } 2074 2075 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, 2076 "vop_remove_start:"); 2077 error = VOP_REMOVE(vp, da->da_name, cr); 2078 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, 2079 "vop_remove_end:"); 2080 2081 /* 2082 * Force modified data and metadata out to stable storage. 2083 */ 2084 (void) VOP_FSYNC(vp, 0, cr); 2085 2086 out: 2087 if (in_crit) 2088 nbl_end_crit(targvp); 2089 VN_RELE(targvp); 2090 VN_RELE(vp); 2091 2092 *status = puterrno(error); 2093 2094 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2095 "rfs_remove_end:(%S)", "done"); 2096 } 2097 2098 fhandle_t * 2099 rfs_remove_getfh(struct nfsdiropargs *da) 2100 { 2101 return (da->da_fhandle); 2102 } 2103 2104 /* 2105 * rename a file 2106 * Give a file (from) a new name (to). 2107 */ 2108 void 2109 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2110 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2111 { 2112 int error = 0; 2113 vnode_t *fromvp; 2114 vnode_t *tovp; 2115 struct exportinfo *to_exi; 2116 fhandle_t *fh; 2117 vnode_t *srcvp; 2118 vnode_t *targvp; 2119 int in_crit = 0; 2120 2121 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, 2122 "rfs_rename_start:"); 2123 2124 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2125 if (fromvp == NULL) { 2126 *status = NFSERR_STALE; 2127 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2128 "rfs_rename_end:(%S)", "from stale"); 2129 return; 2130 } 2131 2132 fh = args->rna_to.da_fhandle; 2133 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2134 if (to_exi == NULL) { 2135 VN_RELE(fromvp); 2136 *status = NFSERR_ACCES; 2137 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2138 "rfs_rename_end:(%S)", "cross device"); 2139 return; 2140 } 2141 exi_rele(to_exi); 2142 2143 if (to_exi != exi) { 2144 VN_RELE(fromvp); 2145 *status = NFSERR_XDEV; 2146 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2147 "rfs_rename_end:(%S)", "from stale"); 2148 return; 2149 } 2150 2151 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2152 if (tovp == NULL) { 2153 VN_RELE(fromvp); 2154 *status = NFSERR_STALE; 2155 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2156 "rfs_rename_end:(%S)", "to stale"); 2157 return; 2158 } 2159 2160 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2161 VN_RELE(tovp); 2162 VN_RELE(fromvp); 2163 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2164 "rfs_rename_end:(%S)", "not dir"); 2165 *status = NFSERR_NOTDIR; 2166 return; 2167 } 2168 2169 /* 2170 * Disallow NULL paths 2171 */ 2172 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2173 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2174 VN_RELE(tovp); 2175 VN_RELE(fromvp); 2176 *status = NFSERR_ACCES; 2177 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2178 "rfs_rename_end:(%S)", "access"); 2179 return; 2180 } 2181 2182 if (rdonly(exi, req)) { 2183 VN_RELE(tovp); 2184 VN_RELE(fromvp); 2185 *status = NFSERR_ROFS; 2186 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2187 "rfs_rename_end:(%S)", "rofs"); 2188 return; 2189 } 2190 2191 /* 2192 * Check for a conflict with a non-blocking mandatory share reservation. 2193 */ 2194 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2195 NULL, cr); 2196 if (error != 0) { 2197 VN_RELE(tovp); 2198 VN_RELE(fromvp); 2199 *status = puterrno(error); 2200 return; 2201 } 2202 2203 /* Check for delegations on the source file */ 2204 2205 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2206 VN_RELE(tovp); 2207 VN_RELE(fromvp); 2208 VN_RELE(srcvp); 2209 curthread->t_flag |= T_WOULDBLOCK; 2210 return; 2211 } 2212 2213 /* Check for delegation on the file being renamed over, if it exists */ 2214 2215 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2216 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr) 2217 == 0) { 2218 2219 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2220 VN_RELE(tovp); 2221 VN_RELE(fromvp); 2222 VN_RELE(srcvp); 2223 VN_RELE(targvp); 2224 curthread->t_flag |= T_WOULDBLOCK; 2225 return; 2226 } 2227 VN_RELE(targvp); 2228 } 2229 2230 2231 if (nbl_need_check(srcvp)) { 2232 nbl_start_crit(srcvp, RW_READER); 2233 in_crit = 1; 2234 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) { 2235 error = EACCES; 2236 goto out; 2237 } 2238 } 2239 2240 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, 2241 "vop_rename_start:"); 2242 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2243 tovp, args->rna_to.da_name, cr); 2244 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, 2245 "vop_rename_end:"); 2246 2247 /* 2248 * Force modified data and metadata out to stable storage. 2249 */ 2250 (void) VOP_FSYNC(tovp, 0, cr); 2251 (void) VOP_FSYNC(fromvp, 0, cr); 2252 2253 out: 2254 if (in_crit) 2255 nbl_end_crit(srcvp); 2256 VN_RELE(srcvp); 2257 VN_RELE(tovp); 2258 VN_RELE(fromvp); 2259 2260 *status = puterrno(error); 2261 2262 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2263 "rfs_rename_end:(%S)", "done"); 2264 } 2265 fhandle_t * 2266 rfs_rename_getfh(struct nfsrnmargs *args) 2267 { 2268 return (args->rna_from.da_fhandle); 2269 } 2270 2271 /* 2272 * Link to a file. 2273 * Create a file (to) which is a hard link to the given file (from). 2274 */ 2275 void 2276 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2277 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2278 { 2279 int error; 2280 vnode_t *fromvp; 2281 vnode_t *tovp; 2282 struct exportinfo *to_exi; 2283 fhandle_t *fh; 2284 2285 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, 2286 "rfs_link_start:"); 2287 2288 fromvp = nfs_fhtovp(args->la_from, exi); 2289 if (fromvp == NULL) { 2290 *status = NFSERR_STALE; 2291 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2292 "rfs_link_end:(%S)", "from stale"); 2293 return; 2294 } 2295 2296 fh = args->la_to.da_fhandle; 2297 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2298 if (to_exi == NULL) { 2299 VN_RELE(fromvp); 2300 *status = NFSERR_ACCES; 2301 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2302 "rfs_link_end:(%S)", "cross device"); 2303 return; 2304 } 2305 exi_rele(to_exi); 2306 2307 if (to_exi != exi) { 2308 VN_RELE(fromvp); 2309 *status = NFSERR_XDEV; 2310 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2311 "rfs_link_end:(%S)", "cross device"); 2312 return; 2313 } 2314 2315 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2316 if (tovp == NULL) { 2317 VN_RELE(fromvp); 2318 *status = NFSERR_STALE; 2319 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2320 "rfs_link_end:(%S)", "to stale"); 2321 return; 2322 } 2323 2324 if (tovp->v_type != VDIR) { 2325 VN_RELE(tovp); 2326 VN_RELE(fromvp); 2327 *status = NFSERR_NOTDIR; 2328 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2329 "rfs_link_end:(%S)", "not dir"); 2330 return; 2331 } 2332 /* 2333 * Disallow NULL paths 2334 */ 2335 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2336 VN_RELE(tovp); 2337 VN_RELE(fromvp); 2338 *status = NFSERR_ACCES; 2339 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2340 "rfs_link_end:(%S)", "access"); 2341 return; 2342 } 2343 2344 if (rdonly(exi, req)) { 2345 VN_RELE(tovp); 2346 VN_RELE(fromvp); 2347 *status = NFSERR_ROFS; 2348 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2349 "rfs_link_end:(%S)", "rofs"); 2350 return; 2351 } 2352 2353 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, 2354 "vop_link_start:"); 2355 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr); 2356 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, 2357 "vop_link_end:"); 2358 2359 /* 2360 * Force modified data and metadata out to stable storage. 2361 */ 2362 (void) VOP_FSYNC(tovp, 0, cr); 2363 (void) VOP_FSYNC(fromvp, FNODSYNC, cr); 2364 2365 VN_RELE(tovp); 2366 VN_RELE(fromvp); 2367 2368 *status = puterrno(error); 2369 2370 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2371 "rfs_link_end:(%S)", "done"); 2372 } 2373 fhandle_t * 2374 rfs_link_getfh(struct nfslinkargs *args) 2375 { 2376 return (args->la_from); 2377 } 2378 2379 /* 2380 * Symbolicly link to a file. 2381 * Create a file (to) with the given attributes which is a symbolic link 2382 * to the given path name (to). 2383 */ 2384 void 2385 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2386 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2387 { 2388 int error; 2389 struct vattr va; 2390 vnode_t *vp; 2391 vnode_t *svp; 2392 int lerror; 2393 2394 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, 2395 "rfs_symlink_start:"); 2396 2397 /* 2398 * Disallow NULL paths 2399 */ 2400 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2401 *status = NFSERR_ACCES; 2402 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2403 "rfs_symlink_end:(%S)", "access"); 2404 return; 2405 } 2406 2407 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2408 if (vp == NULL) { 2409 *status = NFSERR_STALE; 2410 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2411 "rfs_symlink_end:(%S)", "stale"); 2412 return; 2413 } 2414 2415 if (rdonly(exi, req)) { 2416 VN_RELE(vp); 2417 *status = NFSERR_ROFS; 2418 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2419 "rfs_symlink_end:(%S)", "rofs"); 2420 return; 2421 } 2422 2423 error = sattr_to_vattr(args->sla_sa, &va); 2424 if (error) { 2425 VN_RELE(vp); 2426 *status = puterrno(error); 2427 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2428 "rfs_symlink_end:(%S)", "sattr"); 2429 return; 2430 } 2431 2432 if (!(va.va_mask & AT_MODE)) { 2433 VN_RELE(vp); 2434 *status = NFSERR_INVAL; 2435 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2436 "rfs_symlink_end:(%S)", "no mode"); 2437 return; 2438 } 2439 2440 va.va_type = VLNK; 2441 va.va_mask |= AT_TYPE; 2442 2443 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, 2444 "vop_symlink_start:"); 2445 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr); 2446 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, 2447 "vop_symlink_end:"); 2448 2449 /* 2450 * Force new data and metadata out to stable storage. 2451 */ 2452 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 2453 "vop_lookup_start:"); 2454 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2455 0, NULL, cr); 2456 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 2457 "vop_lookup_end:"); 2458 if (!lerror) { 2459 (void) VOP_FSYNC(svp, 0, cr); 2460 VN_RELE(svp); 2461 } 2462 2463 /* 2464 * Force modified data and metadata out to stable storage. 2465 */ 2466 (void) VOP_FSYNC(vp, 0, cr); 2467 2468 VN_RELE(vp); 2469 2470 *status = puterrno(error); 2471 2472 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2473 "rfs_symlink_end:(%S)", "done"); 2474 } 2475 fhandle_t * 2476 rfs_symlink_getfh(struct nfsslargs *args) 2477 { 2478 return (args->sla_from.da_fhandle); 2479 } 2480 2481 /* 2482 * Make a directory. 2483 * Create a directory with the given name, parent directory, and attributes. 2484 * Returns a file handle and attributes for the new directory. 2485 */ 2486 void 2487 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2488 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2489 { 2490 int error; 2491 struct vattr va; 2492 vnode_t *dvp = NULL; 2493 vnode_t *vp; 2494 char *name = args->ca_da.da_name; 2495 2496 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, 2497 "rfs_mkdir_start:"); 2498 2499 /* 2500 * Disallow NULL paths 2501 */ 2502 if (name == NULL || *name == '\0') { 2503 dr->dr_status = NFSERR_ACCES; 2504 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2505 "rfs_mkdir_end:(%S)", "access"); 2506 return; 2507 } 2508 2509 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2510 if (vp == NULL) { 2511 dr->dr_status = NFSERR_STALE; 2512 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2513 "rfs_mkdir_end:(%S)", "stale"); 2514 return; 2515 } 2516 2517 if (rdonly(exi, req)) { 2518 VN_RELE(vp); 2519 dr->dr_status = NFSERR_ROFS; 2520 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2521 "rfs_mkdir_end:(%S)", "rofs"); 2522 return; 2523 } 2524 2525 error = sattr_to_vattr(args->ca_sa, &va); 2526 if (error) { 2527 VN_RELE(vp); 2528 dr->dr_status = puterrno(error); 2529 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2530 "rfs_mkdir_end:(%S)", "sattr"); 2531 return; 2532 } 2533 2534 if (!(va.va_mask & AT_MODE)) { 2535 VN_RELE(vp); 2536 dr->dr_status = NFSERR_INVAL; 2537 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2538 "rfs_mkdir_end:(%S)", "no mode"); 2539 return; 2540 } 2541 2542 va.va_type = VDIR; 2543 va.va_mask |= AT_TYPE; 2544 2545 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, 2546 "vop_mkdir_start:"); 2547 error = VOP_MKDIR(vp, name, &va, &dvp, cr); 2548 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, 2549 "vop_mkdir_end:"); 2550 2551 if (!error) { 2552 /* 2553 * Attribtutes of the newly created directory should 2554 * be returned to the client. 2555 */ 2556 va.va_mask = AT_ALL; /* We want everything */ 2557 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 2558 "vop_getattr_start:"); 2559 error = VOP_GETATTR(dvp, &va, 0, cr); 2560 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 2561 "vop_getattr_end:"); 2562 /* check for overflows */ 2563 if (!error) { 2564 acl_perm(vp, exi, &va, cr); 2565 error = vattr_to_nattr(&va, &dr->dr_attr); 2566 if (!error) { 2567 error = makefh(&dr->dr_fhandle, dvp, exi); 2568 } 2569 } 2570 /* 2571 * Force new data and metadata out to stable storage. 2572 */ 2573 (void) VOP_FSYNC(dvp, 0, cr); 2574 VN_RELE(dvp); 2575 } 2576 2577 /* 2578 * Force modified data and metadata out to stable storage. 2579 */ 2580 (void) VOP_FSYNC(vp, 0, cr); 2581 2582 VN_RELE(vp); 2583 2584 dr->dr_status = puterrno(error); 2585 2586 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2587 "rfs_mkdir_end:(%S)", "done"); 2588 } 2589 fhandle_t * 2590 rfs_mkdir_getfh(struct nfscreatargs *args) 2591 { 2592 return (args->ca_da.da_fhandle); 2593 } 2594 2595 /* 2596 * Remove a directory. 2597 * Remove the given directory name from the given parent directory. 2598 */ 2599 void 2600 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2601 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2602 { 2603 int error; 2604 vnode_t *vp; 2605 2606 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, 2607 "rfs_rmdir_start:"); 2608 2609 /* 2610 * Disallow NULL paths 2611 */ 2612 if (da->da_name == NULL || *da->da_name == '\0') { 2613 *status = NFSERR_ACCES; 2614 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2615 "rfs_rmdir_end:(%S)", "access"); 2616 return; 2617 } 2618 2619 vp = nfs_fhtovp(da->da_fhandle, exi); 2620 if (vp == NULL) { 2621 *status = NFSERR_STALE; 2622 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2623 "rfs_rmdir_end:(%S)", "stale"); 2624 return; 2625 } 2626 2627 if (rdonly(exi, req)) { 2628 VN_RELE(vp); 2629 *status = NFSERR_ROFS; 2630 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2631 "rfs_rmdir_end:(%S)", "rofs"); 2632 return; 2633 } 2634 2635 /* 2636 * VOP_RMDIR now takes a new third argument (the current 2637 * directory of the process). That's because someone 2638 * wants to return EINVAL if one tries to remove ".". 2639 * Of course, NFS servers have no idea what their 2640 * clients' current directories are. We fake it by 2641 * supplying a vnode known to exist and illegal to 2642 * remove. 2643 */ 2644 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, 2645 "vop_rmdir_start:"); 2646 error = VOP_RMDIR(vp, da->da_name, rootdir, cr); 2647 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, 2648 "vop_rmdir_end:"); 2649 2650 /* 2651 * Force modified data and metadata out to stable storage. 2652 */ 2653 (void) VOP_FSYNC(vp, 0, cr); 2654 2655 VN_RELE(vp); 2656 2657 /* 2658 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2659 * if the directory is not empty. A System V NFS server 2660 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2661 * over the wire. 2662 */ 2663 if (error == EEXIST) 2664 *status = NFSERR_NOTEMPTY; 2665 else 2666 *status = puterrno(error); 2667 2668 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2669 "rfs_rmdir_end:(%S)", "done"); 2670 } 2671 fhandle_t * 2672 rfs_rmdir_getfh(struct nfsdiropargs *da) 2673 { 2674 return (da->da_fhandle); 2675 } 2676 2677 /* ARGSUSED */ 2678 void 2679 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2680 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2681 { 2682 int error; 2683 int iseof; 2684 struct iovec iov; 2685 struct uio uio; 2686 vnode_t *vp; 2687 2688 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, 2689 "rfs_readdir_start:"); 2690 2691 vp = nfs_fhtovp(&rda->rda_fh, exi); 2692 if (vp == NULL) { 2693 rd->rd_entries = NULL; 2694 rd->rd_status = NFSERR_STALE; 2695 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2696 "rfs_readdir_end:(%S)", "stale"); 2697 return; 2698 } 2699 2700 if (vp->v_type != VDIR) { 2701 VN_RELE(vp); 2702 rd->rd_entries = NULL; 2703 rd->rd_status = NFSERR_NOTDIR; 2704 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2705 "rfs_readdir_end:(%S)", "notdir"); 2706 return; 2707 } 2708 2709 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 2710 "vop_rwlock_start:"); 2711 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2712 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 2713 "vop_rwlock_end:"); 2714 2715 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 2716 "vop_access_start:"); 2717 error = VOP_ACCESS(vp, VREAD, 0, cr); 2718 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 2719 "vop_access_end:"); 2720 if (error) { 2721 rd->rd_entries = NULL; 2722 goto bad; 2723 } 2724 2725 if (rda->rda_count == 0) { 2726 rd->rd_entries = NULL; 2727 rd->rd_size = 0; 2728 rd->rd_eof = FALSE; 2729 goto bad; 2730 } 2731 2732 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2733 2734 /* 2735 * Allocate data for entries. This will be freed by rfs_rddirfree. 2736 */ 2737 rd->rd_bufsize = (uint_t)rda->rda_count; 2738 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2739 2740 /* 2741 * Set up io vector to read directory data 2742 */ 2743 iov.iov_base = (caddr_t)rd->rd_entries; 2744 iov.iov_len = rda->rda_count; 2745 uio.uio_iov = &iov; 2746 uio.uio_iovcnt = 1; 2747 uio.uio_segflg = UIO_SYSSPACE; 2748 uio.uio_extflg = UIO_COPY_CACHED; 2749 uio.uio_loffset = (offset_t)rda->rda_offset; 2750 uio.uio_resid = rda->rda_count; 2751 2752 /* 2753 * read directory 2754 */ 2755 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, 2756 "vop_readdir_start:"); 2757 error = VOP_READDIR(vp, &uio, cr, &iseof); 2758 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, 2759 "vop_readdir_end:"); 2760 2761 /* 2762 * Clean up 2763 */ 2764 if (!error) { 2765 /* 2766 * set size and eof 2767 */ 2768 if (uio.uio_resid == rda->rda_count) { 2769 rd->rd_size = 0; 2770 rd->rd_eof = TRUE; 2771 } else { 2772 rd->rd_size = (uint32_t)(rda->rda_count - 2773 uio.uio_resid); 2774 rd->rd_eof = iseof ? TRUE : FALSE; 2775 } 2776 } 2777 2778 bad: 2779 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 2780 "vop_rwunlock_start:"); 2781 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2782 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 2783 "vop_rwunlock_end:"); 2784 2785 #if 0 /* notyet */ 2786 /* 2787 * Don't do this. It causes local disk writes when just 2788 * reading the file and the overhead is deemed larger 2789 * than the benefit. 2790 */ 2791 /* 2792 * Force modified metadata out to stable storage. 2793 */ 2794 (void) VOP_FSYNC(vp, FNODSYNC, cr); 2795 #endif 2796 2797 VN_RELE(vp); 2798 2799 rd->rd_status = puterrno(error); 2800 2801 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2802 "rfs_readdir_end:(%S)", "done"); 2803 } 2804 fhandle_t * 2805 rfs_readdir_getfh(struct nfsrddirargs *rda) 2806 { 2807 return (&rda->rda_fh); 2808 } 2809 void 2810 rfs_rddirfree(struct nfsrddirres *rd) 2811 { 2812 if (rd->rd_entries != NULL) 2813 kmem_free(rd->rd_entries, rd->rd_bufsize); 2814 } 2815 2816 /* ARGSUSED */ 2817 void 2818 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2819 struct svc_req *req, cred_t *cr) 2820 { 2821 int error; 2822 struct statvfs64 sb; 2823 vnode_t *vp; 2824 2825 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, 2826 "rfs_statfs_start:"); 2827 2828 vp = nfs_fhtovp(fh, exi); 2829 if (vp == NULL) { 2830 fs->fs_status = NFSERR_STALE; 2831 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2832 "rfs_statfs_end:(%S)", "stale"); 2833 return; 2834 } 2835 2836 error = VFS_STATVFS(vp->v_vfsp, &sb); 2837 2838 if (!error) { 2839 fs->fs_tsize = nfstsize(); 2840 fs->fs_bsize = sb.f_frsize; 2841 fs->fs_blocks = sb.f_blocks; 2842 fs->fs_bfree = sb.f_bfree; 2843 fs->fs_bavail = sb.f_bavail; 2844 } 2845 2846 VN_RELE(vp); 2847 2848 fs->fs_status = puterrno(error); 2849 2850 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2851 "rfs_statfs_end:(%S)", "done"); 2852 } 2853 fhandle_t * 2854 rfs_statfs_getfh(fhandle_t *fh) 2855 { 2856 return (fh); 2857 } 2858 2859 static int 2860 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2861 { 2862 vap->va_mask = 0; 2863 2864 /* 2865 * There was a sign extension bug in some VFS based systems 2866 * which stored the mode as a short. When it would get 2867 * assigned to a u_long, no sign extension would occur. 2868 * It needed to, but this wasn't noticed because sa_mode 2869 * would then get assigned back to the short, thus ignoring 2870 * the upper 16 bits of sa_mode. 2871 * 2872 * To make this implementation work for both broken 2873 * clients and good clients, we check for both versions 2874 * of the mode. 2875 */ 2876 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2877 sa->sa_mode != (uint32_t)-1) { 2878 vap->va_mask |= AT_MODE; 2879 vap->va_mode = sa->sa_mode; 2880 } 2881 if (sa->sa_uid != (uint32_t)-1) { 2882 vap->va_mask |= AT_UID; 2883 vap->va_uid = sa->sa_uid; 2884 } 2885 if (sa->sa_gid != (uint32_t)-1) { 2886 vap->va_mask |= AT_GID; 2887 vap->va_gid = sa->sa_gid; 2888 } 2889 if (sa->sa_size != (uint32_t)-1) { 2890 vap->va_mask |= AT_SIZE; 2891 vap->va_size = sa->sa_size; 2892 } 2893 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2894 sa->sa_atime.tv_usec != (int32_t)-1) { 2895 #ifndef _LP64 2896 /* return error if time overflow */ 2897 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2898 return (EOVERFLOW); 2899 #endif 2900 vap->va_mask |= AT_ATIME; 2901 /* 2902 * nfs protocol defines times as unsigned so don't extend sign, 2903 * unless sysadmin set nfs_allow_preepoch_time. 2904 */ 2905 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2906 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2907 } 2908 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2909 sa->sa_mtime.tv_usec != (int32_t)-1) { 2910 #ifndef _LP64 2911 /* return error if time overflow */ 2912 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2913 return (EOVERFLOW); 2914 #endif 2915 vap->va_mask |= AT_MTIME; 2916 /* 2917 * nfs protocol defines times as unsigned so don't extend sign, 2918 * unless sysadmin set nfs_allow_preepoch_time. 2919 */ 2920 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2921 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2922 } 2923 return (0); 2924 } 2925 2926 static enum nfsftype vt_to_nf[] = { 2927 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2928 }; 2929 2930 /* 2931 * check the following fields for overflow: nodeid, size, and time. 2932 * There could be a problem when converting 64-bit LP64 fields 2933 * into 32-bit ones. Return an error if there is an overflow. 2934 */ 2935 int 2936 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2937 { 2938 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2939 na->na_type = vt_to_nf[vap->va_type]; 2940 2941 if (vap->va_mode == (unsigned short) -1) 2942 na->na_mode = (uint32_t)-1; 2943 else 2944 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2945 2946 if (vap->va_uid == (unsigned short)(-1)) 2947 na->na_uid = (uint32_t)(-1); 2948 else if (vap->va_uid == UID_NOBODY) 2949 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2950 else 2951 na->na_uid = vap->va_uid; 2952 2953 if (vap->va_gid == (unsigned short)(-1)) 2954 na->na_gid = (uint32_t)-1; 2955 else if (vap->va_gid == GID_NOBODY) 2956 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2957 else 2958 na->na_gid = vap->va_gid; 2959 2960 /* 2961 * Do we need to check fsid for overflow? It is 64-bit in the 2962 * vattr, but are bigger than 32 bit values supported? 2963 */ 2964 na->na_fsid = vap->va_fsid; 2965 2966 na->na_nodeid = vap->va_nodeid; 2967 2968 /* 2969 * Check to make sure that the nodeid is representable over the 2970 * wire without losing bits. 2971 */ 2972 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2973 return (EFBIG); 2974 na->na_nlink = vap->va_nlink; 2975 2976 /* 2977 * Check for big files here, instead of at the caller. See 2978 * comments in cstat for large special file explanation. 2979 */ 2980 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2981 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2982 return (EFBIG); 2983 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2984 /* UNKNOWN_SIZE | OVERFLOW */ 2985 na->na_size = MAXOFF32_T; 2986 } else 2987 na->na_size = vap->va_size; 2988 } else 2989 na->na_size = vap->va_size; 2990 2991 /* 2992 * If the vnode times overflow the 32-bit times that NFS2 2993 * uses on the wire then return an error. 2994 */ 2995 if (!NFS_VAP_TIME_OK(vap)) { 2996 return (EOVERFLOW); 2997 } 2998 na->na_atime.tv_sec = vap->va_atime.tv_sec; 2999 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 3000 3001 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 3002 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 3003 3004 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 3005 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 3006 3007 /* 3008 * If the dev_t will fit into 16 bits then compress 3009 * it, otherwise leave it alone. See comments in 3010 * nfs_client.c. 3011 */ 3012 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 3013 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 3014 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 3015 else 3016 (void) cmpldev(&na->na_rdev, vap->va_rdev); 3017 3018 na->na_blocks = vap->va_nblocks; 3019 na->na_blocksize = vap->va_blksize; 3020 3021 /* 3022 * This bit of ugliness is a *TEMPORARY* hack to preserve the 3023 * over-the-wire protocols for named-pipe vnodes. It remaps the 3024 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 3025 * 3026 * BUYER BEWARE: 3027 * If you are porting the NFS to a non-Sun server, you probably 3028 * don't want to include the following block of code. The 3029 * over-the-wire special file types will be changing with the 3030 * NFS Protocol Revision. 3031 */ 3032 if (vap->va_type == VFIFO) 3033 NA_SETFIFO(na); 3034 return (0); 3035 } 3036 3037 /* 3038 * acl v2 support: returns approximate permission. 3039 * default: returns minimal permission (more restrictive) 3040 * aclok: returns maximal permission (less restrictive) 3041 * This routine changes the permissions that are alaredy in *va. 3042 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3043 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3044 */ 3045 static void 3046 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3047 { 3048 vsecattr_t vsa; 3049 int aclcnt; 3050 aclent_t *aclentp; 3051 mode_t mask_perm; 3052 mode_t grp_perm; 3053 mode_t other_perm; 3054 mode_t other_orig; 3055 int error; 3056 3057 /* dont care default acl */ 3058 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3059 error = VOP_GETSECATTR(vp, &vsa, 0, cr); 3060 3061 if (!error) { 3062 aclcnt = vsa.vsa_aclcnt; 3063 if (aclcnt > MIN_ACL_ENTRIES) { 3064 /* non-trivial ACL */ 3065 aclentp = vsa.vsa_aclentp; 3066 if (exi->exi_export.ex_flags & EX_ACLOK) { 3067 /* maximal permissions */ 3068 grp_perm = 0; 3069 other_perm = 0; 3070 for (; aclcnt > 0; aclcnt--, aclentp++) { 3071 switch (aclentp->a_type) { 3072 case USER_OBJ: 3073 break; 3074 case USER: 3075 grp_perm |= 3076 aclentp->a_perm << 3; 3077 other_perm |= aclentp->a_perm; 3078 break; 3079 case GROUP_OBJ: 3080 grp_perm |= 3081 aclentp->a_perm << 3; 3082 break; 3083 case GROUP: 3084 other_perm |= aclentp->a_perm; 3085 break; 3086 case OTHER_OBJ: 3087 other_orig = aclentp->a_perm; 3088 break; 3089 case CLASS_OBJ: 3090 mask_perm = aclentp->a_perm; 3091 break; 3092 default: 3093 break; 3094 } 3095 } 3096 grp_perm &= mask_perm << 3; 3097 other_perm &= mask_perm; 3098 other_perm |= other_orig; 3099 3100 } else { 3101 /* minimal permissions */ 3102 grp_perm = 070; 3103 other_perm = 07; 3104 for (; aclcnt > 0; aclcnt--, aclentp++) { 3105 switch (aclentp->a_type) { 3106 case USER_OBJ: 3107 break; 3108 case USER: 3109 case CLASS_OBJ: 3110 grp_perm &= 3111 aclentp->a_perm << 3; 3112 other_perm &= 3113 aclentp->a_perm; 3114 break; 3115 case GROUP_OBJ: 3116 grp_perm &= 3117 aclentp->a_perm << 3; 3118 break; 3119 case GROUP: 3120 other_perm &= 3121 aclentp->a_perm; 3122 break; 3123 case OTHER_OBJ: 3124 other_perm &= 3125 aclentp->a_perm; 3126 break; 3127 default: 3128 break; 3129 } 3130 } 3131 } 3132 /* copy to va */ 3133 va->va_mode &= ~077; 3134 va->va_mode |= grp_perm | other_perm; 3135 } 3136 if (vsa.vsa_aclcnt) 3137 kmem_free(vsa.vsa_aclentp, 3138 vsa.vsa_aclcnt * sizeof (aclent_t)); 3139 } 3140 } 3141 3142 void 3143 rfs_srvrinit(void) 3144 { 3145 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3146 } 3147 3148 void 3149 rfs_srvrfini(void) 3150 { 3151 mutex_destroy(&rfs_async_write_lock); 3152 } 3153