1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All rights reserved. 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/uio.h> 42 #include <sys/stat.h> 43 #include <sys/errno.h> 44 #include <sys/sysmacros.h> 45 #include <sys/statvfs.h> 46 #include <sys/kmem.h> 47 #include <sys/kstat.h> 48 #include <sys/dirent.h> 49 #include <sys/cmn_err.h> 50 #include <sys/debug.h> 51 #include <sys/vtrace.h> 52 #include <sys/mode.h> 53 #include <sys/acl.h> 54 #include <sys/nbmlock.h> 55 #include <sys/policy.h> 56 57 #include <rpc/types.h> 58 #include <rpc/auth.h> 59 #include <rpc/svc.h> 60 61 #include <nfs/nfs.h> 62 #include <nfs/export.h> 63 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/seg_map.h> 68 #include <vm/seg_kmem.h> 69 70 #include <sys/strsubr.h> 71 72 /* 73 * These are the interface routines for the server side of the 74 * Network File System. See the NFS version 2 protocol specification 75 * for a description of this interface. 76 */ 77 78 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 79 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 80 cred_t *); 81 82 /* 83 * Some "over the wire" UNIX file types. These are encoded 84 * into the mode. This needs to be fixed in the next rev. 85 */ 86 #define IFMT 0170000 /* type of file */ 87 #define IFCHR 0020000 /* character special */ 88 #define IFBLK 0060000 /* block special */ 89 #define IFSOCK 0140000 /* socket */ 90 91 /* 92 * Get file attributes. 93 * Returns the current attributes of the file with the given fhandle. 94 */ 95 /* ARGSUSED */ 96 void 97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 98 struct svc_req *req, cred_t *cr) 99 { 100 int error; 101 vnode_t *vp; 102 struct vattr va; 103 104 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, 105 "rfs_getattr_start:"); 106 107 vp = nfs_fhtovp(fhp, exi); 108 if (vp == NULL) { 109 ns->ns_status = NFSERR_STALE; 110 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 111 "rfs_getattr_end:(%S)", "stale"); 112 return; 113 } 114 115 /* 116 * Do the getattr. 117 */ 118 va.va_mask = AT_ALL; /* we want all the attributes */ 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 120 "vop_getattr_start:"); 121 error = rfs4_delegated_getattr(vp, &va, 0, cr); 122 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 123 "vop_getattr_end:"); 124 125 /* check for overflows */ 126 if (!error) { 127 acl_perm(vp, exi, &va, cr); 128 error = vattr_to_nattr(&va, &ns->ns_attr); 129 } 130 131 VN_RELE(vp); 132 133 ns->ns_status = puterrno(error); 134 135 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 136 "rfs_getattr_end:(%S)", "done"); 137 } 138 fhandle_t * 139 rfs_getattr_getfh(fhandle_t *fhp) 140 { 141 return (fhp); 142 } 143 144 /* 145 * Set file attributes. 146 * Sets the attributes of the file with the given fhandle. Returns 147 * the new attributes. 148 */ 149 void 150 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 151 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 152 { 153 int error; 154 int flag; 155 int in_crit = 0; 156 vnode_t *vp; 157 struct vattr va; 158 struct vattr bva; 159 struct flock64 bf; 160 161 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, 162 "rfs_setattr_start:"); 163 164 vp = nfs_fhtovp(&args->saa_fh, exi); 165 if (vp == NULL) { 166 ns->ns_status = NFSERR_STALE; 167 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 168 "rfs_setattr_end:(%S)", "stale"); 169 return; 170 } 171 172 if (rdonly(exi, req) || vn_is_readonly(vp)) { 173 VN_RELE(vp); 174 ns->ns_status = NFSERR_ROFS; 175 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 176 "rfs_setattr_end:(%S)", "rofs"); 177 return; 178 } 179 180 error = sattr_to_vattr(&args->saa_sa, &va); 181 if (error) { 182 VN_RELE(vp); 183 ns->ns_status = puterrno(error); 184 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 185 "rfs_setattr_end:(%S)", "sattr"); 186 return; 187 } 188 189 /* 190 * If the client is requesting a change to the mtime, 191 * but the nanosecond field is set to 1 billion, then 192 * this is a flag to the server that it should set the 193 * atime and mtime fields to the server's current time. 194 * The 1 billion number actually came from the client 195 * as 1 million, but the units in the over the wire 196 * request are microseconds instead of nanoseconds. 197 * 198 * This is an overload of the protocol and should be 199 * documented in the NFS Version 2 protocol specification. 200 */ 201 if (va.va_mask & AT_MTIME) { 202 if (va.va_mtime.tv_nsec == 1000000000) { 203 gethrestime(&va.va_mtime); 204 va.va_atime = va.va_mtime; 205 va.va_mask |= AT_ATIME; 206 flag = 0; 207 } else 208 flag = ATTR_UTIME; 209 } else 210 flag = 0; 211 212 /* 213 * If the filesystem is exported with nosuid, then mask off 214 * the setuid and setgid bits. 215 */ 216 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 217 (exi->exi_export.ex_flags & EX_NOSUID)) 218 va.va_mode &= ~(VSUID | VSGID); 219 220 /* 221 * We need to specially handle size changes because it is 222 * possible for the client to create a file with modes 223 * which indicate read-only, but with the file opened for 224 * writing. If the client then tries to set the size of 225 * the file, then the normal access checking done in 226 * VOP_SETATTR would prevent the client from doing so, 227 * although it should be legal for it to do so. To get 228 * around this, we do the access checking for ourselves 229 * and then use VOP_SPACE which doesn't do the access 230 * checking which VOP_SETATTR does. VOP_SPACE can only 231 * operate on VREG files, let VOP_SETATTR handle the other 232 * extremely rare cases. 233 * Also the client should not be allowed to change the 234 * size of the file if there is a conflicting non-blocking 235 * mandatory lock in the region of change. 236 * 237 * Also(2), check to see if the v4 side of the server has 238 * delegated this file. If so, then we set T_WOULDBLOCK 239 * so that the dispatch function dosn't send a reply, forcing 240 * the client to retrasmit its request. 241 */ 242 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 243 /* If delegated, mark as wouldblock so response is dropped */ 244 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 245 VN_RELE(vp); 246 curthread->t_flag |= T_WOULDBLOCK; 247 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 248 "rfs_setattr_end:(%S)", "delegated"); 249 return; 250 } 251 if (nbl_need_check(vp)) { 252 nbl_start_crit(vp, RW_READER); 253 in_crit = 1; 254 } 255 256 bva.va_mask = AT_UID | AT_SIZE; 257 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 258 "vop_getattr_start:"); 259 error = VOP_GETATTR(vp, &bva, 0, cr); 260 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 261 "vop_getattr_end:"); 262 if (error) { 263 if (in_crit) 264 nbl_end_crit(vp); 265 VN_RELE(vp); 266 ns->ns_status = puterrno(error); 267 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 268 "rfs_setattr_end:(%S)", "getattr"); 269 return; 270 } 271 272 if (in_crit) { 273 u_offset_t offset; 274 ssize_t length; 275 276 if (va.va_size < bva.va_size) { 277 offset = va.va_size; 278 length = bva.va_size - va.va_size; 279 } else { 280 offset = bva.va_size; 281 length = va.va_size - bva.va_size; 282 } 283 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) { 284 error = EACCES; 285 } 286 } 287 288 if (crgetuid(cr) == bva.va_uid && !error && 289 va.va_size != bva.va_size) { 290 va.va_mask &= ~AT_SIZE; 291 bf.l_type = F_WRLCK; 292 bf.l_whence = 0; 293 bf.l_start = (off64_t)va.va_size; 294 bf.l_len = 0; 295 bf.l_sysid = 0; 296 bf.l_pid = 0; 297 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 298 "vop_space_start:"); 299 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 300 (offset_t)va.va_size, cr, NULL); 301 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, 302 "vop_space_end:"); 303 } 304 if (in_crit) 305 nbl_end_crit(vp); 306 } else 307 error = 0; 308 309 /* 310 * Do the setattr. 311 */ 312 if (!error && va.va_mask) { 313 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, 314 "vop_setattr_start:"); 315 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 316 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, 317 "vop_setattr_end:"); 318 } 319 320 if (!error) { 321 va.va_mask = AT_ALL; /* get everything */ 322 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 323 "vop_getattr_start:"); 324 error = rfs4_delegated_getattr(vp, &va, 0, cr); 325 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 326 "vop_getattr_end:"); 327 328 /* check for overflows */ 329 if (!error) { 330 acl_perm(vp, exi, &va, cr); 331 error = vattr_to_nattr(&va, &ns->ns_attr); 332 } 333 } 334 335 /* 336 * Force modified metadata out to stable storage. 337 */ 338 (void) VOP_FSYNC(vp, FNODSYNC, cr); 339 340 VN_RELE(vp); 341 342 ns->ns_status = puterrno(error); 343 344 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 345 "rfs_setattr_end:(%S)", "done"); 346 } 347 fhandle_t * 348 rfs_setattr_getfh(struct nfssaargs *args) 349 { 350 return (&args->saa_fh); 351 } 352 353 /* 354 * Directory lookup. 355 * Returns an fhandle and file attributes for file name in a directory. 356 */ 357 /* ARGSUSED */ 358 void 359 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 360 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 361 { 362 int error; 363 vnode_t *dvp; 364 vnode_t *vp; 365 struct vattr va; 366 fhandle_t *fhp = da->da_fhandle; 367 struct sec_ol sec = {0, 0}; 368 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 369 370 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, 371 "rfs_lookup_start:"); 372 373 /* 374 * Disallow NULL paths 375 */ 376 if (da->da_name == NULL || *da->da_name == '\0') { 377 dr->dr_status = NFSERR_ACCES; 378 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 379 "rfs_lookup_end:(%S)", "access"); 380 return; 381 } 382 383 /* 384 * Allow lookups from the root - the default 385 * location of the public filehandle. 386 */ 387 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 388 dvp = rootdir; 389 VN_HOLD(dvp); 390 } else { 391 dvp = nfs_fhtovp(fhp, exi); 392 if (dvp == NULL) { 393 dr->dr_status = NFSERR_STALE; 394 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 395 "rfs_lookup_end:(%S)", "stale"); 396 return; 397 } 398 } 399 400 /* 401 * Not allow lookup beyond root. 402 * If the filehandle matches a filehandle of the exi, 403 * then the ".." refers beyond the root of an exported filesystem. 404 */ 405 if (strcmp(da->da_name, "..") == 0 && 406 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 407 VN_RELE(dvp); 408 dr->dr_status = NFSERR_NOENT; 409 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 410 "rfs_lookup_end:(%S)", "noent"); 411 return; 412 } 413 414 /* 415 * If the public filehandle is used then allow 416 * a multi-component lookup, i.e. evaluate 417 * a pathname and follow symbolic links if 418 * necessary. 419 * 420 * This may result in a vnode in another filesystem 421 * which is OK as long as the filesystem is exported. 422 */ 423 if (PUBLIC_FH2(fhp)) { 424 publicfh_flag = TRUE; 425 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 426 &sec); 427 } else { 428 /* 429 * Do a normal single component lookup. 430 */ 431 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 432 "vop_lookup_start:"); 433 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr); 434 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 435 "vop_lookup_end:"); 436 } 437 438 if (!error) { 439 va.va_mask = AT_ALL; /* we want everything */ 440 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 441 "vop_getattr_start:"); 442 error = rfs4_delegated_getattr(vp, &va, 0, cr); 443 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 444 "vop_getattr_end:"); 445 /* check for overflows */ 446 if (!error) { 447 acl_perm(vp, exi, &va, cr); 448 error = vattr_to_nattr(&va, &dr->dr_attr); 449 if (!error) { 450 if (sec.sec_flags & SEC_QUERY) 451 error = makefh_ol(&dr->dr_fhandle, exi, 452 sec.sec_index); 453 else { 454 error = makefh(&dr->dr_fhandle, vp, 455 exi); 456 if (!error && publicfh_flag && 457 !chk_clnt_sec(exi, req)) 458 auth_weak = TRUE; 459 } 460 } 461 } 462 VN_RELE(vp); 463 } 464 465 VN_RELE(dvp); 466 467 /* 468 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 469 * and have obtained a new exportinfo in exi which needs to be 470 * released. Note the the original exportinfo pointed to by exi 471 * will be released by the caller, comon_dispatch. 472 */ 473 if (publicfh_flag && exi != NULL) 474 exi_rele(exi); 475 476 /* 477 * If it's public fh, no 0x81, and client's flavor is 478 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 479 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 480 */ 481 if (auth_weak) 482 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 483 else 484 dr->dr_status = puterrno(error); 485 486 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 487 "rfs_lookup_end:(%S)", "done"); 488 } 489 fhandle_t * 490 rfs_lookup_getfh(struct nfsdiropargs *da) 491 { 492 return (da->da_fhandle); 493 } 494 495 /* 496 * Read symbolic link. 497 * Returns the string in the symbolic link at the given fhandle. 498 */ 499 /* ARGSUSED */ 500 void 501 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 502 struct svc_req *req, cred_t *cr) 503 { 504 int error; 505 struct iovec iov; 506 struct uio uio; 507 vnode_t *vp; 508 struct vattr va; 509 510 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, 511 "rfs_readlink_start:"); 512 513 vp = nfs_fhtovp(fhp, exi); 514 if (vp == NULL) { 515 rl->rl_data = NULL; 516 rl->rl_status = NFSERR_STALE; 517 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 518 "rfs_readlink_end:(%S)", "stale"); 519 return; 520 } 521 522 va.va_mask = AT_MODE; 523 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 524 "vop_getattr_start:"); 525 error = VOP_GETATTR(vp, &va, 0, cr); 526 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 527 "vop_getattr_end:"); 528 529 if (error) { 530 VN_RELE(vp); 531 rl->rl_data = NULL; 532 rl->rl_status = puterrno(error); 533 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 534 "rfs_readlink_end:(%S)", "getattr error"); 535 return; 536 } 537 538 if (MANDLOCK(vp, va.va_mode)) { 539 VN_RELE(vp); 540 rl->rl_data = NULL; 541 rl->rl_status = NFSERR_ACCES; 542 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 543 "rfs_readlink_end:(%S)", "access"); 544 return; 545 } 546 547 /* 548 * XNFS and RFC1094 require us to return ENXIO if argument 549 * is not a link. BUGID 1138002. 550 */ 551 if (vp->v_type != VLNK) { 552 VN_RELE(vp); 553 rl->rl_data = NULL; 554 rl->rl_status = NFSERR_NXIO; 555 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 556 "rfs_readlink_end:(%S)", "nxio"); 557 return; 558 } 559 560 /* 561 * Allocate data for pathname. This will be freed by rfs_rlfree. 562 */ 563 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 564 565 /* 566 * Set up io vector to read sym link data 567 */ 568 iov.iov_base = rl->rl_data; 569 iov.iov_len = NFS_MAXPATHLEN; 570 uio.uio_iov = &iov; 571 uio.uio_iovcnt = 1; 572 uio.uio_segflg = UIO_SYSSPACE; 573 uio.uio_extflg = UIO_COPY_CACHED; 574 uio.uio_loffset = (offset_t)0; 575 uio.uio_resid = NFS_MAXPATHLEN; 576 577 /* 578 * Do the readlink. 579 */ 580 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, 581 "vop_readlink_start:"); 582 error = VOP_READLINK(vp, &uio, cr); 583 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, 584 "vop_readlink_end:"); 585 586 #if 0 /* notyet */ 587 /* 588 * Don't do this. It causes local disk writes when just 589 * reading the file and the overhead is deemed larger 590 * than the benefit. 591 */ 592 /* 593 * Force modified metadata out to stable storage. 594 */ 595 (void) VOP_FSYNC(vp, FNODSYNC, cr); 596 #endif 597 598 VN_RELE(vp); 599 600 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 601 602 /* 603 * XNFS and RFC1094 require us to return ENXIO if argument 604 * is not a link. UFS returns EINVAL if this is the case, 605 * so we do the mapping here. BUGID 1138002. 606 */ 607 if (error == EINVAL) 608 rl->rl_status = NFSERR_NXIO; 609 else 610 rl->rl_status = puterrno(error); 611 612 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 613 "rfs_readlink_end:(%S)", "done"); 614 } 615 fhandle_t * 616 rfs_readlink_getfh(fhandle_t *fhp) 617 { 618 return (fhp); 619 } 620 /* 621 * Free data allocated by rfs_readlink 622 */ 623 void 624 rfs_rlfree(struct nfsrdlnres *rl) 625 { 626 if (rl->rl_data != NULL) 627 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 628 } 629 630 /* 631 * Read data. 632 * Returns some data read from the file at the given fhandle. 633 */ 634 /* ARGSUSED */ 635 void 636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 637 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 638 { 639 vnode_t *vp; 640 int error; 641 struct vattr va; 642 struct iovec iov; 643 struct uio uio; 644 mblk_t *mp; 645 int alloc_err = 0; 646 int in_crit = 0; 647 648 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, 649 "rfs_read_start:"); 650 651 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 652 if (vp == NULL) { 653 rr->rr_data = NULL; 654 rr->rr_status = NFSERR_STALE; 655 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 656 "rfs_read_end:(%S)", "stale"); 657 return; 658 } 659 660 if (vp->v_type != VREG) { 661 VN_RELE(vp); 662 rr->rr_data = NULL; 663 rr->rr_status = NFSERR_ISDIR; 664 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 665 "rfs_read_end:(%S)", "isdir"); 666 return; 667 } 668 669 /* 670 * Check to see if the v4 side of the server has delegated 671 * this file. If so, then we mark thread as wouldblock so 672 * the response is dropped. 673 */ 674 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 675 VN_RELE(vp); 676 curthread->t_flag |= T_WOULDBLOCK; 677 rr->rr_data = NULL; 678 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 679 "rfs_read_end:(%S)", "delegated"); 680 return; 681 } 682 683 /* 684 * Enter the critical region before calling VOP_RWLOCK 685 * to avoid a deadlock with write requests. 686 */ 687 if (nbl_need_check(vp)) { 688 nbl_start_crit(vp, RW_READER); 689 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 690 0)) { 691 nbl_end_crit(vp); 692 VN_RELE(vp); 693 rr->rr_data = NULL; 694 rr->rr_status = NFSERR_ACCES; 695 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 696 "rfs_read_end:(%S)", " csf access error"); 697 return; 698 } 699 in_crit = 1; 700 } 701 702 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 703 "vop_rwlock_start:"); 704 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 705 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 706 "vop_rwlock_end:"); 707 708 va.va_mask = AT_ALL; 709 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 710 "vop_getattr_start:"); 711 error = VOP_GETATTR(vp, &va, 0, cr); 712 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 713 "vop_getattr_end:"); 714 715 if (error) { 716 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 717 "vop_rwunlock_start:"); 718 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 719 if (in_crit) 720 nbl_end_crit(vp); 721 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 722 "vop_rwunlock_end:"); 723 VN_RELE(vp); 724 rr->rr_data = NULL; 725 rr->rr_status = puterrno(error); 726 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 727 "rfs_read_end:(%S)", "getattr error"); 728 return; 729 } 730 731 /* 732 * This is a kludge to allow reading of files created 733 * with no read permission. The owner of the file 734 * is always allowed to read it. 735 */ 736 if (crgetuid(cr) != va.va_uid) { 737 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 738 "vop_access_start:"); 739 error = VOP_ACCESS(vp, VREAD, 0, cr); 740 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 741 "vop_access_end:"); 742 if (error) { 743 /* 744 * Exec is the same as read over the net because 745 * of demand loading. 746 */ 747 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 748 "vop_access_start:"); 749 error = VOP_ACCESS(vp, VEXEC, 0, cr); 750 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 751 "vop_access_end:"); 752 } 753 if (error) { 754 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 755 "vop_rwunlock_start:"); 756 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 757 if (in_crit) 758 nbl_end_crit(vp); 759 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 760 "vop_rwunlock_end:"); 761 VN_RELE(vp); 762 rr->rr_data = NULL; 763 rr->rr_status = puterrno(error); 764 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 765 "rfs_read_end:(%S)", "access error"); 766 return; 767 } 768 } 769 770 if (MANDLOCK(vp, va.va_mode)) { 771 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 772 "vop_rwunlock_start:"); 773 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 774 if (in_crit) 775 nbl_end_crit(vp); 776 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 777 "vop_rwunlock_end:"); 778 VN_RELE(vp); 779 rr->rr_data = NULL; 780 rr->rr_status = NFSERR_ACCES; 781 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 782 "rfs_read_end:(%S)", "mand lock"); 783 return; 784 } 785 786 if ((u_offset_t)ra->ra_offset >= va.va_size) { 787 rr->rr_count = 0; 788 rr->rr_data = NULL; 789 /* 790 * In this case, status is NFS_OK, but there is no data 791 * to encode. So set rr_mp to NULL. 792 */ 793 rr->rr_mp = NULL; 794 goto done; 795 } 796 797 /* 798 * mp will contain the data to be sent out in the read reply. 799 * This will be freed after the reply has been sent out (by the 800 * driver). 801 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 802 * that the call to xdrmblk_putmblk() never fails. 803 */ 804 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 805 &alloc_err); 806 ASSERT(mp != NULL); 807 ASSERT(alloc_err == 0); 808 809 rr->rr_mp = mp; 810 811 /* 812 * Set up io vector 813 */ 814 iov.iov_base = (caddr_t)mp->b_datap->db_base; 815 iov.iov_len = ra->ra_count; 816 uio.uio_iov = &iov; 817 uio.uio_iovcnt = 1; 818 uio.uio_segflg = UIO_SYSSPACE; 819 uio.uio_extflg = UIO_COPY_CACHED; 820 uio.uio_loffset = (offset_t)ra->ra_offset; 821 uio.uio_resid = ra->ra_count; 822 823 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, 824 "vop_read_start:"); 825 error = VOP_READ(vp, &uio, 0, cr, NULL); 826 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, 827 "vop_read_end:"); 828 829 if (error) { 830 freeb(mp); 831 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 832 "vop_rwunlock_start:"); 833 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 834 if (in_crit) 835 nbl_end_crit(vp); 836 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 837 "vop_rwunlock_end:"); 838 VN_RELE(vp); 839 rr->rr_data = NULL; 840 rr->rr_status = puterrno(error); 841 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 842 "rfs_read_end:(%S)", "read error"); 843 return; 844 } 845 846 /* 847 * Get attributes again so we can send the latest access 848 * time to the client side for his cache. 849 */ 850 va.va_mask = AT_ALL; 851 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 852 "vop_getattr_start:"); 853 error = VOP_GETATTR(vp, &va, 0, cr); 854 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 855 "vop_getattr_end:"); 856 if (error) { 857 freeb(mp); 858 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 859 "vop_rwunlock_start:"); 860 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 861 if (in_crit) 862 nbl_end_crit(vp); 863 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 864 "vop_rwunlock_end:"); 865 VN_RELE(vp); 866 rr->rr_data = NULL; 867 rr->rr_status = puterrno(error); 868 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 869 "rfs_read_end:(%S)", "read error"); 870 return; 871 } 872 873 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 874 875 rr->rr_data = (char *)mp->b_datap->db_base; 876 877 done: 878 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 879 "vop_rwunlock_start:"); 880 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 881 if (in_crit) 882 nbl_end_crit(vp); 883 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 884 "vop_rwunlock_end:"); 885 886 acl_perm(vp, exi, &va, cr); 887 888 /* check for overflows */ 889 error = vattr_to_nattr(&va, &rr->rr_attr); 890 891 #if 0 /* notyet */ 892 /* 893 * Don't do this. It causes local disk writes when just 894 * reading the file and the overhead is deemed larger 895 * than the benefit. 896 */ 897 /* 898 * Force modified metadata out to stable storage. 899 */ 900 (void) VOP_FSYNC(vp, FNODSYNC, cr); 901 #endif 902 903 VN_RELE(vp); 904 905 rr->rr_status = puterrno(error); 906 907 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 908 "rfs_read_end:(%S)", "done"); 909 } 910 911 /* 912 * Free data allocated by rfs_read 913 */ 914 void 915 rfs_rdfree(struct nfsrdresult *rr) 916 { 917 mblk_t *mp; 918 919 if (rr->rr_status == NFS_OK) { 920 mp = rr->rr_mp; 921 if (mp != NULL) 922 freeb(mp); 923 } 924 } 925 926 fhandle_t * 927 rfs_read_getfh(struct nfsreadargs *ra) 928 { 929 return (&ra->ra_fhandle); 930 } 931 932 #define MAX_IOVECS 12 933 934 #ifdef DEBUG 935 static int rfs_write_sync_hits = 0; 936 static int rfs_write_sync_misses = 0; 937 #endif 938 939 /* 940 * Write data to file. 941 * Returns attributes of a file after writing some data to it. 942 * 943 * Any changes made here, especially in error handling might have 944 * to also be done in rfs_write (which clusters write requests). 945 */ 946 void 947 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 948 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 949 { 950 int error; 951 vnode_t *vp; 952 rlim64_t rlimit; 953 struct vattr va; 954 struct uio uio; 955 struct iovec iov[MAX_IOVECS]; 956 mblk_t *m; 957 struct iovec *iovp; 958 int iovcnt; 959 cred_t *savecred; 960 int in_crit = 0; 961 962 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 963 "rfs_write_start:(%S)", "sync"); 964 965 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 966 if (vp == NULL) { 967 ns->ns_status = NFSERR_STALE; 968 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 969 "rfs_write_end:(%S)", "stale"); 970 return; 971 } 972 973 if (rdonly(exi, req)) { 974 VN_RELE(vp); 975 ns->ns_status = NFSERR_ROFS; 976 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 977 "rfs_write_end:(%S)", "rofs"); 978 return; 979 } 980 981 if (vp->v_type != VREG) { 982 VN_RELE(vp); 983 ns->ns_status = NFSERR_ISDIR; 984 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 985 "rfs_write_end:(%S)", "isdir"); 986 return; 987 } 988 989 /* 990 * Check to see if the v4 side of the server has delegated 991 * this file. If so, then we mark thread as wouldblock so 992 * the response is dropped. 993 */ 994 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 995 VN_RELE(vp); 996 curthread->t_flag |= T_WOULDBLOCK; 997 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 998 "rfs_write_end:(%S)", "delegated"); 999 return; 1000 } 1001 1002 va.va_mask = AT_UID|AT_MODE; 1003 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1004 "vop_getattr_start:"); 1005 error = VOP_GETATTR(vp, &va, 0, cr); 1006 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1007 "vop_getattr_end:"); 1008 1009 if (error) { 1010 VN_RELE(vp); 1011 ns->ns_status = puterrno(error); 1012 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1013 "rfs_write_end:(%S)", "getattr error"); 1014 return; 1015 } 1016 1017 if (crgetuid(cr) != va.va_uid) { 1018 /* 1019 * This is a kludge to allow writes of files created 1020 * with read only permission. The owner of the file 1021 * is always allowed to write it. 1022 */ 1023 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1024 "vop_access_start:"); 1025 error = VOP_ACCESS(vp, VWRITE, 0, cr); 1026 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1027 "vop_access_end:"); 1028 if (error) { 1029 VN_RELE(vp); 1030 ns->ns_status = puterrno(error); 1031 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1032 "rfs_write_end:(%S)", "access error"); 1033 return; 1034 } 1035 } 1036 1037 /* 1038 * Can't access a mandatory lock file. This might cause 1039 * the NFS service thread to block forever waiting for a 1040 * lock to be released that will never be released. 1041 */ 1042 if (MANDLOCK(vp, va.va_mode)) { 1043 VN_RELE(vp); 1044 ns->ns_status = NFSERR_ACCES; 1045 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1046 "rfs_write_end:(%S)", "mand lock"); 1047 return; 1048 } 1049 1050 /* 1051 * We have to enter the critical region before calling VOP_RWLOCK 1052 * to avoid a deadlock with ufs. 1053 */ 1054 if (nbl_need_check(vp)) { 1055 nbl_start_crit(vp, RW_READER); 1056 in_crit = 1; 1057 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1058 wa->wa_count, 0)) { 1059 error = EACCES; 1060 goto out; 1061 } 1062 } 1063 1064 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1065 "vop_rwlock_start:"); 1066 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1067 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1068 "vop_rwlock_end:"); 1069 1070 if (wa->wa_data) { 1071 iov[0].iov_base = wa->wa_data; 1072 iov[0].iov_len = wa->wa_count; 1073 uio.uio_iov = iov; 1074 uio.uio_iovcnt = 1; 1075 uio.uio_segflg = UIO_SYSSPACE; 1076 uio.uio_extflg = UIO_COPY_DEFAULT; 1077 uio.uio_loffset = (offset_t)wa->wa_offset; 1078 uio.uio_resid = wa->wa_count; 1079 /* 1080 * The limit is checked on the client. We 1081 * should allow any size writes here. 1082 */ 1083 uio.uio_llimit = curproc->p_fsz_ctl; 1084 rlimit = uio.uio_llimit - wa->wa_offset; 1085 if (rlimit < (rlim64_t)uio.uio_resid) 1086 uio.uio_resid = (uint_t)rlimit; 1087 1088 /* 1089 * for now we assume no append mode 1090 */ 1091 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1092 "vop_write_start:(%S)", "sync"); 1093 /* 1094 * We're changing creds because VM may fault and we need 1095 * the cred of the current thread to be used if quota 1096 * checking is enabled. 1097 */ 1098 savecred = curthread->t_cred; 1099 curthread->t_cred = cr; 1100 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1101 curthread->t_cred = savecred; 1102 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1103 "vop_write_end:"); 1104 } else { 1105 iovcnt = 0; 1106 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1107 iovcnt++; 1108 if (iovcnt <= MAX_IOVECS) { 1109 #ifdef DEBUG 1110 rfs_write_sync_hits++; 1111 #endif 1112 iovp = iov; 1113 } else { 1114 #ifdef DEBUG 1115 rfs_write_sync_misses++; 1116 #endif 1117 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1118 } 1119 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1120 uio.uio_iov = iovp; 1121 uio.uio_iovcnt = iovcnt; 1122 uio.uio_segflg = UIO_SYSSPACE; 1123 uio.uio_extflg = UIO_COPY_DEFAULT; 1124 uio.uio_loffset = (offset_t)wa->wa_offset; 1125 uio.uio_resid = wa->wa_count; 1126 /* 1127 * The limit is checked on the client. We 1128 * should allow any size writes here. 1129 */ 1130 uio.uio_llimit = curproc->p_fsz_ctl; 1131 rlimit = uio.uio_llimit - wa->wa_offset; 1132 if (rlimit < (rlim64_t)uio.uio_resid) 1133 uio.uio_resid = (uint_t)rlimit; 1134 1135 /* 1136 * For now we assume no append mode. 1137 */ 1138 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1139 "vop_write_start:(%S)", "iov sync"); 1140 /* 1141 * We're changing creds because VM may fault and we need 1142 * the cred of the current thread to be used if quota 1143 * checking is enabled. 1144 */ 1145 savecred = curthread->t_cred; 1146 curthread->t_cred = cr; 1147 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1148 curthread->t_cred = savecred; 1149 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1150 "vop_write_end:"); 1151 1152 if (iovp != iov) 1153 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1154 } 1155 1156 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1157 "vop_rwunlock_start:"); 1158 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1159 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1160 "vop_rwunlock_end:"); 1161 1162 if (!error) { 1163 /* 1164 * Get attributes again so we send the latest mod 1165 * time to the client side for his cache. 1166 */ 1167 va.va_mask = AT_ALL; /* now we want everything */ 1168 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1169 "vop_getattr_start:"); 1170 error = VOP_GETATTR(vp, &va, 0, cr); 1171 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1172 "vop_getattr_end:"); 1173 /* check for overflows */ 1174 if (!error) { 1175 acl_perm(vp, exi, &va, cr); 1176 error = vattr_to_nattr(&va, &ns->ns_attr); 1177 } 1178 } 1179 1180 out: 1181 if (in_crit) 1182 nbl_end_crit(vp); 1183 VN_RELE(vp); 1184 1185 ns->ns_status = puterrno(error); 1186 1187 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1188 "rfs_write_end:(%S)", "sync"); 1189 } 1190 1191 struct rfs_async_write { 1192 struct nfswriteargs *wa; 1193 struct nfsattrstat *ns; 1194 struct svc_req *req; 1195 cred_t *cr; 1196 kthread_t *thread; 1197 struct rfs_async_write *list; 1198 }; 1199 1200 struct rfs_async_write_list { 1201 fhandle_t *fhp; 1202 kcondvar_t cv; 1203 struct rfs_async_write *list; 1204 struct rfs_async_write_list *next; 1205 }; 1206 1207 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1208 static kmutex_t rfs_async_write_lock; 1209 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1210 1211 #define MAXCLIOVECS 42 1212 #define RFSWRITE_INITVAL (enum nfsstat) -1 1213 1214 #ifdef DEBUG 1215 static int rfs_write_hits = 0; 1216 static int rfs_write_misses = 0; 1217 #endif 1218 1219 /* 1220 * Write data to file. 1221 * Returns attributes of a file after writing some data to it. 1222 */ 1223 void 1224 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1225 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1226 { 1227 int error; 1228 vnode_t *vp; 1229 rlim64_t rlimit; 1230 struct vattr va; 1231 struct uio uio; 1232 struct rfs_async_write_list *lp; 1233 struct rfs_async_write_list *nlp; 1234 struct rfs_async_write *rp; 1235 struct rfs_async_write *nrp; 1236 struct rfs_async_write *trp; 1237 struct rfs_async_write *lrp; 1238 int data_written; 1239 int iovcnt; 1240 mblk_t *m; 1241 struct iovec *iovp; 1242 struct iovec *niovp; 1243 struct iovec iov[MAXCLIOVECS]; 1244 int count; 1245 int rcount; 1246 uint_t off; 1247 uint_t len; 1248 struct rfs_async_write nrpsp; 1249 struct rfs_async_write_list nlpsp; 1250 ushort_t t_flag; 1251 cred_t *savecred; 1252 int in_crit = 0; 1253 1254 if (!rfs_write_async) { 1255 rfs_write_sync(wa, ns, exi, req, cr); 1256 return; 1257 } 1258 1259 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1260 "rfs_write_start:(%S)", "async"); 1261 1262 /* 1263 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1264 * is considered an OK. 1265 */ 1266 ns->ns_status = RFSWRITE_INITVAL; 1267 1268 nrp = &nrpsp; 1269 nrp->wa = wa; 1270 nrp->ns = ns; 1271 nrp->req = req; 1272 nrp->cr = cr; 1273 nrp->thread = curthread; 1274 1275 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1276 1277 /* 1278 * Look to see if there is already a cluster started 1279 * for this file. 1280 */ 1281 mutex_enter(&rfs_async_write_lock); 1282 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1283 if (bcmp(&wa->wa_fhandle, lp->fhp, 1284 sizeof (fhandle_t)) == 0) 1285 break; 1286 } 1287 1288 /* 1289 * If lp is non-NULL, then there is already a cluster 1290 * started. We need to place ourselves in the cluster 1291 * list in the right place as determined by starting 1292 * offset. Conflicts with non-blocking mandatory locked 1293 * regions will be checked when the cluster is processed. 1294 */ 1295 if (lp != NULL) { 1296 rp = lp->list; 1297 trp = NULL; 1298 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1299 trp = rp; 1300 rp = rp->list; 1301 } 1302 nrp->list = rp; 1303 if (trp == NULL) 1304 lp->list = nrp; 1305 else 1306 trp->list = nrp; 1307 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1308 cv_wait(&lp->cv, &rfs_async_write_lock); 1309 mutex_exit(&rfs_async_write_lock); 1310 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1311 "rfs_write_end:(%S)", "cluster child"); 1312 return; 1313 } 1314 1315 /* 1316 * No cluster started yet, start one and add ourselves 1317 * to the list of clusters. 1318 */ 1319 nrp->list = NULL; 1320 1321 nlp = &nlpsp; 1322 nlp->fhp = &wa->wa_fhandle; 1323 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1324 nlp->list = nrp; 1325 nlp->next = NULL; 1326 1327 if (rfs_async_write_head == NULL) { 1328 rfs_async_write_head = nlp; 1329 } else { 1330 lp = rfs_async_write_head; 1331 while (lp->next != NULL) 1332 lp = lp->next; 1333 lp->next = nlp; 1334 } 1335 mutex_exit(&rfs_async_write_lock); 1336 1337 /* 1338 * Convert the file handle common to all of the requests 1339 * in this cluster to a vnode. 1340 */ 1341 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1342 if (vp == NULL) { 1343 mutex_enter(&rfs_async_write_lock); 1344 if (rfs_async_write_head == nlp) 1345 rfs_async_write_head = nlp->next; 1346 else { 1347 lp = rfs_async_write_head; 1348 while (lp->next != nlp) 1349 lp = lp->next; 1350 lp->next = nlp->next; 1351 } 1352 t_flag = curthread->t_flag & T_WOULDBLOCK; 1353 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1354 rp->ns->ns_status = NFSERR_STALE; 1355 rp->thread->t_flag |= t_flag; 1356 } 1357 cv_broadcast(&nlp->cv); 1358 mutex_exit(&rfs_async_write_lock); 1359 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1360 "rfs_write_end:(%S)", "stale"); 1361 return; 1362 } 1363 1364 /* 1365 * Can only write regular files. Attempts to write any 1366 * other file types fail with EISDIR. 1367 */ 1368 if (vp->v_type != VREG) { 1369 VN_RELE(vp); 1370 mutex_enter(&rfs_async_write_lock); 1371 if (rfs_async_write_head == nlp) 1372 rfs_async_write_head = nlp->next; 1373 else { 1374 lp = rfs_async_write_head; 1375 while (lp->next != nlp) 1376 lp = lp->next; 1377 lp->next = nlp->next; 1378 } 1379 t_flag = curthread->t_flag & T_WOULDBLOCK; 1380 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1381 rp->ns->ns_status = NFSERR_ISDIR; 1382 rp->thread->t_flag |= t_flag; 1383 } 1384 cv_broadcast(&nlp->cv); 1385 mutex_exit(&rfs_async_write_lock); 1386 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1387 "rfs_write_end:(%S)", "isdir"); 1388 return; 1389 } 1390 1391 /* 1392 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1393 * deadlock with ufs. 1394 */ 1395 if (nbl_need_check(vp)) { 1396 nbl_start_crit(vp, RW_READER); 1397 in_crit = 1; 1398 } 1399 1400 /* 1401 * Lock the file for writing. This operation provides 1402 * the delay which allows clusters to grow. 1403 */ 1404 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1405 "vop_wrlock_start:"); 1406 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1407 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1408 "vop_wrlock_end"); 1409 1410 /* 1411 * Disconnect this cluster from the list of clusters. 1412 * The cluster that is being dealt with must be fixed 1413 * in size after this point, so there is no reason 1414 * to leave it on the list so that new requests can 1415 * find it. 1416 * 1417 * The algorithm is that the first write request will 1418 * create a cluster, convert the file handle to a 1419 * vnode pointer, and then lock the file for writing. 1420 * This request is not likely to be clustered with 1421 * any others. However, the next request will create 1422 * a new cluster and be blocked in VOP_RWLOCK while 1423 * the first request is being processed. This delay 1424 * will allow more requests to be clustered in this 1425 * second cluster. 1426 */ 1427 mutex_enter(&rfs_async_write_lock); 1428 if (rfs_async_write_head == nlp) 1429 rfs_async_write_head = nlp->next; 1430 else { 1431 lp = rfs_async_write_head; 1432 while (lp->next != nlp) 1433 lp = lp->next; 1434 lp->next = nlp->next; 1435 } 1436 mutex_exit(&rfs_async_write_lock); 1437 1438 /* 1439 * Step through the list of requests in this cluster. 1440 * We need to check permissions to make sure that all 1441 * of the requests have sufficient permission to write 1442 * the file. A cluster can be composed of requests 1443 * from different clients and different users on each 1444 * client. 1445 * 1446 * As a side effect, we also calculate the size of the 1447 * byte range that this cluster encompasses. 1448 */ 1449 rp = nlp->list; 1450 off = rp->wa->wa_offset; 1451 len = (uint_t)0; 1452 do { 1453 if (rdonly(exi, rp->req)) { 1454 rp->ns->ns_status = NFSERR_ROFS; 1455 t_flag = curthread->t_flag & T_WOULDBLOCK; 1456 rp->thread->t_flag |= t_flag; 1457 continue; 1458 } 1459 1460 va.va_mask = AT_UID|AT_MODE; 1461 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1462 "vop_getattr_start:"); 1463 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1464 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1465 "vop_getattr_end:"); 1466 if (!error) { 1467 if (crgetuid(rp->cr) != va.va_uid) { 1468 /* 1469 * This is a kludge to allow writes of files 1470 * created with read only permission. The 1471 * owner of the file is always allowed to 1472 * write it. 1473 */ 1474 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1475 "vop_access_start:"); 1476 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr); 1477 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1478 "vop_access_end:"); 1479 } 1480 if (!error && MANDLOCK(vp, va.va_mode)) 1481 error = EACCES; 1482 } 1483 1484 /* 1485 * Check for a conflict with a nbmand-locked region. 1486 */ 1487 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1488 rp->wa->wa_count, 0)) { 1489 error = EACCES; 1490 } 1491 1492 if (error) { 1493 rp->ns->ns_status = puterrno(error); 1494 t_flag = curthread->t_flag & T_WOULDBLOCK; 1495 rp->thread->t_flag |= t_flag; 1496 continue; 1497 } 1498 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1499 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1500 } while ((rp = rp->list) != NULL); 1501 1502 /* 1503 * Step through the cluster attempting to gather as many 1504 * requests which are contiguous as possible. These 1505 * contiguous requests are handled via one call to VOP_WRITE 1506 * instead of different calls to VOP_WRITE. We also keep 1507 * track of the fact that any data was written. 1508 */ 1509 rp = nlp->list; 1510 data_written = 0; 1511 do { 1512 /* 1513 * Skip any requests which are already marked as having an 1514 * error. 1515 */ 1516 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1517 rp = rp->list; 1518 continue; 1519 } 1520 1521 /* 1522 * Count the number of iovec's which are required 1523 * to handle this set of requests. One iovec is 1524 * needed for each data buffer, whether addressed 1525 * by wa_data or by the b_rptr pointers in the 1526 * mblk chains. 1527 */ 1528 iovcnt = 0; 1529 lrp = rp; 1530 for (;;) { 1531 if (lrp->wa->wa_data) 1532 iovcnt++; 1533 else { 1534 m = lrp->wa->wa_mblk; 1535 while (m != NULL) { 1536 iovcnt++; 1537 m = m->b_cont; 1538 } 1539 } 1540 if (lrp->list == NULL || 1541 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1542 lrp->wa->wa_offset + lrp->wa->wa_count != 1543 lrp->list->wa->wa_offset) { 1544 lrp = lrp->list; 1545 break; 1546 } 1547 lrp = lrp->list; 1548 } 1549 1550 if (iovcnt <= MAXCLIOVECS) { 1551 #ifdef DEBUG 1552 rfs_write_hits++; 1553 #endif 1554 niovp = iov; 1555 } else { 1556 #ifdef DEBUG 1557 rfs_write_misses++; 1558 #endif 1559 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1560 } 1561 /* 1562 * Put together the scatter/gather iovecs. 1563 */ 1564 iovp = niovp; 1565 trp = rp; 1566 count = 0; 1567 do { 1568 if (trp->wa->wa_data) { 1569 iovp->iov_base = trp->wa->wa_data; 1570 iovp->iov_len = trp->wa->wa_count; 1571 iovp++; 1572 } else { 1573 m = trp->wa->wa_mblk; 1574 rcount = trp->wa->wa_count; 1575 while (m != NULL) { 1576 iovp->iov_base = (caddr_t)m->b_rptr; 1577 iovp->iov_len = (m->b_wptr - m->b_rptr); 1578 rcount -= iovp->iov_len; 1579 if (rcount < 0) 1580 iovp->iov_len += rcount; 1581 iovp++; 1582 if (rcount <= 0) 1583 break; 1584 m = m->b_cont; 1585 } 1586 } 1587 count += trp->wa->wa_count; 1588 trp = trp->list; 1589 } while (trp != lrp); 1590 1591 uio.uio_iov = niovp; 1592 uio.uio_iovcnt = iovcnt; 1593 uio.uio_segflg = UIO_SYSSPACE; 1594 uio.uio_extflg = UIO_COPY_DEFAULT; 1595 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1596 uio.uio_resid = count; 1597 /* 1598 * The limit is checked on the client. We 1599 * should allow any size writes here. 1600 */ 1601 uio.uio_llimit = curproc->p_fsz_ctl; 1602 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1603 if (rlimit < (rlim64_t)uio.uio_resid) 1604 uio.uio_resid = (uint_t)rlimit; 1605 1606 /* 1607 * For now we assume no append mode. 1608 */ 1609 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1610 "vop_write_start:(%S)", "async"); 1611 1612 /* 1613 * Check to see if the v4 side of the server has 1614 * delegated this file. If so, then we mark thread 1615 * as wouldblock so the response is dropped. 1616 */ 1617 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1618 curthread->t_flag |= T_WOULDBLOCK; 1619 error = EACCES; /* just to have an error */ 1620 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1621 "rfs_write_end:(%S)", "delegated"); 1622 } else { 1623 /* 1624 * We're changing creds because VM may fault 1625 * and we need the cred of the current 1626 * thread to be used if quota * checking is 1627 * enabled. 1628 */ 1629 savecred = curthread->t_cred; 1630 curthread->t_cred = cr; 1631 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1632 curthread->t_cred = savecred; 1633 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1634 "vop_write_end:"); 1635 } 1636 1637 if (niovp != iov) 1638 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1639 1640 if (!error) { 1641 data_written = 1; 1642 /* 1643 * Get attributes again so we send the latest mod 1644 * time to the client side for his cache. 1645 */ 1646 va.va_mask = AT_ALL; /* now we want everything */ 1647 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1648 "vop_getattr_start:"); 1649 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1650 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1651 "vop_getattr_end:"); 1652 if (!error) 1653 acl_perm(vp, exi, &va, rp->cr); 1654 } 1655 1656 /* 1657 * Fill in the status responses for each request 1658 * which was just handled. Also, copy the latest 1659 * attributes in to the attribute responses if 1660 * appropriate. 1661 */ 1662 t_flag = curthread->t_flag & T_WOULDBLOCK; 1663 do { 1664 rp->thread->t_flag |= t_flag; 1665 /* check for overflows */ 1666 if (!error) { 1667 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1668 } 1669 rp->ns->ns_status = puterrno(error); 1670 rp = rp->list; 1671 } while (rp != lrp); 1672 } while (rp != NULL); 1673 1674 /* 1675 * If any data was written at all, then we need to flush 1676 * the data and metadata to stable storage. 1677 */ 1678 if (data_written) { 1679 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, 1680 "vop_putpage_start:"); 1681 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr); 1682 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, 1683 "vop_putpage_end:"); 1684 if (!error) { 1685 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1686 "vop_fsync_start:"); 1687 error = VOP_FSYNC(vp, FNODSYNC, cr); 1688 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, 1689 "vop_fsync_end:"); 1690 } 1691 } 1692 1693 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1694 "vop_rwunlock_start:"); 1695 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1696 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1697 "vop_rwunlock_end:"); 1698 1699 if (in_crit) 1700 nbl_end_crit(vp); 1701 VN_RELE(vp); 1702 1703 t_flag = curthread->t_flag & T_WOULDBLOCK; 1704 mutex_enter(&rfs_async_write_lock); 1705 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1706 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1707 rp->ns->ns_status = puterrno(error); 1708 rp->thread->t_flag |= t_flag; 1709 } 1710 } 1711 cv_broadcast(&nlp->cv); 1712 mutex_exit(&rfs_async_write_lock); 1713 1714 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1715 "rfs_write_end:(%S)", "async"); 1716 } 1717 1718 fhandle_t * 1719 rfs_write_getfh(struct nfswriteargs *wa) 1720 { 1721 return (&wa->wa_fhandle); 1722 } 1723 1724 /* 1725 * Create a file. 1726 * Creates a file with given attributes and returns those attributes 1727 * and an fhandle for the new file. 1728 */ 1729 void 1730 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1731 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1732 { 1733 int error; 1734 int lookuperr; 1735 int in_crit = 0; 1736 struct vattr va; 1737 vnode_t *vp; 1738 vnode_t *dvp; 1739 char *name = args->ca_da.da_name; 1740 vnode_t *tvp = NULL; 1741 int mode; 1742 int lookup_ok; 1743 bool_t trunc; 1744 1745 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, 1746 "rfs_create_start:"); 1747 1748 /* 1749 * Disallow NULL paths 1750 */ 1751 if (name == NULL || *name == '\0') { 1752 dr->dr_status = NFSERR_ACCES; 1753 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1754 "rfs_create_end:(%S)", "access"); 1755 return; 1756 } 1757 1758 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1759 if (dvp == NULL) { 1760 dr->dr_status = NFSERR_STALE; 1761 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1762 "rfs_create_end:(%S)", "stale"); 1763 return; 1764 } 1765 1766 error = sattr_to_vattr(args->ca_sa, &va); 1767 if (error) { 1768 dr->dr_status = puterrno(error); 1769 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1770 "rfs_create_end:(%S)", "sattr"); 1771 return; 1772 } 1773 1774 /* 1775 * Must specify the mode. 1776 */ 1777 if (!(va.va_mask & AT_MODE)) { 1778 VN_RELE(dvp); 1779 dr->dr_status = NFSERR_INVAL; 1780 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1781 "rfs_create_end:(%S)", "no mode"); 1782 return; 1783 } 1784 1785 /* 1786 * This is a completely gross hack to make mknod 1787 * work over the wire until we can wack the protocol 1788 */ 1789 if ((va.va_mode & IFMT) == IFCHR) { 1790 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1791 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1792 else { 1793 va.va_type = VCHR; 1794 /* 1795 * uncompress the received dev_t 1796 * if the top half is zero indicating a request 1797 * from an `older style' OS. 1798 */ 1799 if ((va.va_size & 0xffff0000) == 0) 1800 va.va_rdev = nfsv2_expdev(va.va_size); 1801 else 1802 va.va_rdev = (dev_t)va.va_size; 1803 } 1804 va.va_mask &= ~AT_SIZE; 1805 } else if ((va.va_mode & IFMT) == IFBLK) { 1806 va.va_type = VBLK; 1807 /* 1808 * uncompress the received dev_t 1809 * if the top half is zero indicating a request 1810 * from an `older style' OS. 1811 */ 1812 if ((va.va_size & 0xffff0000) == 0) 1813 va.va_rdev = nfsv2_expdev(va.va_size); 1814 else 1815 va.va_rdev = (dev_t)va.va_size; 1816 va.va_mask &= ~AT_SIZE; 1817 } else if ((va.va_mode & IFMT) == IFSOCK) { 1818 va.va_type = VSOCK; 1819 } else 1820 va.va_type = VREG; 1821 va.va_mode &= ~IFMT; 1822 va.va_mask |= AT_TYPE; 1823 1824 /* 1825 * Why was the choice made to use VWRITE as the mode to the 1826 * call to VOP_CREATE ? This results in a bug. When a client 1827 * opens a file that already exists and is RDONLY, the second 1828 * open fails with an EACESS because of the mode. 1829 * bug ID 1054648. 1830 */ 1831 lookup_ok = 0; 1832 mode = VWRITE; 1833 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1834 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 1835 "vop_lookup_start:"); 1836 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1837 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 1838 "vop_lookup_end:"); 1839 if (!error) { 1840 struct vattr at; 1841 1842 lookup_ok = 1; 1843 at.va_mask = AT_MODE; 1844 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1845 "vop_getattr_start:"); 1846 error = VOP_GETATTR(tvp, &at, 0, cr); 1847 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1848 "vop_getattr_end:"); 1849 if (!error) 1850 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1851 VN_RELE(tvp); 1852 tvp = NULL; 1853 } 1854 } 1855 1856 if (!lookup_ok) { 1857 if (rdonly(exi, req)) { 1858 error = EROFS; 1859 } else if (va.va_type != VREG && va.va_type != VFIFO && 1860 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1861 error = EPERM; 1862 } else { 1863 error = 0; 1864 } 1865 } 1866 1867 /* 1868 * If file size is being modified on an already existing file 1869 * make sure that there are no conflicting non-blocking mandatory 1870 * locks in the region being manipulated. Return EACCES if there 1871 * are conflicting locks. 1872 */ 1873 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1874 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1875 1876 if (!lookuperr && 1877 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1878 VN_RELE(tvp); 1879 curthread->t_flag |= T_WOULDBLOCK; 1880 goto out; 1881 } 1882 1883 if (!lookuperr && nbl_need_check(tvp)) { 1884 /* 1885 * The file exists. Now check if it has any 1886 * conflicting non-blocking mandatory locks 1887 * in the region being changed. 1888 */ 1889 struct vattr bva; 1890 u_offset_t offset; 1891 ssize_t length; 1892 1893 nbl_start_crit(tvp, RW_READER); 1894 in_crit = 1; 1895 1896 bva.va_mask = AT_SIZE; 1897 error = VOP_GETATTR(tvp, &bva, 0, cr); 1898 if (!error) { 1899 if (va.va_size < bva.va_size) { 1900 offset = va.va_size; 1901 length = bva.va_size - va.va_size; 1902 } else { 1903 offset = bva.va_size; 1904 length = va.va_size - bva.va_size; 1905 } 1906 if (length) { 1907 if (nbl_conflict(tvp, NBL_WRITE, 1908 offset, length, 0)) { 1909 error = EACCES; 1910 } 1911 } 1912 } 1913 if (error) { 1914 nbl_end_crit(tvp); 1915 VN_RELE(tvp); 1916 in_crit = 0; 1917 } 1918 } else if (tvp != NULL) { 1919 VN_RELE(tvp); 1920 } 1921 } 1922 1923 if (!error) { 1924 /* 1925 * If filesystem is shared with nosuid the remove any 1926 * setuid/setgid bits on create. 1927 */ 1928 if (va.va_type == VREG && 1929 exi->exi_export.ex_flags & EX_NOSUID) 1930 va.va_mode &= ~(VSUID | VSGID); 1931 1932 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, 1933 "vop_create_start:"); 1934 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0); 1935 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, 1936 "vop_create_end:"); 1937 1938 if (!error) { 1939 1940 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1941 trunc = TRUE; 1942 else 1943 trunc = FALSE; 1944 1945 if (rfs4_check_delegated(FWRITE, tvp, trunc)) { 1946 VN_RELE(tvp); 1947 curthread->t_flag |= T_WOULDBLOCK; 1948 goto out; 1949 } 1950 va.va_mask = AT_ALL; 1951 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1952 "vop_getattr_start:"); 1953 error = VOP_GETATTR(vp, &va, 0, cr); 1954 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1955 "vop_getattr_end:"); 1956 /* check for overflows */ 1957 if (!error) { 1958 acl_perm(vp, exi, &va, cr); 1959 error = vattr_to_nattr(&va, &dr->dr_attr); 1960 if (!error) { 1961 error = makefh(&dr->dr_fhandle, vp, 1962 exi); 1963 } 1964 } 1965 /* 1966 * Force modified metadata out to stable storage. 1967 */ 1968 (void) VOP_FSYNC(vp, FNODSYNC, cr); 1969 VN_RELE(vp); 1970 } 1971 1972 if (in_crit) { 1973 nbl_end_crit(tvp); 1974 VN_RELE(tvp); 1975 } 1976 } 1977 1978 /* 1979 * Force modified data and metadata out to stable storage. 1980 */ 1981 (void) VOP_FSYNC(dvp, 0, cr); 1982 1983 out: 1984 1985 VN_RELE(dvp); 1986 1987 dr->dr_status = puterrno(error); 1988 1989 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1990 "rfs_create_end:(%S)", "done"); 1991 } 1992 fhandle_t * 1993 rfs_create_getfh(struct nfscreatargs *args) 1994 { 1995 return (args->ca_da.da_fhandle); 1996 } 1997 1998 /* 1999 * Remove a file. 2000 * Remove named file from parent directory. 2001 */ 2002 void 2003 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 2004 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2005 { 2006 int error = 0; 2007 vnode_t *vp; 2008 vnode_t *targvp; 2009 int in_crit = 0; 2010 2011 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, 2012 "rfs_remove_start:"); 2013 2014 /* 2015 * Disallow NULL paths 2016 */ 2017 if (da->da_name == NULL || *da->da_name == '\0') { 2018 *status = NFSERR_ACCES; 2019 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2020 "rfs_remove_end:(%S)", "access"); 2021 return; 2022 } 2023 2024 vp = nfs_fhtovp(da->da_fhandle, exi); 2025 if (vp == NULL) { 2026 *status = NFSERR_STALE; 2027 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2028 "rfs_remove_end:(%S)", "stale"); 2029 return; 2030 } 2031 2032 if (rdonly(exi, req)) { 2033 VN_RELE(vp); 2034 *status = NFSERR_ROFS; 2035 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2036 "rfs_remove_end:(%S)", "rofs"); 2037 return; 2038 } 2039 2040 /* 2041 * Check for a conflict with a non-blocking mandatory share reservation. 2042 */ 2043 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2044 NULL, cr); 2045 if (error != 0) { 2046 VN_RELE(vp); 2047 *status = puterrno(error); 2048 return; 2049 } 2050 2051 /* 2052 * If the file is delegated to an v4 client, then initiate 2053 * recall and drop this request (by setting T_WOULDBLOCK). 2054 * The client will eventually re-transmit the request and 2055 * (hopefully), by then, the v4 client will have returned 2056 * the delegation. 2057 */ 2058 2059 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2060 VN_RELE(vp); 2061 VN_RELE(targvp); 2062 curthread->t_flag |= T_WOULDBLOCK; 2063 return; 2064 } 2065 2066 if (nbl_need_check(targvp)) { 2067 nbl_start_crit(targvp, RW_READER); 2068 in_crit = 1; 2069 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) { 2070 error = EACCES; 2071 goto out; 2072 } 2073 } 2074 2075 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, 2076 "vop_remove_start:"); 2077 error = VOP_REMOVE(vp, da->da_name, cr); 2078 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, 2079 "vop_remove_end:"); 2080 2081 /* 2082 * Force modified data and metadata out to stable storage. 2083 */ 2084 (void) VOP_FSYNC(vp, 0, cr); 2085 2086 out: 2087 if (in_crit) 2088 nbl_end_crit(targvp); 2089 VN_RELE(targvp); 2090 VN_RELE(vp); 2091 2092 *status = puterrno(error); 2093 2094 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2095 "rfs_remove_end:(%S)", "done"); 2096 } 2097 2098 fhandle_t * 2099 rfs_remove_getfh(struct nfsdiropargs *da) 2100 { 2101 return (da->da_fhandle); 2102 } 2103 2104 /* 2105 * rename a file 2106 * Give a file (from) a new name (to). 2107 */ 2108 void 2109 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2110 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2111 { 2112 int error = 0; 2113 vnode_t *fromvp; 2114 vnode_t *tovp; 2115 struct exportinfo *to_exi; 2116 fhandle_t *fh; 2117 vnode_t *srcvp; 2118 vnode_t *targvp; 2119 int in_crit = 0; 2120 2121 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, 2122 "rfs_rename_start:"); 2123 2124 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2125 if (fromvp == NULL) { 2126 *status = NFSERR_STALE; 2127 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2128 "rfs_rename_end:(%S)", "from stale"); 2129 return; 2130 } 2131 2132 fh = args->rna_to.da_fhandle; 2133 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2134 if (to_exi == NULL) { 2135 VN_RELE(fromvp); 2136 *status = NFSERR_ACCES; 2137 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2138 "rfs_rename_end:(%S)", "cross device"); 2139 return; 2140 } 2141 exi_rele(to_exi); 2142 2143 if (to_exi != exi) { 2144 VN_RELE(fromvp); 2145 *status = NFSERR_XDEV; 2146 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2147 "rfs_rename_end:(%S)", "from stale"); 2148 return; 2149 } 2150 2151 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2152 if (tovp == NULL) { 2153 VN_RELE(fromvp); 2154 *status = NFSERR_STALE; 2155 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2156 "rfs_rename_end:(%S)", "to stale"); 2157 return; 2158 } 2159 2160 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2161 VN_RELE(tovp); 2162 VN_RELE(fromvp); 2163 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2164 "rfs_rename_end:(%S)", "not dir"); 2165 *status = NFSERR_NOTDIR; 2166 return; 2167 } 2168 2169 /* 2170 * Disallow NULL paths 2171 */ 2172 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2173 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2174 VN_RELE(tovp); 2175 VN_RELE(fromvp); 2176 *status = NFSERR_ACCES; 2177 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2178 "rfs_rename_end:(%S)", "access"); 2179 return; 2180 } 2181 2182 if (rdonly(exi, req)) { 2183 VN_RELE(tovp); 2184 VN_RELE(fromvp); 2185 *status = NFSERR_ROFS; 2186 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2187 "rfs_rename_end:(%S)", "rofs"); 2188 return; 2189 } 2190 2191 /* 2192 * Check for a conflict with a non-blocking mandatory share reservation. 2193 */ 2194 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2195 NULL, cr); 2196 if (error != 0) { 2197 VN_RELE(tovp); 2198 VN_RELE(fromvp); 2199 *status = puterrno(error); 2200 return; 2201 } 2202 2203 /* Check for delegations on the source file */ 2204 2205 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2206 VN_RELE(tovp); 2207 VN_RELE(fromvp); 2208 VN_RELE(srcvp); 2209 curthread->t_flag |= T_WOULDBLOCK; 2210 return; 2211 } 2212 2213 /* Check for delegation on the file being renamed over, if it exists */ 2214 2215 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2216 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr) 2217 == 0) { 2218 2219 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2220 VN_RELE(tovp); 2221 VN_RELE(fromvp); 2222 VN_RELE(srcvp); 2223 VN_RELE(targvp); 2224 curthread->t_flag |= T_WOULDBLOCK; 2225 return; 2226 } 2227 VN_RELE(targvp); 2228 } 2229 2230 2231 if (nbl_need_check(srcvp)) { 2232 nbl_start_crit(srcvp, RW_READER); 2233 in_crit = 1; 2234 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) { 2235 error = EACCES; 2236 goto out; 2237 } 2238 } 2239 2240 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, 2241 "vop_rename_start:"); 2242 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2243 tovp, args->rna_to.da_name, cr); 2244 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, 2245 "vop_rename_end:"); 2246 2247 if (error == 0) { 2248 char *tmp; 2249 2250 /* fix the path name for the renamed file */ 2251 mutex_enter(&srcvp->v_lock); 2252 tmp = srcvp->v_path; 2253 srcvp->v_path = NULL; 2254 mutex_exit(&srcvp->v_lock); 2255 vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name, 2256 strlen(args->rna_to.da_name)); 2257 if (tmp != NULL) 2258 kmem_free(tmp, strlen(tmp) + 1); 2259 } 2260 2261 /* 2262 * Force modified data and metadata out to stable storage. 2263 */ 2264 (void) VOP_FSYNC(tovp, 0, cr); 2265 (void) VOP_FSYNC(fromvp, 0, cr); 2266 2267 out: 2268 if (in_crit) 2269 nbl_end_crit(srcvp); 2270 VN_RELE(srcvp); 2271 VN_RELE(tovp); 2272 VN_RELE(fromvp); 2273 2274 *status = puterrno(error); 2275 2276 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2277 "rfs_rename_end:(%S)", "done"); 2278 } 2279 fhandle_t * 2280 rfs_rename_getfh(struct nfsrnmargs *args) 2281 { 2282 return (args->rna_from.da_fhandle); 2283 } 2284 2285 /* 2286 * Link to a file. 2287 * Create a file (to) which is a hard link to the given file (from). 2288 */ 2289 void 2290 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2291 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2292 { 2293 int error; 2294 vnode_t *fromvp; 2295 vnode_t *tovp; 2296 struct exportinfo *to_exi; 2297 fhandle_t *fh; 2298 2299 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, 2300 "rfs_link_start:"); 2301 2302 fromvp = nfs_fhtovp(args->la_from, exi); 2303 if (fromvp == NULL) { 2304 *status = NFSERR_STALE; 2305 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2306 "rfs_link_end:(%S)", "from stale"); 2307 return; 2308 } 2309 2310 fh = args->la_to.da_fhandle; 2311 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2312 if (to_exi == NULL) { 2313 VN_RELE(fromvp); 2314 *status = NFSERR_ACCES; 2315 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2316 "rfs_link_end:(%S)", "cross device"); 2317 return; 2318 } 2319 exi_rele(to_exi); 2320 2321 if (to_exi != exi) { 2322 VN_RELE(fromvp); 2323 *status = NFSERR_XDEV; 2324 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2325 "rfs_link_end:(%S)", "cross device"); 2326 return; 2327 } 2328 2329 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2330 if (tovp == NULL) { 2331 VN_RELE(fromvp); 2332 *status = NFSERR_STALE; 2333 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2334 "rfs_link_end:(%S)", "to stale"); 2335 return; 2336 } 2337 2338 if (tovp->v_type != VDIR) { 2339 VN_RELE(tovp); 2340 VN_RELE(fromvp); 2341 *status = NFSERR_NOTDIR; 2342 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2343 "rfs_link_end:(%S)", "not dir"); 2344 return; 2345 } 2346 /* 2347 * Disallow NULL paths 2348 */ 2349 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2350 VN_RELE(tovp); 2351 VN_RELE(fromvp); 2352 *status = NFSERR_ACCES; 2353 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2354 "rfs_link_end:(%S)", "access"); 2355 return; 2356 } 2357 2358 if (rdonly(exi, req)) { 2359 VN_RELE(tovp); 2360 VN_RELE(fromvp); 2361 *status = NFSERR_ROFS; 2362 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2363 "rfs_link_end:(%S)", "rofs"); 2364 return; 2365 } 2366 2367 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, 2368 "vop_link_start:"); 2369 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr); 2370 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, 2371 "vop_link_end:"); 2372 2373 /* 2374 * Force modified data and metadata out to stable storage. 2375 */ 2376 (void) VOP_FSYNC(tovp, 0, cr); 2377 (void) VOP_FSYNC(fromvp, FNODSYNC, cr); 2378 2379 VN_RELE(tovp); 2380 VN_RELE(fromvp); 2381 2382 *status = puterrno(error); 2383 2384 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2385 "rfs_link_end:(%S)", "done"); 2386 } 2387 fhandle_t * 2388 rfs_link_getfh(struct nfslinkargs *args) 2389 { 2390 return (args->la_from); 2391 } 2392 2393 /* 2394 * Symbolicly link to a file. 2395 * Create a file (to) with the given attributes which is a symbolic link 2396 * to the given path name (to). 2397 */ 2398 void 2399 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2400 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2401 { 2402 int error; 2403 struct vattr va; 2404 vnode_t *vp; 2405 vnode_t *svp; 2406 int lerror; 2407 2408 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, 2409 "rfs_symlink_start:"); 2410 2411 /* 2412 * Disallow NULL paths 2413 */ 2414 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2415 *status = NFSERR_ACCES; 2416 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2417 "rfs_symlink_end:(%S)", "access"); 2418 return; 2419 } 2420 2421 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2422 if (vp == NULL) { 2423 *status = NFSERR_STALE; 2424 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2425 "rfs_symlink_end:(%S)", "stale"); 2426 return; 2427 } 2428 2429 if (rdonly(exi, req)) { 2430 VN_RELE(vp); 2431 *status = NFSERR_ROFS; 2432 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2433 "rfs_symlink_end:(%S)", "rofs"); 2434 return; 2435 } 2436 2437 error = sattr_to_vattr(args->sla_sa, &va); 2438 if (error) { 2439 VN_RELE(vp); 2440 *status = puterrno(error); 2441 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2442 "rfs_symlink_end:(%S)", "sattr"); 2443 return; 2444 } 2445 2446 if (!(va.va_mask & AT_MODE)) { 2447 VN_RELE(vp); 2448 *status = NFSERR_INVAL; 2449 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2450 "rfs_symlink_end:(%S)", "no mode"); 2451 return; 2452 } 2453 2454 va.va_type = VLNK; 2455 va.va_mask |= AT_TYPE; 2456 2457 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, 2458 "vop_symlink_start:"); 2459 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr); 2460 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, 2461 "vop_symlink_end:"); 2462 2463 /* 2464 * Force new data and metadata out to stable storage. 2465 */ 2466 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 2467 "vop_lookup_start:"); 2468 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2469 0, NULL, cr); 2470 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 2471 "vop_lookup_end:"); 2472 if (!lerror) { 2473 (void) VOP_FSYNC(svp, 0, cr); 2474 VN_RELE(svp); 2475 } 2476 2477 /* 2478 * Force modified data and metadata out to stable storage. 2479 */ 2480 (void) VOP_FSYNC(vp, 0, cr); 2481 2482 VN_RELE(vp); 2483 2484 *status = puterrno(error); 2485 2486 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2487 "rfs_symlink_end:(%S)", "done"); 2488 } 2489 fhandle_t * 2490 rfs_symlink_getfh(struct nfsslargs *args) 2491 { 2492 return (args->sla_from.da_fhandle); 2493 } 2494 2495 /* 2496 * Make a directory. 2497 * Create a directory with the given name, parent directory, and attributes. 2498 * Returns a file handle and attributes for the new directory. 2499 */ 2500 void 2501 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2502 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2503 { 2504 int error; 2505 struct vattr va; 2506 vnode_t *dvp = NULL; 2507 vnode_t *vp; 2508 char *name = args->ca_da.da_name; 2509 2510 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, 2511 "rfs_mkdir_start:"); 2512 2513 /* 2514 * Disallow NULL paths 2515 */ 2516 if (name == NULL || *name == '\0') { 2517 dr->dr_status = NFSERR_ACCES; 2518 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2519 "rfs_mkdir_end:(%S)", "access"); 2520 return; 2521 } 2522 2523 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2524 if (vp == NULL) { 2525 dr->dr_status = NFSERR_STALE; 2526 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2527 "rfs_mkdir_end:(%S)", "stale"); 2528 return; 2529 } 2530 2531 if (rdonly(exi, req)) { 2532 VN_RELE(vp); 2533 dr->dr_status = NFSERR_ROFS; 2534 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2535 "rfs_mkdir_end:(%S)", "rofs"); 2536 return; 2537 } 2538 2539 error = sattr_to_vattr(args->ca_sa, &va); 2540 if (error) { 2541 VN_RELE(vp); 2542 dr->dr_status = puterrno(error); 2543 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2544 "rfs_mkdir_end:(%S)", "sattr"); 2545 return; 2546 } 2547 2548 if (!(va.va_mask & AT_MODE)) { 2549 VN_RELE(vp); 2550 dr->dr_status = NFSERR_INVAL; 2551 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2552 "rfs_mkdir_end:(%S)", "no mode"); 2553 return; 2554 } 2555 2556 va.va_type = VDIR; 2557 va.va_mask |= AT_TYPE; 2558 2559 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, 2560 "vop_mkdir_start:"); 2561 error = VOP_MKDIR(vp, name, &va, &dvp, cr); 2562 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, 2563 "vop_mkdir_end:"); 2564 2565 if (!error) { 2566 /* 2567 * Attribtutes of the newly created directory should 2568 * be returned to the client. 2569 */ 2570 va.va_mask = AT_ALL; /* We want everything */ 2571 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 2572 "vop_getattr_start:"); 2573 error = VOP_GETATTR(dvp, &va, 0, cr); 2574 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 2575 "vop_getattr_end:"); 2576 /* check for overflows */ 2577 if (!error) { 2578 acl_perm(vp, exi, &va, cr); 2579 error = vattr_to_nattr(&va, &dr->dr_attr); 2580 if (!error) { 2581 error = makefh(&dr->dr_fhandle, dvp, exi); 2582 } 2583 } 2584 /* 2585 * Force new data and metadata out to stable storage. 2586 */ 2587 (void) VOP_FSYNC(dvp, 0, cr); 2588 VN_RELE(dvp); 2589 } 2590 2591 /* 2592 * Force modified data and metadata out to stable storage. 2593 */ 2594 (void) VOP_FSYNC(vp, 0, cr); 2595 2596 VN_RELE(vp); 2597 2598 dr->dr_status = puterrno(error); 2599 2600 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2601 "rfs_mkdir_end:(%S)", "done"); 2602 } 2603 fhandle_t * 2604 rfs_mkdir_getfh(struct nfscreatargs *args) 2605 { 2606 return (args->ca_da.da_fhandle); 2607 } 2608 2609 /* 2610 * Remove a directory. 2611 * Remove the given directory name from the given parent directory. 2612 */ 2613 void 2614 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2615 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2616 { 2617 int error; 2618 vnode_t *vp; 2619 2620 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, 2621 "rfs_rmdir_start:"); 2622 2623 /* 2624 * Disallow NULL paths 2625 */ 2626 if (da->da_name == NULL || *da->da_name == '\0') { 2627 *status = NFSERR_ACCES; 2628 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2629 "rfs_rmdir_end:(%S)", "access"); 2630 return; 2631 } 2632 2633 vp = nfs_fhtovp(da->da_fhandle, exi); 2634 if (vp == NULL) { 2635 *status = NFSERR_STALE; 2636 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2637 "rfs_rmdir_end:(%S)", "stale"); 2638 return; 2639 } 2640 2641 if (rdonly(exi, req)) { 2642 VN_RELE(vp); 2643 *status = NFSERR_ROFS; 2644 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2645 "rfs_rmdir_end:(%S)", "rofs"); 2646 return; 2647 } 2648 2649 /* 2650 * VOP_RMDIR now takes a new third argument (the current 2651 * directory of the process). That's because someone 2652 * wants to return EINVAL if one tries to remove ".". 2653 * Of course, NFS servers have no idea what their 2654 * clients' current directories are. We fake it by 2655 * supplying a vnode known to exist and illegal to 2656 * remove. 2657 */ 2658 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, 2659 "vop_rmdir_start:"); 2660 error = VOP_RMDIR(vp, da->da_name, rootdir, cr); 2661 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, 2662 "vop_rmdir_end:"); 2663 2664 /* 2665 * Force modified data and metadata out to stable storage. 2666 */ 2667 (void) VOP_FSYNC(vp, 0, cr); 2668 2669 VN_RELE(vp); 2670 2671 /* 2672 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2673 * if the directory is not empty. A System V NFS server 2674 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2675 * over the wire. 2676 */ 2677 if (error == EEXIST) 2678 *status = NFSERR_NOTEMPTY; 2679 else 2680 *status = puterrno(error); 2681 2682 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2683 "rfs_rmdir_end:(%S)", "done"); 2684 } 2685 fhandle_t * 2686 rfs_rmdir_getfh(struct nfsdiropargs *da) 2687 { 2688 return (da->da_fhandle); 2689 } 2690 2691 /* ARGSUSED */ 2692 void 2693 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2694 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2695 { 2696 int error; 2697 int iseof; 2698 struct iovec iov; 2699 struct uio uio; 2700 vnode_t *vp; 2701 2702 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, 2703 "rfs_readdir_start:"); 2704 2705 vp = nfs_fhtovp(&rda->rda_fh, exi); 2706 if (vp == NULL) { 2707 rd->rd_entries = NULL; 2708 rd->rd_status = NFSERR_STALE; 2709 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2710 "rfs_readdir_end:(%S)", "stale"); 2711 return; 2712 } 2713 2714 if (vp->v_type != VDIR) { 2715 VN_RELE(vp); 2716 rd->rd_entries = NULL; 2717 rd->rd_status = NFSERR_NOTDIR; 2718 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2719 "rfs_readdir_end:(%S)", "notdir"); 2720 return; 2721 } 2722 2723 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 2724 "vop_rwlock_start:"); 2725 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2726 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 2727 "vop_rwlock_end:"); 2728 2729 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 2730 "vop_access_start:"); 2731 error = VOP_ACCESS(vp, VREAD, 0, cr); 2732 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 2733 "vop_access_end:"); 2734 if (error) { 2735 rd->rd_entries = NULL; 2736 goto bad; 2737 } 2738 2739 if (rda->rda_count == 0) { 2740 rd->rd_entries = NULL; 2741 rd->rd_size = 0; 2742 rd->rd_eof = FALSE; 2743 goto bad; 2744 } 2745 2746 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2747 2748 /* 2749 * Allocate data for entries. This will be freed by rfs_rddirfree. 2750 */ 2751 rd->rd_bufsize = (uint_t)rda->rda_count; 2752 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2753 2754 /* 2755 * Set up io vector to read directory data 2756 */ 2757 iov.iov_base = (caddr_t)rd->rd_entries; 2758 iov.iov_len = rda->rda_count; 2759 uio.uio_iov = &iov; 2760 uio.uio_iovcnt = 1; 2761 uio.uio_segflg = UIO_SYSSPACE; 2762 uio.uio_extflg = UIO_COPY_CACHED; 2763 uio.uio_loffset = (offset_t)rda->rda_offset; 2764 uio.uio_resid = rda->rda_count; 2765 2766 /* 2767 * read directory 2768 */ 2769 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, 2770 "vop_readdir_start:"); 2771 error = VOP_READDIR(vp, &uio, cr, &iseof); 2772 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, 2773 "vop_readdir_end:"); 2774 2775 /* 2776 * Clean up 2777 */ 2778 if (!error) { 2779 /* 2780 * set size and eof 2781 */ 2782 if (uio.uio_resid == rda->rda_count) { 2783 rd->rd_size = 0; 2784 rd->rd_eof = TRUE; 2785 } else { 2786 rd->rd_size = (uint32_t)(rda->rda_count - 2787 uio.uio_resid); 2788 rd->rd_eof = iseof ? TRUE : FALSE; 2789 } 2790 } 2791 2792 bad: 2793 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 2794 "vop_rwunlock_start:"); 2795 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2796 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 2797 "vop_rwunlock_end:"); 2798 2799 #if 0 /* notyet */ 2800 /* 2801 * Don't do this. It causes local disk writes when just 2802 * reading the file and the overhead is deemed larger 2803 * than the benefit. 2804 */ 2805 /* 2806 * Force modified metadata out to stable storage. 2807 */ 2808 (void) VOP_FSYNC(vp, FNODSYNC, cr); 2809 #endif 2810 2811 VN_RELE(vp); 2812 2813 rd->rd_status = puterrno(error); 2814 2815 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2816 "rfs_readdir_end:(%S)", "done"); 2817 } 2818 fhandle_t * 2819 rfs_readdir_getfh(struct nfsrddirargs *rda) 2820 { 2821 return (&rda->rda_fh); 2822 } 2823 void 2824 rfs_rddirfree(struct nfsrddirres *rd) 2825 { 2826 if (rd->rd_entries != NULL) 2827 kmem_free(rd->rd_entries, rd->rd_bufsize); 2828 } 2829 2830 /* ARGSUSED */ 2831 void 2832 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2833 struct svc_req *req, cred_t *cr) 2834 { 2835 int error; 2836 struct statvfs64 sb; 2837 vnode_t *vp; 2838 2839 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, 2840 "rfs_statfs_start:"); 2841 2842 vp = nfs_fhtovp(fh, exi); 2843 if (vp == NULL) { 2844 fs->fs_status = NFSERR_STALE; 2845 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2846 "rfs_statfs_end:(%S)", "stale"); 2847 return; 2848 } 2849 2850 error = VFS_STATVFS(vp->v_vfsp, &sb); 2851 2852 if (!error) { 2853 fs->fs_tsize = nfstsize(); 2854 fs->fs_bsize = sb.f_frsize; 2855 fs->fs_blocks = sb.f_blocks; 2856 fs->fs_bfree = sb.f_bfree; 2857 fs->fs_bavail = sb.f_bavail; 2858 } 2859 2860 VN_RELE(vp); 2861 2862 fs->fs_status = puterrno(error); 2863 2864 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2865 "rfs_statfs_end:(%S)", "done"); 2866 } 2867 fhandle_t * 2868 rfs_statfs_getfh(fhandle_t *fh) 2869 { 2870 return (fh); 2871 } 2872 2873 static int 2874 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2875 { 2876 vap->va_mask = 0; 2877 2878 /* 2879 * There was a sign extension bug in some VFS based systems 2880 * which stored the mode as a short. When it would get 2881 * assigned to a u_long, no sign extension would occur. 2882 * It needed to, but this wasn't noticed because sa_mode 2883 * would then get assigned back to the short, thus ignoring 2884 * the upper 16 bits of sa_mode. 2885 * 2886 * To make this implementation work for both broken 2887 * clients and good clients, we check for both versions 2888 * of the mode. 2889 */ 2890 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2891 sa->sa_mode != (uint32_t)-1) { 2892 vap->va_mask |= AT_MODE; 2893 vap->va_mode = sa->sa_mode; 2894 } 2895 if (sa->sa_uid != (uint32_t)-1) { 2896 vap->va_mask |= AT_UID; 2897 vap->va_uid = sa->sa_uid; 2898 } 2899 if (sa->sa_gid != (uint32_t)-1) { 2900 vap->va_mask |= AT_GID; 2901 vap->va_gid = sa->sa_gid; 2902 } 2903 if (sa->sa_size != (uint32_t)-1) { 2904 vap->va_mask |= AT_SIZE; 2905 vap->va_size = sa->sa_size; 2906 } 2907 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2908 sa->sa_atime.tv_usec != (int32_t)-1) { 2909 #ifndef _LP64 2910 /* return error if time overflow */ 2911 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2912 return (EOVERFLOW); 2913 #endif 2914 vap->va_mask |= AT_ATIME; 2915 /* 2916 * nfs protocol defines times as unsigned so don't extend sign, 2917 * unless sysadmin set nfs_allow_preepoch_time. 2918 */ 2919 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2920 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2921 } 2922 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2923 sa->sa_mtime.tv_usec != (int32_t)-1) { 2924 #ifndef _LP64 2925 /* return error if time overflow */ 2926 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2927 return (EOVERFLOW); 2928 #endif 2929 vap->va_mask |= AT_MTIME; 2930 /* 2931 * nfs protocol defines times as unsigned so don't extend sign, 2932 * unless sysadmin set nfs_allow_preepoch_time. 2933 */ 2934 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2935 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2936 } 2937 return (0); 2938 } 2939 2940 static enum nfsftype vt_to_nf[] = { 2941 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2942 }; 2943 2944 /* 2945 * check the following fields for overflow: nodeid, size, and time. 2946 * There could be a problem when converting 64-bit LP64 fields 2947 * into 32-bit ones. Return an error if there is an overflow. 2948 */ 2949 int 2950 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2951 { 2952 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2953 na->na_type = vt_to_nf[vap->va_type]; 2954 2955 if (vap->va_mode == (unsigned short) -1) 2956 na->na_mode = (uint32_t)-1; 2957 else 2958 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2959 2960 if (vap->va_uid == (unsigned short)(-1)) 2961 na->na_uid = (uint32_t)(-1); 2962 else if (vap->va_uid == UID_NOBODY) 2963 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2964 else 2965 na->na_uid = vap->va_uid; 2966 2967 if (vap->va_gid == (unsigned short)(-1)) 2968 na->na_gid = (uint32_t)-1; 2969 else if (vap->va_gid == GID_NOBODY) 2970 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2971 else 2972 na->na_gid = vap->va_gid; 2973 2974 /* 2975 * Do we need to check fsid for overflow? It is 64-bit in the 2976 * vattr, but are bigger than 32 bit values supported? 2977 */ 2978 na->na_fsid = vap->va_fsid; 2979 2980 na->na_nodeid = vap->va_nodeid; 2981 2982 /* 2983 * Check to make sure that the nodeid is representable over the 2984 * wire without losing bits. 2985 */ 2986 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2987 return (EFBIG); 2988 na->na_nlink = vap->va_nlink; 2989 2990 /* 2991 * Check for big files here, instead of at the caller. See 2992 * comments in cstat for large special file explanation. 2993 */ 2994 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2995 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2996 return (EFBIG); 2997 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2998 /* UNKNOWN_SIZE | OVERFLOW */ 2999 na->na_size = MAXOFF32_T; 3000 } else 3001 na->na_size = vap->va_size; 3002 } else 3003 na->na_size = vap->va_size; 3004 3005 /* 3006 * If the vnode times overflow the 32-bit times that NFS2 3007 * uses on the wire then return an error. 3008 */ 3009 if (!NFS_VAP_TIME_OK(vap)) { 3010 return (EOVERFLOW); 3011 } 3012 na->na_atime.tv_sec = vap->va_atime.tv_sec; 3013 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 3014 3015 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 3016 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 3017 3018 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 3019 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 3020 3021 /* 3022 * If the dev_t will fit into 16 bits then compress 3023 * it, otherwise leave it alone. See comments in 3024 * nfs_client.c. 3025 */ 3026 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 3027 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 3028 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 3029 else 3030 (void) cmpldev(&na->na_rdev, vap->va_rdev); 3031 3032 na->na_blocks = vap->va_nblocks; 3033 na->na_blocksize = vap->va_blksize; 3034 3035 /* 3036 * This bit of ugliness is a *TEMPORARY* hack to preserve the 3037 * over-the-wire protocols for named-pipe vnodes. It remaps the 3038 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 3039 * 3040 * BUYER BEWARE: 3041 * If you are porting the NFS to a non-Sun server, you probably 3042 * don't want to include the following block of code. The 3043 * over-the-wire special file types will be changing with the 3044 * NFS Protocol Revision. 3045 */ 3046 if (vap->va_type == VFIFO) 3047 NA_SETFIFO(na); 3048 return (0); 3049 } 3050 3051 /* 3052 * acl v2 support: returns approximate permission. 3053 * default: returns minimal permission (more restrictive) 3054 * aclok: returns maximal permission (less restrictive) 3055 * This routine changes the permissions that are alaredy in *va. 3056 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3057 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3058 */ 3059 static void 3060 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3061 { 3062 vsecattr_t vsa; 3063 int aclcnt; 3064 aclent_t *aclentp; 3065 mode_t mask_perm; 3066 mode_t grp_perm; 3067 mode_t other_perm; 3068 mode_t other_orig; 3069 int error; 3070 3071 /* dont care default acl */ 3072 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3073 error = VOP_GETSECATTR(vp, &vsa, 0, cr); 3074 3075 if (!error) { 3076 aclcnt = vsa.vsa_aclcnt; 3077 if (aclcnt > MIN_ACL_ENTRIES) { 3078 /* non-trivial ACL */ 3079 aclentp = vsa.vsa_aclentp; 3080 if (exi->exi_export.ex_flags & EX_ACLOK) { 3081 /* maximal permissions */ 3082 grp_perm = 0; 3083 other_perm = 0; 3084 for (; aclcnt > 0; aclcnt--, aclentp++) { 3085 switch (aclentp->a_type) { 3086 case USER_OBJ: 3087 break; 3088 case USER: 3089 grp_perm |= 3090 aclentp->a_perm << 3; 3091 other_perm |= aclentp->a_perm; 3092 break; 3093 case GROUP_OBJ: 3094 grp_perm |= 3095 aclentp->a_perm << 3; 3096 break; 3097 case GROUP: 3098 other_perm |= aclentp->a_perm; 3099 break; 3100 case OTHER_OBJ: 3101 other_orig = aclentp->a_perm; 3102 break; 3103 case CLASS_OBJ: 3104 mask_perm = aclentp->a_perm; 3105 break; 3106 default: 3107 break; 3108 } 3109 } 3110 grp_perm &= mask_perm << 3; 3111 other_perm &= mask_perm; 3112 other_perm |= other_orig; 3113 3114 } else { 3115 /* minimal permissions */ 3116 grp_perm = 070; 3117 other_perm = 07; 3118 for (; aclcnt > 0; aclcnt--, aclentp++) { 3119 switch (aclentp->a_type) { 3120 case USER_OBJ: 3121 break; 3122 case USER: 3123 case CLASS_OBJ: 3124 grp_perm &= 3125 aclentp->a_perm << 3; 3126 other_perm &= 3127 aclentp->a_perm; 3128 break; 3129 case GROUP_OBJ: 3130 grp_perm &= 3131 aclentp->a_perm << 3; 3132 break; 3133 case GROUP: 3134 other_perm &= 3135 aclentp->a_perm; 3136 break; 3137 case OTHER_OBJ: 3138 other_perm &= 3139 aclentp->a_perm; 3140 break; 3141 default: 3142 break; 3143 } 3144 } 3145 } 3146 /* copy to va */ 3147 va->va_mode &= ~077; 3148 va->va_mode |= grp_perm | other_perm; 3149 } 3150 if (vsa.vsa_aclcnt) 3151 kmem_free(vsa.vsa_aclentp, 3152 vsa.vsa_aclcnt * sizeof (aclent_t)); 3153 } 3154 } 3155 3156 void 3157 rfs_srvrinit(void) 3158 { 3159 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3160 } 3161 3162 void 3163 rfs_srvrfini(void) 3164 { 3165 mutex_destroy(&rfs_async_write_lock); 3166 } 3167