1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 29 * All rights reserved. 30 */ 31 32 #pragma ident "%Z%%M% %I% %E% SMI" 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/uio.h> 42 #include <sys/stat.h> 43 #include <sys/errno.h> 44 #include <sys/sysmacros.h> 45 #include <sys/statvfs.h> 46 #include <sys/kmem.h> 47 #include <sys/kstat.h> 48 #include <sys/dirent.h> 49 #include <sys/cmn_err.h> 50 #include <sys/debug.h> 51 #include <sys/vtrace.h> 52 #include <sys/mode.h> 53 #include <sys/acl.h> 54 #include <sys/nbmlock.h> 55 #include <sys/policy.h> 56 57 #include <rpc/types.h> 58 #include <rpc/auth.h> 59 #include <rpc/svc.h> 60 61 #include <nfs/nfs.h> 62 #include <nfs/export.h> 63 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/seg_map.h> 68 #include <vm/seg_kmem.h> 69 70 #include <sys/strsubr.h> 71 72 /* 73 * These are the interface routines for the server side of the 74 * Network File System. See the NFS version 2 protocol specification 75 * for a description of this interface. 76 */ 77 78 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 79 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 80 cred_t *); 81 82 /* 83 * Some "over the wire" UNIX file types. These are encoded 84 * into the mode. This needs to be fixed in the next rev. 85 */ 86 #define IFMT 0170000 /* type of file */ 87 #define IFCHR 0020000 /* character special */ 88 #define IFBLK 0060000 /* block special */ 89 #define IFSOCK 0140000 /* socket */ 90 91 /* 92 * Get file attributes. 93 * Returns the current attributes of the file with the given fhandle. 94 */ 95 /* ARGSUSED */ 96 void 97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 98 struct svc_req *req, cred_t *cr) 99 { 100 int error; 101 vnode_t *vp; 102 struct vattr va; 103 104 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, 105 "rfs_getattr_start:"); 106 107 vp = nfs_fhtovp(fhp, exi); 108 if (vp == NULL) { 109 ns->ns_status = NFSERR_STALE; 110 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 111 "rfs_getattr_end:(%S)", "stale"); 112 return; 113 } 114 115 /* 116 * Do the getattr. 117 */ 118 va.va_mask = AT_ALL; /* we want all the attributes */ 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 120 "vop_getattr_start:"); 121 error = rfs4_delegated_getattr(vp, &va, 0, cr); 122 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 123 "vop_getattr_end:"); 124 125 /* check for overflows */ 126 if (!error) { 127 acl_perm(vp, exi, &va, cr); 128 error = vattr_to_nattr(&va, &ns->ns_attr); 129 } 130 131 VN_RELE(vp); 132 133 ns->ns_status = puterrno(error); 134 135 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 136 "rfs_getattr_end:(%S)", "done"); 137 } 138 fhandle_t * 139 rfs_getattr_getfh(fhandle_t *fhp) 140 { 141 return (fhp); 142 } 143 144 /* 145 * Set file attributes. 146 * Sets the attributes of the file with the given fhandle. Returns 147 * the new attributes. 148 */ 149 void 150 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 151 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 152 { 153 int error; 154 int flag; 155 int in_crit = 0; 156 vnode_t *vp; 157 struct vattr va; 158 struct vattr bva; 159 struct flock64 bf; 160 161 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, 162 "rfs_setattr_start:"); 163 164 vp = nfs_fhtovp(&args->saa_fh, exi); 165 if (vp == NULL) { 166 ns->ns_status = NFSERR_STALE; 167 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 168 "rfs_setattr_end:(%S)", "stale"); 169 return; 170 } 171 172 if (rdonly(exi, req) || vn_is_readonly(vp)) { 173 VN_RELE(vp); 174 ns->ns_status = NFSERR_ROFS; 175 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 176 "rfs_setattr_end:(%S)", "rofs"); 177 return; 178 } 179 180 error = sattr_to_vattr(&args->saa_sa, &va); 181 if (error) { 182 VN_RELE(vp); 183 ns->ns_status = puterrno(error); 184 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 185 "rfs_setattr_end:(%S)", "sattr"); 186 return; 187 } 188 189 /* 190 * If the client is requesting a change to the mtime, 191 * but the nanosecond field is set to 1 billion, then 192 * this is a flag to the server that it should set the 193 * atime and mtime fields to the server's current time. 194 * The 1 billion number actually came from the client 195 * as 1 million, but the units in the over the wire 196 * request are microseconds instead of nanoseconds. 197 * 198 * This is an overload of the protocol and should be 199 * documented in the NFS Version 2 protocol specification. 200 */ 201 if (va.va_mask & AT_MTIME) { 202 if (va.va_mtime.tv_nsec == 1000000000) { 203 gethrestime(&va.va_mtime); 204 va.va_atime = va.va_mtime; 205 va.va_mask |= AT_ATIME; 206 flag = 0; 207 } else 208 flag = ATTR_UTIME; 209 } else 210 flag = 0; 211 212 /* 213 * If the filesystem is exported with nosuid, then mask off 214 * the setuid and setgid bits. 215 */ 216 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 217 (exi->exi_export.ex_flags & EX_NOSUID)) 218 va.va_mode &= ~(VSUID | VSGID); 219 220 /* 221 * We need to specially handle size changes because it is 222 * possible for the client to create a file with modes 223 * which indicate read-only, but with the file opened for 224 * writing. If the client then tries to set the size of 225 * the file, then the normal access checking done in 226 * VOP_SETATTR would prevent the client from doing so, 227 * although it should be legal for it to do so. To get 228 * around this, we do the access checking for ourselves 229 * and then use VOP_SPACE which doesn't do the access 230 * checking which VOP_SETATTR does. VOP_SPACE can only 231 * operate on VREG files, let VOP_SETATTR handle the other 232 * extremely rare cases. 233 * Also the client should not be allowed to change the 234 * size of the file if there is a conflicting non-blocking 235 * mandatory lock in the region of change. 236 * 237 * Also(2), check to see if the v4 side of the server has 238 * delegated this file. If so, then we set T_WOULDBLOCK 239 * so that the dispatch function dosn't send a reply, forcing 240 * the client to retrasmit its request. 241 */ 242 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 243 /* If delegated, mark as wouldblock so response is dropped */ 244 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 245 VN_RELE(vp); 246 curthread->t_flag |= T_WOULDBLOCK; 247 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 248 "rfs_setattr_end:(%S)", "delegated"); 249 return; 250 } 251 if (nbl_need_check(vp)) { 252 nbl_start_crit(vp, RW_READER); 253 in_crit = 1; 254 } 255 256 bva.va_mask = AT_UID | AT_SIZE; 257 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 258 "vop_getattr_start:"); 259 error = VOP_GETATTR(vp, &bva, 0, cr); 260 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 261 "vop_getattr_end:"); 262 if (error) { 263 if (in_crit) 264 nbl_end_crit(vp); 265 VN_RELE(vp); 266 ns->ns_status = puterrno(error); 267 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 268 "rfs_setattr_end:(%S)", "getattr"); 269 return; 270 } 271 272 if (in_crit) { 273 u_offset_t offset; 274 ssize_t length; 275 276 if (va.va_size < bva.va_size) { 277 offset = va.va_size; 278 length = bva.va_size - va.va_size; 279 } else { 280 offset = bva.va_size; 281 length = va.va_size - bva.va_size; 282 } 283 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) { 284 error = EACCES; 285 } 286 } 287 288 if (crgetuid(cr) == bva.va_uid && !error && 289 va.va_size != bva.va_size) { 290 va.va_mask &= ~AT_SIZE; 291 bf.l_type = F_WRLCK; 292 bf.l_whence = 0; 293 bf.l_start = (off64_t)va.va_size; 294 bf.l_len = 0; 295 bf.l_sysid = 0; 296 bf.l_pid = 0; 297 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 298 "vop_space_start:"); 299 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 300 (offset_t)va.va_size, cr, NULL); 301 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, 302 "vop_space_end:"); 303 } 304 if (in_crit) 305 nbl_end_crit(vp); 306 } else 307 error = 0; 308 309 /* 310 * Do the setattr. 311 */ 312 if (!error && va.va_mask) { 313 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, 314 "vop_setattr_start:"); 315 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 316 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, 317 "vop_setattr_end:"); 318 } 319 320 if (!error) { 321 va.va_mask = AT_ALL; /* get everything */ 322 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 323 "vop_getattr_start:"); 324 error = rfs4_delegated_getattr(vp, &va, 0, cr); 325 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 326 "vop_getattr_end:"); 327 328 /* check for overflows */ 329 if (!error) { 330 acl_perm(vp, exi, &va, cr); 331 error = vattr_to_nattr(&va, &ns->ns_attr); 332 } 333 } 334 335 /* 336 * Force modified metadata out to stable storage. 337 */ 338 (void) VOP_FSYNC(vp, FNODSYNC, cr); 339 340 VN_RELE(vp); 341 342 ns->ns_status = puterrno(error); 343 344 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 345 "rfs_setattr_end:(%S)", "done"); 346 } 347 fhandle_t * 348 rfs_setattr_getfh(struct nfssaargs *args) 349 { 350 return (&args->saa_fh); 351 } 352 353 /* 354 * Directory lookup. 355 * Returns an fhandle and file attributes for file name in a directory. 356 */ 357 /* ARGSUSED */ 358 void 359 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 360 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 361 { 362 int error; 363 vnode_t *dvp; 364 vnode_t *vp; 365 struct vattr va; 366 fhandle_t *fhp = da->da_fhandle; 367 struct sec_ol sec = {0, 0}; 368 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 369 370 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, 371 "rfs_lookup_start:"); 372 373 /* 374 * Disallow NULL paths 375 */ 376 if (da->da_name == NULL || *da->da_name == '\0') { 377 dr->dr_status = NFSERR_ACCES; 378 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 379 "rfs_lookup_end:(%S)", "access"); 380 return; 381 } 382 383 /* 384 * Allow lookups from the root - the default 385 * location of the public filehandle. 386 */ 387 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 388 dvp = rootdir; 389 VN_HOLD(dvp); 390 } else { 391 dvp = nfs_fhtovp(fhp, exi); 392 if (dvp == NULL) { 393 dr->dr_status = NFSERR_STALE; 394 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 395 "rfs_lookup_end:(%S)", "stale"); 396 return; 397 } 398 } 399 400 /* 401 * Not allow lookup beyond root. 402 * If the filehandle matches a filehandle of the exi, 403 * then the ".." refers beyond the root of an exported filesystem. 404 */ 405 if (strcmp(da->da_name, "..") == 0 && 406 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 407 VN_RELE(dvp); 408 dr->dr_status = NFSERR_NOENT; 409 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 410 "rfs_lookup_end:(%S)", "noent"); 411 return; 412 } 413 414 /* 415 * If the public filehandle is used then allow 416 * a multi-component lookup, i.e. evaluate 417 * a pathname and follow symbolic links if 418 * necessary. 419 * 420 * This may result in a vnode in another filesystem 421 * which is OK as long as the filesystem is exported. 422 */ 423 if (PUBLIC_FH2(fhp)) { 424 publicfh_flag = TRUE; 425 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 426 &sec); 427 } else { 428 /* 429 * Do a normal single component lookup. 430 */ 431 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 432 "vop_lookup_start:"); 433 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr); 434 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 435 "vop_lookup_end:"); 436 } 437 438 if (!error) { 439 VN_SETPATH(rootdir, dvp, vp, da->da_name, 440 strlen(da->da_name)); 441 va.va_mask = AT_ALL; /* we want everything */ 442 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 443 "vop_getattr_start:"); 444 error = rfs4_delegated_getattr(vp, &va, 0, cr); 445 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 446 "vop_getattr_end:"); 447 /* check for overflows */ 448 if (!error) { 449 acl_perm(vp, exi, &va, cr); 450 error = vattr_to_nattr(&va, &dr->dr_attr); 451 if (!error) { 452 if (sec.sec_flags & SEC_QUERY) 453 error = makefh_ol(&dr->dr_fhandle, exi, 454 sec.sec_index); 455 else { 456 error = makefh(&dr->dr_fhandle, vp, 457 exi); 458 if (!error && publicfh_flag && 459 !chk_clnt_sec(exi, req)) 460 auth_weak = TRUE; 461 } 462 } 463 } 464 VN_RELE(vp); 465 } 466 467 VN_RELE(dvp); 468 469 /* 470 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 471 * and have obtained a new exportinfo in exi which needs to be 472 * released. Note the the original exportinfo pointed to by exi 473 * will be released by the caller, comon_dispatch. 474 */ 475 if (publicfh_flag && exi != NULL) 476 exi_rele(exi); 477 478 /* 479 * If it's public fh, no 0x81, and client's flavor is 480 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 481 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 482 */ 483 if (auth_weak) 484 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 485 else 486 dr->dr_status = puterrno(error); 487 488 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 489 "rfs_lookup_end:(%S)", "done"); 490 } 491 fhandle_t * 492 rfs_lookup_getfh(struct nfsdiropargs *da) 493 { 494 return (da->da_fhandle); 495 } 496 497 /* 498 * Read symbolic link. 499 * Returns the string in the symbolic link at the given fhandle. 500 */ 501 /* ARGSUSED */ 502 void 503 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 504 struct svc_req *req, cred_t *cr) 505 { 506 int error; 507 struct iovec iov; 508 struct uio uio; 509 vnode_t *vp; 510 struct vattr va; 511 512 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, 513 "rfs_readlink_start:"); 514 515 vp = nfs_fhtovp(fhp, exi); 516 if (vp == NULL) { 517 rl->rl_data = NULL; 518 rl->rl_status = NFSERR_STALE; 519 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 520 "rfs_readlink_end:(%S)", "stale"); 521 return; 522 } 523 524 va.va_mask = AT_MODE; 525 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 526 "vop_getattr_start:"); 527 error = VOP_GETATTR(vp, &va, 0, cr); 528 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 529 "vop_getattr_end:"); 530 531 if (error) { 532 VN_RELE(vp); 533 rl->rl_data = NULL; 534 rl->rl_status = puterrno(error); 535 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 536 "rfs_readlink_end:(%S)", "getattr error"); 537 return; 538 } 539 540 if (MANDLOCK(vp, va.va_mode)) { 541 VN_RELE(vp); 542 rl->rl_data = NULL; 543 rl->rl_status = NFSERR_ACCES; 544 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 545 "rfs_readlink_end:(%S)", "access"); 546 return; 547 } 548 549 /* 550 * XNFS and RFC1094 require us to return ENXIO if argument 551 * is not a link. BUGID 1138002. 552 */ 553 if (vp->v_type != VLNK) { 554 VN_RELE(vp); 555 rl->rl_data = NULL; 556 rl->rl_status = NFSERR_NXIO; 557 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 558 "rfs_readlink_end:(%S)", "nxio"); 559 return; 560 } 561 562 /* 563 * Allocate data for pathname. This will be freed by rfs_rlfree. 564 */ 565 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 566 567 /* 568 * Set up io vector to read sym link data 569 */ 570 iov.iov_base = rl->rl_data; 571 iov.iov_len = NFS_MAXPATHLEN; 572 uio.uio_iov = &iov; 573 uio.uio_iovcnt = 1; 574 uio.uio_segflg = UIO_SYSSPACE; 575 uio.uio_extflg = UIO_COPY_CACHED; 576 uio.uio_loffset = (offset_t)0; 577 uio.uio_resid = NFS_MAXPATHLEN; 578 579 /* 580 * Do the readlink. 581 */ 582 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, 583 "vop_readlink_start:"); 584 error = VOP_READLINK(vp, &uio, cr); 585 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, 586 "vop_readlink_end:"); 587 588 #if 0 /* notyet */ 589 /* 590 * Don't do this. It causes local disk writes when just 591 * reading the file and the overhead is deemed larger 592 * than the benefit. 593 */ 594 /* 595 * Force modified metadata out to stable storage. 596 */ 597 (void) VOP_FSYNC(vp, FNODSYNC, cr); 598 #endif 599 600 VN_RELE(vp); 601 602 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 603 604 /* 605 * XNFS and RFC1094 require us to return ENXIO if argument 606 * is not a link. UFS returns EINVAL if this is the case, 607 * so we do the mapping here. BUGID 1138002. 608 */ 609 if (error == EINVAL) 610 rl->rl_status = NFSERR_NXIO; 611 else 612 rl->rl_status = puterrno(error); 613 614 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 615 "rfs_readlink_end:(%S)", "done"); 616 } 617 fhandle_t * 618 rfs_readlink_getfh(fhandle_t *fhp) 619 { 620 return (fhp); 621 } 622 /* 623 * Free data allocated by rfs_readlink 624 */ 625 void 626 rfs_rlfree(struct nfsrdlnres *rl) 627 { 628 if (rl->rl_data != NULL) 629 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 630 } 631 632 /* 633 * Read data. 634 * Returns some data read from the file at the given fhandle. 635 */ 636 /* ARGSUSED */ 637 void 638 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 639 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 640 { 641 vnode_t *vp; 642 int error; 643 struct vattr va; 644 struct iovec iov; 645 struct uio uio; 646 mblk_t *mp; 647 int alloc_err = 0; 648 int in_crit = 0; 649 650 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, 651 "rfs_read_start:"); 652 653 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 654 if (vp == NULL) { 655 rr->rr_data = NULL; 656 rr->rr_status = NFSERR_STALE; 657 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 658 "rfs_read_end:(%S)", "stale"); 659 return; 660 } 661 662 if (vp->v_type != VREG) { 663 VN_RELE(vp); 664 rr->rr_data = NULL; 665 rr->rr_status = NFSERR_ISDIR; 666 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 667 "rfs_read_end:(%S)", "isdir"); 668 return; 669 } 670 671 /* 672 * Check to see if the v4 side of the server has delegated 673 * this file. If so, then we mark thread as wouldblock so 674 * the response is dropped. 675 */ 676 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 677 VN_RELE(vp); 678 curthread->t_flag |= T_WOULDBLOCK; 679 rr->rr_data = NULL; 680 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 681 "rfs_read_end:(%S)", "delegated"); 682 return; 683 } 684 685 /* 686 * Enter the critical region before calling VOP_RWLOCK 687 * to avoid a deadlock with write requests. 688 */ 689 if (nbl_need_check(vp)) { 690 nbl_start_crit(vp, RW_READER); 691 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 692 0)) { 693 nbl_end_crit(vp); 694 VN_RELE(vp); 695 rr->rr_data = NULL; 696 rr->rr_status = NFSERR_ACCES; 697 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 698 "rfs_read_end:(%S)", " csf access error"); 699 return; 700 } 701 in_crit = 1; 702 } 703 704 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 705 "vop_rwlock_start:"); 706 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 707 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 708 "vop_rwlock_end:"); 709 710 va.va_mask = AT_ALL; 711 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 712 "vop_getattr_start:"); 713 error = VOP_GETATTR(vp, &va, 0, cr); 714 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 715 "vop_getattr_end:"); 716 717 if (error) { 718 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 719 "vop_rwunlock_start:"); 720 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 721 if (in_crit) 722 nbl_end_crit(vp); 723 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 724 "vop_rwunlock_end:"); 725 VN_RELE(vp); 726 rr->rr_data = NULL; 727 rr->rr_status = puterrno(error); 728 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 729 "rfs_read_end:(%S)", "getattr error"); 730 return; 731 } 732 733 /* 734 * This is a kludge to allow reading of files created 735 * with no read permission. The owner of the file 736 * is always allowed to read it. 737 */ 738 if (crgetuid(cr) != va.va_uid) { 739 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 740 "vop_access_start:"); 741 error = VOP_ACCESS(vp, VREAD, 0, cr); 742 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 743 "vop_access_end:"); 744 if (error) { 745 /* 746 * Exec is the same as read over the net because 747 * of demand loading. 748 */ 749 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 750 "vop_access_start:"); 751 error = VOP_ACCESS(vp, VEXEC, 0, cr); 752 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 753 "vop_access_end:"); 754 } 755 if (error) { 756 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 757 "vop_rwunlock_start:"); 758 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 759 if (in_crit) 760 nbl_end_crit(vp); 761 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 762 "vop_rwunlock_end:"); 763 VN_RELE(vp); 764 rr->rr_data = NULL; 765 rr->rr_status = puterrno(error); 766 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 767 "rfs_read_end:(%S)", "access error"); 768 return; 769 } 770 } 771 772 if (MANDLOCK(vp, va.va_mode)) { 773 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 774 "vop_rwunlock_start:"); 775 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 776 if (in_crit) 777 nbl_end_crit(vp); 778 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 779 "vop_rwunlock_end:"); 780 VN_RELE(vp); 781 rr->rr_data = NULL; 782 rr->rr_status = NFSERR_ACCES; 783 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 784 "rfs_read_end:(%S)", "mand lock"); 785 return; 786 } 787 788 if ((u_offset_t)ra->ra_offset >= va.va_size) { 789 rr->rr_count = 0; 790 rr->rr_data = NULL; 791 /* 792 * In this case, status is NFS_OK, but there is no data 793 * to encode. So set rr_mp to NULL. 794 */ 795 rr->rr_mp = NULL; 796 goto done; 797 } 798 799 /* 800 * mp will contain the data to be sent out in the read reply. 801 * This will be freed after the reply has been sent out (by the 802 * driver). 803 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 804 * that the call to xdrmblk_putmblk() never fails. 805 */ 806 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 807 &alloc_err); 808 ASSERT(mp != NULL); 809 ASSERT(alloc_err == 0); 810 811 rr->rr_mp = mp; 812 813 /* 814 * Set up io vector 815 */ 816 iov.iov_base = (caddr_t)mp->b_datap->db_base; 817 iov.iov_len = ra->ra_count; 818 uio.uio_iov = &iov; 819 uio.uio_iovcnt = 1; 820 uio.uio_segflg = UIO_SYSSPACE; 821 uio.uio_extflg = UIO_COPY_CACHED; 822 uio.uio_loffset = (offset_t)ra->ra_offset; 823 uio.uio_resid = ra->ra_count; 824 825 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, 826 "vop_read_start:"); 827 error = VOP_READ(vp, &uio, 0, cr, NULL); 828 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, 829 "vop_read_end:"); 830 831 if (error) { 832 freeb(mp); 833 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 834 "vop_rwunlock_start:"); 835 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 836 if (in_crit) 837 nbl_end_crit(vp); 838 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 839 "vop_rwunlock_end:"); 840 VN_RELE(vp); 841 rr->rr_data = NULL; 842 rr->rr_status = puterrno(error); 843 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 844 "rfs_read_end:(%S)", "read error"); 845 return; 846 } 847 848 /* 849 * Get attributes again so we can send the latest access 850 * time to the client side for his cache. 851 */ 852 va.va_mask = AT_ALL; 853 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 854 "vop_getattr_start:"); 855 error = VOP_GETATTR(vp, &va, 0, cr); 856 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 857 "vop_getattr_end:"); 858 if (error) { 859 freeb(mp); 860 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 861 "vop_rwunlock_start:"); 862 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 863 if (in_crit) 864 nbl_end_crit(vp); 865 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 866 "vop_rwunlock_end:"); 867 VN_RELE(vp); 868 rr->rr_data = NULL; 869 rr->rr_status = puterrno(error); 870 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 871 "rfs_read_end:(%S)", "read error"); 872 return; 873 } 874 875 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 876 877 rr->rr_data = (char *)mp->b_datap->db_base; 878 879 done: 880 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 881 "vop_rwunlock_start:"); 882 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 883 if (in_crit) 884 nbl_end_crit(vp); 885 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 886 "vop_rwunlock_end:"); 887 888 acl_perm(vp, exi, &va, cr); 889 890 /* check for overflows */ 891 error = vattr_to_nattr(&va, &rr->rr_attr); 892 893 #if 0 /* notyet */ 894 /* 895 * Don't do this. It causes local disk writes when just 896 * reading the file and the overhead is deemed larger 897 * than the benefit. 898 */ 899 /* 900 * Force modified metadata out to stable storage. 901 */ 902 (void) VOP_FSYNC(vp, FNODSYNC, cr); 903 #endif 904 905 VN_RELE(vp); 906 907 rr->rr_status = puterrno(error); 908 909 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 910 "rfs_read_end:(%S)", "done"); 911 } 912 913 /* 914 * Free data allocated by rfs_read 915 */ 916 void 917 rfs_rdfree(struct nfsrdresult *rr) 918 { 919 mblk_t *mp; 920 921 if (rr->rr_status == NFS_OK) { 922 mp = rr->rr_mp; 923 if (mp != NULL) 924 freeb(mp); 925 } 926 } 927 928 fhandle_t * 929 rfs_read_getfh(struct nfsreadargs *ra) 930 { 931 return (&ra->ra_fhandle); 932 } 933 934 #define MAX_IOVECS 12 935 936 #ifdef DEBUG 937 static int rfs_write_sync_hits = 0; 938 static int rfs_write_sync_misses = 0; 939 #endif 940 941 /* 942 * Write data to file. 943 * Returns attributes of a file after writing some data to it. 944 * 945 * Any changes made here, especially in error handling might have 946 * to also be done in rfs_write (which clusters write requests). 947 */ 948 void 949 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 950 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 951 { 952 int error; 953 vnode_t *vp; 954 rlim64_t rlimit; 955 struct vattr va; 956 struct uio uio; 957 struct iovec iov[MAX_IOVECS]; 958 mblk_t *m; 959 struct iovec *iovp; 960 int iovcnt; 961 cred_t *savecred; 962 int in_crit = 0; 963 964 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 965 "rfs_write_start:(%S)", "sync"); 966 967 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 968 if (vp == NULL) { 969 ns->ns_status = NFSERR_STALE; 970 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 971 "rfs_write_end:(%S)", "stale"); 972 return; 973 } 974 975 if (rdonly(exi, req)) { 976 VN_RELE(vp); 977 ns->ns_status = NFSERR_ROFS; 978 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 979 "rfs_write_end:(%S)", "rofs"); 980 return; 981 } 982 983 if (vp->v_type != VREG) { 984 VN_RELE(vp); 985 ns->ns_status = NFSERR_ISDIR; 986 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 987 "rfs_write_end:(%S)", "isdir"); 988 return; 989 } 990 991 /* 992 * Check to see if the v4 side of the server has delegated 993 * this file. If so, then we mark thread as wouldblock so 994 * the response is dropped. 995 */ 996 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 997 VN_RELE(vp); 998 curthread->t_flag |= T_WOULDBLOCK; 999 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1000 "rfs_write_end:(%S)", "delegated"); 1001 return; 1002 } 1003 1004 va.va_mask = AT_UID|AT_MODE; 1005 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1006 "vop_getattr_start:"); 1007 error = VOP_GETATTR(vp, &va, 0, cr); 1008 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1009 "vop_getattr_end:"); 1010 1011 if (error) { 1012 VN_RELE(vp); 1013 ns->ns_status = puterrno(error); 1014 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1015 "rfs_write_end:(%S)", "getattr error"); 1016 return; 1017 } 1018 1019 if (crgetuid(cr) != va.va_uid) { 1020 /* 1021 * This is a kludge to allow writes of files created 1022 * with read only permission. The owner of the file 1023 * is always allowed to write it. 1024 */ 1025 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1026 "vop_access_start:"); 1027 error = VOP_ACCESS(vp, VWRITE, 0, cr); 1028 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1029 "vop_access_end:"); 1030 if (error) { 1031 VN_RELE(vp); 1032 ns->ns_status = puterrno(error); 1033 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1034 "rfs_write_end:(%S)", "access error"); 1035 return; 1036 } 1037 } 1038 1039 /* 1040 * Can't access a mandatory lock file. This might cause 1041 * the NFS service thread to block forever waiting for a 1042 * lock to be released that will never be released. 1043 */ 1044 if (MANDLOCK(vp, va.va_mode)) { 1045 VN_RELE(vp); 1046 ns->ns_status = NFSERR_ACCES; 1047 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1048 "rfs_write_end:(%S)", "mand lock"); 1049 return; 1050 } 1051 1052 /* 1053 * We have to enter the critical region before calling VOP_RWLOCK 1054 * to avoid a deadlock with ufs. 1055 */ 1056 if (nbl_need_check(vp)) { 1057 nbl_start_crit(vp, RW_READER); 1058 in_crit = 1; 1059 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1060 wa->wa_count, 0)) { 1061 error = EACCES; 1062 goto out; 1063 } 1064 } 1065 1066 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1067 "vop_rwlock_start:"); 1068 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1069 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1070 "vop_rwlock_end:"); 1071 1072 if (wa->wa_data) { 1073 iov[0].iov_base = wa->wa_data; 1074 iov[0].iov_len = wa->wa_count; 1075 uio.uio_iov = iov; 1076 uio.uio_iovcnt = 1; 1077 uio.uio_segflg = UIO_SYSSPACE; 1078 uio.uio_extflg = UIO_COPY_DEFAULT; 1079 uio.uio_loffset = (offset_t)wa->wa_offset; 1080 uio.uio_resid = wa->wa_count; 1081 /* 1082 * The limit is checked on the client. We 1083 * should allow any size writes here. 1084 */ 1085 uio.uio_llimit = curproc->p_fsz_ctl; 1086 rlimit = uio.uio_llimit - wa->wa_offset; 1087 if (rlimit < (rlim64_t)uio.uio_resid) 1088 uio.uio_resid = (uint_t)rlimit; 1089 1090 /* 1091 * for now we assume no append mode 1092 */ 1093 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1094 "vop_write_start:(%S)", "sync"); 1095 /* 1096 * We're changing creds because VM may fault and we need 1097 * the cred of the current thread to be used if quota 1098 * checking is enabled. 1099 */ 1100 savecred = curthread->t_cred; 1101 curthread->t_cred = cr; 1102 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1103 curthread->t_cred = savecred; 1104 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1105 "vop_write_end:"); 1106 } else { 1107 iovcnt = 0; 1108 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1109 iovcnt++; 1110 if (iovcnt <= MAX_IOVECS) { 1111 #ifdef DEBUG 1112 rfs_write_sync_hits++; 1113 #endif 1114 iovp = iov; 1115 } else { 1116 #ifdef DEBUG 1117 rfs_write_sync_misses++; 1118 #endif 1119 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1120 } 1121 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1122 uio.uio_iov = iovp; 1123 uio.uio_iovcnt = iovcnt; 1124 uio.uio_segflg = UIO_SYSSPACE; 1125 uio.uio_extflg = UIO_COPY_DEFAULT; 1126 uio.uio_loffset = (offset_t)wa->wa_offset; 1127 uio.uio_resid = wa->wa_count; 1128 /* 1129 * The limit is checked on the client. We 1130 * should allow any size writes here. 1131 */ 1132 uio.uio_llimit = curproc->p_fsz_ctl; 1133 rlimit = uio.uio_llimit - wa->wa_offset; 1134 if (rlimit < (rlim64_t)uio.uio_resid) 1135 uio.uio_resid = (uint_t)rlimit; 1136 1137 /* 1138 * For now we assume no append mode. 1139 */ 1140 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1141 "vop_write_start:(%S)", "iov sync"); 1142 /* 1143 * We're changing creds because VM may fault and we need 1144 * the cred of the current thread to be used if quota 1145 * checking is enabled. 1146 */ 1147 savecred = curthread->t_cred; 1148 curthread->t_cred = cr; 1149 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1150 curthread->t_cred = savecred; 1151 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1152 "vop_write_end:"); 1153 1154 if (iovp != iov) 1155 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1156 } 1157 1158 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1159 "vop_rwunlock_start:"); 1160 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1161 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1162 "vop_rwunlock_end:"); 1163 1164 if (!error) { 1165 /* 1166 * Get attributes again so we send the latest mod 1167 * time to the client side for his cache. 1168 */ 1169 va.va_mask = AT_ALL; /* now we want everything */ 1170 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1171 "vop_getattr_start:"); 1172 error = VOP_GETATTR(vp, &va, 0, cr); 1173 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1174 "vop_getattr_end:"); 1175 /* check for overflows */ 1176 if (!error) { 1177 acl_perm(vp, exi, &va, cr); 1178 error = vattr_to_nattr(&va, &ns->ns_attr); 1179 } 1180 } 1181 1182 out: 1183 if (in_crit) 1184 nbl_end_crit(vp); 1185 VN_RELE(vp); 1186 1187 ns->ns_status = puterrno(error); 1188 1189 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1190 "rfs_write_end:(%S)", "sync"); 1191 } 1192 1193 struct rfs_async_write { 1194 struct nfswriteargs *wa; 1195 struct nfsattrstat *ns; 1196 struct svc_req *req; 1197 cred_t *cr; 1198 kthread_t *thread; 1199 struct rfs_async_write *list; 1200 }; 1201 1202 struct rfs_async_write_list { 1203 fhandle_t *fhp; 1204 kcondvar_t cv; 1205 struct rfs_async_write *list; 1206 struct rfs_async_write_list *next; 1207 }; 1208 1209 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1210 static kmutex_t rfs_async_write_lock; 1211 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1212 1213 #define MAXCLIOVECS 42 1214 #define RFSWRITE_INITVAL (enum nfsstat) -1 1215 1216 #ifdef DEBUG 1217 static int rfs_write_hits = 0; 1218 static int rfs_write_misses = 0; 1219 #endif 1220 1221 /* 1222 * Write data to file. 1223 * Returns attributes of a file after writing some data to it. 1224 */ 1225 void 1226 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1227 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1228 { 1229 int error; 1230 vnode_t *vp; 1231 rlim64_t rlimit; 1232 struct vattr va; 1233 struct uio uio; 1234 struct rfs_async_write_list *lp; 1235 struct rfs_async_write_list *nlp; 1236 struct rfs_async_write *rp; 1237 struct rfs_async_write *nrp; 1238 struct rfs_async_write *trp; 1239 struct rfs_async_write *lrp; 1240 int data_written; 1241 int iovcnt; 1242 mblk_t *m; 1243 struct iovec *iovp; 1244 struct iovec *niovp; 1245 struct iovec iov[MAXCLIOVECS]; 1246 int count; 1247 int rcount; 1248 uint_t off; 1249 uint_t len; 1250 struct rfs_async_write nrpsp; 1251 struct rfs_async_write_list nlpsp; 1252 ushort_t t_flag; 1253 cred_t *savecred; 1254 int in_crit = 0; 1255 1256 if (!rfs_write_async) { 1257 rfs_write_sync(wa, ns, exi, req, cr); 1258 return; 1259 } 1260 1261 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1262 "rfs_write_start:(%S)", "async"); 1263 1264 /* 1265 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1266 * is considered an OK. 1267 */ 1268 ns->ns_status = RFSWRITE_INITVAL; 1269 1270 nrp = &nrpsp; 1271 nrp->wa = wa; 1272 nrp->ns = ns; 1273 nrp->req = req; 1274 nrp->cr = cr; 1275 nrp->thread = curthread; 1276 1277 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1278 1279 /* 1280 * Look to see if there is already a cluster started 1281 * for this file. 1282 */ 1283 mutex_enter(&rfs_async_write_lock); 1284 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1285 if (bcmp(&wa->wa_fhandle, lp->fhp, 1286 sizeof (fhandle_t)) == 0) 1287 break; 1288 } 1289 1290 /* 1291 * If lp is non-NULL, then there is already a cluster 1292 * started. We need to place ourselves in the cluster 1293 * list in the right place as determined by starting 1294 * offset. Conflicts with non-blocking mandatory locked 1295 * regions will be checked when the cluster is processed. 1296 */ 1297 if (lp != NULL) { 1298 rp = lp->list; 1299 trp = NULL; 1300 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1301 trp = rp; 1302 rp = rp->list; 1303 } 1304 nrp->list = rp; 1305 if (trp == NULL) 1306 lp->list = nrp; 1307 else 1308 trp->list = nrp; 1309 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1310 cv_wait(&lp->cv, &rfs_async_write_lock); 1311 mutex_exit(&rfs_async_write_lock); 1312 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1313 "rfs_write_end:(%S)", "cluster child"); 1314 return; 1315 } 1316 1317 /* 1318 * No cluster started yet, start one and add ourselves 1319 * to the list of clusters. 1320 */ 1321 nrp->list = NULL; 1322 1323 nlp = &nlpsp; 1324 nlp->fhp = &wa->wa_fhandle; 1325 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1326 nlp->list = nrp; 1327 nlp->next = NULL; 1328 1329 if (rfs_async_write_head == NULL) { 1330 rfs_async_write_head = nlp; 1331 } else { 1332 lp = rfs_async_write_head; 1333 while (lp->next != NULL) 1334 lp = lp->next; 1335 lp->next = nlp; 1336 } 1337 mutex_exit(&rfs_async_write_lock); 1338 1339 /* 1340 * Convert the file handle common to all of the requests 1341 * in this cluster to a vnode. 1342 */ 1343 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1344 if (vp == NULL) { 1345 mutex_enter(&rfs_async_write_lock); 1346 if (rfs_async_write_head == nlp) 1347 rfs_async_write_head = nlp->next; 1348 else { 1349 lp = rfs_async_write_head; 1350 while (lp->next != nlp) 1351 lp = lp->next; 1352 lp->next = nlp->next; 1353 } 1354 t_flag = curthread->t_flag & T_WOULDBLOCK; 1355 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1356 rp->ns->ns_status = NFSERR_STALE; 1357 rp->thread->t_flag |= t_flag; 1358 } 1359 cv_broadcast(&nlp->cv); 1360 mutex_exit(&rfs_async_write_lock); 1361 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1362 "rfs_write_end:(%S)", "stale"); 1363 return; 1364 } 1365 1366 /* 1367 * Can only write regular files. Attempts to write any 1368 * other file types fail with EISDIR. 1369 */ 1370 if (vp->v_type != VREG) { 1371 VN_RELE(vp); 1372 mutex_enter(&rfs_async_write_lock); 1373 if (rfs_async_write_head == nlp) 1374 rfs_async_write_head = nlp->next; 1375 else { 1376 lp = rfs_async_write_head; 1377 while (lp->next != nlp) 1378 lp = lp->next; 1379 lp->next = nlp->next; 1380 } 1381 t_flag = curthread->t_flag & T_WOULDBLOCK; 1382 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1383 rp->ns->ns_status = NFSERR_ISDIR; 1384 rp->thread->t_flag |= t_flag; 1385 } 1386 cv_broadcast(&nlp->cv); 1387 mutex_exit(&rfs_async_write_lock); 1388 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1389 "rfs_write_end:(%S)", "isdir"); 1390 return; 1391 } 1392 1393 /* 1394 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1395 * deadlock with ufs. 1396 */ 1397 if (nbl_need_check(vp)) { 1398 nbl_start_crit(vp, RW_READER); 1399 in_crit = 1; 1400 } 1401 1402 /* 1403 * Lock the file for writing. This operation provides 1404 * the delay which allows clusters to grow. 1405 */ 1406 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1407 "vop_wrlock_start:"); 1408 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1409 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1410 "vop_wrlock_end"); 1411 1412 /* 1413 * Disconnect this cluster from the list of clusters. 1414 * The cluster that is being dealt with must be fixed 1415 * in size after this point, so there is no reason 1416 * to leave it on the list so that new requests can 1417 * find it. 1418 * 1419 * The algorithm is that the first write request will 1420 * create a cluster, convert the file handle to a 1421 * vnode pointer, and then lock the file for writing. 1422 * This request is not likely to be clustered with 1423 * any others. However, the next request will create 1424 * a new cluster and be blocked in VOP_RWLOCK while 1425 * the first request is being processed. This delay 1426 * will allow more requests to be clustered in this 1427 * second cluster. 1428 */ 1429 mutex_enter(&rfs_async_write_lock); 1430 if (rfs_async_write_head == nlp) 1431 rfs_async_write_head = nlp->next; 1432 else { 1433 lp = rfs_async_write_head; 1434 while (lp->next != nlp) 1435 lp = lp->next; 1436 lp->next = nlp->next; 1437 } 1438 mutex_exit(&rfs_async_write_lock); 1439 1440 /* 1441 * Step through the list of requests in this cluster. 1442 * We need to check permissions to make sure that all 1443 * of the requests have sufficient permission to write 1444 * the file. A cluster can be composed of requests 1445 * from different clients and different users on each 1446 * client. 1447 * 1448 * As a side effect, we also calculate the size of the 1449 * byte range that this cluster encompasses. 1450 */ 1451 rp = nlp->list; 1452 off = rp->wa->wa_offset; 1453 len = (uint_t)0; 1454 do { 1455 if (rdonly(exi, rp->req)) { 1456 rp->ns->ns_status = NFSERR_ROFS; 1457 t_flag = curthread->t_flag & T_WOULDBLOCK; 1458 rp->thread->t_flag |= t_flag; 1459 continue; 1460 } 1461 1462 va.va_mask = AT_UID|AT_MODE; 1463 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1464 "vop_getattr_start:"); 1465 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1466 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1467 "vop_getattr_end:"); 1468 if (!error) { 1469 if (crgetuid(rp->cr) != va.va_uid) { 1470 /* 1471 * This is a kludge to allow writes of files 1472 * created with read only permission. The 1473 * owner of the file is always allowed to 1474 * write it. 1475 */ 1476 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1477 "vop_access_start:"); 1478 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr); 1479 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1480 "vop_access_end:"); 1481 } 1482 if (!error && MANDLOCK(vp, va.va_mode)) 1483 error = EACCES; 1484 } 1485 1486 /* 1487 * Check for a conflict with a nbmand-locked region. 1488 */ 1489 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1490 rp->wa->wa_count, 0)) { 1491 error = EACCES; 1492 } 1493 1494 if (error) { 1495 rp->ns->ns_status = puterrno(error); 1496 t_flag = curthread->t_flag & T_WOULDBLOCK; 1497 rp->thread->t_flag |= t_flag; 1498 continue; 1499 } 1500 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1501 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1502 } while ((rp = rp->list) != NULL); 1503 1504 /* 1505 * Step through the cluster attempting to gather as many 1506 * requests which are contiguous as possible. These 1507 * contiguous requests are handled via one call to VOP_WRITE 1508 * instead of different calls to VOP_WRITE. We also keep 1509 * track of the fact that any data was written. 1510 */ 1511 rp = nlp->list; 1512 data_written = 0; 1513 do { 1514 /* 1515 * Skip any requests which are already marked as having an 1516 * error. 1517 */ 1518 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1519 rp = rp->list; 1520 continue; 1521 } 1522 1523 /* 1524 * Count the number of iovec's which are required 1525 * to handle this set of requests. One iovec is 1526 * needed for each data buffer, whether addressed 1527 * by wa_data or by the b_rptr pointers in the 1528 * mblk chains. 1529 */ 1530 iovcnt = 0; 1531 lrp = rp; 1532 for (;;) { 1533 if (lrp->wa->wa_data) 1534 iovcnt++; 1535 else { 1536 m = lrp->wa->wa_mblk; 1537 while (m != NULL) { 1538 iovcnt++; 1539 m = m->b_cont; 1540 } 1541 } 1542 if (lrp->list == NULL || 1543 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1544 lrp->wa->wa_offset + lrp->wa->wa_count != 1545 lrp->list->wa->wa_offset) { 1546 lrp = lrp->list; 1547 break; 1548 } 1549 lrp = lrp->list; 1550 } 1551 1552 if (iovcnt <= MAXCLIOVECS) { 1553 #ifdef DEBUG 1554 rfs_write_hits++; 1555 #endif 1556 niovp = iov; 1557 } else { 1558 #ifdef DEBUG 1559 rfs_write_misses++; 1560 #endif 1561 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1562 } 1563 /* 1564 * Put together the scatter/gather iovecs. 1565 */ 1566 iovp = niovp; 1567 trp = rp; 1568 count = 0; 1569 do { 1570 if (trp->wa->wa_data) { 1571 iovp->iov_base = trp->wa->wa_data; 1572 iovp->iov_len = trp->wa->wa_count; 1573 iovp++; 1574 } else { 1575 m = trp->wa->wa_mblk; 1576 rcount = trp->wa->wa_count; 1577 while (m != NULL) { 1578 iovp->iov_base = (caddr_t)m->b_rptr; 1579 iovp->iov_len = (m->b_wptr - m->b_rptr); 1580 rcount -= iovp->iov_len; 1581 if (rcount < 0) 1582 iovp->iov_len += rcount; 1583 iovp++; 1584 if (rcount <= 0) 1585 break; 1586 m = m->b_cont; 1587 } 1588 } 1589 count += trp->wa->wa_count; 1590 trp = trp->list; 1591 } while (trp != lrp); 1592 1593 uio.uio_iov = niovp; 1594 uio.uio_iovcnt = iovcnt; 1595 uio.uio_segflg = UIO_SYSSPACE; 1596 uio.uio_extflg = UIO_COPY_DEFAULT; 1597 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1598 uio.uio_resid = count; 1599 /* 1600 * The limit is checked on the client. We 1601 * should allow any size writes here. 1602 */ 1603 uio.uio_llimit = curproc->p_fsz_ctl; 1604 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1605 if (rlimit < (rlim64_t)uio.uio_resid) 1606 uio.uio_resid = (uint_t)rlimit; 1607 1608 /* 1609 * For now we assume no append mode. 1610 */ 1611 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1612 "vop_write_start:(%S)", "async"); 1613 1614 /* 1615 * Check to see if the v4 side of the server has 1616 * delegated this file. If so, then we mark thread 1617 * as wouldblock so the response is dropped. 1618 */ 1619 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1620 curthread->t_flag |= T_WOULDBLOCK; 1621 error = EACCES; /* just to have an error */ 1622 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1623 "rfs_write_end:(%S)", "delegated"); 1624 } else { 1625 /* 1626 * We're changing creds because VM may fault 1627 * and we need the cred of the current 1628 * thread to be used if quota * checking is 1629 * enabled. 1630 */ 1631 savecred = curthread->t_cred; 1632 curthread->t_cred = cr; 1633 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1634 curthread->t_cred = savecred; 1635 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1636 "vop_write_end:"); 1637 } 1638 1639 if (niovp != iov) 1640 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1641 1642 if (!error) { 1643 data_written = 1; 1644 /* 1645 * Get attributes again so we send the latest mod 1646 * time to the client side for his cache. 1647 */ 1648 va.va_mask = AT_ALL; /* now we want everything */ 1649 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1650 "vop_getattr_start:"); 1651 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1652 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1653 "vop_getattr_end:"); 1654 if (!error) 1655 acl_perm(vp, exi, &va, rp->cr); 1656 } 1657 1658 /* 1659 * Fill in the status responses for each request 1660 * which was just handled. Also, copy the latest 1661 * attributes in to the attribute responses if 1662 * appropriate. 1663 */ 1664 t_flag = curthread->t_flag & T_WOULDBLOCK; 1665 do { 1666 rp->thread->t_flag |= t_flag; 1667 /* check for overflows */ 1668 if (!error) { 1669 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1670 } 1671 rp->ns->ns_status = puterrno(error); 1672 rp = rp->list; 1673 } while (rp != lrp); 1674 } while (rp != NULL); 1675 1676 /* 1677 * If any data was written at all, then we need to flush 1678 * the data and metadata to stable storage. 1679 */ 1680 if (data_written) { 1681 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, 1682 "vop_putpage_start:"); 1683 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr); 1684 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, 1685 "vop_putpage_end:"); 1686 if (!error) { 1687 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1688 "vop_fsync_start:"); 1689 error = VOP_FSYNC(vp, FNODSYNC, cr); 1690 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, 1691 "vop_fsync_end:"); 1692 } 1693 } 1694 1695 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1696 "vop_rwunlock_start:"); 1697 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1698 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1699 "vop_rwunlock_end:"); 1700 1701 if (in_crit) 1702 nbl_end_crit(vp); 1703 VN_RELE(vp); 1704 1705 t_flag = curthread->t_flag & T_WOULDBLOCK; 1706 mutex_enter(&rfs_async_write_lock); 1707 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1708 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1709 rp->ns->ns_status = puterrno(error); 1710 rp->thread->t_flag |= t_flag; 1711 } 1712 } 1713 cv_broadcast(&nlp->cv); 1714 mutex_exit(&rfs_async_write_lock); 1715 1716 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1717 "rfs_write_end:(%S)", "async"); 1718 } 1719 1720 fhandle_t * 1721 rfs_write_getfh(struct nfswriteargs *wa) 1722 { 1723 return (&wa->wa_fhandle); 1724 } 1725 1726 /* 1727 * Create a file. 1728 * Creates a file with given attributes and returns those attributes 1729 * and an fhandle for the new file. 1730 */ 1731 void 1732 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1733 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1734 { 1735 int error; 1736 int lookuperr; 1737 int in_crit = 0; 1738 struct vattr va; 1739 vnode_t *vp; 1740 vnode_t *dvp; 1741 char *name = args->ca_da.da_name; 1742 vnode_t *tvp = NULL; 1743 int mode; 1744 int lookup_ok; 1745 bool_t trunc; 1746 1747 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, 1748 "rfs_create_start:"); 1749 1750 /* 1751 * Disallow NULL paths 1752 */ 1753 if (name == NULL || *name == '\0') { 1754 dr->dr_status = NFSERR_ACCES; 1755 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1756 "rfs_create_end:(%S)", "access"); 1757 return; 1758 } 1759 1760 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1761 if (dvp == NULL) { 1762 dr->dr_status = NFSERR_STALE; 1763 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1764 "rfs_create_end:(%S)", "stale"); 1765 return; 1766 } 1767 1768 error = sattr_to_vattr(args->ca_sa, &va); 1769 if (error) { 1770 dr->dr_status = puterrno(error); 1771 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1772 "rfs_create_end:(%S)", "sattr"); 1773 return; 1774 } 1775 1776 /* 1777 * Must specify the mode. 1778 */ 1779 if (!(va.va_mask & AT_MODE)) { 1780 VN_RELE(dvp); 1781 dr->dr_status = NFSERR_INVAL; 1782 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1783 "rfs_create_end:(%S)", "no mode"); 1784 return; 1785 } 1786 1787 /* 1788 * This is a completely gross hack to make mknod 1789 * work over the wire until we can wack the protocol 1790 */ 1791 if ((va.va_mode & IFMT) == IFCHR) { 1792 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1793 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1794 else { 1795 va.va_type = VCHR; 1796 /* 1797 * uncompress the received dev_t 1798 * if the top half is zero indicating a request 1799 * from an `older style' OS. 1800 */ 1801 if ((va.va_size & 0xffff0000) == 0) 1802 va.va_rdev = nfsv2_expdev(va.va_size); 1803 else 1804 va.va_rdev = (dev_t)va.va_size; 1805 } 1806 va.va_mask &= ~AT_SIZE; 1807 } else if ((va.va_mode & IFMT) == IFBLK) { 1808 va.va_type = VBLK; 1809 /* 1810 * uncompress the received dev_t 1811 * if the top half is zero indicating a request 1812 * from an `older style' OS. 1813 */ 1814 if ((va.va_size & 0xffff0000) == 0) 1815 va.va_rdev = nfsv2_expdev(va.va_size); 1816 else 1817 va.va_rdev = (dev_t)va.va_size; 1818 va.va_mask &= ~AT_SIZE; 1819 } else if ((va.va_mode & IFMT) == IFSOCK) { 1820 va.va_type = VSOCK; 1821 } else 1822 va.va_type = VREG; 1823 va.va_mode &= ~IFMT; 1824 va.va_mask |= AT_TYPE; 1825 1826 /* 1827 * Why was the choice made to use VWRITE as the mode to the 1828 * call to VOP_CREATE ? This results in a bug. When a client 1829 * opens a file that already exists and is RDONLY, the second 1830 * open fails with an EACESS because of the mode. 1831 * bug ID 1054648. 1832 */ 1833 lookup_ok = 0; 1834 mode = VWRITE; 1835 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1836 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 1837 "vop_lookup_start:"); 1838 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1839 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 1840 "vop_lookup_end:"); 1841 if (!error) { 1842 struct vattr at; 1843 1844 VN_SETPATH(rootdir, dvp, tvp, name, strlen(name)); 1845 lookup_ok = 1; 1846 at.va_mask = AT_MODE; 1847 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1848 "vop_getattr_start:"); 1849 error = VOP_GETATTR(tvp, &at, 0, cr); 1850 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1851 "vop_getattr_end:"); 1852 if (!error) 1853 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1854 VN_RELE(tvp); 1855 tvp = NULL; 1856 } 1857 } 1858 1859 if (!lookup_ok) { 1860 if (rdonly(exi, req)) { 1861 error = EROFS; 1862 } else if (va.va_type != VREG && va.va_type != VFIFO && 1863 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1864 error = EPERM; 1865 } else { 1866 error = 0; 1867 } 1868 } 1869 1870 /* 1871 * If file size is being modified on an already existing file 1872 * make sure that there are no conflicting non-blocking mandatory 1873 * locks in the region being manipulated. Return EACCES if there 1874 * are conflicting locks. 1875 */ 1876 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1877 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1878 1879 if (!lookuperr && 1880 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1881 VN_RELE(tvp); 1882 curthread->t_flag |= T_WOULDBLOCK; 1883 goto out; 1884 } 1885 1886 if (!lookuperr && nbl_need_check(tvp)) { 1887 /* 1888 * The file exists. Now check if it has any 1889 * conflicting non-blocking mandatory locks 1890 * in the region being changed. 1891 */ 1892 struct vattr bva; 1893 u_offset_t offset; 1894 ssize_t length; 1895 1896 nbl_start_crit(tvp, RW_READER); 1897 in_crit = 1; 1898 1899 bva.va_mask = AT_SIZE; 1900 error = VOP_GETATTR(tvp, &bva, 0, cr); 1901 if (!error) { 1902 if (va.va_size < bva.va_size) { 1903 offset = va.va_size; 1904 length = bva.va_size - va.va_size; 1905 } else { 1906 offset = bva.va_size; 1907 length = va.va_size - bva.va_size; 1908 } 1909 if (length) { 1910 if (nbl_conflict(tvp, NBL_WRITE, 1911 offset, length, 0)) { 1912 error = EACCES; 1913 } 1914 } 1915 } 1916 if (error) { 1917 nbl_end_crit(tvp); 1918 VN_RELE(tvp); 1919 in_crit = 0; 1920 } 1921 } else if (tvp != NULL) { 1922 VN_RELE(tvp); 1923 } 1924 } 1925 1926 if (!error) { 1927 /* 1928 * If filesystem is shared with nosuid the remove any 1929 * setuid/setgid bits on create. 1930 */ 1931 if (va.va_type == VREG && 1932 exi->exi_export.ex_flags & EX_NOSUID) 1933 va.va_mode &= ~(VSUID | VSGID); 1934 1935 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, 1936 "vop_create_start:"); 1937 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0); 1938 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, 1939 "vop_create_end:"); 1940 1941 if (!error) { 1942 1943 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1944 trunc = TRUE; 1945 else 1946 trunc = FALSE; 1947 1948 if (rfs4_check_delegated(FWRITE, tvp, trunc)) { 1949 VN_RELE(tvp); 1950 curthread->t_flag |= T_WOULDBLOCK; 1951 goto out; 1952 } 1953 va.va_mask = AT_ALL; 1954 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1955 "vop_getattr_start:"); 1956 error = VOP_GETATTR(vp, &va, 0, cr); 1957 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1958 "vop_getattr_end:"); 1959 /* check for overflows */ 1960 if (!error) { 1961 acl_perm(vp, exi, &va, cr); 1962 error = vattr_to_nattr(&va, &dr->dr_attr); 1963 if (!error) { 1964 error = makefh(&dr->dr_fhandle, vp, 1965 exi); 1966 } 1967 } 1968 /* 1969 * Force modified metadata out to stable storage. 1970 */ 1971 (void) VOP_FSYNC(vp, FNODSYNC, cr); 1972 VN_RELE(vp); 1973 } 1974 1975 if (in_crit) { 1976 nbl_end_crit(tvp); 1977 VN_RELE(tvp); 1978 } 1979 } 1980 1981 /* 1982 * Force modified data and metadata out to stable storage. 1983 */ 1984 (void) VOP_FSYNC(dvp, 0, cr); 1985 1986 out: 1987 1988 VN_RELE(dvp); 1989 1990 dr->dr_status = puterrno(error); 1991 1992 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1993 "rfs_create_end:(%S)", "done"); 1994 } 1995 fhandle_t * 1996 rfs_create_getfh(struct nfscreatargs *args) 1997 { 1998 return (args->ca_da.da_fhandle); 1999 } 2000 2001 /* 2002 * Remove a file. 2003 * Remove named file from parent directory. 2004 */ 2005 void 2006 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 2007 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2008 { 2009 int error = 0; 2010 vnode_t *vp; 2011 vnode_t *targvp; 2012 int in_crit = 0; 2013 2014 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, 2015 "rfs_remove_start:"); 2016 2017 /* 2018 * Disallow NULL paths 2019 */ 2020 if (da->da_name == NULL || *da->da_name == '\0') { 2021 *status = NFSERR_ACCES; 2022 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2023 "rfs_remove_end:(%S)", "access"); 2024 return; 2025 } 2026 2027 vp = nfs_fhtovp(da->da_fhandle, exi); 2028 if (vp == NULL) { 2029 *status = NFSERR_STALE; 2030 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2031 "rfs_remove_end:(%S)", "stale"); 2032 return; 2033 } 2034 2035 if (rdonly(exi, req)) { 2036 VN_RELE(vp); 2037 *status = NFSERR_ROFS; 2038 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2039 "rfs_remove_end:(%S)", "rofs"); 2040 return; 2041 } 2042 2043 /* 2044 * Check for a conflict with a non-blocking mandatory share reservation. 2045 */ 2046 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2047 NULL, cr); 2048 if (error != 0) { 2049 VN_RELE(vp); 2050 *status = puterrno(error); 2051 return; 2052 } 2053 2054 /* 2055 * If the file is delegated to an v4 client, then initiate 2056 * recall and drop this request (by setting T_WOULDBLOCK). 2057 * The client will eventually re-transmit the request and 2058 * (hopefully), by then, the v4 client will have returned 2059 * the delegation. 2060 */ 2061 2062 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2063 VN_RELE(vp); 2064 VN_RELE(targvp); 2065 curthread->t_flag |= T_WOULDBLOCK; 2066 return; 2067 } 2068 2069 if (nbl_need_check(targvp)) { 2070 nbl_start_crit(targvp, RW_READER); 2071 in_crit = 1; 2072 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) { 2073 error = EACCES; 2074 goto out; 2075 } 2076 } 2077 2078 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, 2079 "vop_remove_start:"); 2080 error = VOP_REMOVE(vp, da->da_name, cr); 2081 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, 2082 "vop_remove_end:"); 2083 2084 /* 2085 * Force modified data and metadata out to stable storage. 2086 */ 2087 (void) VOP_FSYNC(vp, 0, cr); 2088 2089 out: 2090 if (in_crit) 2091 nbl_end_crit(targvp); 2092 VN_RELE(targvp); 2093 VN_RELE(vp); 2094 2095 *status = puterrno(error); 2096 2097 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2098 "rfs_remove_end:(%S)", "done"); 2099 } 2100 2101 fhandle_t * 2102 rfs_remove_getfh(struct nfsdiropargs *da) 2103 { 2104 return (da->da_fhandle); 2105 } 2106 2107 /* 2108 * rename a file 2109 * Give a file (from) a new name (to). 2110 */ 2111 void 2112 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2113 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2114 { 2115 int error = 0; 2116 vnode_t *fromvp; 2117 vnode_t *tovp; 2118 struct exportinfo *to_exi; 2119 fhandle_t *fh; 2120 vnode_t *srcvp; 2121 vnode_t *targvp; 2122 int in_crit = 0; 2123 2124 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, 2125 "rfs_rename_start:"); 2126 2127 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2128 if (fromvp == NULL) { 2129 *status = NFSERR_STALE; 2130 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2131 "rfs_rename_end:(%S)", "from stale"); 2132 return; 2133 } 2134 2135 fh = args->rna_to.da_fhandle; 2136 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2137 if (to_exi == NULL) { 2138 VN_RELE(fromvp); 2139 *status = NFSERR_ACCES; 2140 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2141 "rfs_rename_end:(%S)", "cross device"); 2142 return; 2143 } 2144 exi_rele(to_exi); 2145 2146 if (to_exi != exi) { 2147 VN_RELE(fromvp); 2148 *status = NFSERR_XDEV; 2149 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2150 "rfs_rename_end:(%S)", "from stale"); 2151 return; 2152 } 2153 2154 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2155 if (tovp == NULL) { 2156 VN_RELE(fromvp); 2157 *status = NFSERR_STALE; 2158 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2159 "rfs_rename_end:(%S)", "to stale"); 2160 return; 2161 } 2162 2163 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2164 VN_RELE(tovp); 2165 VN_RELE(fromvp); 2166 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2167 "rfs_rename_end:(%S)", "not dir"); 2168 *status = NFSERR_NOTDIR; 2169 return; 2170 } 2171 2172 /* 2173 * Disallow NULL paths 2174 */ 2175 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2176 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2177 VN_RELE(tovp); 2178 VN_RELE(fromvp); 2179 *status = NFSERR_ACCES; 2180 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2181 "rfs_rename_end:(%S)", "access"); 2182 return; 2183 } 2184 2185 if (rdonly(exi, req)) { 2186 VN_RELE(tovp); 2187 VN_RELE(fromvp); 2188 *status = NFSERR_ROFS; 2189 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2190 "rfs_rename_end:(%S)", "rofs"); 2191 return; 2192 } 2193 2194 /* 2195 * Check for a conflict with a non-blocking mandatory share reservation. 2196 */ 2197 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2198 NULL, cr); 2199 if (error != 0) { 2200 VN_RELE(tovp); 2201 VN_RELE(fromvp); 2202 *status = puterrno(error); 2203 return; 2204 } 2205 2206 /* Check for delegations on the source file */ 2207 2208 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2209 VN_RELE(tovp); 2210 VN_RELE(fromvp); 2211 VN_RELE(srcvp); 2212 curthread->t_flag |= T_WOULDBLOCK; 2213 return; 2214 } 2215 2216 /* Check for delegation on the file being renamed over, if it exists */ 2217 2218 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2219 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr) 2220 == 0) { 2221 2222 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2223 VN_RELE(tovp); 2224 VN_RELE(fromvp); 2225 VN_RELE(srcvp); 2226 VN_RELE(targvp); 2227 curthread->t_flag |= T_WOULDBLOCK; 2228 return; 2229 } 2230 VN_RELE(targvp); 2231 } 2232 2233 2234 if (nbl_need_check(srcvp)) { 2235 nbl_start_crit(srcvp, RW_READER); 2236 in_crit = 1; 2237 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) { 2238 error = EACCES; 2239 goto out; 2240 } 2241 } 2242 2243 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, 2244 "vop_rename_start:"); 2245 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2246 tovp, args->rna_to.da_name, cr); 2247 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, 2248 "vop_rename_end:"); 2249 2250 /* 2251 * Force modified data and metadata out to stable storage. 2252 */ 2253 (void) VOP_FSYNC(tovp, 0, cr); 2254 (void) VOP_FSYNC(fromvp, 0, cr); 2255 2256 out: 2257 if (in_crit) 2258 nbl_end_crit(srcvp); 2259 VN_RELE(srcvp); 2260 VN_RELE(tovp); 2261 VN_RELE(fromvp); 2262 2263 *status = puterrno(error); 2264 2265 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2266 "rfs_rename_end:(%S)", "done"); 2267 } 2268 fhandle_t * 2269 rfs_rename_getfh(struct nfsrnmargs *args) 2270 { 2271 return (args->rna_from.da_fhandle); 2272 } 2273 2274 /* 2275 * Link to a file. 2276 * Create a file (to) which is a hard link to the given file (from). 2277 */ 2278 void 2279 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2280 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2281 { 2282 int error; 2283 vnode_t *fromvp; 2284 vnode_t *tovp; 2285 struct exportinfo *to_exi; 2286 fhandle_t *fh; 2287 2288 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, 2289 "rfs_link_start:"); 2290 2291 fromvp = nfs_fhtovp(args->la_from, exi); 2292 if (fromvp == NULL) { 2293 *status = NFSERR_STALE; 2294 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2295 "rfs_link_end:(%S)", "from stale"); 2296 return; 2297 } 2298 2299 fh = args->la_to.da_fhandle; 2300 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2301 if (to_exi == NULL) { 2302 VN_RELE(fromvp); 2303 *status = NFSERR_ACCES; 2304 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2305 "rfs_link_end:(%S)", "cross device"); 2306 return; 2307 } 2308 exi_rele(to_exi); 2309 2310 if (to_exi != exi) { 2311 VN_RELE(fromvp); 2312 *status = NFSERR_XDEV; 2313 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2314 "rfs_link_end:(%S)", "cross device"); 2315 return; 2316 } 2317 2318 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2319 if (tovp == NULL) { 2320 VN_RELE(fromvp); 2321 *status = NFSERR_STALE; 2322 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2323 "rfs_link_end:(%S)", "to stale"); 2324 return; 2325 } 2326 2327 if (tovp->v_type != VDIR) { 2328 VN_RELE(tovp); 2329 VN_RELE(fromvp); 2330 *status = NFSERR_NOTDIR; 2331 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2332 "rfs_link_end:(%S)", "not dir"); 2333 return; 2334 } 2335 /* 2336 * Disallow NULL paths 2337 */ 2338 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2339 VN_RELE(tovp); 2340 VN_RELE(fromvp); 2341 *status = NFSERR_ACCES; 2342 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2343 "rfs_link_end:(%S)", "access"); 2344 return; 2345 } 2346 2347 if (rdonly(exi, req)) { 2348 VN_RELE(tovp); 2349 VN_RELE(fromvp); 2350 *status = NFSERR_ROFS; 2351 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2352 "rfs_link_end:(%S)", "rofs"); 2353 return; 2354 } 2355 2356 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, 2357 "vop_link_start:"); 2358 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr); 2359 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, 2360 "vop_link_end:"); 2361 2362 /* 2363 * Force modified data and metadata out to stable storage. 2364 */ 2365 (void) VOP_FSYNC(tovp, 0, cr); 2366 (void) VOP_FSYNC(fromvp, FNODSYNC, cr); 2367 2368 VN_RELE(tovp); 2369 VN_RELE(fromvp); 2370 2371 *status = puterrno(error); 2372 2373 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2374 "rfs_link_end:(%S)", "done"); 2375 } 2376 fhandle_t * 2377 rfs_link_getfh(struct nfslinkargs *args) 2378 { 2379 return (args->la_from); 2380 } 2381 2382 /* 2383 * Symbolicly link to a file. 2384 * Create a file (to) with the given attributes which is a symbolic link 2385 * to the given path name (to). 2386 */ 2387 void 2388 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2389 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2390 { 2391 int error; 2392 struct vattr va; 2393 vnode_t *vp; 2394 vnode_t *svp; 2395 int lerror; 2396 2397 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, 2398 "rfs_symlink_start:"); 2399 2400 /* 2401 * Disallow NULL paths 2402 */ 2403 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2404 *status = NFSERR_ACCES; 2405 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2406 "rfs_symlink_end:(%S)", "access"); 2407 return; 2408 } 2409 2410 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2411 if (vp == NULL) { 2412 *status = NFSERR_STALE; 2413 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2414 "rfs_symlink_end:(%S)", "stale"); 2415 return; 2416 } 2417 2418 if (rdonly(exi, req)) { 2419 VN_RELE(vp); 2420 *status = NFSERR_ROFS; 2421 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2422 "rfs_symlink_end:(%S)", "rofs"); 2423 return; 2424 } 2425 2426 error = sattr_to_vattr(args->sla_sa, &va); 2427 if (error) { 2428 VN_RELE(vp); 2429 *status = puterrno(error); 2430 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2431 "rfs_symlink_end:(%S)", "sattr"); 2432 return; 2433 } 2434 2435 if (!(va.va_mask & AT_MODE)) { 2436 VN_RELE(vp); 2437 *status = NFSERR_INVAL; 2438 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2439 "rfs_symlink_end:(%S)", "no mode"); 2440 return; 2441 } 2442 2443 va.va_type = VLNK; 2444 va.va_mask |= AT_TYPE; 2445 2446 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, 2447 "vop_symlink_start:"); 2448 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr); 2449 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, 2450 "vop_symlink_end:"); 2451 2452 /* 2453 * Force new data and metadata out to stable storage. 2454 */ 2455 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 2456 "vop_lookup_start:"); 2457 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2458 0, NULL, cr); 2459 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 2460 "vop_lookup_end:"); 2461 if (!lerror) { 2462 VN_SETPATH(rootdir, vp, svp, args->sla_from.da_name, 2463 strlen(args->sla_from.da_name)); 2464 (void) VOP_FSYNC(svp, 0, cr); 2465 VN_RELE(svp); 2466 } 2467 2468 /* 2469 * Force modified data and metadata out to stable storage. 2470 */ 2471 (void) VOP_FSYNC(vp, 0, cr); 2472 2473 VN_RELE(vp); 2474 2475 *status = puterrno(error); 2476 2477 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2478 "rfs_symlink_end:(%S)", "done"); 2479 } 2480 fhandle_t * 2481 rfs_symlink_getfh(struct nfsslargs *args) 2482 { 2483 return (args->sla_from.da_fhandle); 2484 } 2485 2486 /* 2487 * Make a directory. 2488 * Create a directory with the given name, parent directory, and attributes. 2489 * Returns a file handle and attributes for the new directory. 2490 */ 2491 void 2492 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2493 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2494 { 2495 int error; 2496 struct vattr va; 2497 vnode_t *dvp = NULL; 2498 vnode_t *vp; 2499 char *name = args->ca_da.da_name; 2500 2501 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, 2502 "rfs_mkdir_start:"); 2503 2504 /* 2505 * Disallow NULL paths 2506 */ 2507 if (name == NULL || *name == '\0') { 2508 dr->dr_status = NFSERR_ACCES; 2509 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2510 "rfs_mkdir_end:(%S)", "access"); 2511 return; 2512 } 2513 2514 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2515 if (vp == NULL) { 2516 dr->dr_status = NFSERR_STALE; 2517 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2518 "rfs_mkdir_end:(%S)", "stale"); 2519 return; 2520 } 2521 2522 if (rdonly(exi, req)) { 2523 VN_RELE(vp); 2524 dr->dr_status = NFSERR_ROFS; 2525 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2526 "rfs_mkdir_end:(%S)", "rofs"); 2527 return; 2528 } 2529 2530 error = sattr_to_vattr(args->ca_sa, &va); 2531 if (error) { 2532 VN_RELE(vp); 2533 dr->dr_status = puterrno(error); 2534 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2535 "rfs_mkdir_end:(%S)", "sattr"); 2536 return; 2537 } 2538 2539 if (!(va.va_mask & AT_MODE)) { 2540 VN_RELE(vp); 2541 dr->dr_status = NFSERR_INVAL; 2542 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2543 "rfs_mkdir_end:(%S)", "no mode"); 2544 return; 2545 } 2546 2547 va.va_type = VDIR; 2548 va.va_mask |= AT_TYPE; 2549 2550 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, 2551 "vop_mkdir_start:"); 2552 error = VOP_MKDIR(vp, name, &va, &dvp, cr); 2553 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, 2554 "vop_mkdir_end:"); 2555 2556 if (!error) { 2557 /* 2558 * Attribtutes of the newly created directory should 2559 * be returned to the client. 2560 */ 2561 va.va_mask = AT_ALL; /* We want everything */ 2562 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 2563 "vop_getattr_start:"); 2564 error = VOP_GETATTR(dvp, &va, 0, cr); 2565 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 2566 "vop_getattr_end:"); 2567 /* check for overflows */ 2568 if (!error) { 2569 acl_perm(vp, exi, &va, cr); 2570 error = vattr_to_nattr(&va, &dr->dr_attr); 2571 if (!error) { 2572 error = makefh(&dr->dr_fhandle, dvp, exi); 2573 } 2574 } 2575 /* 2576 * Force new data and metadata out to stable storage. 2577 */ 2578 (void) VOP_FSYNC(dvp, 0, cr); 2579 VN_RELE(dvp); 2580 } 2581 2582 /* 2583 * Force modified data and metadata out to stable storage. 2584 */ 2585 (void) VOP_FSYNC(vp, 0, cr); 2586 2587 VN_RELE(vp); 2588 2589 dr->dr_status = puterrno(error); 2590 2591 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2592 "rfs_mkdir_end:(%S)", "done"); 2593 } 2594 fhandle_t * 2595 rfs_mkdir_getfh(struct nfscreatargs *args) 2596 { 2597 return (args->ca_da.da_fhandle); 2598 } 2599 2600 /* 2601 * Remove a directory. 2602 * Remove the given directory name from the given parent directory. 2603 */ 2604 void 2605 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2606 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2607 { 2608 int error; 2609 vnode_t *vp; 2610 2611 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, 2612 "rfs_rmdir_start:"); 2613 2614 /* 2615 * Disallow NULL paths 2616 */ 2617 if (da->da_name == NULL || *da->da_name == '\0') { 2618 *status = NFSERR_ACCES; 2619 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2620 "rfs_rmdir_end:(%S)", "access"); 2621 return; 2622 } 2623 2624 vp = nfs_fhtovp(da->da_fhandle, exi); 2625 if (vp == NULL) { 2626 *status = NFSERR_STALE; 2627 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2628 "rfs_rmdir_end:(%S)", "stale"); 2629 return; 2630 } 2631 2632 if (rdonly(exi, req)) { 2633 VN_RELE(vp); 2634 *status = NFSERR_ROFS; 2635 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2636 "rfs_rmdir_end:(%S)", "rofs"); 2637 return; 2638 } 2639 2640 /* 2641 * VOP_RMDIR now takes a new third argument (the current 2642 * directory of the process). That's because someone 2643 * wants to return EINVAL if one tries to remove ".". 2644 * Of course, NFS servers have no idea what their 2645 * clients' current directories are. We fake it by 2646 * supplying a vnode known to exist and illegal to 2647 * remove. 2648 */ 2649 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, 2650 "vop_rmdir_start:"); 2651 error = VOP_RMDIR(vp, da->da_name, rootdir, cr); 2652 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, 2653 "vop_rmdir_end:"); 2654 2655 /* 2656 * Force modified data and metadata out to stable storage. 2657 */ 2658 (void) VOP_FSYNC(vp, 0, cr); 2659 2660 VN_RELE(vp); 2661 2662 /* 2663 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2664 * if the directory is not empty. A System V NFS server 2665 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2666 * over the wire. 2667 */ 2668 if (error == EEXIST) 2669 *status = NFSERR_NOTEMPTY; 2670 else 2671 *status = puterrno(error); 2672 2673 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2674 "rfs_rmdir_end:(%S)", "done"); 2675 } 2676 fhandle_t * 2677 rfs_rmdir_getfh(struct nfsdiropargs *da) 2678 { 2679 return (da->da_fhandle); 2680 } 2681 2682 /* ARGSUSED */ 2683 void 2684 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2685 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2686 { 2687 int error; 2688 int iseof; 2689 struct iovec iov; 2690 struct uio uio; 2691 vnode_t *vp; 2692 2693 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, 2694 "rfs_readdir_start:"); 2695 2696 vp = nfs_fhtovp(&rda->rda_fh, exi); 2697 if (vp == NULL) { 2698 rd->rd_entries = NULL; 2699 rd->rd_status = NFSERR_STALE; 2700 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2701 "rfs_readdir_end:(%S)", "stale"); 2702 return; 2703 } 2704 2705 if (vp->v_type != VDIR) { 2706 VN_RELE(vp); 2707 rd->rd_entries = NULL; 2708 rd->rd_status = NFSERR_NOTDIR; 2709 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2710 "rfs_readdir_end:(%S)", "notdir"); 2711 return; 2712 } 2713 2714 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 2715 "vop_rwlock_start:"); 2716 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2717 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 2718 "vop_rwlock_end:"); 2719 2720 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 2721 "vop_access_start:"); 2722 error = VOP_ACCESS(vp, VREAD, 0, cr); 2723 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 2724 "vop_access_end:"); 2725 if (error) { 2726 rd->rd_entries = NULL; 2727 goto bad; 2728 } 2729 2730 if (rda->rda_count == 0) { 2731 rd->rd_entries = NULL; 2732 rd->rd_size = 0; 2733 rd->rd_eof = FALSE; 2734 goto bad; 2735 } 2736 2737 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2738 2739 /* 2740 * Allocate data for entries. This will be freed by rfs_rddirfree. 2741 */ 2742 rd->rd_bufsize = (uint_t)rda->rda_count; 2743 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2744 2745 /* 2746 * Set up io vector to read directory data 2747 */ 2748 iov.iov_base = (caddr_t)rd->rd_entries; 2749 iov.iov_len = rda->rda_count; 2750 uio.uio_iov = &iov; 2751 uio.uio_iovcnt = 1; 2752 uio.uio_segflg = UIO_SYSSPACE; 2753 uio.uio_extflg = UIO_COPY_CACHED; 2754 uio.uio_loffset = (offset_t)rda->rda_offset; 2755 uio.uio_resid = rda->rda_count; 2756 2757 /* 2758 * read directory 2759 */ 2760 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, 2761 "vop_readdir_start:"); 2762 error = VOP_READDIR(vp, &uio, cr, &iseof); 2763 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, 2764 "vop_readdir_end:"); 2765 2766 /* 2767 * Clean up 2768 */ 2769 if (!error) { 2770 /* 2771 * set size and eof 2772 */ 2773 if (uio.uio_resid == rda->rda_count) { 2774 rd->rd_size = 0; 2775 rd->rd_eof = TRUE; 2776 } else { 2777 rd->rd_size = (uint32_t)(rda->rda_count - 2778 uio.uio_resid); 2779 rd->rd_eof = iseof ? TRUE : FALSE; 2780 } 2781 } 2782 2783 bad: 2784 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 2785 "vop_rwunlock_start:"); 2786 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2787 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 2788 "vop_rwunlock_end:"); 2789 2790 #if 0 /* notyet */ 2791 /* 2792 * Don't do this. It causes local disk writes when just 2793 * reading the file and the overhead is deemed larger 2794 * than the benefit. 2795 */ 2796 /* 2797 * Force modified metadata out to stable storage. 2798 */ 2799 (void) VOP_FSYNC(vp, FNODSYNC, cr); 2800 #endif 2801 2802 VN_RELE(vp); 2803 2804 rd->rd_status = puterrno(error); 2805 2806 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2807 "rfs_readdir_end:(%S)", "done"); 2808 } 2809 fhandle_t * 2810 rfs_readdir_getfh(struct nfsrddirargs *rda) 2811 { 2812 return (&rda->rda_fh); 2813 } 2814 void 2815 rfs_rddirfree(struct nfsrddirres *rd) 2816 { 2817 if (rd->rd_entries != NULL) 2818 kmem_free(rd->rd_entries, rd->rd_bufsize); 2819 } 2820 2821 /* ARGSUSED */ 2822 void 2823 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2824 struct svc_req *req, cred_t *cr) 2825 { 2826 int error; 2827 struct statvfs64 sb; 2828 vnode_t *vp; 2829 2830 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, 2831 "rfs_statfs_start:"); 2832 2833 vp = nfs_fhtovp(fh, exi); 2834 if (vp == NULL) { 2835 fs->fs_status = NFSERR_STALE; 2836 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2837 "rfs_statfs_end:(%S)", "stale"); 2838 return; 2839 } 2840 2841 error = VFS_STATVFS(vp->v_vfsp, &sb); 2842 2843 if (!error) { 2844 fs->fs_tsize = nfstsize(); 2845 fs->fs_bsize = sb.f_frsize; 2846 fs->fs_blocks = sb.f_blocks; 2847 fs->fs_bfree = sb.f_bfree; 2848 fs->fs_bavail = sb.f_bavail; 2849 } 2850 2851 VN_RELE(vp); 2852 2853 fs->fs_status = puterrno(error); 2854 2855 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2856 "rfs_statfs_end:(%S)", "done"); 2857 } 2858 fhandle_t * 2859 rfs_statfs_getfh(fhandle_t *fh) 2860 { 2861 return (fh); 2862 } 2863 2864 static int 2865 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2866 { 2867 vap->va_mask = 0; 2868 2869 /* 2870 * There was a sign extension bug in some VFS based systems 2871 * which stored the mode as a short. When it would get 2872 * assigned to a u_long, no sign extension would occur. 2873 * It needed to, but this wasn't noticed because sa_mode 2874 * would then get assigned back to the short, thus ignoring 2875 * the upper 16 bits of sa_mode. 2876 * 2877 * To make this implementation work for both broken 2878 * clients and good clients, we check for both versions 2879 * of the mode. 2880 */ 2881 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2882 sa->sa_mode != (uint32_t)-1) { 2883 vap->va_mask |= AT_MODE; 2884 vap->va_mode = sa->sa_mode; 2885 } 2886 if (sa->sa_uid != (uint32_t)-1) { 2887 vap->va_mask |= AT_UID; 2888 vap->va_uid = sa->sa_uid; 2889 } 2890 if (sa->sa_gid != (uint32_t)-1) { 2891 vap->va_mask |= AT_GID; 2892 vap->va_gid = sa->sa_gid; 2893 } 2894 if (sa->sa_size != (uint32_t)-1) { 2895 vap->va_mask |= AT_SIZE; 2896 vap->va_size = sa->sa_size; 2897 } 2898 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2899 sa->sa_atime.tv_usec != (int32_t)-1) { 2900 #ifndef _LP64 2901 /* return error if time overflow */ 2902 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2903 return (EOVERFLOW); 2904 #endif 2905 vap->va_mask |= AT_ATIME; 2906 /* 2907 * nfs protocol defines times as unsigned so don't extend sign, 2908 * unless sysadmin set nfs_allow_preepoch_time. 2909 */ 2910 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2911 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2912 } 2913 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2914 sa->sa_mtime.tv_usec != (int32_t)-1) { 2915 #ifndef _LP64 2916 /* return error if time overflow */ 2917 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2918 return (EOVERFLOW); 2919 #endif 2920 vap->va_mask |= AT_MTIME; 2921 /* 2922 * nfs protocol defines times as unsigned so don't extend sign, 2923 * unless sysadmin set nfs_allow_preepoch_time. 2924 */ 2925 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2926 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2927 } 2928 return (0); 2929 } 2930 2931 static enum nfsftype vt_to_nf[] = { 2932 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2933 }; 2934 2935 /* 2936 * check the following fields for overflow: nodeid, size, and time. 2937 * There could be a problem when converting 64-bit LP64 fields 2938 * into 32-bit ones. Return an error if there is an overflow. 2939 */ 2940 int 2941 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2942 { 2943 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2944 na->na_type = vt_to_nf[vap->va_type]; 2945 2946 if (vap->va_mode == (unsigned short) -1) 2947 na->na_mode = (uint32_t)-1; 2948 else 2949 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2950 2951 if (vap->va_uid == (unsigned short)(-1)) 2952 na->na_uid = (uint32_t)(-1); 2953 else if (vap->va_uid == UID_NOBODY) 2954 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2955 else 2956 na->na_uid = vap->va_uid; 2957 2958 if (vap->va_gid == (unsigned short)(-1)) 2959 na->na_gid = (uint32_t)-1; 2960 else if (vap->va_gid == GID_NOBODY) 2961 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2962 else 2963 na->na_gid = vap->va_gid; 2964 2965 /* 2966 * Do we need to check fsid for overflow? It is 64-bit in the 2967 * vattr, but are bigger than 32 bit values supported? 2968 */ 2969 na->na_fsid = vap->va_fsid; 2970 2971 na->na_nodeid = vap->va_nodeid; 2972 2973 /* 2974 * Check to make sure that the nodeid is representable over the 2975 * wire without losing bits. 2976 */ 2977 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2978 return (EFBIG); 2979 na->na_nlink = vap->va_nlink; 2980 2981 /* 2982 * Check for big files here, instead of at the caller. See 2983 * comments in cstat for large special file explanation. 2984 */ 2985 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2986 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2987 return (EFBIG); 2988 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2989 /* UNKNOWN_SIZE | OVERFLOW */ 2990 na->na_size = MAXOFF32_T; 2991 } else 2992 na->na_size = vap->va_size; 2993 } else 2994 na->na_size = vap->va_size; 2995 2996 /* 2997 * If the vnode times overflow the 32-bit times that NFS2 2998 * uses on the wire then return an error. 2999 */ 3000 if (!NFS_VAP_TIME_OK(vap)) { 3001 return (EOVERFLOW); 3002 } 3003 na->na_atime.tv_sec = vap->va_atime.tv_sec; 3004 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 3005 3006 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 3007 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 3008 3009 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 3010 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 3011 3012 /* 3013 * If the dev_t will fit into 16 bits then compress 3014 * it, otherwise leave it alone. See comments in 3015 * nfs_client.c. 3016 */ 3017 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 3018 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 3019 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 3020 else 3021 (void) cmpldev(&na->na_rdev, vap->va_rdev); 3022 3023 na->na_blocks = vap->va_nblocks; 3024 na->na_blocksize = vap->va_blksize; 3025 3026 /* 3027 * This bit of ugliness is a *TEMPORARY* hack to preserve the 3028 * over-the-wire protocols for named-pipe vnodes. It remaps the 3029 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 3030 * 3031 * BUYER BEWARE: 3032 * If you are porting the NFS to a non-Sun server, you probably 3033 * don't want to include the following block of code. The 3034 * over-the-wire special file types will be changing with the 3035 * NFS Protocol Revision. 3036 */ 3037 if (vap->va_type == VFIFO) 3038 NA_SETFIFO(na); 3039 return (0); 3040 } 3041 3042 /* 3043 * acl v2 support: returns approximate permission. 3044 * default: returns minimal permission (more restrictive) 3045 * aclok: returns maximal permission (less restrictive) 3046 * This routine changes the permissions that are alaredy in *va. 3047 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3048 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3049 */ 3050 static void 3051 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3052 { 3053 vsecattr_t vsa; 3054 int aclcnt; 3055 aclent_t *aclentp; 3056 mode_t mask_perm; 3057 mode_t grp_perm; 3058 mode_t other_perm; 3059 mode_t other_orig; 3060 int error; 3061 3062 /* dont care default acl */ 3063 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3064 error = VOP_GETSECATTR(vp, &vsa, 0, cr); 3065 3066 if (!error) { 3067 aclcnt = vsa.vsa_aclcnt; 3068 if (aclcnt > MIN_ACL_ENTRIES) { 3069 /* non-trivial ACL */ 3070 aclentp = vsa.vsa_aclentp; 3071 if (exi->exi_export.ex_flags & EX_ACLOK) { 3072 /* maximal permissions */ 3073 grp_perm = 0; 3074 other_perm = 0; 3075 for (; aclcnt > 0; aclcnt--, aclentp++) { 3076 switch (aclentp->a_type) { 3077 case USER_OBJ: 3078 break; 3079 case USER: 3080 grp_perm |= 3081 aclentp->a_perm << 3; 3082 other_perm |= aclentp->a_perm; 3083 break; 3084 case GROUP_OBJ: 3085 grp_perm |= 3086 aclentp->a_perm << 3; 3087 break; 3088 case GROUP: 3089 other_perm |= aclentp->a_perm; 3090 break; 3091 case OTHER_OBJ: 3092 other_orig = aclentp->a_perm; 3093 break; 3094 case CLASS_OBJ: 3095 mask_perm = aclentp->a_perm; 3096 break; 3097 default: 3098 break; 3099 } 3100 } 3101 grp_perm &= mask_perm << 3; 3102 other_perm &= mask_perm; 3103 other_perm |= other_orig; 3104 3105 } else { 3106 /* minimal permissions */ 3107 grp_perm = 070; 3108 other_perm = 07; 3109 for (; aclcnt > 0; aclcnt--, aclentp++) { 3110 switch (aclentp->a_type) { 3111 case USER_OBJ: 3112 break; 3113 case USER: 3114 case CLASS_OBJ: 3115 grp_perm &= 3116 aclentp->a_perm << 3; 3117 other_perm &= 3118 aclentp->a_perm; 3119 break; 3120 case GROUP_OBJ: 3121 grp_perm &= 3122 aclentp->a_perm << 3; 3123 break; 3124 case GROUP: 3125 other_perm &= 3126 aclentp->a_perm; 3127 break; 3128 case OTHER_OBJ: 3129 other_perm &= 3130 aclentp->a_perm; 3131 break; 3132 default: 3133 break; 3134 } 3135 } 3136 } 3137 /* copy to va */ 3138 va->va_mode &= ~077; 3139 va->va_mode |= grp_perm | other_perm; 3140 } 3141 if (vsa.vsa_aclcnt) 3142 kmem_free(vsa.vsa_aclentp, 3143 vsa.vsa_aclcnt * sizeof (aclent_t)); 3144 } 3145 } 3146 3147 void 3148 rfs_srvrinit(void) 3149 { 3150 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3151 } 3152 3153 void 3154 rfs_srvrfini(void) 3155 { 3156 mutex_destroy(&rfs_async_write_lock); 3157 } 3158