1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/uio.h> 41 #include <sys/stat.h> 42 #include <sys/errno.h> 43 #include <sys/sysmacros.h> 44 #include <sys/statvfs.h> 45 #include <sys/kmem.h> 46 #include <sys/kstat.h> 47 #include <sys/dirent.h> 48 #include <sys/cmn_err.h> 49 #include <sys/debug.h> 50 #include <sys/vtrace.h> 51 #include <sys/mode.h> 52 #include <sys/acl.h> 53 #include <sys/nbmlock.h> 54 #include <sys/policy.h> 55 56 #include <rpc/types.h> 57 #include <rpc/auth.h> 58 #include <rpc/svc.h> 59 60 #include <nfs/nfs.h> 61 #include <nfs/export.h> 62 63 #include <vm/hat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kmem.h> 68 69 #include <sys/strsubr.h> 70 71 /* 72 * These are the interface routines for the server side of the 73 * Network File System. See the NFS version 2 protocol specification 74 * for a description of this interface. 75 */ 76 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 79 cred_t *); 80 81 /* 82 * Some "over the wire" UNIX file types. These are encoded 83 * into the mode. This needs to be fixed in the next rev. 84 */ 85 #define IFMT 0170000 /* type of file */ 86 #define IFCHR 0020000 /* character special */ 87 #define IFBLK 0060000 /* block special */ 88 #define IFSOCK 0140000 /* socket */ 89 90 /* 91 * Get file attributes. 92 * Returns the current attributes of the file with the given fhandle. 93 */ 94 /* ARGSUSED */ 95 void 96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 97 struct svc_req *req, cred_t *cr) 98 { 99 int error; 100 vnode_t *vp; 101 struct vattr va; 102 103 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, 104 "rfs_getattr_start:"); 105 106 vp = nfs_fhtovp(fhp, exi); 107 if (vp == NULL) { 108 ns->ns_status = NFSERR_STALE; 109 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 110 "rfs_getattr_end:(%S)", "stale"); 111 return; 112 } 113 114 /* 115 * Do the getattr. 116 */ 117 va.va_mask = AT_ALL; /* we want all the attributes */ 118 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 119 "vop_getattr_start:"); 120 error = rfs4_delegated_getattr(vp, &va, 0, cr); 121 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 122 "vop_getattr_end:"); 123 124 /* check for overflows */ 125 if (!error) { 126 acl_perm(vp, exi, &va, cr); 127 error = vattr_to_nattr(&va, &ns->ns_attr); 128 } 129 130 VN_RELE(vp); 131 132 ns->ns_status = puterrno(error); 133 134 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 135 "rfs_getattr_end:(%S)", "done"); 136 } 137 void * 138 rfs_getattr_getfh(fhandle_t *fhp) 139 { 140 return (fhp); 141 } 142 143 /* 144 * Set file attributes. 145 * Sets the attributes of the file with the given fhandle. Returns 146 * the new attributes. 147 */ 148 void 149 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 150 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 151 { 152 int error; 153 int flag; 154 int in_crit = 0; 155 vnode_t *vp; 156 struct vattr va; 157 struct vattr bva; 158 struct flock64 bf; 159 160 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, 161 "rfs_setattr_start:"); 162 163 vp = nfs_fhtovp(&args->saa_fh, exi); 164 if (vp == NULL) { 165 ns->ns_status = NFSERR_STALE; 166 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 167 "rfs_setattr_end:(%S)", "stale"); 168 return; 169 } 170 171 if (rdonly(exi, req) || vn_is_readonly(vp)) { 172 VN_RELE(vp); 173 ns->ns_status = NFSERR_ROFS; 174 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 175 "rfs_setattr_end:(%S)", "rofs"); 176 return; 177 } 178 179 error = sattr_to_vattr(&args->saa_sa, &va); 180 if (error) { 181 VN_RELE(vp); 182 ns->ns_status = puterrno(error); 183 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 184 "rfs_setattr_end:(%S)", "sattr"); 185 return; 186 } 187 188 /* 189 * If the client is requesting a change to the mtime, 190 * but the nanosecond field is set to 1 billion, then 191 * this is a flag to the server that it should set the 192 * atime and mtime fields to the server's current time. 193 * The 1 billion number actually came from the client 194 * as 1 million, but the units in the over the wire 195 * request are microseconds instead of nanoseconds. 196 * 197 * This is an overload of the protocol and should be 198 * documented in the NFS Version 2 protocol specification. 199 */ 200 if (va.va_mask & AT_MTIME) { 201 if (va.va_mtime.tv_nsec == 1000000000) { 202 gethrestime(&va.va_mtime); 203 va.va_atime = va.va_mtime; 204 va.va_mask |= AT_ATIME; 205 flag = 0; 206 } else 207 flag = ATTR_UTIME; 208 } else 209 flag = 0; 210 211 /* 212 * If the filesystem is exported with nosuid, then mask off 213 * the setuid and setgid bits. 214 */ 215 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 216 (exi->exi_export.ex_flags & EX_NOSUID)) 217 va.va_mode &= ~(VSUID | VSGID); 218 219 /* 220 * We need to specially handle size changes because it is 221 * possible for the client to create a file with modes 222 * which indicate read-only, but with the file opened for 223 * writing. If the client then tries to set the size of 224 * the file, then the normal access checking done in 225 * VOP_SETATTR would prevent the client from doing so, 226 * although it should be legal for it to do so. To get 227 * around this, we do the access checking for ourselves 228 * and then use VOP_SPACE which doesn't do the access 229 * checking which VOP_SETATTR does. VOP_SPACE can only 230 * operate on VREG files, let VOP_SETATTR handle the other 231 * extremely rare cases. 232 * Also the client should not be allowed to change the 233 * size of the file if there is a conflicting non-blocking 234 * mandatory lock in the region of change. 235 * 236 * Also(2), check to see if the v4 side of the server has 237 * delegated this file. If so, then we set T_WOULDBLOCK 238 * so that the dispatch function dosn't send a reply, forcing 239 * the client to retrasmit its request. 240 */ 241 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 242 /* If delegated, mark as wouldblock so response is dropped */ 243 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 244 VN_RELE(vp); 245 curthread->t_flag |= T_WOULDBLOCK; 246 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 247 "rfs_setattr_end:(%S)", "delegated"); 248 return; 249 } 250 if (nbl_need_check(vp)) { 251 nbl_start_crit(vp, RW_READER); 252 in_crit = 1; 253 } 254 255 bva.va_mask = AT_UID | AT_SIZE; 256 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 257 "vop_getattr_start:"); 258 error = VOP_GETATTR(vp, &bva, 0, cr); 259 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 260 "vop_getattr_end:"); 261 if (error) { 262 if (in_crit) 263 nbl_end_crit(vp); 264 VN_RELE(vp); 265 ns->ns_status = puterrno(error); 266 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 267 "rfs_setattr_end:(%S)", "getattr"); 268 return; 269 } 270 271 if (in_crit) { 272 u_offset_t offset; 273 ssize_t length; 274 275 if (va.va_size < bva.va_size) { 276 offset = va.va_size; 277 length = bva.va_size - va.va_size; 278 } else { 279 offset = bva.va_size; 280 length = va.va_size - bva.va_size; 281 } 282 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) { 283 error = EACCES; 284 } 285 } 286 287 if (crgetuid(cr) == bva.va_uid && !error && 288 va.va_size != bva.va_size) { 289 va.va_mask &= ~AT_SIZE; 290 bf.l_type = F_WRLCK; 291 bf.l_whence = 0; 292 bf.l_start = (off64_t)va.va_size; 293 bf.l_len = 0; 294 bf.l_sysid = 0; 295 bf.l_pid = 0; 296 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 297 "vop_space_start:"); 298 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 299 (offset_t)va.va_size, cr, NULL); 300 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, 301 "vop_space_end:"); 302 } 303 if (in_crit) 304 nbl_end_crit(vp); 305 } else 306 error = 0; 307 308 /* 309 * Do the setattr. 310 */ 311 if (!error && va.va_mask) { 312 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, 313 "vop_setattr_start:"); 314 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 315 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, 316 "vop_setattr_end:"); 317 } 318 319 if (!error) { 320 va.va_mask = AT_ALL; /* get everything */ 321 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 322 "vop_getattr_start:"); 323 error = rfs4_delegated_getattr(vp, &va, 0, cr); 324 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 325 "vop_getattr_end:"); 326 327 /* check for overflows */ 328 if (!error) { 329 acl_perm(vp, exi, &va, cr); 330 error = vattr_to_nattr(&va, &ns->ns_attr); 331 } 332 } 333 334 /* 335 * Force modified metadata out to stable storage. 336 */ 337 (void) VOP_FSYNC(vp, FNODSYNC, cr); 338 339 VN_RELE(vp); 340 341 ns->ns_status = puterrno(error); 342 343 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 344 "rfs_setattr_end:(%S)", "done"); 345 } 346 void * 347 rfs_setattr_getfh(struct nfssaargs *args) 348 { 349 return (&args->saa_fh); 350 } 351 352 /* 353 * Directory lookup. 354 * Returns an fhandle and file attributes for file name in a directory. 355 */ 356 /* ARGSUSED */ 357 void 358 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 359 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 360 { 361 int error; 362 vnode_t *dvp; 363 vnode_t *vp; 364 struct vattr va; 365 fhandle_t *fhp = da->da_fhandle; 366 struct sec_ol sec = {0, 0}; 367 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 368 369 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, 370 "rfs_lookup_start:"); 371 372 /* 373 * Disallow NULL paths 374 */ 375 if (da->da_name == NULL || *da->da_name == '\0') { 376 dr->dr_status = NFSERR_ACCES; 377 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 378 "rfs_lookup_end:(%S)", "access"); 379 return; 380 } 381 382 /* 383 * Allow lookups from the root - the default 384 * location of the public filehandle. 385 */ 386 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 387 dvp = rootdir; 388 VN_HOLD(dvp); 389 } else { 390 dvp = nfs_fhtovp(fhp, exi); 391 if (dvp == NULL) { 392 dr->dr_status = NFSERR_STALE; 393 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 394 "rfs_lookup_end:(%S)", "stale"); 395 return; 396 } 397 } 398 399 /* 400 * Not allow lookup beyond root. 401 * If the filehandle matches a filehandle of the exi, 402 * then the ".." refers beyond the root of an exported filesystem. 403 */ 404 if (strcmp(da->da_name, "..") == 0 && 405 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 406 VN_RELE(dvp); 407 dr->dr_status = NFSERR_NOENT; 408 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 409 "rfs_lookup_end:(%S)", "noent"); 410 return; 411 } 412 413 /* 414 * If the public filehandle is used then allow 415 * a multi-component lookup, i.e. evaluate 416 * a pathname and follow symbolic links if 417 * necessary. 418 * 419 * This may result in a vnode in another filesystem 420 * which is OK as long as the filesystem is exported. 421 */ 422 if (PUBLIC_FH2(fhp)) { 423 publicfh_flag = TRUE; 424 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 425 &sec); 426 } else { 427 /* 428 * Do a normal single component lookup. 429 */ 430 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 431 "vop_lookup_start:"); 432 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr); 433 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 434 "vop_lookup_end:"); 435 } 436 437 if (!error) { 438 va.va_mask = AT_ALL; /* we want everything */ 439 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 440 "vop_getattr_start:"); 441 error = rfs4_delegated_getattr(vp, &va, 0, cr); 442 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 443 "vop_getattr_end:"); 444 /* check for overflows */ 445 if (!error) { 446 acl_perm(vp, exi, &va, cr); 447 error = vattr_to_nattr(&va, &dr->dr_attr); 448 if (!error) { 449 if (sec.sec_flags & SEC_QUERY) 450 error = makefh_ol(&dr->dr_fhandle, exi, 451 sec.sec_index); 452 else { 453 error = makefh(&dr->dr_fhandle, vp, 454 exi); 455 if (!error && publicfh_flag && 456 !chk_clnt_sec(exi, req)) 457 auth_weak = TRUE; 458 } 459 } 460 } 461 VN_RELE(vp); 462 } 463 464 VN_RELE(dvp); 465 466 /* 467 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 468 * and have obtained a new exportinfo in exi which needs to be 469 * released. Note the the original exportinfo pointed to by exi 470 * will be released by the caller, comon_dispatch. 471 */ 472 if (publicfh_flag && exi != NULL) 473 exi_rele(exi); 474 475 /* 476 * If it's public fh, no 0x81, and client's flavor is 477 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 478 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 479 */ 480 if (auth_weak) 481 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 482 else 483 dr->dr_status = puterrno(error); 484 485 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 486 "rfs_lookup_end:(%S)", "done"); 487 } 488 void * 489 rfs_lookup_getfh(struct nfsdiropargs *da) 490 { 491 return (da->da_fhandle); 492 } 493 494 /* 495 * Read symbolic link. 496 * Returns the string in the symbolic link at the given fhandle. 497 */ 498 /* ARGSUSED */ 499 void 500 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 501 struct svc_req *req, cred_t *cr) 502 { 503 int error; 504 struct iovec iov; 505 struct uio uio; 506 vnode_t *vp; 507 struct vattr va; 508 509 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, 510 "rfs_readlink_start:"); 511 512 vp = nfs_fhtovp(fhp, exi); 513 if (vp == NULL) { 514 rl->rl_data = NULL; 515 rl->rl_status = NFSERR_STALE; 516 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 517 "rfs_readlink_end:(%S)", "stale"); 518 return; 519 } 520 521 va.va_mask = AT_MODE; 522 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 523 "vop_getattr_start:"); 524 error = VOP_GETATTR(vp, &va, 0, cr); 525 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 526 "vop_getattr_end:"); 527 528 if (error) { 529 VN_RELE(vp); 530 rl->rl_data = NULL; 531 rl->rl_status = puterrno(error); 532 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 533 "rfs_readlink_end:(%S)", "getattr error"); 534 return; 535 } 536 537 if (MANDLOCK(vp, va.va_mode)) { 538 VN_RELE(vp); 539 rl->rl_data = NULL; 540 rl->rl_status = NFSERR_ACCES; 541 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 542 "rfs_readlink_end:(%S)", "access"); 543 return; 544 } 545 546 /* 547 * XNFS and RFC1094 require us to return ENXIO if argument 548 * is not a link. BUGID 1138002. 549 */ 550 if (vp->v_type != VLNK) { 551 VN_RELE(vp); 552 rl->rl_data = NULL; 553 rl->rl_status = NFSERR_NXIO; 554 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 555 "rfs_readlink_end:(%S)", "nxio"); 556 return; 557 } 558 559 /* 560 * Allocate data for pathname. This will be freed by rfs_rlfree. 561 */ 562 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 563 564 /* 565 * Set up io vector to read sym link data 566 */ 567 iov.iov_base = rl->rl_data; 568 iov.iov_len = NFS_MAXPATHLEN; 569 uio.uio_iov = &iov; 570 uio.uio_iovcnt = 1; 571 uio.uio_segflg = UIO_SYSSPACE; 572 uio.uio_extflg = UIO_COPY_CACHED; 573 uio.uio_loffset = (offset_t)0; 574 uio.uio_resid = NFS_MAXPATHLEN; 575 576 /* 577 * Do the readlink. 578 */ 579 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, 580 "vop_readlink_start:"); 581 error = VOP_READLINK(vp, &uio, cr); 582 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, 583 "vop_readlink_end:"); 584 585 #if 0 /* notyet */ 586 /* 587 * Don't do this. It causes local disk writes when just 588 * reading the file and the overhead is deemed larger 589 * than the benefit. 590 */ 591 /* 592 * Force modified metadata out to stable storage. 593 */ 594 (void) VOP_FSYNC(vp, FNODSYNC, cr); 595 #endif 596 597 VN_RELE(vp); 598 599 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 600 601 /* 602 * XNFS and RFC1094 require us to return ENXIO if argument 603 * is not a link. UFS returns EINVAL if this is the case, 604 * so we do the mapping here. BUGID 1138002. 605 */ 606 if (error == EINVAL) 607 rl->rl_status = NFSERR_NXIO; 608 else 609 rl->rl_status = puterrno(error); 610 611 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 612 "rfs_readlink_end:(%S)", "done"); 613 } 614 void * 615 rfs_readlink_getfh(fhandle_t *fhp) 616 { 617 return (fhp); 618 } 619 /* 620 * Free data allocated by rfs_readlink 621 */ 622 void 623 rfs_rlfree(struct nfsrdlnres *rl) 624 { 625 if (rl->rl_data != NULL) 626 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 627 } 628 629 /* 630 * Read data. 631 * Returns some data read from the file at the given fhandle. 632 */ 633 /* ARGSUSED */ 634 void 635 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 636 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 637 { 638 vnode_t *vp; 639 int error; 640 struct vattr va; 641 struct iovec iov; 642 struct uio uio; 643 mblk_t *mp; 644 int alloc_err = 0; 645 int in_crit = 0; 646 647 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, 648 "rfs_read_start:"); 649 650 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 651 if (vp == NULL) { 652 rr->rr_data = NULL; 653 rr->rr_status = NFSERR_STALE; 654 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 655 "rfs_read_end:(%S)", "stale"); 656 return; 657 } 658 659 if (vp->v_type != VREG) { 660 VN_RELE(vp); 661 rr->rr_data = NULL; 662 rr->rr_status = NFSERR_ISDIR; 663 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 664 "rfs_read_end:(%S)", "isdir"); 665 return; 666 } 667 668 /* 669 * Check to see if the v4 side of the server has delegated 670 * this file. If so, then we mark thread as wouldblock so 671 * the response is dropped. 672 */ 673 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 674 VN_RELE(vp); 675 curthread->t_flag |= T_WOULDBLOCK; 676 rr->rr_data = NULL; 677 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 678 "rfs_read_end:(%S)", "delegated"); 679 return; 680 } 681 682 /* 683 * Enter the critical region before calling VOP_RWLOCK 684 * to avoid a deadlock with write requests. 685 */ 686 if (nbl_need_check(vp)) { 687 nbl_start_crit(vp, RW_READER); 688 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 689 0)) { 690 nbl_end_crit(vp); 691 VN_RELE(vp); 692 rr->rr_data = NULL; 693 rr->rr_status = NFSERR_ACCES; 694 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 695 "rfs_read_end:(%S)", " csf access error"); 696 return; 697 } 698 in_crit = 1; 699 } 700 701 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 702 "vop_rwlock_start:"); 703 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 704 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 705 "vop_rwlock_end:"); 706 707 va.va_mask = AT_ALL; 708 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 709 "vop_getattr_start:"); 710 error = VOP_GETATTR(vp, &va, 0, cr); 711 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 712 "vop_getattr_end:"); 713 714 if (error) { 715 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 716 "vop_rwunlock_start:"); 717 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 718 if (in_crit) 719 nbl_end_crit(vp); 720 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 721 "vop_rwunlock_end:"); 722 VN_RELE(vp); 723 rr->rr_data = NULL; 724 rr->rr_status = puterrno(error); 725 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 726 "rfs_read_end:(%S)", "getattr error"); 727 return; 728 } 729 730 /* 731 * This is a kludge to allow reading of files created 732 * with no read permission. The owner of the file 733 * is always allowed to read it. 734 */ 735 if (crgetuid(cr) != va.va_uid) { 736 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 737 "vop_access_start:"); 738 error = VOP_ACCESS(vp, VREAD, 0, cr); 739 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 740 "vop_access_end:"); 741 if (error) { 742 /* 743 * Exec is the same as read over the net because 744 * of demand loading. 745 */ 746 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 747 "vop_access_start:"); 748 error = VOP_ACCESS(vp, VEXEC, 0, cr); 749 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 750 "vop_access_end:"); 751 } 752 if (error) { 753 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 754 "vop_rwunlock_start:"); 755 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 756 if (in_crit) 757 nbl_end_crit(vp); 758 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 759 "vop_rwunlock_end:"); 760 VN_RELE(vp); 761 rr->rr_data = NULL; 762 rr->rr_status = puterrno(error); 763 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 764 "rfs_read_end:(%S)", "access error"); 765 return; 766 } 767 } 768 769 if (MANDLOCK(vp, va.va_mode)) { 770 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 771 "vop_rwunlock_start:"); 772 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 773 if (in_crit) 774 nbl_end_crit(vp); 775 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 776 "vop_rwunlock_end:"); 777 VN_RELE(vp); 778 rr->rr_data = NULL; 779 rr->rr_status = NFSERR_ACCES; 780 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 781 "rfs_read_end:(%S)", "mand lock"); 782 return; 783 } 784 785 if ((u_offset_t)ra->ra_offset >= va.va_size) { 786 rr->rr_count = 0; 787 rr->rr_data = NULL; 788 /* 789 * In this case, status is NFS_OK, but there is no data 790 * to encode. So set rr_mp to NULL. 791 */ 792 rr->rr_mp = NULL; 793 goto done; 794 } 795 796 /* 797 * mp will contain the data to be sent out in the read reply. 798 * This will be freed after the reply has been sent out (by the 799 * driver). 800 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 801 * that the call to xdrmblk_putmblk() never fails. 802 */ 803 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 804 &alloc_err); 805 ASSERT(mp != NULL); 806 ASSERT(alloc_err == 0); 807 808 rr->rr_mp = mp; 809 810 /* 811 * Set up io vector 812 */ 813 iov.iov_base = (caddr_t)mp->b_datap->db_base; 814 iov.iov_len = ra->ra_count; 815 uio.uio_iov = &iov; 816 uio.uio_iovcnt = 1; 817 uio.uio_segflg = UIO_SYSSPACE; 818 uio.uio_extflg = UIO_COPY_CACHED; 819 uio.uio_loffset = (offset_t)ra->ra_offset; 820 uio.uio_resid = ra->ra_count; 821 822 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, 823 "vop_read_start:"); 824 error = VOP_READ(vp, &uio, 0, cr, NULL); 825 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, 826 "vop_read_end:"); 827 828 if (error) { 829 freeb(mp); 830 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 831 "vop_rwunlock_start:"); 832 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 833 if (in_crit) 834 nbl_end_crit(vp); 835 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 836 "vop_rwunlock_end:"); 837 VN_RELE(vp); 838 rr->rr_data = NULL; 839 rr->rr_status = puterrno(error); 840 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 841 "rfs_read_end:(%S)", "read error"); 842 return; 843 } 844 845 /* 846 * Get attributes again so we can send the latest access 847 * time to the client side for his cache. 848 */ 849 va.va_mask = AT_ALL; 850 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 851 "vop_getattr_start:"); 852 error = VOP_GETATTR(vp, &va, 0, cr); 853 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 854 "vop_getattr_end:"); 855 if (error) { 856 freeb(mp); 857 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 858 "vop_rwunlock_start:"); 859 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 860 if (in_crit) 861 nbl_end_crit(vp); 862 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 863 "vop_rwunlock_end:"); 864 VN_RELE(vp); 865 rr->rr_data = NULL; 866 rr->rr_status = puterrno(error); 867 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 868 "rfs_read_end:(%S)", "read error"); 869 return; 870 } 871 872 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 873 874 rr->rr_data = (char *)mp->b_datap->db_base; 875 876 done: 877 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 878 "vop_rwunlock_start:"); 879 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 880 if (in_crit) 881 nbl_end_crit(vp); 882 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 883 "vop_rwunlock_end:"); 884 885 acl_perm(vp, exi, &va, cr); 886 887 /* check for overflows */ 888 error = vattr_to_nattr(&va, &rr->rr_attr); 889 890 #if 0 /* notyet */ 891 /* 892 * Don't do this. It causes local disk writes when just 893 * reading the file and the overhead is deemed larger 894 * than the benefit. 895 */ 896 /* 897 * Force modified metadata out to stable storage. 898 */ 899 (void) VOP_FSYNC(vp, FNODSYNC, cr); 900 #endif 901 902 VN_RELE(vp); 903 904 rr->rr_status = puterrno(error); 905 906 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 907 "rfs_read_end:(%S)", "done"); 908 } 909 910 /* 911 * Free data allocated by rfs_read 912 */ 913 void 914 rfs_rdfree(struct nfsrdresult *rr) 915 { 916 mblk_t *mp; 917 918 if (rr->rr_status == NFS_OK) { 919 mp = rr->rr_mp; 920 if (mp != NULL) 921 freeb(mp); 922 } 923 } 924 925 void * 926 rfs_read_getfh(struct nfsreadargs *ra) 927 { 928 return (&ra->ra_fhandle); 929 } 930 931 #define MAX_IOVECS 12 932 933 #ifdef DEBUG 934 static int rfs_write_sync_hits = 0; 935 static int rfs_write_sync_misses = 0; 936 #endif 937 938 /* 939 * Write data to file. 940 * Returns attributes of a file after writing some data to it. 941 * 942 * Any changes made here, especially in error handling might have 943 * to also be done in rfs_write (which clusters write requests). 944 */ 945 void 946 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 947 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 948 { 949 int error; 950 vnode_t *vp; 951 rlim64_t rlimit; 952 struct vattr va; 953 struct uio uio; 954 struct iovec iov[MAX_IOVECS]; 955 mblk_t *m; 956 struct iovec *iovp; 957 int iovcnt; 958 cred_t *savecred; 959 int in_crit = 0; 960 961 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 962 "rfs_write_start:(%S)", "sync"); 963 964 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 965 if (vp == NULL) { 966 ns->ns_status = NFSERR_STALE; 967 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 968 "rfs_write_end:(%S)", "stale"); 969 return; 970 } 971 972 if (rdonly(exi, req)) { 973 VN_RELE(vp); 974 ns->ns_status = NFSERR_ROFS; 975 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 976 "rfs_write_end:(%S)", "rofs"); 977 return; 978 } 979 980 if (vp->v_type != VREG) { 981 VN_RELE(vp); 982 ns->ns_status = NFSERR_ISDIR; 983 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 984 "rfs_write_end:(%S)", "isdir"); 985 return; 986 } 987 988 /* 989 * Check to see if the v4 side of the server has delegated 990 * this file. If so, then we mark thread as wouldblock so 991 * the response is dropped. 992 */ 993 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 994 VN_RELE(vp); 995 curthread->t_flag |= T_WOULDBLOCK; 996 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 997 "rfs_write_end:(%S)", "delegated"); 998 return; 999 } 1000 1001 va.va_mask = AT_UID|AT_MODE; 1002 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1003 "vop_getattr_start:"); 1004 error = VOP_GETATTR(vp, &va, 0, cr); 1005 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1006 "vop_getattr_end:"); 1007 1008 if (error) { 1009 VN_RELE(vp); 1010 ns->ns_status = puterrno(error); 1011 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1012 "rfs_write_end:(%S)", "getattr error"); 1013 return; 1014 } 1015 1016 if (crgetuid(cr) != va.va_uid) { 1017 /* 1018 * This is a kludge to allow writes of files created 1019 * with read only permission. The owner of the file 1020 * is always allowed to write it. 1021 */ 1022 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1023 "vop_access_start:"); 1024 error = VOP_ACCESS(vp, VWRITE, 0, cr); 1025 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1026 "vop_access_end:"); 1027 if (error) { 1028 VN_RELE(vp); 1029 ns->ns_status = puterrno(error); 1030 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1031 "rfs_write_end:(%S)", "access error"); 1032 return; 1033 } 1034 } 1035 1036 /* 1037 * Can't access a mandatory lock file. This might cause 1038 * the NFS service thread to block forever waiting for a 1039 * lock to be released that will never be released. 1040 */ 1041 if (MANDLOCK(vp, va.va_mode)) { 1042 VN_RELE(vp); 1043 ns->ns_status = NFSERR_ACCES; 1044 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1045 "rfs_write_end:(%S)", "mand lock"); 1046 return; 1047 } 1048 1049 /* 1050 * We have to enter the critical region before calling VOP_RWLOCK 1051 * to avoid a deadlock with ufs. 1052 */ 1053 if (nbl_need_check(vp)) { 1054 nbl_start_crit(vp, RW_READER); 1055 in_crit = 1; 1056 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1057 wa->wa_count, 0)) { 1058 error = EACCES; 1059 goto out; 1060 } 1061 } 1062 1063 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1064 "vop_rwlock_start:"); 1065 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1066 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1067 "vop_rwlock_end:"); 1068 1069 if (wa->wa_data) { 1070 iov[0].iov_base = wa->wa_data; 1071 iov[0].iov_len = wa->wa_count; 1072 uio.uio_iov = iov; 1073 uio.uio_iovcnt = 1; 1074 uio.uio_segflg = UIO_SYSSPACE; 1075 uio.uio_extflg = UIO_COPY_DEFAULT; 1076 uio.uio_loffset = (offset_t)wa->wa_offset; 1077 uio.uio_resid = wa->wa_count; 1078 /* 1079 * The limit is checked on the client. We 1080 * should allow any size writes here. 1081 */ 1082 uio.uio_llimit = curproc->p_fsz_ctl; 1083 rlimit = uio.uio_llimit - wa->wa_offset; 1084 if (rlimit < (rlim64_t)uio.uio_resid) 1085 uio.uio_resid = (uint_t)rlimit; 1086 1087 /* 1088 * for now we assume no append mode 1089 */ 1090 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1091 "vop_write_start:(%S)", "sync"); 1092 /* 1093 * We're changing creds because VM may fault and we need 1094 * the cred of the current thread to be used if quota 1095 * checking is enabled. 1096 */ 1097 savecred = curthread->t_cred; 1098 curthread->t_cred = cr; 1099 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1100 curthread->t_cred = savecred; 1101 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1102 "vop_write_end:"); 1103 } else { 1104 iovcnt = 0; 1105 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1106 iovcnt++; 1107 if (iovcnt <= MAX_IOVECS) { 1108 #ifdef DEBUG 1109 rfs_write_sync_hits++; 1110 #endif 1111 iovp = iov; 1112 } else { 1113 #ifdef DEBUG 1114 rfs_write_sync_misses++; 1115 #endif 1116 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1117 } 1118 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1119 uio.uio_iov = iovp; 1120 uio.uio_iovcnt = iovcnt; 1121 uio.uio_segflg = UIO_SYSSPACE; 1122 uio.uio_extflg = UIO_COPY_DEFAULT; 1123 uio.uio_loffset = (offset_t)wa->wa_offset; 1124 uio.uio_resid = wa->wa_count; 1125 /* 1126 * The limit is checked on the client. We 1127 * should allow any size writes here. 1128 */ 1129 uio.uio_llimit = curproc->p_fsz_ctl; 1130 rlimit = uio.uio_llimit - wa->wa_offset; 1131 if (rlimit < (rlim64_t)uio.uio_resid) 1132 uio.uio_resid = (uint_t)rlimit; 1133 1134 /* 1135 * For now we assume no append mode. 1136 */ 1137 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1138 "vop_write_start:(%S)", "iov sync"); 1139 /* 1140 * We're changing creds because VM may fault and we need 1141 * the cred of the current thread to be used if quota 1142 * checking is enabled. 1143 */ 1144 savecred = curthread->t_cred; 1145 curthread->t_cred = cr; 1146 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1147 curthread->t_cred = savecred; 1148 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1149 "vop_write_end:"); 1150 1151 if (iovp != iov) 1152 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1153 } 1154 1155 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1156 "vop_rwunlock_start:"); 1157 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1158 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1159 "vop_rwunlock_end:"); 1160 1161 if (!error) { 1162 /* 1163 * Get attributes again so we send the latest mod 1164 * time to the client side for his cache. 1165 */ 1166 va.va_mask = AT_ALL; /* now we want everything */ 1167 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1168 "vop_getattr_start:"); 1169 error = VOP_GETATTR(vp, &va, 0, cr); 1170 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1171 "vop_getattr_end:"); 1172 /* check for overflows */ 1173 if (!error) { 1174 acl_perm(vp, exi, &va, cr); 1175 error = vattr_to_nattr(&va, &ns->ns_attr); 1176 } 1177 } 1178 1179 out: 1180 if (in_crit) 1181 nbl_end_crit(vp); 1182 VN_RELE(vp); 1183 1184 ns->ns_status = puterrno(error); 1185 1186 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1187 "rfs_write_end:(%S)", "sync"); 1188 } 1189 1190 struct rfs_async_write { 1191 struct nfswriteargs *wa; 1192 struct nfsattrstat *ns; 1193 struct svc_req *req; 1194 cred_t *cr; 1195 kthread_t *thread; 1196 struct rfs_async_write *list; 1197 }; 1198 1199 struct rfs_async_write_list { 1200 fhandle_t *fhp; 1201 kcondvar_t cv; 1202 struct rfs_async_write *list; 1203 struct rfs_async_write_list *next; 1204 }; 1205 1206 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1207 static kmutex_t rfs_async_write_lock; 1208 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1209 1210 #define MAXCLIOVECS 42 1211 #define RFSWRITE_INITVAL (enum nfsstat) -1 1212 1213 #ifdef DEBUG 1214 static int rfs_write_hits = 0; 1215 static int rfs_write_misses = 0; 1216 #endif 1217 1218 /* 1219 * Write data to file. 1220 * Returns attributes of a file after writing some data to it. 1221 */ 1222 void 1223 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1224 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1225 { 1226 int error; 1227 vnode_t *vp; 1228 rlim64_t rlimit; 1229 struct vattr va; 1230 struct uio uio; 1231 struct rfs_async_write_list *lp; 1232 struct rfs_async_write_list *nlp; 1233 struct rfs_async_write *rp; 1234 struct rfs_async_write *nrp; 1235 struct rfs_async_write *trp; 1236 struct rfs_async_write *lrp; 1237 int data_written; 1238 int iovcnt; 1239 mblk_t *m; 1240 struct iovec *iovp; 1241 struct iovec *niovp; 1242 struct iovec iov[MAXCLIOVECS]; 1243 int count; 1244 int rcount; 1245 uint_t off; 1246 uint_t len; 1247 struct rfs_async_write nrpsp; 1248 struct rfs_async_write_list nlpsp; 1249 ushort_t t_flag; 1250 cred_t *savecred; 1251 int in_crit = 0; 1252 1253 if (!rfs_write_async) { 1254 rfs_write_sync(wa, ns, exi, req, cr); 1255 return; 1256 } 1257 1258 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1259 "rfs_write_start:(%S)", "async"); 1260 1261 /* 1262 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1263 * is considered an OK. 1264 */ 1265 ns->ns_status = RFSWRITE_INITVAL; 1266 1267 nrp = &nrpsp; 1268 nrp->wa = wa; 1269 nrp->ns = ns; 1270 nrp->req = req; 1271 nrp->cr = cr; 1272 nrp->thread = curthread; 1273 1274 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1275 1276 /* 1277 * Look to see if there is already a cluster started 1278 * for this file. 1279 */ 1280 mutex_enter(&rfs_async_write_lock); 1281 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1282 if (bcmp(&wa->wa_fhandle, lp->fhp, 1283 sizeof (fhandle_t)) == 0) 1284 break; 1285 } 1286 1287 /* 1288 * If lp is non-NULL, then there is already a cluster 1289 * started. We need to place ourselves in the cluster 1290 * list in the right place as determined by starting 1291 * offset. Conflicts with non-blocking mandatory locked 1292 * regions will be checked when the cluster is processed. 1293 */ 1294 if (lp != NULL) { 1295 rp = lp->list; 1296 trp = NULL; 1297 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1298 trp = rp; 1299 rp = rp->list; 1300 } 1301 nrp->list = rp; 1302 if (trp == NULL) 1303 lp->list = nrp; 1304 else 1305 trp->list = nrp; 1306 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1307 cv_wait(&lp->cv, &rfs_async_write_lock); 1308 mutex_exit(&rfs_async_write_lock); 1309 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1310 "rfs_write_end:(%S)", "cluster child"); 1311 return; 1312 } 1313 1314 /* 1315 * No cluster started yet, start one and add ourselves 1316 * to the list of clusters. 1317 */ 1318 nrp->list = NULL; 1319 1320 nlp = &nlpsp; 1321 nlp->fhp = &wa->wa_fhandle; 1322 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1323 nlp->list = nrp; 1324 nlp->next = NULL; 1325 1326 if (rfs_async_write_head == NULL) { 1327 rfs_async_write_head = nlp; 1328 } else { 1329 lp = rfs_async_write_head; 1330 while (lp->next != NULL) 1331 lp = lp->next; 1332 lp->next = nlp; 1333 } 1334 mutex_exit(&rfs_async_write_lock); 1335 1336 /* 1337 * Convert the file handle common to all of the requests 1338 * in this cluster to a vnode. 1339 */ 1340 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1341 if (vp == NULL) { 1342 mutex_enter(&rfs_async_write_lock); 1343 if (rfs_async_write_head == nlp) 1344 rfs_async_write_head = nlp->next; 1345 else { 1346 lp = rfs_async_write_head; 1347 while (lp->next != nlp) 1348 lp = lp->next; 1349 lp->next = nlp->next; 1350 } 1351 t_flag = curthread->t_flag & T_WOULDBLOCK; 1352 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1353 rp->ns->ns_status = NFSERR_STALE; 1354 rp->thread->t_flag |= t_flag; 1355 } 1356 cv_broadcast(&nlp->cv); 1357 mutex_exit(&rfs_async_write_lock); 1358 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1359 "rfs_write_end:(%S)", "stale"); 1360 return; 1361 } 1362 1363 /* 1364 * Can only write regular files. Attempts to write any 1365 * other file types fail with EISDIR. 1366 */ 1367 if (vp->v_type != VREG) { 1368 VN_RELE(vp); 1369 mutex_enter(&rfs_async_write_lock); 1370 if (rfs_async_write_head == nlp) 1371 rfs_async_write_head = nlp->next; 1372 else { 1373 lp = rfs_async_write_head; 1374 while (lp->next != nlp) 1375 lp = lp->next; 1376 lp->next = nlp->next; 1377 } 1378 t_flag = curthread->t_flag & T_WOULDBLOCK; 1379 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1380 rp->ns->ns_status = NFSERR_ISDIR; 1381 rp->thread->t_flag |= t_flag; 1382 } 1383 cv_broadcast(&nlp->cv); 1384 mutex_exit(&rfs_async_write_lock); 1385 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1386 "rfs_write_end:(%S)", "isdir"); 1387 return; 1388 } 1389 1390 /* 1391 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1392 * deadlock with ufs. 1393 */ 1394 if (nbl_need_check(vp)) { 1395 nbl_start_crit(vp, RW_READER); 1396 in_crit = 1; 1397 } 1398 1399 /* 1400 * Lock the file for writing. This operation provides 1401 * the delay which allows clusters to grow. 1402 */ 1403 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1404 "vop_wrlock_start:"); 1405 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1406 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1407 "vop_wrlock_end"); 1408 1409 /* 1410 * Disconnect this cluster from the list of clusters. 1411 * The cluster that is being dealt with must be fixed 1412 * in size after this point, so there is no reason 1413 * to leave it on the list so that new requests can 1414 * find it. 1415 * 1416 * The algorithm is that the first write request will 1417 * create a cluster, convert the file handle to a 1418 * vnode pointer, and then lock the file for writing. 1419 * This request is not likely to be clustered with 1420 * any others. However, the next request will create 1421 * a new cluster and be blocked in VOP_RWLOCK while 1422 * the first request is being processed. This delay 1423 * will allow more requests to be clustered in this 1424 * second cluster. 1425 */ 1426 mutex_enter(&rfs_async_write_lock); 1427 if (rfs_async_write_head == nlp) 1428 rfs_async_write_head = nlp->next; 1429 else { 1430 lp = rfs_async_write_head; 1431 while (lp->next != nlp) 1432 lp = lp->next; 1433 lp->next = nlp->next; 1434 } 1435 mutex_exit(&rfs_async_write_lock); 1436 1437 /* 1438 * Step through the list of requests in this cluster. 1439 * We need to check permissions to make sure that all 1440 * of the requests have sufficient permission to write 1441 * the file. A cluster can be composed of requests 1442 * from different clients and different users on each 1443 * client. 1444 * 1445 * As a side effect, we also calculate the size of the 1446 * byte range that this cluster encompasses. 1447 */ 1448 rp = nlp->list; 1449 off = rp->wa->wa_offset; 1450 len = (uint_t)0; 1451 do { 1452 if (rdonly(exi, rp->req)) { 1453 rp->ns->ns_status = NFSERR_ROFS; 1454 t_flag = curthread->t_flag & T_WOULDBLOCK; 1455 rp->thread->t_flag |= t_flag; 1456 continue; 1457 } 1458 1459 va.va_mask = AT_UID|AT_MODE; 1460 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1461 "vop_getattr_start:"); 1462 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1463 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1464 "vop_getattr_end:"); 1465 if (!error) { 1466 if (crgetuid(rp->cr) != va.va_uid) { 1467 /* 1468 * This is a kludge to allow writes of files 1469 * created with read only permission. The 1470 * owner of the file is always allowed to 1471 * write it. 1472 */ 1473 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1474 "vop_access_start:"); 1475 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr); 1476 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1477 "vop_access_end:"); 1478 } 1479 if (!error && MANDLOCK(vp, va.va_mode)) 1480 error = EACCES; 1481 } 1482 1483 /* 1484 * Check for a conflict with a nbmand-locked region. 1485 */ 1486 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1487 rp->wa->wa_count, 0)) { 1488 error = EACCES; 1489 } 1490 1491 if (error) { 1492 rp->ns->ns_status = puterrno(error); 1493 t_flag = curthread->t_flag & T_WOULDBLOCK; 1494 rp->thread->t_flag |= t_flag; 1495 continue; 1496 } 1497 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1498 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1499 } while ((rp = rp->list) != NULL); 1500 1501 /* 1502 * Step through the cluster attempting to gather as many 1503 * requests which are contiguous as possible. These 1504 * contiguous requests are handled via one call to VOP_WRITE 1505 * instead of different calls to VOP_WRITE. We also keep 1506 * track of the fact that any data was written. 1507 */ 1508 rp = nlp->list; 1509 data_written = 0; 1510 do { 1511 /* 1512 * Skip any requests which are already marked as having an 1513 * error. 1514 */ 1515 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1516 rp = rp->list; 1517 continue; 1518 } 1519 1520 /* 1521 * Count the number of iovec's which are required 1522 * to handle this set of requests. One iovec is 1523 * needed for each data buffer, whether addressed 1524 * by wa_data or by the b_rptr pointers in the 1525 * mblk chains. 1526 */ 1527 iovcnt = 0; 1528 lrp = rp; 1529 for (;;) { 1530 if (lrp->wa->wa_data) 1531 iovcnt++; 1532 else { 1533 m = lrp->wa->wa_mblk; 1534 while (m != NULL) { 1535 iovcnt++; 1536 m = m->b_cont; 1537 } 1538 } 1539 if (lrp->list == NULL || 1540 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1541 lrp->wa->wa_offset + lrp->wa->wa_count != 1542 lrp->list->wa->wa_offset) { 1543 lrp = lrp->list; 1544 break; 1545 } 1546 lrp = lrp->list; 1547 } 1548 1549 if (iovcnt <= MAXCLIOVECS) { 1550 #ifdef DEBUG 1551 rfs_write_hits++; 1552 #endif 1553 niovp = iov; 1554 } else { 1555 #ifdef DEBUG 1556 rfs_write_misses++; 1557 #endif 1558 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1559 } 1560 /* 1561 * Put together the scatter/gather iovecs. 1562 */ 1563 iovp = niovp; 1564 trp = rp; 1565 count = 0; 1566 do { 1567 if (trp->wa->wa_data) { 1568 iovp->iov_base = trp->wa->wa_data; 1569 iovp->iov_len = trp->wa->wa_count; 1570 iovp++; 1571 } else { 1572 m = trp->wa->wa_mblk; 1573 rcount = trp->wa->wa_count; 1574 while (m != NULL) { 1575 iovp->iov_base = (caddr_t)m->b_rptr; 1576 iovp->iov_len = (m->b_wptr - m->b_rptr); 1577 rcount -= iovp->iov_len; 1578 if (rcount < 0) 1579 iovp->iov_len += rcount; 1580 iovp++; 1581 if (rcount <= 0) 1582 break; 1583 m = m->b_cont; 1584 } 1585 } 1586 count += trp->wa->wa_count; 1587 trp = trp->list; 1588 } while (trp != lrp); 1589 1590 uio.uio_iov = niovp; 1591 uio.uio_iovcnt = iovcnt; 1592 uio.uio_segflg = UIO_SYSSPACE; 1593 uio.uio_extflg = UIO_COPY_DEFAULT; 1594 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1595 uio.uio_resid = count; 1596 /* 1597 * The limit is checked on the client. We 1598 * should allow any size writes here. 1599 */ 1600 uio.uio_llimit = curproc->p_fsz_ctl; 1601 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1602 if (rlimit < (rlim64_t)uio.uio_resid) 1603 uio.uio_resid = (uint_t)rlimit; 1604 1605 /* 1606 * For now we assume no append mode. 1607 */ 1608 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1609 "vop_write_start:(%S)", "async"); 1610 1611 /* 1612 * Check to see if the v4 side of the server has 1613 * delegated this file. If so, then we mark thread 1614 * as wouldblock so the response is dropped. 1615 */ 1616 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1617 curthread->t_flag |= T_WOULDBLOCK; 1618 error = EACCES; /* just to have an error */ 1619 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1620 "rfs_write_end:(%S)", "delegated"); 1621 } else { 1622 /* 1623 * We're changing creds because VM may fault 1624 * and we need the cred of the current 1625 * thread to be used if quota * checking is 1626 * enabled. 1627 */ 1628 savecred = curthread->t_cred; 1629 curthread->t_cred = cr; 1630 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1631 curthread->t_cred = savecred; 1632 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1633 "vop_write_end:"); 1634 } 1635 1636 if (niovp != iov) 1637 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1638 1639 if (!error) { 1640 data_written = 1; 1641 /* 1642 * Get attributes again so we send the latest mod 1643 * time to the client side for his cache. 1644 */ 1645 va.va_mask = AT_ALL; /* now we want everything */ 1646 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1647 "vop_getattr_start:"); 1648 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1649 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1650 "vop_getattr_end:"); 1651 if (!error) 1652 acl_perm(vp, exi, &va, rp->cr); 1653 } 1654 1655 /* 1656 * Fill in the status responses for each request 1657 * which was just handled. Also, copy the latest 1658 * attributes in to the attribute responses if 1659 * appropriate. 1660 */ 1661 t_flag = curthread->t_flag & T_WOULDBLOCK; 1662 do { 1663 rp->thread->t_flag |= t_flag; 1664 /* check for overflows */ 1665 if (!error) { 1666 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1667 } 1668 rp->ns->ns_status = puterrno(error); 1669 rp = rp->list; 1670 } while (rp != lrp); 1671 } while (rp != NULL); 1672 1673 /* 1674 * If any data was written at all, then we need to flush 1675 * the data and metadata to stable storage. 1676 */ 1677 if (data_written) { 1678 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, 1679 "vop_putpage_start:"); 1680 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr); 1681 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, 1682 "vop_putpage_end:"); 1683 if (!error) { 1684 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1685 "vop_fsync_start:"); 1686 error = VOP_FSYNC(vp, FNODSYNC, cr); 1687 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, 1688 "vop_fsync_end:"); 1689 } 1690 } 1691 1692 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1693 "vop_rwunlock_start:"); 1694 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1695 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1696 "vop_rwunlock_end:"); 1697 1698 if (in_crit) 1699 nbl_end_crit(vp); 1700 VN_RELE(vp); 1701 1702 t_flag = curthread->t_flag & T_WOULDBLOCK; 1703 mutex_enter(&rfs_async_write_lock); 1704 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1705 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1706 rp->ns->ns_status = puterrno(error); 1707 rp->thread->t_flag |= t_flag; 1708 } 1709 } 1710 cv_broadcast(&nlp->cv); 1711 mutex_exit(&rfs_async_write_lock); 1712 1713 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1714 "rfs_write_end:(%S)", "async"); 1715 } 1716 1717 void * 1718 rfs_write_getfh(struct nfswriteargs *wa) 1719 { 1720 return (&wa->wa_fhandle); 1721 } 1722 1723 /* 1724 * Create a file. 1725 * Creates a file with given attributes and returns those attributes 1726 * and an fhandle for the new file. 1727 */ 1728 void 1729 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1730 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1731 { 1732 int error; 1733 int lookuperr; 1734 int in_crit = 0; 1735 struct vattr va; 1736 vnode_t *vp; 1737 vnode_t *dvp; 1738 char *name = args->ca_da.da_name; 1739 vnode_t *tvp = NULL; 1740 int mode; 1741 int lookup_ok; 1742 bool_t trunc; 1743 1744 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, 1745 "rfs_create_start:"); 1746 1747 /* 1748 * Disallow NULL paths 1749 */ 1750 if (name == NULL || *name == '\0') { 1751 dr->dr_status = NFSERR_ACCES; 1752 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1753 "rfs_create_end:(%S)", "access"); 1754 return; 1755 } 1756 1757 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1758 if (dvp == NULL) { 1759 dr->dr_status = NFSERR_STALE; 1760 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1761 "rfs_create_end:(%S)", "stale"); 1762 return; 1763 } 1764 1765 error = sattr_to_vattr(args->ca_sa, &va); 1766 if (error) { 1767 dr->dr_status = puterrno(error); 1768 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1769 "rfs_create_end:(%S)", "sattr"); 1770 return; 1771 } 1772 1773 /* 1774 * Must specify the mode. 1775 */ 1776 if (!(va.va_mask & AT_MODE)) { 1777 VN_RELE(dvp); 1778 dr->dr_status = NFSERR_INVAL; 1779 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1780 "rfs_create_end:(%S)", "no mode"); 1781 return; 1782 } 1783 1784 /* 1785 * This is a completely gross hack to make mknod 1786 * work over the wire until we can wack the protocol 1787 */ 1788 if ((va.va_mode & IFMT) == IFCHR) { 1789 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1790 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1791 else { 1792 va.va_type = VCHR; 1793 /* 1794 * uncompress the received dev_t 1795 * if the top half is zero indicating a request 1796 * from an `older style' OS. 1797 */ 1798 if ((va.va_size & 0xffff0000) == 0) 1799 va.va_rdev = nfsv2_expdev(va.va_size); 1800 else 1801 va.va_rdev = (dev_t)va.va_size; 1802 } 1803 va.va_mask &= ~AT_SIZE; 1804 } else if ((va.va_mode & IFMT) == IFBLK) { 1805 va.va_type = VBLK; 1806 /* 1807 * uncompress the received dev_t 1808 * if the top half is zero indicating a request 1809 * from an `older style' OS. 1810 */ 1811 if ((va.va_size & 0xffff0000) == 0) 1812 va.va_rdev = nfsv2_expdev(va.va_size); 1813 else 1814 va.va_rdev = (dev_t)va.va_size; 1815 va.va_mask &= ~AT_SIZE; 1816 } else if ((va.va_mode & IFMT) == IFSOCK) { 1817 va.va_type = VSOCK; 1818 } else 1819 va.va_type = VREG; 1820 va.va_mode &= ~IFMT; 1821 va.va_mask |= AT_TYPE; 1822 1823 /* 1824 * Why was the choice made to use VWRITE as the mode to the 1825 * call to VOP_CREATE ? This results in a bug. When a client 1826 * opens a file that already exists and is RDONLY, the second 1827 * open fails with an EACESS because of the mode. 1828 * bug ID 1054648. 1829 */ 1830 lookup_ok = 0; 1831 mode = VWRITE; 1832 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1833 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 1834 "vop_lookup_start:"); 1835 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1836 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 1837 "vop_lookup_end:"); 1838 if (!error) { 1839 struct vattr at; 1840 1841 lookup_ok = 1; 1842 at.va_mask = AT_MODE; 1843 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1844 "vop_getattr_start:"); 1845 error = VOP_GETATTR(tvp, &at, 0, cr); 1846 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1847 "vop_getattr_end:"); 1848 if (!error) 1849 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1850 VN_RELE(tvp); 1851 tvp = NULL; 1852 } 1853 } 1854 1855 if (!lookup_ok) { 1856 if (rdonly(exi, req)) { 1857 error = EROFS; 1858 } else if (va.va_type != VREG && va.va_type != VFIFO && 1859 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1860 error = EPERM; 1861 } else { 1862 error = 0; 1863 } 1864 } 1865 1866 /* 1867 * If file size is being modified on an already existing file 1868 * make sure that there are no conflicting non-blocking mandatory 1869 * locks in the region being manipulated. Return EACCES if there 1870 * are conflicting locks. 1871 */ 1872 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1873 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1874 1875 if (!lookuperr && 1876 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1877 VN_RELE(tvp); 1878 curthread->t_flag |= T_WOULDBLOCK; 1879 goto out; 1880 } 1881 1882 if (!lookuperr && nbl_need_check(tvp)) { 1883 /* 1884 * The file exists. Now check if it has any 1885 * conflicting non-blocking mandatory locks 1886 * in the region being changed. 1887 */ 1888 struct vattr bva; 1889 u_offset_t offset; 1890 ssize_t length; 1891 1892 nbl_start_crit(tvp, RW_READER); 1893 in_crit = 1; 1894 1895 bva.va_mask = AT_SIZE; 1896 error = VOP_GETATTR(tvp, &bva, 0, cr); 1897 if (!error) { 1898 if (va.va_size < bva.va_size) { 1899 offset = va.va_size; 1900 length = bva.va_size - va.va_size; 1901 } else { 1902 offset = bva.va_size; 1903 length = va.va_size - bva.va_size; 1904 } 1905 if (length) { 1906 if (nbl_conflict(tvp, NBL_WRITE, 1907 offset, length, 0)) { 1908 error = EACCES; 1909 } 1910 } 1911 } 1912 if (error) { 1913 nbl_end_crit(tvp); 1914 VN_RELE(tvp); 1915 in_crit = 0; 1916 } 1917 } else if (tvp != NULL) { 1918 VN_RELE(tvp); 1919 } 1920 } 1921 1922 if (!error) { 1923 /* 1924 * If filesystem is shared with nosuid the remove any 1925 * setuid/setgid bits on create. 1926 */ 1927 if (va.va_type == VREG && 1928 exi->exi_export.ex_flags & EX_NOSUID) 1929 va.va_mode &= ~(VSUID | VSGID); 1930 1931 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, 1932 "vop_create_start:"); 1933 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0); 1934 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, 1935 "vop_create_end:"); 1936 1937 if (!error) { 1938 1939 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1940 trunc = TRUE; 1941 else 1942 trunc = FALSE; 1943 1944 if (rfs4_check_delegated(FWRITE, tvp, trunc)) { 1945 VN_RELE(tvp); 1946 curthread->t_flag |= T_WOULDBLOCK; 1947 goto out; 1948 } 1949 va.va_mask = AT_ALL; 1950 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1951 "vop_getattr_start:"); 1952 error = VOP_GETATTR(vp, &va, 0, cr); 1953 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1954 "vop_getattr_end:"); 1955 /* check for overflows */ 1956 if (!error) { 1957 acl_perm(vp, exi, &va, cr); 1958 error = vattr_to_nattr(&va, &dr->dr_attr); 1959 if (!error) { 1960 error = makefh(&dr->dr_fhandle, vp, 1961 exi); 1962 } 1963 } 1964 /* 1965 * Force modified metadata out to stable storage. 1966 */ 1967 (void) VOP_FSYNC(vp, FNODSYNC, cr); 1968 VN_RELE(vp); 1969 } 1970 1971 if (in_crit) { 1972 nbl_end_crit(tvp); 1973 VN_RELE(tvp); 1974 } 1975 } 1976 1977 /* 1978 * Force modified data and metadata out to stable storage. 1979 */ 1980 (void) VOP_FSYNC(dvp, 0, cr); 1981 1982 out: 1983 1984 VN_RELE(dvp); 1985 1986 dr->dr_status = puterrno(error); 1987 1988 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1989 "rfs_create_end:(%S)", "done"); 1990 } 1991 void * 1992 rfs_create_getfh(struct nfscreatargs *args) 1993 { 1994 return (args->ca_da.da_fhandle); 1995 } 1996 1997 /* 1998 * Remove a file. 1999 * Remove named file from parent directory. 2000 */ 2001 void 2002 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 2003 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2004 { 2005 int error = 0; 2006 vnode_t *vp; 2007 vnode_t *targvp; 2008 int in_crit = 0; 2009 2010 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, 2011 "rfs_remove_start:"); 2012 2013 /* 2014 * Disallow NULL paths 2015 */ 2016 if (da->da_name == NULL || *da->da_name == '\0') { 2017 *status = NFSERR_ACCES; 2018 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2019 "rfs_remove_end:(%S)", "access"); 2020 return; 2021 } 2022 2023 vp = nfs_fhtovp(da->da_fhandle, exi); 2024 if (vp == NULL) { 2025 *status = NFSERR_STALE; 2026 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2027 "rfs_remove_end:(%S)", "stale"); 2028 return; 2029 } 2030 2031 if (rdonly(exi, req)) { 2032 VN_RELE(vp); 2033 *status = NFSERR_ROFS; 2034 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2035 "rfs_remove_end:(%S)", "rofs"); 2036 return; 2037 } 2038 2039 /* 2040 * Check for a conflict with a non-blocking mandatory share reservation. 2041 */ 2042 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2043 NULL, cr); 2044 if (error != 0) { 2045 VN_RELE(vp); 2046 *status = puterrno(error); 2047 return; 2048 } 2049 2050 /* 2051 * If the file is delegated to an v4 client, then initiate 2052 * recall and drop this request (by setting T_WOULDBLOCK). 2053 * The client will eventually re-transmit the request and 2054 * (hopefully), by then, the v4 client will have returned 2055 * the delegation. 2056 */ 2057 2058 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2059 VN_RELE(vp); 2060 VN_RELE(targvp); 2061 curthread->t_flag |= T_WOULDBLOCK; 2062 return; 2063 } 2064 2065 if (nbl_need_check(targvp)) { 2066 nbl_start_crit(targvp, RW_READER); 2067 in_crit = 1; 2068 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) { 2069 error = EACCES; 2070 goto out; 2071 } 2072 } 2073 2074 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, 2075 "vop_remove_start:"); 2076 error = VOP_REMOVE(vp, da->da_name, cr); 2077 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, 2078 "vop_remove_end:"); 2079 2080 /* 2081 * Force modified data and metadata out to stable storage. 2082 */ 2083 (void) VOP_FSYNC(vp, 0, cr); 2084 2085 out: 2086 if (in_crit) 2087 nbl_end_crit(targvp); 2088 VN_RELE(targvp); 2089 VN_RELE(vp); 2090 2091 *status = puterrno(error); 2092 2093 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2094 "rfs_remove_end:(%S)", "done"); 2095 } 2096 2097 void * 2098 rfs_remove_getfh(struct nfsdiropargs *da) 2099 { 2100 return (da->da_fhandle); 2101 } 2102 2103 /* 2104 * rename a file 2105 * Give a file (from) a new name (to). 2106 */ 2107 void 2108 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2109 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2110 { 2111 int error = 0; 2112 vnode_t *fromvp; 2113 vnode_t *tovp; 2114 struct exportinfo *to_exi; 2115 fhandle_t *fh; 2116 vnode_t *srcvp; 2117 vnode_t *targvp; 2118 int in_crit = 0; 2119 2120 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, 2121 "rfs_rename_start:"); 2122 2123 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2124 if (fromvp == NULL) { 2125 *status = NFSERR_STALE; 2126 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2127 "rfs_rename_end:(%S)", "from stale"); 2128 return; 2129 } 2130 2131 fh = args->rna_to.da_fhandle; 2132 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2133 if (to_exi == NULL) { 2134 VN_RELE(fromvp); 2135 *status = NFSERR_ACCES; 2136 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2137 "rfs_rename_end:(%S)", "cross device"); 2138 return; 2139 } 2140 exi_rele(to_exi); 2141 2142 if (to_exi != exi) { 2143 VN_RELE(fromvp); 2144 *status = NFSERR_XDEV; 2145 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2146 "rfs_rename_end:(%S)", "from stale"); 2147 return; 2148 } 2149 2150 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2151 if (tovp == NULL) { 2152 VN_RELE(fromvp); 2153 *status = NFSERR_STALE; 2154 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2155 "rfs_rename_end:(%S)", "to stale"); 2156 return; 2157 } 2158 2159 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2160 VN_RELE(tovp); 2161 VN_RELE(fromvp); 2162 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2163 "rfs_rename_end:(%S)", "not dir"); 2164 *status = NFSERR_NOTDIR; 2165 return; 2166 } 2167 2168 /* 2169 * Disallow NULL paths 2170 */ 2171 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2172 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2173 VN_RELE(tovp); 2174 VN_RELE(fromvp); 2175 *status = NFSERR_ACCES; 2176 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2177 "rfs_rename_end:(%S)", "access"); 2178 return; 2179 } 2180 2181 if (rdonly(exi, req)) { 2182 VN_RELE(tovp); 2183 VN_RELE(fromvp); 2184 *status = NFSERR_ROFS; 2185 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2186 "rfs_rename_end:(%S)", "rofs"); 2187 return; 2188 } 2189 2190 /* 2191 * Check for a conflict with a non-blocking mandatory share reservation. 2192 */ 2193 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2194 NULL, cr); 2195 if (error != 0) { 2196 VN_RELE(tovp); 2197 VN_RELE(fromvp); 2198 *status = puterrno(error); 2199 return; 2200 } 2201 2202 /* Check for delegations on the source file */ 2203 2204 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2205 VN_RELE(tovp); 2206 VN_RELE(fromvp); 2207 VN_RELE(srcvp); 2208 curthread->t_flag |= T_WOULDBLOCK; 2209 return; 2210 } 2211 2212 /* Check for delegation on the file being renamed over, if it exists */ 2213 2214 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2215 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr) 2216 == 0) { 2217 2218 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2219 VN_RELE(tovp); 2220 VN_RELE(fromvp); 2221 VN_RELE(srcvp); 2222 VN_RELE(targvp); 2223 curthread->t_flag |= T_WOULDBLOCK; 2224 return; 2225 } 2226 VN_RELE(targvp); 2227 } 2228 2229 2230 if (nbl_need_check(srcvp)) { 2231 nbl_start_crit(srcvp, RW_READER); 2232 in_crit = 1; 2233 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) { 2234 error = EACCES; 2235 goto out; 2236 } 2237 } 2238 2239 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, 2240 "vop_rename_start:"); 2241 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2242 tovp, args->rna_to.da_name, cr); 2243 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, 2244 "vop_rename_end:"); 2245 2246 if (error == 0) { 2247 char *tmp; 2248 2249 /* fix the path name for the renamed file */ 2250 mutex_enter(&srcvp->v_lock); 2251 tmp = srcvp->v_path; 2252 srcvp->v_path = NULL; 2253 mutex_exit(&srcvp->v_lock); 2254 vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name, 2255 strlen(args->rna_to.da_name)); 2256 if (tmp != NULL) 2257 kmem_free(tmp, strlen(tmp) + 1); 2258 } 2259 2260 /* 2261 * Force modified data and metadata out to stable storage. 2262 */ 2263 (void) VOP_FSYNC(tovp, 0, cr); 2264 (void) VOP_FSYNC(fromvp, 0, cr); 2265 2266 out: 2267 if (in_crit) 2268 nbl_end_crit(srcvp); 2269 VN_RELE(srcvp); 2270 VN_RELE(tovp); 2271 VN_RELE(fromvp); 2272 2273 *status = puterrno(error); 2274 2275 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2276 "rfs_rename_end:(%S)", "done"); 2277 } 2278 void * 2279 rfs_rename_getfh(struct nfsrnmargs *args) 2280 { 2281 return (args->rna_from.da_fhandle); 2282 } 2283 2284 /* 2285 * Link to a file. 2286 * Create a file (to) which is a hard link to the given file (from). 2287 */ 2288 void 2289 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2290 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2291 { 2292 int error; 2293 vnode_t *fromvp; 2294 vnode_t *tovp; 2295 struct exportinfo *to_exi; 2296 fhandle_t *fh; 2297 2298 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, 2299 "rfs_link_start:"); 2300 2301 fromvp = nfs_fhtovp(args->la_from, exi); 2302 if (fromvp == NULL) { 2303 *status = NFSERR_STALE; 2304 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2305 "rfs_link_end:(%S)", "from stale"); 2306 return; 2307 } 2308 2309 fh = args->la_to.da_fhandle; 2310 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2311 if (to_exi == NULL) { 2312 VN_RELE(fromvp); 2313 *status = NFSERR_ACCES; 2314 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2315 "rfs_link_end:(%S)", "cross device"); 2316 return; 2317 } 2318 exi_rele(to_exi); 2319 2320 if (to_exi != exi) { 2321 VN_RELE(fromvp); 2322 *status = NFSERR_XDEV; 2323 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2324 "rfs_link_end:(%S)", "cross device"); 2325 return; 2326 } 2327 2328 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2329 if (tovp == NULL) { 2330 VN_RELE(fromvp); 2331 *status = NFSERR_STALE; 2332 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2333 "rfs_link_end:(%S)", "to stale"); 2334 return; 2335 } 2336 2337 if (tovp->v_type != VDIR) { 2338 VN_RELE(tovp); 2339 VN_RELE(fromvp); 2340 *status = NFSERR_NOTDIR; 2341 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2342 "rfs_link_end:(%S)", "not dir"); 2343 return; 2344 } 2345 /* 2346 * Disallow NULL paths 2347 */ 2348 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2349 VN_RELE(tovp); 2350 VN_RELE(fromvp); 2351 *status = NFSERR_ACCES; 2352 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2353 "rfs_link_end:(%S)", "access"); 2354 return; 2355 } 2356 2357 if (rdonly(exi, req)) { 2358 VN_RELE(tovp); 2359 VN_RELE(fromvp); 2360 *status = NFSERR_ROFS; 2361 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2362 "rfs_link_end:(%S)", "rofs"); 2363 return; 2364 } 2365 2366 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, 2367 "vop_link_start:"); 2368 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr); 2369 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, 2370 "vop_link_end:"); 2371 2372 /* 2373 * Force modified data and metadata out to stable storage. 2374 */ 2375 (void) VOP_FSYNC(tovp, 0, cr); 2376 (void) VOP_FSYNC(fromvp, FNODSYNC, cr); 2377 2378 VN_RELE(tovp); 2379 VN_RELE(fromvp); 2380 2381 *status = puterrno(error); 2382 2383 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2384 "rfs_link_end:(%S)", "done"); 2385 } 2386 void * 2387 rfs_link_getfh(struct nfslinkargs *args) 2388 { 2389 return (args->la_from); 2390 } 2391 2392 /* 2393 * Symbolicly link to a file. 2394 * Create a file (to) with the given attributes which is a symbolic link 2395 * to the given path name (to). 2396 */ 2397 void 2398 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2399 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2400 { 2401 int error; 2402 struct vattr va; 2403 vnode_t *vp; 2404 vnode_t *svp; 2405 int lerror; 2406 2407 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, 2408 "rfs_symlink_start:"); 2409 2410 /* 2411 * Disallow NULL paths 2412 */ 2413 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2414 *status = NFSERR_ACCES; 2415 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2416 "rfs_symlink_end:(%S)", "access"); 2417 return; 2418 } 2419 2420 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2421 if (vp == NULL) { 2422 *status = NFSERR_STALE; 2423 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2424 "rfs_symlink_end:(%S)", "stale"); 2425 return; 2426 } 2427 2428 if (rdonly(exi, req)) { 2429 VN_RELE(vp); 2430 *status = NFSERR_ROFS; 2431 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2432 "rfs_symlink_end:(%S)", "rofs"); 2433 return; 2434 } 2435 2436 error = sattr_to_vattr(args->sla_sa, &va); 2437 if (error) { 2438 VN_RELE(vp); 2439 *status = puterrno(error); 2440 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2441 "rfs_symlink_end:(%S)", "sattr"); 2442 return; 2443 } 2444 2445 if (!(va.va_mask & AT_MODE)) { 2446 VN_RELE(vp); 2447 *status = NFSERR_INVAL; 2448 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2449 "rfs_symlink_end:(%S)", "no mode"); 2450 return; 2451 } 2452 2453 va.va_type = VLNK; 2454 va.va_mask |= AT_TYPE; 2455 2456 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, 2457 "vop_symlink_start:"); 2458 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr); 2459 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, 2460 "vop_symlink_end:"); 2461 2462 /* 2463 * Force new data and metadata out to stable storage. 2464 */ 2465 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 2466 "vop_lookup_start:"); 2467 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2468 0, NULL, cr); 2469 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 2470 "vop_lookup_end:"); 2471 if (!lerror) { 2472 (void) VOP_FSYNC(svp, 0, cr); 2473 VN_RELE(svp); 2474 } 2475 2476 /* 2477 * Force modified data and metadata out to stable storage. 2478 */ 2479 (void) VOP_FSYNC(vp, 0, cr); 2480 2481 VN_RELE(vp); 2482 2483 *status = puterrno(error); 2484 2485 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2486 "rfs_symlink_end:(%S)", "done"); 2487 } 2488 void * 2489 rfs_symlink_getfh(struct nfsslargs *args) 2490 { 2491 return (args->sla_from.da_fhandle); 2492 } 2493 2494 /* 2495 * Make a directory. 2496 * Create a directory with the given name, parent directory, and attributes. 2497 * Returns a file handle and attributes for the new directory. 2498 */ 2499 void 2500 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2501 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2502 { 2503 int error; 2504 struct vattr va; 2505 vnode_t *dvp = NULL; 2506 vnode_t *vp; 2507 char *name = args->ca_da.da_name; 2508 2509 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, 2510 "rfs_mkdir_start:"); 2511 2512 /* 2513 * Disallow NULL paths 2514 */ 2515 if (name == NULL || *name == '\0') { 2516 dr->dr_status = NFSERR_ACCES; 2517 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2518 "rfs_mkdir_end:(%S)", "access"); 2519 return; 2520 } 2521 2522 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2523 if (vp == NULL) { 2524 dr->dr_status = NFSERR_STALE; 2525 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2526 "rfs_mkdir_end:(%S)", "stale"); 2527 return; 2528 } 2529 2530 if (rdonly(exi, req)) { 2531 VN_RELE(vp); 2532 dr->dr_status = NFSERR_ROFS; 2533 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2534 "rfs_mkdir_end:(%S)", "rofs"); 2535 return; 2536 } 2537 2538 error = sattr_to_vattr(args->ca_sa, &va); 2539 if (error) { 2540 VN_RELE(vp); 2541 dr->dr_status = puterrno(error); 2542 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2543 "rfs_mkdir_end:(%S)", "sattr"); 2544 return; 2545 } 2546 2547 if (!(va.va_mask & AT_MODE)) { 2548 VN_RELE(vp); 2549 dr->dr_status = NFSERR_INVAL; 2550 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2551 "rfs_mkdir_end:(%S)", "no mode"); 2552 return; 2553 } 2554 2555 va.va_type = VDIR; 2556 va.va_mask |= AT_TYPE; 2557 2558 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, 2559 "vop_mkdir_start:"); 2560 error = VOP_MKDIR(vp, name, &va, &dvp, cr); 2561 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, 2562 "vop_mkdir_end:"); 2563 2564 if (!error) { 2565 /* 2566 * Attribtutes of the newly created directory should 2567 * be returned to the client. 2568 */ 2569 va.va_mask = AT_ALL; /* We want everything */ 2570 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 2571 "vop_getattr_start:"); 2572 error = VOP_GETATTR(dvp, &va, 0, cr); 2573 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 2574 "vop_getattr_end:"); 2575 /* check for overflows */ 2576 if (!error) { 2577 acl_perm(vp, exi, &va, cr); 2578 error = vattr_to_nattr(&va, &dr->dr_attr); 2579 if (!error) { 2580 error = makefh(&dr->dr_fhandle, dvp, exi); 2581 } 2582 } 2583 /* 2584 * Force new data and metadata out to stable storage. 2585 */ 2586 (void) VOP_FSYNC(dvp, 0, cr); 2587 VN_RELE(dvp); 2588 } 2589 2590 /* 2591 * Force modified data and metadata out to stable storage. 2592 */ 2593 (void) VOP_FSYNC(vp, 0, cr); 2594 2595 VN_RELE(vp); 2596 2597 dr->dr_status = puterrno(error); 2598 2599 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2600 "rfs_mkdir_end:(%S)", "done"); 2601 } 2602 void * 2603 rfs_mkdir_getfh(struct nfscreatargs *args) 2604 { 2605 return (args->ca_da.da_fhandle); 2606 } 2607 2608 /* 2609 * Remove a directory. 2610 * Remove the given directory name from the given parent directory. 2611 */ 2612 void 2613 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2614 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2615 { 2616 int error; 2617 vnode_t *vp; 2618 2619 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, 2620 "rfs_rmdir_start:"); 2621 2622 /* 2623 * Disallow NULL paths 2624 */ 2625 if (da->da_name == NULL || *da->da_name == '\0') { 2626 *status = NFSERR_ACCES; 2627 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2628 "rfs_rmdir_end:(%S)", "access"); 2629 return; 2630 } 2631 2632 vp = nfs_fhtovp(da->da_fhandle, exi); 2633 if (vp == NULL) { 2634 *status = NFSERR_STALE; 2635 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2636 "rfs_rmdir_end:(%S)", "stale"); 2637 return; 2638 } 2639 2640 if (rdonly(exi, req)) { 2641 VN_RELE(vp); 2642 *status = NFSERR_ROFS; 2643 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2644 "rfs_rmdir_end:(%S)", "rofs"); 2645 return; 2646 } 2647 2648 /* 2649 * VOP_RMDIR now takes a new third argument (the current 2650 * directory of the process). That's because someone 2651 * wants to return EINVAL if one tries to remove ".". 2652 * Of course, NFS servers have no idea what their 2653 * clients' current directories are. We fake it by 2654 * supplying a vnode known to exist and illegal to 2655 * remove. 2656 */ 2657 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, 2658 "vop_rmdir_start:"); 2659 error = VOP_RMDIR(vp, da->da_name, rootdir, cr); 2660 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, 2661 "vop_rmdir_end:"); 2662 2663 /* 2664 * Force modified data and metadata out to stable storage. 2665 */ 2666 (void) VOP_FSYNC(vp, 0, cr); 2667 2668 VN_RELE(vp); 2669 2670 /* 2671 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2672 * if the directory is not empty. A System V NFS server 2673 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2674 * over the wire. 2675 */ 2676 if (error == EEXIST) 2677 *status = NFSERR_NOTEMPTY; 2678 else 2679 *status = puterrno(error); 2680 2681 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2682 "rfs_rmdir_end:(%S)", "done"); 2683 } 2684 void * 2685 rfs_rmdir_getfh(struct nfsdiropargs *da) 2686 { 2687 return (da->da_fhandle); 2688 } 2689 2690 /* ARGSUSED */ 2691 void 2692 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2693 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2694 { 2695 int error; 2696 int iseof; 2697 struct iovec iov; 2698 struct uio uio; 2699 vnode_t *vp; 2700 2701 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, 2702 "rfs_readdir_start:"); 2703 2704 vp = nfs_fhtovp(&rda->rda_fh, exi); 2705 if (vp == NULL) { 2706 rd->rd_entries = NULL; 2707 rd->rd_status = NFSERR_STALE; 2708 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2709 "rfs_readdir_end:(%S)", "stale"); 2710 return; 2711 } 2712 2713 if (vp->v_type != VDIR) { 2714 VN_RELE(vp); 2715 rd->rd_entries = NULL; 2716 rd->rd_status = NFSERR_NOTDIR; 2717 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2718 "rfs_readdir_end:(%S)", "notdir"); 2719 return; 2720 } 2721 2722 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 2723 "vop_rwlock_start:"); 2724 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2725 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 2726 "vop_rwlock_end:"); 2727 2728 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 2729 "vop_access_start:"); 2730 error = VOP_ACCESS(vp, VREAD, 0, cr); 2731 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 2732 "vop_access_end:"); 2733 if (error) { 2734 rd->rd_entries = NULL; 2735 goto bad; 2736 } 2737 2738 if (rda->rda_count == 0) { 2739 rd->rd_entries = NULL; 2740 rd->rd_size = 0; 2741 rd->rd_eof = FALSE; 2742 goto bad; 2743 } 2744 2745 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2746 2747 /* 2748 * Allocate data for entries. This will be freed by rfs_rddirfree. 2749 */ 2750 rd->rd_bufsize = (uint_t)rda->rda_count; 2751 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2752 2753 /* 2754 * Set up io vector to read directory data 2755 */ 2756 iov.iov_base = (caddr_t)rd->rd_entries; 2757 iov.iov_len = rda->rda_count; 2758 uio.uio_iov = &iov; 2759 uio.uio_iovcnt = 1; 2760 uio.uio_segflg = UIO_SYSSPACE; 2761 uio.uio_extflg = UIO_COPY_CACHED; 2762 uio.uio_loffset = (offset_t)rda->rda_offset; 2763 uio.uio_resid = rda->rda_count; 2764 2765 /* 2766 * read directory 2767 */ 2768 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, 2769 "vop_readdir_start:"); 2770 error = VOP_READDIR(vp, &uio, cr, &iseof); 2771 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, 2772 "vop_readdir_end:"); 2773 2774 /* 2775 * Clean up 2776 */ 2777 if (!error) { 2778 /* 2779 * set size and eof 2780 */ 2781 if (uio.uio_resid == rda->rda_count) { 2782 rd->rd_size = 0; 2783 rd->rd_eof = TRUE; 2784 } else { 2785 rd->rd_size = (uint32_t)(rda->rda_count - 2786 uio.uio_resid); 2787 rd->rd_eof = iseof ? TRUE : FALSE; 2788 } 2789 } 2790 2791 bad: 2792 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 2793 "vop_rwunlock_start:"); 2794 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2795 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 2796 "vop_rwunlock_end:"); 2797 2798 #if 0 /* notyet */ 2799 /* 2800 * Don't do this. It causes local disk writes when just 2801 * reading the file and the overhead is deemed larger 2802 * than the benefit. 2803 */ 2804 /* 2805 * Force modified metadata out to stable storage. 2806 */ 2807 (void) VOP_FSYNC(vp, FNODSYNC, cr); 2808 #endif 2809 2810 VN_RELE(vp); 2811 2812 rd->rd_status = puterrno(error); 2813 2814 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2815 "rfs_readdir_end:(%S)", "done"); 2816 } 2817 void * 2818 rfs_readdir_getfh(struct nfsrddirargs *rda) 2819 { 2820 return (&rda->rda_fh); 2821 } 2822 void 2823 rfs_rddirfree(struct nfsrddirres *rd) 2824 { 2825 if (rd->rd_entries != NULL) 2826 kmem_free(rd->rd_entries, rd->rd_bufsize); 2827 } 2828 2829 /* ARGSUSED */ 2830 void 2831 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2832 struct svc_req *req, cred_t *cr) 2833 { 2834 int error; 2835 struct statvfs64 sb; 2836 vnode_t *vp; 2837 2838 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, 2839 "rfs_statfs_start:"); 2840 2841 vp = nfs_fhtovp(fh, exi); 2842 if (vp == NULL) { 2843 fs->fs_status = NFSERR_STALE; 2844 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2845 "rfs_statfs_end:(%S)", "stale"); 2846 return; 2847 } 2848 2849 error = VFS_STATVFS(vp->v_vfsp, &sb); 2850 2851 if (!error) { 2852 fs->fs_tsize = nfstsize(); 2853 fs->fs_bsize = sb.f_frsize; 2854 fs->fs_blocks = sb.f_blocks; 2855 fs->fs_bfree = sb.f_bfree; 2856 fs->fs_bavail = sb.f_bavail; 2857 } 2858 2859 VN_RELE(vp); 2860 2861 fs->fs_status = puterrno(error); 2862 2863 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2864 "rfs_statfs_end:(%S)", "done"); 2865 } 2866 void * 2867 rfs_statfs_getfh(fhandle_t *fh) 2868 { 2869 return (fh); 2870 } 2871 2872 static int 2873 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2874 { 2875 vap->va_mask = 0; 2876 2877 /* 2878 * There was a sign extension bug in some VFS based systems 2879 * which stored the mode as a short. When it would get 2880 * assigned to a u_long, no sign extension would occur. 2881 * It needed to, but this wasn't noticed because sa_mode 2882 * would then get assigned back to the short, thus ignoring 2883 * the upper 16 bits of sa_mode. 2884 * 2885 * To make this implementation work for both broken 2886 * clients and good clients, we check for both versions 2887 * of the mode. 2888 */ 2889 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2890 sa->sa_mode != (uint32_t)-1) { 2891 vap->va_mask |= AT_MODE; 2892 vap->va_mode = sa->sa_mode; 2893 } 2894 if (sa->sa_uid != (uint32_t)-1) { 2895 vap->va_mask |= AT_UID; 2896 vap->va_uid = sa->sa_uid; 2897 } 2898 if (sa->sa_gid != (uint32_t)-1) { 2899 vap->va_mask |= AT_GID; 2900 vap->va_gid = sa->sa_gid; 2901 } 2902 if (sa->sa_size != (uint32_t)-1) { 2903 vap->va_mask |= AT_SIZE; 2904 vap->va_size = sa->sa_size; 2905 } 2906 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2907 sa->sa_atime.tv_usec != (int32_t)-1) { 2908 #ifndef _LP64 2909 /* return error if time overflow */ 2910 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2911 return (EOVERFLOW); 2912 #endif 2913 vap->va_mask |= AT_ATIME; 2914 /* 2915 * nfs protocol defines times as unsigned so don't extend sign, 2916 * unless sysadmin set nfs_allow_preepoch_time. 2917 */ 2918 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2919 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2920 } 2921 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2922 sa->sa_mtime.tv_usec != (int32_t)-1) { 2923 #ifndef _LP64 2924 /* return error if time overflow */ 2925 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2926 return (EOVERFLOW); 2927 #endif 2928 vap->va_mask |= AT_MTIME; 2929 /* 2930 * nfs protocol defines times as unsigned so don't extend sign, 2931 * unless sysadmin set nfs_allow_preepoch_time. 2932 */ 2933 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2934 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2935 } 2936 return (0); 2937 } 2938 2939 static enum nfsftype vt_to_nf[] = { 2940 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2941 }; 2942 2943 /* 2944 * check the following fields for overflow: nodeid, size, and time. 2945 * There could be a problem when converting 64-bit LP64 fields 2946 * into 32-bit ones. Return an error if there is an overflow. 2947 */ 2948 int 2949 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2950 { 2951 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2952 na->na_type = vt_to_nf[vap->va_type]; 2953 2954 if (vap->va_mode == (unsigned short) -1) 2955 na->na_mode = (uint32_t)-1; 2956 else 2957 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2958 2959 if (vap->va_uid == (unsigned short)(-1)) 2960 na->na_uid = (uint32_t)(-1); 2961 else if (vap->va_uid == UID_NOBODY) 2962 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2963 else 2964 na->na_uid = vap->va_uid; 2965 2966 if (vap->va_gid == (unsigned short)(-1)) 2967 na->na_gid = (uint32_t)-1; 2968 else if (vap->va_gid == GID_NOBODY) 2969 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2970 else 2971 na->na_gid = vap->va_gid; 2972 2973 /* 2974 * Do we need to check fsid for overflow? It is 64-bit in the 2975 * vattr, but are bigger than 32 bit values supported? 2976 */ 2977 na->na_fsid = vap->va_fsid; 2978 2979 na->na_nodeid = vap->va_nodeid; 2980 2981 /* 2982 * Check to make sure that the nodeid is representable over the 2983 * wire without losing bits. 2984 */ 2985 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2986 return (EFBIG); 2987 na->na_nlink = vap->va_nlink; 2988 2989 /* 2990 * Check for big files here, instead of at the caller. See 2991 * comments in cstat for large special file explanation. 2992 */ 2993 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2994 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2995 return (EFBIG); 2996 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2997 /* UNKNOWN_SIZE | OVERFLOW */ 2998 na->na_size = MAXOFF32_T; 2999 } else 3000 na->na_size = vap->va_size; 3001 } else 3002 na->na_size = vap->va_size; 3003 3004 /* 3005 * If the vnode times overflow the 32-bit times that NFS2 3006 * uses on the wire then return an error. 3007 */ 3008 if (!NFS_VAP_TIME_OK(vap)) { 3009 return (EOVERFLOW); 3010 } 3011 na->na_atime.tv_sec = vap->va_atime.tv_sec; 3012 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 3013 3014 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 3015 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 3016 3017 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 3018 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 3019 3020 /* 3021 * If the dev_t will fit into 16 bits then compress 3022 * it, otherwise leave it alone. See comments in 3023 * nfs_client.c. 3024 */ 3025 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 3026 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 3027 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 3028 else 3029 (void) cmpldev(&na->na_rdev, vap->va_rdev); 3030 3031 na->na_blocks = vap->va_nblocks; 3032 na->na_blocksize = vap->va_blksize; 3033 3034 /* 3035 * This bit of ugliness is a *TEMPORARY* hack to preserve the 3036 * over-the-wire protocols for named-pipe vnodes. It remaps the 3037 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 3038 * 3039 * BUYER BEWARE: 3040 * If you are porting the NFS to a non-Sun server, you probably 3041 * don't want to include the following block of code. The 3042 * over-the-wire special file types will be changing with the 3043 * NFS Protocol Revision. 3044 */ 3045 if (vap->va_type == VFIFO) 3046 NA_SETFIFO(na); 3047 return (0); 3048 } 3049 3050 /* 3051 * acl v2 support: returns approximate permission. 3052 * default: returns minimal permission (more restrictive) 3053 * aclok: returns maximal permission (less restrictive) 3054 * This routine changes the permissions that are alaredy in *va. 3055 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3056 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3057 */ 3058 static void 3059 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3060 { 3061 vsecattr_t vsa; 3062 int aclcnt; 3063 aclent_t *aclentp; 3064 mode_t mask_perm; 3065 mode_t grp_perm; 3066 mode_t other_perm; 3067 mode_t other_orig; 3068 int error; 3069 3070 /* dont care default acl */ 3071 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3072 error = VOP_GETSECATTR(vp, &vsa, 0, cr); 3073 3074 if (!error) { 3075 aclcnt = vsa.vsa_aclcnt; 3076 if (aclcnt > MIN_ACL_ENTRIES) { 3077 /* non-trivial ACL */ 3078 aclentp = vsa.vsa_aclentp; 3079 if (exi->exi_export.ex_flags & EX_ACLOK) { 3080 /* maximal permissions */ 3081 grp_perm = 0; 3082 other_perm = 0; 3083 for (; aclcnt > 0; aclcnt--, aclentp++) { 3084 switch (aclentp->a_type) { 3085 case USER_OBJ: 3086 break; 3087 case USER: 3088 grp_perm |= 3089 aclentp->a_perm << 3; 3090 other_perm |= aclentp->a_perm; 3091 break; 3092 case GROUP_OBJ: 3093 grp_perm |= 3094 aclentp->a_perm << 3; 3095 break; 3096 case GROUP: 3097 other_perm |= aclentp->a_perm; 3098 break; 3099 case OTHER_OBJ: 3100 other_orig = aclentp->a_perm; 3101 break; 3102 case CLASS_OBJ: 3103 mask_perm = aclentp->a_perm; 3104 break; 3105 default: 3106 break; 3107 } 3108 } 3109 grp_perm &= mask_perm << 3; 3110 other_perm &= mask_perm; 3111 other_perm |= other_orig; 3112 3113 } else { 3114 /* minimal permissions */ 3115 grp_perm = 070; 3116 other_perm = 07; 3117 for (; aclcnt > 0; aclcnt--, aclentp++) { 3118 switch (aclentp->a_type) { 3119 case USER_OBJ: 3120 break; 3121 case USER: 3122 case CLASS_OBJ: 3123 grp_perm &= 3124 aclentp->a_perm << 3; 3125 other_perm &= 3126 aclentp->a_perm; 3127 break; 3128 case GROUP_OBJ: 3129 grp_perm &= 3130 aclentp->a_perm << 3; 3131 break; 3132 case GROUP: 3133 other_perm &= 3134 aclentp->a_perm; 3135 break; 3136 case OTHER_OBJ: 3137 other_perm &= 3138 aclentp->a_perm; 3139 break; 3140 default: 3141 break; 3142 } 3143 } 3144 } 3145 /* copy to va */ 3146 va->va_mode &= ~077; 3147 va->va_mode |= grp_perm | other_perm; 3148 } 3149 if (vsa.vsa_aclcnt) 3150 kmem_free(vsa.vsa_aclentp, 3151 vsa.vsa_aclcnt * sizeof (aclent_t)); 3152 } 3153 } 3154 3155 void 3156 rfs_srvrinit(void) 3157 { 3158 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3159 } 3160 3161 void 3162 rfs_srvrfini(void) 3163 { 3164 mutex_destroy(&rfs_async_write_lock); 3165 } 3166