1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/uio.h> 41 #include <sys/stat.h> 42 #include <sys/errno.h> 43 #include <sys/sysmacros.h> 44 #include <sys/statvfs.h> 45 #include <sys/kmem.h> 46 #include <sys/kstat.h> 47 #include <sys/dirent.h> 48 #include <sys/cmn_err.h> 49 #include <sys/debug.h> 50 #include <sys/vtrace.h> 51 #include <sys/mode.h> 52 #include <sys/acl.h> 53 #include <sys/nbmlock.h> 54 #include <sys/policy.h> 55 56 #include <rpc/types.h> 57 #include <rpc/auth.h> 58 #include <rpc/svc.h> 59 60 #include <nfs/nfs.h> 61 #include <nfs/export.h> 62 63 #include <vm/hat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kmem.h> 68 69 #include <sys/strsubr.h> 70 71 /* 72 * These are the interface routines for the server side of the 73 * Network File System. See the NFS version 2 protocol specification 74 * for a description of this interface. 75 */ 76 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 79 cred_t *); 80 81 /* 82 * Some "over the wire" UNIX file types. These are encoded 83 * into the mode. This needs to be fixed in the next rev. 84 */ 85 #define IFMT 0170000 /* type of file */ 86 #define IFCHR 0020000 /* character special */ 87 #define IFBLK 0060000 /* block special */ 88 #define IFSOCK 0140000 /* socket */ 89 90 /* 91 * Get file attributes. 92 * Returns the current attributes of the file with the given fhandle. 93 */ 94 /* ARGSUSED */ 95 void 96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 97 struct svc_req *req, cred_t *cr) 98 { 99 int error; 100 vnode_t *vp; 101 struct vattr va; 102 103 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, 104 "rfs_getattr_start:"); 105 106 vp = nfs_fhtovp(fhp, exi); 107 if (vp == NULL) { 108 ns->ns_status = NFSERR_STALE; 109 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 110 "rfs_getattr_end:(%S)", "stale"); 111 return; 112 } 113 114 /* 115 * Do the getattr. 116 */ 117 va.va_mask = AT_ALL; /* we want all the attributes */ 118 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 119 "vop_getattr_start:"); 120 error = rfs4_delegated_getattr(vp, &va, 0, cr); 121 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 122 "vop_getattr_end:"); 123 124 /* check for overflows */ 125 if (!error) { 126 acl_perm(vp, exi, &va, cr); 127 error = vattr_to_nattr(&va, &ns->ns_attr); 128 } 129 130 VN_RELE(vp); 131 132 ns->ns_status = puterrno(error); 133 134 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 135 "rfs_getattr_end:(%S)", "done"); 136 } 137 void * 138 rfs_getattr_getfh(fhandle_t *fhp) 139 { 140 return (fhp); 141 } 142 143 /* 144 * Set file attributes. 145 * Sets the attributes of the file with the given fhandle. Returns 146 * the new attributes. 147 */ 148 void 149 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 150 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 151 { 152 int error; 153 int flag; 154 int in_crit = 0; 155 vnode_t *vp; 156 struct vattr va; 157 struct vattr bva; 158 struct flock64 bf; 159 160 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, 161 "rfs_setattr_start:"); 162 163 vp = nfs_fhtovp(&args->saa_fh, exi); 164 if (vp == NULL) { 165 ns->ns_status = NFSERR_STALE; 166 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 167 "rfs_setattr_end:(%S)", "stale"); 168 return; 169 } 170 171 if (rdonly(exi, req) || vn_is_readonly(vp)) { 172 VN_RELE(vp); 173 ns->ns_status = NFSERR_ROFS; 174 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 175 "rfs_setattr_end:(%S)", "rofs"); 176 return; 177 } 178 179 error = sattr_to_vattr(&args->saa_sa, &va); 180 if (error) { 181 VN_RELE(vp); 182 ns->ns_status = puterrno(error); 183 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 184 "rfs_setattr_end:(%S)", "sattr"); 185 return; 186 } 187 188 /* 189 * If the client is requesting a change to the mtime, 190 * but the nanosecond field is set to 1 billion, then 191 * this is a flag to the server that it should set the 192 * atime and mtime fields to the server's current time. 193 * The 1 billion number actually came from the client 194 * as 1 million, but the units in the over the wire 195 * request are microseconds instead of nanoseconds. 196 * 197 * This is an overload of the protocol and should be 198 * documented in the NFS Version 2 protocol specification. 199 */ 200 if (va.va_mask & AT_MTIME) { 201 if (va.va_mtime.tv_nsec == 1000000000) { 202 gethrestime(&va.va_mtime); 203 va.va_atime = va.va_mtime; 204 va.va_mask |= AT_ATIME; 205 flag = 0; 206 } else 207 flag = ATTR_UTIME; 208 } else 209 flag = 0; 210 211 /* 212 * If the filesystem is exported with nosuid, then mask off 213 * the setuid and setgid bits. 214 */ 215 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 216 (exi->exi_export.ex_flags & EX_NOSUID)) 217 va.va_mode &= ~(VSUID | VSGID); 218 219 /* 220 * We need to specially handle size changes because it is 221 * possible for the client to create a file with modes 222 * which indicate read-only, but with the file opened for 223 * writing. If the client then tries to set the size of 224 * the file, then the normal access checking done in 225 * VOP_SETATTR would prevent the client from doing so, 226 * although it should be legal for it to do so. To get 227 * around this, we do the access checking for ourselves 228 * and then use VOP_SPACE which doesn't do the access 229 * checking which VOP_SETATTR does. VOP_SPACE can only 230 * operate on VREG files, let VOP_SETATTR handle the other 231 * extremely rare cases. 232 * Also the client should not be allowed to change the 233 * size of the file if there is a conflicting non-blocking 234 * mandatory lock in the region of change. 235 * 236 * Also(2), check to see if the v4 side of the server has 237 * delegated this file. If so, then we set T_WOULDBLOCK 238 * so that the dispatch function dosn't send a reply, forcing 239 * the client to retrasmit its request. 240 */ 241 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 242 /* If delegated, mark as wouldblock so response is dropped */ 243 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 244 VN_RELE(vp); 245 curthread->t_flag |= T_WOULDBLOCK; 246 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 247 "rfs_setattr_end:(%S)", "delegated"); 248 return; 249 } 250 if (nbl_need_check(vp)) { 251 nbl_start_crit(vp, RW_READER); 252 in_crit = 1; 253 } 254 255 bva.va_mask = AT_UID | AT_SIZE; 256 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 257 "vop_getattr_start:"); 258 error = VOP_GETATTR(vp, &bva, 0, cr); 259 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 260 "vop_getattr_end:"); 261 if (error) { 262 if (in_crit) 263 nbl_end_crit(vp); 264 VN_RELE(vp); 265 ns->ns_status = puterrno(error); 266 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 267 "rfs_setattr_end:(%S)", "getattr"); 268 return; 269 } 270 271 if (in_crit) { 272 u_offset_t offset; 273 ssize_t length; 274 275 if (va.va_size < bva.va_size) { 276 offset = va.va_size; 277 length = bva.va_size - va.va_size; 278 } else { 279 offset = bva.va_size; 280 length = va.va_size - bva.va_size; 281 } 282 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) { 283 error = EACCES; 284 } 285 } 286 287 if (crgetuid(cr) == bva.va_uid && !error && 288 va.va_size != bva.va_size) { 289 va.va_mask &= ~AT_SIZE; 290 bf.l_type = F_WRLCK; 291 bf.l_whence = 0; 292 bf.l_start = (off64_t)va.va_size; 293 bf.l_len = 0; 294 bf.l_sysid = 0; 295 bf.l_pid = 0; 296 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 297 "vop_space_start:"); 298 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 299 (offset_t)va.va_size, cr, NULL); 300 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, 301 "vop_space_end:"); 302 } 303 if (in_crit) 304 nbl_end_crit(vp); 305 } else 306 error = 0; 307 308 /* 309 * Do the setattr. 310 */ 311 if (!error && va.va_mask) { 312 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, 313 "vop_setattr_start:"); 314 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 315 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, 316 "vop_setattr_end:"); 317 } 318 319 if (!error) { 320 va.va_mask = AT_ALL; /* get everything */ 321 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 322 "vop_getattr_start:"); 323 error = rfs4_delegated_getattr(vp, &va, 0, cr); 324 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 325 "vop_getattr_end:"); 326 327 /* check for overflows */ 328 if (!error) { 329 acl_perm(vp, exi, &va, cr); 330 error = vattr_to_nattr(&va, &ns->ns_attr); 331 } 332 } 333 334 /* 335 * Force modified metadata out to stable storage. 336 */ 337 (void) VOP_FSYNC(vp, FNODSYNC, cr); 338 339 VN_RELE(vp); 340 341 ns->ns_status = puterrno(error); 342 343 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 344 "rfs_setattr_end:(%S)", "done"); 345 } 346 void * 347 rfs_setattr_getfh(struct nfssaargs *args) 348 { 349 return (&args->saa_fh); 350 } 351 352 /* 353 * Directory lookup. 354 * Returns an fhandle and file attributes for file name in a directory. 355 */ 356 /* ARGSUSED */ 357 void 358 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 359 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 360 { 361 int error; 362 vnode_t *dvp; 363 vnode_t *vp; 364 struct vattr va; 365 fhandle_t *fhp = da->da_fhandle; 366 struct sec_ol sec = {0, 0}; 367 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 368 369 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, 370 "rfs_lookup_start:"); 371 372 /* 373 * Trusted Extension doesn't support NFSv2. MOUNT 374 * will reject v2 clients. Need to prevent v2 client 375 * access via WebNFS here. 376 */ 377 if (is_system_labeled() && req->rq_vers == 2) { 378 dr->dr_status = NFSERR_ACCES; 379 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 380 "rfs_lookup_end:(%S)", "access"); 381 return; 382 } 383 384 /* 385 * Disallow NULL paths 386 */ 387 if (da->da_name == NULL || *da->da_name == '\0') { 388 dr->dr_status = NFSERR_ACCES; 389 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 390 "rfs_lookup_end:(%S)", "access"); 391 return; 392 } 393 394 /* 395 * Allow lookups from the root - the default 396 * location of the public filehandle. 397 */ 398 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 399 dvp = rootdir; 400 VN_HOLD(dvp); 401 } else { 402 dvp = nfs_fhtovp(fhp, exi); 403 if (dvp == NULL) { 404 dr->dr_status = NFSERR_STALE; 405 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 406 "rfs_lookup_end:(%S)", "stale"); 407 return; 408 } 409 } 410 411 /* 412 * Not allow lookup beyond root. 413 * If the filehandle matches a filehandle of the exi, 414 * then the ".." refers beyond the root of an exported filesystem. 415 */ 416 if (strcmp(da->da_name, "..") == 0 && 417 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 418 VN_RELE(dvp); 419 dr->dr_status = NFSERR_NOENT; 420 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 421 "rfs_lookup_end:(%S)", "noent"); 422 return; 423 } 424 425 /* 426 * If the public filehandle is used then allow 427 * a multi-component lookup, i.e. evaluate 428 * a pathname and follow symbolic links if 429 * necessary. 430 * 431 * This may result in a vnode in another filesystem 432 * which is OK as long as the filesystem is exported. 433 */ 434 if (PUBLIC_FH2(fhp)) { 435 publicfh_flag = TRUE; 436 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 437 &sec); 438 } else { 439 /* 440 * Do a normal single component lookup. 441 */ 442 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 443 "vop_lookup_start:"); 444 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr); 445 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 446 "vop_lookup_end:"); 447 } 448 449 if (!error) { 450 va.va_mask = AT_ALL; /* we want everything */ 451 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 452 "vop_getattr_start:"); 453 error = rfs4_delegated_getattr(vp, &va, 0, cr); 454 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 455 "vop_getattr_end:"); 456 /* check for overflows */ 457 if (!error) { 458 acl_perm(vp, exi, &va, cr); 459 error = vattr_to_nattr(&va, &dr->dr_attr); 460 if (!error) { 461 if (sec.sec_flags & SEC_QUERY) 462 error = makefh_ol(&dr->dr_fhandle, exi, 463 sec.sec_index); 464 else { 465 error = makefh(&dr->dr_fhandle, vp, 466 exi); 467 if (!error && publicfh_flag && 468 !chk_clnt_sec(exi, req)) 469 auth_weak = TRUE; 470 } 471 } 472 } 473 VN_RELE(vp); 474 } 475 476 VN_RELE(dvp); 477 478 /* 479 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 480 * and have obtained a new exportinfo in exi which needs to be 481 * released. Note the the original exportinfo pointed to by exi 482 * will be released by the caller, comon_dispatch. 483 */ 484 if (publicfh_flag && exi != NULL) 485 exi_rele(exi); 486 487 /* 488 * If it's public fh, no 0x81, and client's flavor is 489 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 490 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 491 */ 492 if (auth_weak) 493 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 494 else 495 dr->dr_status = puterrno(error); 496 497 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 498 "rfs_lookup_end:(%S)", "done"); 499 } 500 void * 501 rfs_lookup_getfh(struct nfsdiropargs *da) 502 { 503 return (da->da_fhandle); 504 } 505 506 /* 507 * Read symbolic link. 508 * Returns the string in the symbolic link at the given fhandle. 509 */ 510 /* ARGSUSED */ 511 void 512 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 513 struct svc_req *req, cred_t *cr) 514 { 515 int error; 516 struct iovec iov; 517 struct uio uio; 518 vnode_t *vp; 519 struct vattr va; 520 521 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, 522 "rfs_readlink_start:"); 523 524 vp = nfs_fhtovp(fhp, exi); 525 if (vp == NULL) { 526 rl->rl_data = NULL; 527 rl->rl_status = NFSERR_STALE; 528 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 529 "rfs_readlink_end:(%S)", "stale"); 530 return; 531 } 532 533 va.va_mask = AT_MODE; 534 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 535 "vop_getattr_start:"); 536 error = VOP_GETATTR(vp, &va, 0, cr); 537 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 538 "vop_getattr_end:"); 539 540 if (error) { 541 VN_RELE(vp); 542 rl->rl_data = NULL; 543 rl->rl_status = puterrno(error); 544 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 545 "rfs_readlink_end:(%S)", "getattr error"); 546 return; 547 } 548 549 if (MANDLOCK(vp, va.va_mode)) { 550 VN_RELE(vp); 551 rl->rl_data = NULL; 552 rl->rl_status = NFSERR_ACCES; 553 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 554 "rfs_readlink_end:(%S)", "access"); 555 return; 556 } 557 558 /* 559 * XNFS and RFC1094 require us to return ENXIO if argument 560 * is not a link. BUGID 1138002. 561 */ 562 if (vp->v_type != VLNK) { 563 VN_RELE(vp); 564 rl->rl_data = NULL; 565 rl->rl_status = NFSERR_NXIO; 566 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 567 "rfs_readlink_end:(%S)", "nxio"); 568 return; 569 } 570 571 /* 572 * Allocate data for pathname. This will be freed by rfs_rlfree. 573 */ 574 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 575 576 /* 577 * Set up io vector to read sym link data 578 */ 579 iov.iov_base = rl->rl_data; 580 iov.iov_len = NFS_MAXPATHLEN; 581 uio.uio_iov = &iov; 582 uio.uio_iovcnt = 1; 583 uio.uio_segflg = UIO_SYSSPACE; 584 uio.uio_extflg = UIO_COPY_CACHED; 585 uio.uio_loffset = (offset_t)0; 586 uio.uio_resid = NFS_MAXPATHLEN; 587 588 /* 589 * Do the readlink. 590 */ 591 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, 592 "vop_readlink_start:"); 593 error = VOP_READLINK(vp, &uio, cr); 594 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, 595 "vop_readlink_end:"); 596 597 #if 0 /* notyet */ 598 /* 599 * Don't do this. It causes local disk writes when just 600 * reading the file and the overhead is deemed larger 601 * than the benefit. 602 */ 603 /* 604 * Force modified metadata out to stable storage. 605 */ 606 (void) VOP_FSYNC(vp, FNODSYNC, cr); 607 #endif 608 609 VN_RELE(vp); 610 611 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 612 613 /* 614 * XNFS and RFC1094 require us to return ENXIO if argument 615 * is not a link. UFS returns EINVAL if this is the case, 616 * so we do the mapping here. BUGID 1138002. 617 */ 618 if (error == EINVAL) 619 rl->rl_status = NFSERR_NXIO; 620 else 621 rl->rl_status = puterrno(error); 622 623 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 624 "rfs_readlink_end:(%S)", "done"); 625 } 626 void * 627 rfs_readlink_getfh(fhandle_t *fhp) 628 { 629 return (fhp); 630 } 631 /* 632 * Free data allocated by rfs_readlink 633 */ 634 void 635 rfs_rlfree(struct nfsrdlnres *rl) 636 { 637 if (rl->rl_data != NULL) 638 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 639 } 640 641 /* 642 * Read data. 643 * Returns some data read from the file at the given fhandle. 644 */ 645 /* ARGSUSED */ 646 void 647 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 648 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 649 { 650 vnode_t *vp; 651 int error; 652 struct vattr va; 653 struct iovec iov; 654 struct uio uio; 655 mblk_t *mp; 656 int alloc_err = 0; 657 int in_crit = 0; 658 659 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, 660 "rfs_read_start:"); 661 662 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 663 if (vp == NULL) { 664 rr->rr_data = NULL; 665 rr->rr_status = NFSERR_STALE; 666 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 667 "rfs_read_end:(%S)", "stale"); 668 return; 669 } 670 671 if (vp->v_type != VREG) { 672 VN_RELE(vp); 673 rr->rr_data = NULL; 674 rr->rr_status = NFSERR_ISDIR; 675 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 676 "rfs_read_end:(%S)", "isdir"); 677 return; 678 } 679 680 /* 681 * Check to see if the v4 side of the server has delegated 682 * this file. If so, then we mark thread as wouldblock so 683 * the response is dropped. 684 */ 685 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 686 VN_RELE(vp); 687 curthread->t_flag |= T_WOULDBLOCK; 688 rr->rr_data = NULL; 689 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 690 "rfs_read_end:(%S)", "delegated"); 691 return; 692 } 693 694 /* 695 * Enter the critical region before calling VOP_RWLOCK 696 * to avoid a deadlock with write requests. 697 */ 698 if (nbl_need_check(vp)) { 699 nbl_start_crit(vp, RW_READER); 700 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 701 0)) { 702 nbl_end_crit(vp); 703 VN_RELE(vp); 704 rr->rr_data = NULL; 705 rr->rr_status = NFSERR_ACCES; 706 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 707 "rfs_read_end:(%S)", " csf access error"); 708 return; 709 } 710 in_crit = 1; 711 } 712 713 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 714 "vop_rwlock_start:"); 715 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 716 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 717 "vop_rwlock_end:"); 718 719 va.va_mask = AT_ALL; 720 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 721 "vop_getattr_start:"); 722 error = VOP_GETATTR(vp, &va, 0, cr); 723 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 724 "vop_getattr_end:"); 725 726 if (error) { 727 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 728 "vop_rwunlock_start:"); 729 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 730 if (in_crit) 731 nbl_end_crit(vp); 732 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 733 "vop_rwunlock_end:"); 734 VN_RELE(vp); 735 rr->rr_data = NULL; 736 rr->rr_status = puterrno(error); 737 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 738 "rfs_read_end:(%S)", "getattr error"); 739 return; 740 } 741 742 /* 743 * This is a kludge to allow reading of files created 744 * with no read permission. The owner of the file 745 * is always allowed to read it. 746 */ 747 if (crgetuid(cr) != va.va_uid) { 748 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 749 "vop_access_start:"); 750 error = VOP_ACCESS(vp, VREAD, 0, cr); 751 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 752 "vop_access_end:"); 753 if (error) { 754 /* 755 * Exec is the same as read over the net because 756 * of demand loading. 757 */ 758 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 759 "vop_access_start:"); 760 error = VOP_ACCESS(vp, VEXEC, 0, cr); 761 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 762 "vop_access_end:"); 763 } 764 if (error) { 765 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 766 "vop_rwunlock_start:"); 767 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 768 if (in_crit) 769 nbl_end_crit(vp); 770 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 771 "vop_rwunlock_end:"); 772 VN_RELE(vp); 773 rr->rr_data = NULL; 774 rr->rr_status = puterrno(error); 775 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 776 "rfs_read_end:(%S)", "access error"); 777 return; 778 } 779 } 780 781 if (MANDLOCK(vp, va.va_mode)) { 782 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 783 "vop_rwunlock_start:"); 784 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 785 if (in_crit) 786 nbl_end_crit(vp); 787 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 788 "vop_rwunlock_end:"); 789 VN_RELE(vp); 790 rr->rr_data = NULL; 791 rr->rr_status = NFSERR_ACCES; 792 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 793 "rfs_read_end:(%S)", "mand lock"); 794 return; 795 } 796 797 if ((u_offset_t)ra->ra_offset >= va.va_size) { 798 rr->rr_count = 0; 799 rr->rr_data = NULL; 800 /* 801 * In this case, status is NFS_OK, but there is no data 802 * to encode. So set rr_mp to NULL. 803 */ 804 rr->rr_mp = NULL; 805 goto done; 806 } 807 808 /* 809 * mp will contain the data to be sent out in the read reply. 810 * This will be freed after the reply has been sent out (by the 811 * driver). 812 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 813 * that the call to xdrmblk_putmblk() never fails. 814 */ 815 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 816 &alloc_err); 817 ASSERT(mp != NULL); 818 ASSERT(alloc_err == 0); 819 820 rr->rr_mp = mp; 821 822 /* 823 * Set up io vector 824 */ 825 iov.iov_base = (caddr_t)mp->b_datap->db_base; 826 iov.iov_len = ra->ra_count; 827 uio.uio_iov = &iov; 828 uio.uio_iovcnt = 1; 829 uio.uio_segflg = UIO_SYSSPACE; 830 uio.uio_extflg = UIO_COPY_CACHED; 831 uio.uio_loffset = (offset_t)ra->ra_offset; 832 uio.uio_resid = ra->ra_count; 833 834 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, 835 "vop_read_start:"); 836 error = VOP_READ(vp, &uio, 0, cr, NULL); 837 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, 838 "vop_read_end:"); 839 840 if (error) { 841 freeb(mp); 842 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 843 "vop_rwunlock_start:"); 844 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 845 if (in_crit) 846 nbl_end_crit(vp); 847 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 848 "vop_rwunlock_end:"); 849 VN_RELE(vp); 850 rr->rr_data = NULL; 851 rr->rr_status = puterrno(error); 852 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 853 "rfs_read_end:(%S)", "read error"); 854 return; 855 } 856 857 /* 858 * Get attributes again so we can send the latest access 859 * time to the client side for his cache. 860 */ 861 va.va_mask = AT_ALL; 862 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 863 "vop_getattr_start:"); 864 error = VOP_GETATTR(vp, &va, 0, cr); 865 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 866 "vop_getattr_end:"); 867 if (error) { 868 freeb(mp); 869 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 870 "vop_rwunlock_start:"); 871 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 872 if (in_crit) 873 nbl_end_crit(vp); 874 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 875 "vop_rwunlock_end:"); 876 VN_RELE(vp); 877 rr->rr_data = NULL; 878 rr->rr_status = puterrno(error); 879 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 880 "rfs_read_end:(%S)", "read error"); 881 return; 882 } 883 884 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 885 886 rr->rr_data = (char *)mp->b_datap->db_base; 887 888 done: 889 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 890 "vop_rwunlock_start:"); 891 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 892 if (in_crit) 893 nbl_end_crit(vp); 894 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 895 "vop_rwunlock_end:"); 896 897 acl_perm(vp, exi, &va, cr); 898 899 /* check for overflows */ 900 error = vattr_to_nattr(&va, &rr->rr_attr); 901 902 #if 0 /* notyet */ 903 /* 904 * Don't do this. It causes local disk writes when just 905 * reading the file and the overhead is deemed larger 906 * than the benefit. 907 */ 908 /* 909 * Force modified metadata out to stable storage. 910 */ 911 (void) VOP_FSYNC(vp, FNODSYNC, cr); 912 #endif 913 914 VN_RELE(vp); 915 916 rr->rr_status = puterrno(error); 917 918 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 919 "rfs_read_end:(%S)", "done"); 920 } 921 922 /* 923 * Free data allocated by rfs_read 924 */ 925 void 926 rfs_rdfree(struct nfsrdresult *rr) 927 { 928 mblk_t *mp; 929 930 if (rr->rr_status == NFS_OK) { 931 mp = rr->rr_mp; 932 if (mp != NULL) 933 freeb(mp); 934 } 935 } 936 937 void * 938 rfs_read_getfh(struct nfsreadargs *ra) 939 { 940 return (&ra->ra_fhandle); 941 } 942 943 #define MAX_IOVECS 12 944 945 #ifdef DEBUG 946 static int rfs_write_sync_hits = 0; 947 static int rfs_write_sync_misses = 0; 948 #endif 949 950 /* 951 * Write data to file. 952 * Returns attributes of a file after writing some data to it. 953 * 954 * Any changes made here, especially in error handling might have 955 * to also be done in rfs_write (which clusters write requests). 956 */ 957 void 958 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 959 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 960 { 961 int error; 962 vnode_t *vp; 963 rlim64_t rlimit; 964 struct vattr va; 965 struct uio uio; 966 struct iovec iov[MAX_IOVECS]; 967 mblk_t *m; 968 struct iovec *iovp; 969 int iovcnt; 970 cred_t *savecred; 971 int in_crit = 0; 972 973 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 974 "rfs_write_start:(%S)", "sync"); 975 976 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 977 if (vp == NULL) { 978 ns->ns_status = NFSERR_STALE; 979 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 980 "rfs_write_end:(%S)", "stale"); 981 return; 982 } 983 984 if (rdonly(exi, req)) { 985 VN_RELE(vp); 986 ns->ns_status = NFSERR_ROFS; 987 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 988 "rfs_write_end:(%S)", "rofs"); 989 return; 990 } 991 992 if (vp->v_type != VREG) { 993 VN_RELE(vp); 994 ns->ns_status = NFSERR_ISDIR; 995 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 996 "rfs_write_end:(%S)", "isdir"); 997 return; 998 } 999 1000 /* 1001 * Check to see if the v4 side of the server has delegated 1002 * this file. If so, then we mark thread as wouldblock so 1003 * the response is dropped. 1004 */ 1005 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1006 VN_RELE(vp); 1007 curthread->t_flag |= T_WOULDBLOCK; 1008 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1009 "rfs_write_end:(%S)", "delegated"); 1010 return; 1011 } 1012 1013 va.va_mask = AT_UID|AT_MODE; 1014 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1015 "vop_getattr_start:"); 1016 error = VOP_GETATTR(vp, &va, 0, cr); 1017 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1018 "vop_getattr_end:"); 1019 1020 if (error) { 1021 VN_RELE(vp); 1022 ns->ns_status = puterrno(error); 1023 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1024 "rfs_write_end:(%S)", "getattr error"); 1025 return; 1026 } 1027 1028 if (crgetuid(cr) != va.va_uid) { 1029 /* 1030 * This is a kludge to allow writes of files created 1031 * with read only permission. The owner of the file 1032 * is always allowed to write it. 1033 */ 1034 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1035 "vop_access_start:"); 1036 error = VOP_ACCESS(vp, VWRITE, 0, cr); 1037 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1038 "vop_access_end:"); 1039 if (error) { 1040 VN_RELE(vp); 1041 ns->ns_status = puterrno(error); 1042 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1043 "rfs_write_end:(%S)", "access error"); 1044 return; 1045 } 1046 } 1047 1048 /* 1049 * Can't access a mandatory lock file. This might cause 1050 * the NFS service thread to block forever waiting for a 1051 * lock to be released that will never be released. 1052 */ 1053 if (MANDLOCK(vp, va.va_mode)) { 1054 VN_RELE(vp); 1055 ns->ns_status = NFSERR_ACCES; 1056 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1057 "rfs_write_end:(%S)", "mand lock"); 1058 return; 1059 } 1060 1061 /* 1062 * We have to enter the critical region before calling VOP_RWLOCK 1063 * to avoid a deadlock with ufs. 1064 */ 1065 if (nbl_need_check(vp)) { 1066 nbl_start_crit(vp, RW_READER); 1067 in_crit = 1; 1068 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1069 wa->wa_count, 0)) { 1070 error = EACCES; 1071 goto out; 1072 } 1073 } 1074 1075 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1076 "vop_rwlock_start:"); 1077 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1078 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1079 "vop_rwlock_end:"); 1080 1081 if (wa->wa_data) { 1082 iov[0].iov_base = wa->wa_data; 1083 iov[0].iov_len = wa->wa_count; 1084 uio.uio_iov = iov; 1085 uio.uio_iovcnt = 1; 1086 uio.uio_segflg = UIO_SYSSPACE; 1087 uio.uio_extflg = UIO_COPY_DEFAULT; 1088 uio.uio_loffset = (offset_t)wa->wa_offset; 1089 uio.uio_resid = wa->wa_count; 1090 /* 1091 * The limit is checked on the client. We 1092 * should allow any size writes here. 1093 */ 1094 uio.uio_llimit = curproc->p_fsz_ctl; 1095 rlimit = uio.uio_llimit - wa->wa_offset; 1096 if (rlimit < (rlim64_t)uio.uio_resid) 1097 uio.uio_resid = (uint_t)rlimit; 1098 1099 /* 1100 * for now we assume no append mode 1101 */ 1102 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1103 "vop_write_start:(%S)", "sync"); 1104 /* 1105 * We're changing creds because VM may fault and we need 1106 * the cred of the current thread to be used if quota 1107 * checking is enabled. 1108 */ 1109 savecred = curthread->t_cred; 1110 curthread->t_cred = cr; 1111 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1112 curthread->t_cred = savecred; 1113 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1114 "vop_write_end:"); 1115 } else { 1116 iovcnt = 0; 1117 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1118 iovcnt++; 1119 if (iovcnt <= MAX_IOVECS) { 1120 #ifdef DEBUG 1121 rfs_write_sync_hits++; 1122 #endif 1123 iovp = iov; 1124 } else { 1125 #ifdef DEBUG 1126 rfs_write_sync_misses++; 1127 #endif 1128 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1129 } 1130 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1131 uio.uio_iov = iovp; 1132 uio.uio_iovcnt = iovcnt; 1133 uio.uio_segflg = UIO_SYSSPACE; 1134 uio.uio_extflg = UIO_COPY_DEFAULT; 1135 uio.uio_loffset = (offset_t)wa->wa_offset; 1136 uio.uio_resid = wa->wa_count; 1137 /* 1138 * The limit is checked on the client. We 1139 * should allow any size writes here. 1140 */ 1141 uio.uio_llimit = curproc->p_fsz_ctl; 1142 rlimit = uio.uio_llimit - wa->wa_offset; 1143 if (rlimit < (rlim64_t)uio.uio_resid) 1144 uio.uio_resid = (uint_t)rlimit; 1145 1146 /* 1147 * For now we assume no append mode. 1148 */ 1149 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1150 "vop_write_start:(%S)", "iov sync"); 1151 /* 1152 * We're changing creds because VM may fault and we need 1153 * the cred of the current thread to be used if quota 1154 * checking is enabled. 1155 */ 1156 savecred = curthread->t_cred; 1157 curthread->t_cred = cr; 1158 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1159 curthread->t_cred = savecred; 1160 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1161 "vop_write_end:"); 1162 1163 if (iovp != iov) 1164 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1165 } 1166 1167 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1168 "vop_rwunlock_start:"); 1169 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1170 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1171 "vop_rwunlock_end:"); 1172 1173 if (!error) { 1174 /* 1175 * Get attributes again so we send the latest mod 1176 * time to the client side for his cache. 1177 */ 1178 va.va_mask = AT_ALL; /* now we want everything */ 1179 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1180 "vop_getattr_start:"); 1181 error = VOP_GETATTR(vp, &va, 0, cr); 1182 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1183 "vop_getattr_end:"); 1184 /* check for overflows */ 1185 if (!error) { 1186 acl_perm(vp, exi, &va, cr); 1187 error = vattr_to_nattr(&va, &ns->ns_attr); 1188 } 1189 } 1190 1191 out: 1192 if (in_crit) 1193 nbl_end_crit(vp); 1194 VN_RELE(vp); 1195 1196 ns->ns_status = puterrno(error); 1197 1198 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1199 "rfs_write_end:(%S)", "sync"); 1200 } 1201 1202 struct rfs_async_write { 1203 struct nfswriteargs *wa; 1204 struct nfsattrstat *ns; 1205 struct svc_req *req; 1206 cred_t *cr; 1207 kthread_t *thread; 1208 struct rfs_async_write *list; 1209 }; 1210 1211 struct rfs_async_write_list { 1212 fhandle_t *fhp; 1213 kcondvar_t cv; 1214 struct rfs_async_write *list; 1215 struct rfs_async_write_list *next; 1216 }; 1217 1218 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1219 static kmutex_t rfs_async_write_lock; 1220 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1221 1222 #define MAXCLIOVECS 42 1223 #define RFSWRITE_INITVAL (enum nfsstat) -1 1224 1225 #ifdef DEBUG 1226 static int rfs_write_hits = 0; 1227 static int rfs_write_misses = 0; 1228 #endif 1229 1230 /* 1231 * Write data to file. 1232 * Returns attributes of a file after writing some data to it. 1233 */ 1234 void 1235 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1236 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1237 { 1238 int error; 1239 vnode_t *vp; 1240 rlim64_t rlimit; 1241 struct vattr va; 1242 struct uio uio; 1243 struct rfs_async_write_list *lp; 1244 struct rfs_async_write_list *nlp; 1245 struct rfs_async_write *rp; 1246 struct rfs_async_write *nrp; 1247 struct rfs_async_write *trp; 1248 struct rfs_async_write *lrp; 1249 int data_written; 1250 int iovcnt; 1251 mblk_t *m; 1252 struct iovec *iovp; 1253 struct iovec *niovp; 1254 struct iovec iov[MAXCLIOVECS]; 1255 int count; 1256 int rcount; 1257 uint_t off; 1258 uint_t len; 1259 struct rfs_async_write nrpsp; 1260 struct rfs_async_write_list nlpsp; 1261 ushort_t t_flag; 1262 cred_t *savecred; 1263 int in_crit = 0; 1264 1265 if (!rfs_write_async) { 1266 rfs_write_sync(wa, ns, exi, req, cr); 1267 return; 1268 } 1269 1270 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1271 "rfs_write_start:(%S)", "async"); 1272 1273 /* 1274 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1275 * is considered an OK. 1276 */ 1277 ns->ns_status = RFSWRITE_INITVAL; 1278 1279 nrp = &nrpsp; 1280 nrp->wa = wa; 1281 nrp->ns = ns; 1282 nrp->req = req; 1283 nrp->cr = cr; 1284 nrp->thread = curthread; 1285 1286 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1287 1288 /* 1289 * Look to see if there is already a cluster started 1290 * for this file. 1291 */ 1292 mutex_enter(&rfs_async_write_lock); 1293 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1294 if (bcmp(&wa->wa_fhandle, lp->fhp, 1295 sizeof (fhandle_t)) == 0) 1296 break; 1297 } 1298 1299 /* 1300 * If lp is non-NULL, then there is already a cluster 1301 * started. We need to place ourselves in the cluster 1302 * list in the right place as determined by starting 1303 * offset. Conflicts with non-blocking mandatory locked 1304 * regions will be checked when the cluster is processed. 1305 */ 1306 if (lp != NULL) { 1307 rp = lp->list; 1308 trp = NULL; 1309 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1310 trp = rp; 1311 rp = rp->list; 1312 } 1313 nrp->list = rp; 1314 if (trp == NULL) 1315 lp->list = nrp; 1316 else 1317 trp->list = nrp; 1318 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1319 cv_wait(&lp->cv, &rfs_async_write_lock); 1320 mutex_exit(&rfs_async_write_lock); 1321 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1322 "rfs_write_end:(%S)", "cluster child"); 1323 return; 1324 } 1325 1326 /* 1327 * No cluster started yet, start one and add ourselves 1328 * to the list of clusters. 1329 */ 1330 nrp->list = NULL; 1331 1332 nlp = &nlpsp; 1333 nlp->fhp = &wa->wa_fhandle; 1334 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1335 nlp->list = nrp; 1336 nlp->next = NULL; 1337 1338 if (rfs_async_write_head == NULL) { 1339 rfs_async_write_head = nlp; 1340 } else { 1341 lp = rfs_async_write_head; 1342 while (lp->next != NULL) 1343 lp = lp->next; 1344 lp->next = nlp; 1345 } 1346 mutex_exit(&rfs_async_write_lock); 1347 1348 /* 1349 * Convert the file handle common to all of the requests 1350 * in this cluster to a vnode. 1351 */ 1352 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1353 if (vp == NULL) { 1354 mutex_enter(&rfs_async_write_lock); 1355 if (rfs_async_write_head == nlp) 1356 rfs_async_write_head = nlp->next; 1357 else { 1358 lp = rfs_async_write_head; 1359 while (lp->next != nlp) 1360 lp = lp->next; 1361 lp->next = nlp->next; 1362 } 1363 t_flag = curthread->t_flag & T_WOULDBLOCK; 1364 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1365 rp->ns->ns_status = NFSERR_STALE; 1366 rp->thread->t_flag |= t_flag; 1367 } 1368 cv_broadcast(&nlp->cv); 1369 mutex_exit(&rfs_async_write_lock); 1370 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1371 "rfs_write_end:(%S)", "stale"); 1372 return; 1373 } 1374 1375 /* 1376 * Can only write regular files. Attempts to write any 1377 * other file types fail with EISDIR. 1378 */ 1379 if (vp->v_type != VREG) { 1380 VN_RELE(vp); 1381 mutex_enter(&rfs_async_write_lock); 1382 if (rfs_async_write_head == nlp) 1383 rfs_async_write_head = nlp->next; 1384 else { 1385 lp = rfs_async_write_head; 1386 while (lp->next != nlp) 1387 lp = lp->next; 1388 lp->next = nlp->next; 1389 } 1390 t_flag = curthread->t_flag & T_WOULDBLOCK; 1391 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1392 rp->ns->ns_status = NFSERR_ISDIR; 1393 rp->thread->t_flag |= t_flag; 1394 } 1395 cv_broadcast(&nlp->cv); 1396 mutex_exit(&rfs_async_write_lock); 1397 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1398 "rfs_write_end:(%S)", "isdir"); 1399 return; 1400 } 1401 1402 /* 1403 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1404 * deadlock with ufs. 1405 */ 1406 if (nbl_need_check(vp)) { 1407 nbl_start_crit(vp, RW_READER); 1408 in_crit = 1; 1409 } 1410 1411 /* 1412 * Lock the file for writing. This operation provides 1413 * the delay which allows clusters to grow. 1414 */ 1415 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 1416 "vop_wrlock_start:"); 1417 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1418 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 1419 "vop_wrlock_end"); 1420 1421 /* 1422 * Disconnect this cluster from the list of clusters. 1423 * The cluster that is being dealt with must be fixed 1424 * in size after this point, so there is no reason 1425 * to leave it on the list so that new requests can 1426 * find it. 1427 * 1428 * The algorithm is that the first write request will 1429 * create a cluster, convert the file handle to a 1430 * vnode pointer, and then lock the file for writing. 1431 * This request is not likely to be clustered with 1432 * any others. However, the next request will create 1433 * a new cluster and be blocked in VOP_RWLOCK while 1434 * the first request is being processed. This delay 1435 * will allow more requests to be clustered in this 1436 * second cluster. 1437 */ 1438 mutex_enter(&rfs_async_write_lock); 1439 if (rfs_async_write_head == nlp) 1440 rfs_async_write_head = nlp->next; 1441 else { 1442 lp = rfs_async_write_head; 1443 while (lp->next != nlp) 1444 lp = lp->next; 1445 lp->next = nlp->next; 1446 } 1447 mutex_exit(&rfs_async_write_lock); 1448 1449 /* 1450 * Step through the list of requests in this cluster. 1451 * We need to check permissions to make sure that all 1452 * of the requests have sufficient permission to write 1453 * the file. A cluster can be composed of requests 1454 * from different clients and different users on each 1455 * client. 1456 * 1457 * As a side effect, we also calculate the size of the 1458 * byte range that this cluster encompasses. 1459 */ 1460 rp = nlp->list; 1461 off = rp->wa->wa_offset; 1462 len = (uint_t)0; 1463 do { 1464 if (rdonly(exi, rp->req)) { 1465 rp->ns->ns_status = NFSERR_ROFS; 1466 t_flag = curthread->t_flag & T_WOULDBLOCK; 1467 rp->thread->t_flag |= t_flag; 1468 continue; 1469 } 1470 1471 va.va_mask = AT_UID|AT_MODE; 1472 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1473 "vop_getattr_start:"); 1474 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1475 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1476 "vop_getattr_end:"); 1477 if (!error) { 1478 if (crgetuid(rp->cr) != va.va_uid) { 1479 /* 1480 * This is a kludge to allow writes of files 1481 * created with read only permission. The 1482 * owner of the file is always allowed to 1483 * write it. 1484 */ 1485 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1486 "vop_access_start:"); 1487 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr); 1488 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1489 "vop_access_end:"); 1490 } 1491 if (!error && MANDLOCK(vp, va.va_mode)) 1492 error = EACCES; 1493 } 1494 1495 /* 1496 * Check for a conflict with a nbmand-locked region. 1497 */ 1498 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1499 rp->wa->wa_count, 0)) { 1500 error = EACCES; 1501 } 1502 1503 if (error) { 1504 rp->ns->ns_status = puterrno(error); 1505 t_flag = curthread->t_flag & T_WOULDBLOCK; 1506 rp->thread->t_flag |= t_flag; 1507 continue; 1508 } 1509 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1510 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1511 } while ((rp = rp->list) != NULL); 1512 1513 /* 1514 * Step through the cluster attempting to gather as many 1515 * requests which are contiguous as possible. These 1516 * contiguous requests are handled via one call to VOP_WRITE 1517 * instead of different calls to VOP_WRITE. We also keep 1518 * track of the fact that any data was written. 1519 */ 1520 rp = nlp->list; 1521 data_written = 0; 1522 do { 1523 /* 1524 * Skip any requests which are already marked as having an 1525 * error. 1526 */ 1527 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1528 rp = rp->list; 1529 continue; 1530 } 1531 1532 /* 1533 * Count the number of iovec's which are required 1534 * to handle this set of requests. One iovec is 1535 * needed for each data buffer, whether addressed 1536 * by wa_data or by the b_rptr pointers in the 1537 * mblk chains. 1538 */ 1539 iovcnt = 0; 1540 lrp = rp; 1541 for (;;) { 1542 if (lrp->wa->wa_data) 1543 iovcnt++; 1544 else { 1545 m = lrp->wa->wa_mblk; 1546 while (m != NULL) { 1547 iovcnt++; 1548 m = m->b_cont; 1549 } 1550 } 1551 if (lrp->list == NULL || 1552 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1553 lrp->wa->wa_offset + lrp->wa->wa_count != 1554 lrp->list->wa->wa_offset) { 1555 lrp = lrp->list; 1556 break; 1557 } 1558 lrp = lrp->list; 1559 } 1560 1561 if (iovcnt <= MAXCLIOVECS) { 1562 #ifdef DEBUG 1563 rfs_write_hits++; 1564 #endif 1565 niovp = iov; 1566 } else { 1567 #ifdef DEBUG 1568 rfs_write_misses++; 1569 #endif 1570 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1571 } 1572 /* 1573 * Put together the scatter/gather iovecs. 1574 */ 1575 iovp = niovp; 1576 trp = rp; 1577 count = 0; 1578 do { 1579 if (trp->wa->wa_data) { 1580 iovp->iov_base = trp->wa->wa_data; 1581 iovp->iov_len = trp->wa->wa_count; 1582 iovp++; 1583 } else { 1584 m = trp->wa->wa_mblk; 1585 rcount = trp->wa->wa_count; 1586 while (m != NULL) { 1587 iovp->iov_base = (caddr_t)m->b_rptr; 1588 iovp->iov_len = (m->b_wptr - m->b_rptr); 1589 rcount -= iovp->iov_len; 1590 if (rcount < 0) 1591 iovp->iov_len += rcount; 1592 iovp++; 1593 if (rcount <= 0) 1594 break; 1595 m = m->b_cont; 1596 } 1597 } 1598 count += trp->wa->wa_count; 1599 trp = trp->list; 1600 } while (trp != lrp); 1601 1602 uio.uio_iov = niovp; 1603 uio.uio_iovcnt = iovcnt; 1604 uio.uio_segflg = UIO_SYSSPACE; 1605 uio.uio_extflg = UIO_COPY_DEFAULT; 1606 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1607 uio.uio_resid = count; 1608 /* 1609 * The limit is checked on the client. We 1610 * should allow any size writes here. 1611 */ 1612 uio.uio_llimit = curproc->p_fsz_ctl; 1613 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1614 if (rlimit < (rlim64_t)uio.uio_resid) 1615 uio.uio_resid = (uint_t)rlimit; 1616 1617 /* 1618 * For now we assume no append mode. 1619 */ 1620 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1621 "vop_write_start:(%S)", "async"); 1622 1623 /* 1624 * Check to see if the v4 side of the server has 1625 * delegated this file. If so, then we mark thread 1626 * as wouldblock so the response is dropped. 1627 */ 1628 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1629 curthread->t_flag |= T_WOULDBLOCK; 1630 error = EACCES; /* just to have an error */ 1631 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1632 "rfs_write_end:(%S)", "delegated"); 1633 } else { 1634 /* 1635 * We're changing creds because VM may fault 1636 * and we need the cred of the current 1637 * thread to be used if quota * checking is 1638 * enabled. 1639 */ 1640 savecred = curthread->t_cred; 1641 curthread->t_cred = cr; 1642 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1643 curthread->t_cred = savecred; 1644 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1645 "vop_write_end:"); 1646 } 1647 1648 if (niovp != iov) 1649 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1650 1651 if (!error) { 1652 data_written = 1; 1653 /* 1654 * Get attributes again so we send the latest mod 1655 * time to the client side for his cache. 1656 */ 1657 va.va_mask = AT_ALL; /* now we want everything */ 1658 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1659 "vop_getattr_start:"); 1660 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1661 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1662 "vop_getattr_end:"); 1663 if (!error) 1664 acl_perm(vp, exi, &va, rp->cr); 1665 } 1666 1667 /* 1668 * Fill in the status responses for each request 1669 * which was just handled. Also, copy the latest 1670 * attributes in to the attribute responses if 1671 * appropriate. 1672 */ 1673 t_flag = curthread->t_flag & T_WOULDBLOCK; 1674 do { 1675 rp->thread->t_flag |= t_flag; 1676 /* check for overflows */ 1677 if (!error) { 1678 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1679 } 1680 rp->ns->ns_status = puterrno(error); 1681 rp = rp->list; 1682 } while (rp != lrp); 1683 } while (rp != NULL); 1684 1685 /* 1686 * If any data was written at all, then we need to flush 1687 * the data and metadata to stable storage. 1688 */ 1689 if (data_written) { 1690 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, 1691 "vop_putpage_start:"); 1692 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr); 1693 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, 1694 "vop_putpage_end:"); 1695 if (!error) { 1696 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1697 "vop_fsync_start:"); 1698 error = VOP_FSYNC(vp, FNODSYNC, cr); 1699 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, 1700 "vop_fsync_end:"); 1701 } 1702 } 1703 1704 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 1705 "vop_rwunlock_start:"); 1706 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1707 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 1708 "vop_rwunlock_end:"); 1709 1710 if (in_crit) 1711 nbl_end_crit(vp); 1712 VN_RELE(vp); 1713 1714 t_flag = curthread->t_flag & T_WOULDBLOCK; 1715 mutex_enter(&rfs_async_write_lock); 1716 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1717 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1718 rp->ns->ns_status = puterrno(error); 1719 rp->thread->t_flag |= t_flag; 1720 } 1721 } 1722 cv_broadcast(&nlp->cv); 1723 mutex_exit(&rfs_async_write_lock); 1724 1725 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1726 "rfs_write_end:(%S)", "async"); 1727 } 1728 1729 void * 1730 rfs_write_getfh(struct nfswriteargs *wa) 1731 { 1732 return (&wa->wa_fhandle); 1733 } 1734 1735 /* 1736 * Create a file. 1737 * Creates a file with given attributes and returns those attributes 1738 * and an fhandle for the new file. 1739 */ 1740 void 1741 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1742 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1743 { 1744 int error; 1745 int lookuperr; 1746 int in_crit = 0; 1747 struct vattr va; 1748 vnode_t *vp; 1749 vnode_t *dvp; 1750 char *name = args->ca_da.da_name; 1751 vnode_t *tvp = NULL; 1752 int mode; 1753 int lookup_ok; 1754 bool_t trunc; 1755 1756 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, 1757 "rfs_create_start:"); 1758 1759 /* 1760 * Disallow NULL paths 1761 */ 1762 if (name == NULL || *name == '\0') { 1763 dr->dr_status = NFSERR_ACCES; 1764 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1765 "rfs_create_end:(%S)", "access"); 1766 return; 1767 } 1768 1769 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1770 if (dvp == NULL) { 1771 dr->dr_status = NFSERR_STALE; 1772 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1773 "rfs_create_end:(%S)", "stale"); 1774 return; 1775 } 1776 1777 error = sattr_to_vattr(args->ca_sa, &va); 1778 if (error) { 1779 dr->dr_status = puterrno(error); 1780 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1781 "rfs_create_end:(%S)", "sattr"); 1782 return; 1783 } 1784 1785 /* 1786 * Must specify the mode. 1787 */ 1788 if (!(va.va_mask & AT_MODE)) { 1789 VN_RELE(dvp); 1790 dr->dr_status = NFSERR_INVAL; 1791 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1792 "rfs_create_end:(%S)", "no mode"); 1793 return; 1794 } 1795 1796 /* 1797 * This is a completely gross hack to make mknod 1798 * work over the wire until we can wack the protocol 1799 */ 1800 if ((va.va_mode & IFMT) == IFCHR) { 1801 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1802 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1803 else { 1804 va.va_type = VCHR; 1805 /* 1806 * uncompress the received dev_t 1807 * if the top half is zero indicating a request 1808 * from an `older style' OS. 1809 */ 1810 if ((va.va_size & 0xffff0000) == 0) 1811 va.va_rdev = nfsv2_expdev(va.va_size); 1812 else 1813 va.va_rdev = (dev_t)va.va_size; 1814 } 1815 va.va_mask &= ~AT_SIZE; 1816 } else if ((va.va_mode & IFMT) == IFBLK) { 1817 va.va_type = VBLK; 1818 /* 1819 * uncompress the received dev_t 1820 * if the top half is zero indicating a request 1821 * from an `older style' OS. 1822 */ 1823 if ((va.va_size & 0xffff0000) == 0) 1824 va.va_rdev = nfsv2_expdev(va.va_size); 1825 else 1826 va.va_rdev = (dev_t)va.va_size; 1827 va.va_mask &= ~AT_SIZE; 1828 } else if ((va.va_mode & IFMT) == IFSOCK) { 1829 va.va_type = VSOCK; 1830 } else 1831 va.va_type = VREG; 1832 va.va_mode &= ~IFMT; 1833 va.va_mask |= AT_TYPE; 1834 1835 /* 1836 * Why was the choice made to use VWRITE as the mode to the 1837 * call to VOP_CREATE ? This results in a bug. When a client 1838 * opens a file that already exists and is RDONLY, the second 1839 * open fails with an EACESS because of the mode. 1840 * bug ID 1054648. 1841 */ 1842 lookup_ok = 0; 1843 mode = VWRITE; 1844 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1845 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 1846 "vop_lookup_start:"); 1847 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1848 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 1849 "vop_lookup_end:"); 1850 if (!error) { 1851 struct vattr at; 1852 1853 lookup_ok = 1; 1854 at.va_mask = AT_MODE; 1855 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1856 "vop_getattr_start:"); 1857 error = VOP_GETATTR(tvp, &at, 0, cr); 1858 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1859 "vop_getattr_end:"); 1860 if (!error) 1861 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1862 VN_RELE(tvp); 1863 tvp = NULL; 1864 } 1865 } 1866 1867 if (!lookup_ok) { 1868 if (rdonly(exi, req)) { 1869 error = EROFS; 1870 } else if (va.va_type != VREG && va.va_type != VFIFO && 1871 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1872 error = EPERM; 1873 } else { 1874 error = 0; 1875 } 1876 } 1877 1878 /* 1879 * If file size is being modified on an already existing file 1880 * make sure that there are no conflicting non-blocking mandatory 1881 * locks in the region being manipulated. Return EACCES if there 1882 * are conflicting locks. 1883 */ 1884 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1885 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1886 1887 if (!lookuperr && 1888 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1889 VN_RELE(tvp); 1890 curthread->t_flag |= T_WOULDBLOCK; 1891 goto out; 1892 } 1893 1894 if (!lookuperr && nbl_need_check(tvp)) { 1895 /* 1896 * The file exists. Now check if it has any 1897 * conflicting non-blocking mandatory locks 1898 * in the region being changed. 1899 */ 1900 struct vattr bva; 1901 u_offset_t offset; 1902 ssize_t length; 1903 1904 nbl_start_crit(tvp, RW_READER); 1905 in_crit = 1; 1906 1907 bva.va_mask = AT_SIZE; 1908 error = VOP_GETATTR(tvp, &bva, 0, cr); 1909 if (!error) { 1910 if (va.va_size < bva.va_size) { 1911 offset = va.va_size; 1912 length = bva.va_size - va.va_size; 1913 } else { 1914 offset = bva.va_size; 1915 length = va.va_size - bva.va_size; 1916 } 1917 if (length) { 1918 if (nbl_conflict(tvp, NBL_WRITE, 1919 offset, length, 0)) { 1920 error = EACCES; 1921 } 1922 } 1923 } 1924 if (error) { 1925 nbl_end_crit(tvp); 1926 VN_RELE(tvp); 1927 in_crit = 0; 1928 } 1929 } else if (tvp != NULL) { 1930 VN_RELE(tvp); 1931 } 1932 } 1933 1934 if (!error) { 1935 /* 1936 * If filesystem is shared with nosuid the remove any 1937 * setuid/setgid bits on create. 1938 */ 1939 if (va.va_type == VREG && 1940 exi->exi_export.ex_flags & EX_NOSUID) 1941 va.va_mode &= ~(VSUID | VSGID); 1942 1943 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, 1944 "vop_create_start:"); 1945 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0); 1946 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, 1947 "vop_create_end:"); 1948 1949 if (!error) { 1950 1951 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1952 trunc = TRUE; 1953 else 1954 trunc = FALSE; 1955 1956 if (rfs4_check_delegated(FWRITE, tvp, trunc)) { 1957 VN_RELE(tvp); 1958 curthread->t_flag |= T_WOULDBLOCK; 1959 goto out; 1960 } 1961 va.va_mask = AT_ALL; 1962 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1963 "vop_getattr_start:"); 1964 error = VOP_GETATTR(vp, &va, 0, cr); 1965 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1966 "vop_getattr_end:"); 1967 /* check for overflows */ 1968 if (!error) { 1969 acl_perm(vp, exi, &va, cr); 1970 error = vattr_to_nattr(&va, &dr->dr_attr); 1971 if (!error) { 1972 error = makefh(&dr->dr_fhandle, vp, 1973 exi); 1974 } 1975 } 1976 /* 1977 * Force modified metadata out to stable storage. 1978 */ 1979 (void) VOP_FSYNC(vp, FNODSYNC, cr); 1980 VN_RELE(vp); 1981 } 1982 1983 if (in_crit) { 1984 nbl_end_crit(tvp); 1985 VN_RELE(tvp); 1986 } 1987 } 1988 1989 /* 1990 * Force modified data and metadata out to stable storage. 1991 */ 1992 (void) VOP_FSYNC(dvp, 0, cr); 1993 1994 out: 1995 1996 VN_RELE(dvp); 1997 1998 dr->dr_status = puterrno(error); 1999 2000 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 2001 "rfs_create_end:(%S)", "done"); 2002 } 2003 void * 2004 rfs_create_getfh(struct nfscreatargs *args) 2005 { 2006 return (args->ca_da.da_fhandle); 2007 } 2008 2009 /* 2010 * Remove a file. 2011 * Remove named file from parent directory. 2012 */ 2013 void 2014 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 2015 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2016 { 2017 int error = 0; 2018 vnode_t *vp; 2019 vnode_t *targvp; 2020 int in_crit = 0; 2021 2022 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, 2023 "rfs_remove_start:"); 2024 2025 /* 2026 * Disallow NULL paths 2027 */ 2028 if (da->da_name == NULL || *da->da_name == '\0') { 2029 *status = NFSERR_ACCES; 2030 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2031 "rfs_remove_end:(%S)", "access"); 2032 return; 2033 } 2034 2035 vp = nfs_fhtovp(da->da_fhandle, exi); 2036 if (vp == NULL) { 2037 *status = NFSERR_STALE; 2038 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2039 "rfs_remove_end:(%S)", "stale"); 2040 return; 2041 } 2042 2043 if (rdonly(exi, req)) { 2044 VN_RELE(vp); 2045 *status = NFSERR_ROFS; 2046 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2047 "rfs_remove_end:(%S)", "rofs"); 2048 return; 2049 } 2050 2051 /* 2052 * Check for a conflict with a non-blocking mandatory share reservation. 2053 */ 2054 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2055 NULL, cr); 2056 if (error != 0) { 2057 VN_RELE(vp); 2058 *status = puterrno(error); 2059 return; 2060 } 2061 2062 /* 2063 * If the file is delegated to an v4 client, then initiate 2064 * recall and drop this request (by setting T_WOULDBLOCK). 2065 * The client will eventually re-transmit the request and 2066 * (hopefully), by then, the v4 client will have returned 2067 * the delegation. 2068 */ 2069 2070 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2071 VN_RELE(vp); 2072 VN_RELE(targvp); 2073 curthread->t_flag |= T_WOULDBLOCK; 2074 return; 2075 } 2076 2077 if (nbl_need_check(targvp)) { 2078 nbl_start_crit(targvp, RW_READER); 2079 in_crit = 1; 2080 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) { 2081 error = EACCES; 2082 goto out; 2083 } 2084 } 2085 2086 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, 2087 "vop_remove_start:"); 2088 error = VOP_REMOVE(vp, da->da_name, cr); 2089 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, 2090 "vop_remove_end:"); 2091 2092 /* 2093 * Force modified data and metadata out to stable storage. 2094 */ 2095 (void) VOP_FSYNC(vp, 0, cr); 2096 2097 out: 2098 if (in_crit) 2099 nbl_end_crit(targvp); 2100 VN_RELE(targvp); 2101 VN_RELE(vp); 2102 2103 *status = puterrno(error); 2104 2105 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2106 "rfs_remove_end:(%S)", "done"); 2107 } 2108 2109 void * 2110 rfs_remove_getfh(struct nfsdiropargs *da) 2111 { 2112 return (da->da_fhandle); 2113 } 2114 2115 /* 2116 * rename a file 2117 * Give a file (from) a new name (to). 2118 */ 2119 void 2120 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2121 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2122 { 2123 int error = 0; 2124 vnode_t *fromvp; 2125 vnode_t *tovp; 2126 struct exportinfo *to_exi; 2127 fhandle_t *fh; 2128 vnode_t *srcvp; 2129 vnode_t *targvp; 2130 int in_crit = 0; 2131 2132 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, 2133 "rfs_rename_start:"); 2134 2135 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2136 if (fromvp == NULL) { 2137 *status = NFSERR_STALE; 2138 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2139 "rfs_rename_end:(%S)", "from stale"); 2140 return; 2141 } 2142 2143 fh = args->rna_to.da_fhandle; 2144 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2145 if (to_exi == NULL) { 2146 VN_RELE(fromvp); 2147 *status = NFSERR_ACCES; 2148 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2149 "rfs_rename_end:(%S)", "cross device"); 2150 return; 2151 } 2152 exi_rele(to_exi); 2153 2154 if (to_exi != exi) { 2155 VN_RELE(fromvp); 2156 *status = NFSERR_XDEV; 2157 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2158 "rfs_rename_end:(%S)", "from stale"); 2159 return; 2160 } 2161 2162 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2163 if (tovp == NULL) { 2164 VN_RELE(fromvp); 2165 *status = NFSERR_STALE; 2166 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2167 "rfs_rename_end:(%S)", "to stale"); 2168 return; 2169 } 2170 2171 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2172 VN_RELE(tovp); 2173 VN_RELE(fromvp); 2174 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2175 "rfs_rename_end:(%S)", "not dir"); 2176 *status = NFSERR_NOTDIR; 2177 return; 2178 } 2179 2180 /* 2181 * Disallow NULL paths 2182 */ 2183 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2184 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2185 VN_RELE(tovp); 2186 VN_RELE(fromvp); 2187 *status = NFSERR_ACCES; 2188 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2189 "rfs_rename_end:(%S)", "access"); 2190 return; 2191 } 2192 2193 if (rdonly(exi, req)) { 2194 VN_RELE(tovp); 2195 VN_RELE(fromvp); 2196 *status = NFSERR_ROFS; 2197 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2198 "rfs_rename_end:(%S)", "rofs"); 2199 return; 2200 } 2201 2202 /* 2203 * Check for a conflict with a non-blocking mandatory share reservation. 2204 */ 2205 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2206 NULL, cr); 2207 if (error != 0) { 2208 VN_RELE(tovp); 2209 VN_RELE(fromvp); 2210 *status = puterrno(error); 2211 return; 2212 } 2213 2214 /* Check for delegations on the source file */ 2215 2216 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2217 VN_RELE(tovp); 2218 VN_RELE(fromvp); 2219 VN_RELE(srcvp); 2220 curthread->t_flag |= T_WOULDBLOCK; 2221 return; 2222 } 2223 2224 /* Check for delegation on the file being renamed over, if it exists */ 2225 2226 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2227 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr) 2228 == 0) { 2229 2230 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2231 VN_RELE(tovp); 2232 VN_RELE(fromvp); 2233 VN_RELE(srcvp); 2234 VN_RELE(targvp); 2235 curthread->t_flag |= T_WOULDBLOCK; 2236 return; 2237 } 2238 VN_RELE(targvp); 2239 } 2240 2241 2242 if (nbl_need_check(srcvp)) { 2243 nbl_start_crit(srcvp, RW_READER); 2244 in_crit = 1; 2245 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) { 2246 error = EACCES; 2247 goto out; 2248 } 2249 } 2250 2251 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, 2252 "vop_rename_start:"); 2253 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2254 tovp, args->rna_to.da_name, cr); 2255 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, 2256 "vop_rename_end:"); 2257 2258 if (error == 0) { 2259 char *tmp; 2260 2261 /* fix the path name for the renamed file */ 2262 mutex_enter(&srcvp->v_lock); 2263 tmp = srcvp->v_path; 2264 srcvp->v_path = NULL; 2265 mutex_exit(&srcvp->v_lock); 2266 vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name, 2267 strlen(args->rna_to.da_name)); 2268 if (tmp != NULL) 2269 kmem_free(tmp, strlen(tmp) + 1); 2270 } 2271 2272 /* 2273 * Force modified data and metadata out to stable storage. 2274 */ 2275 (void) VOP_FSYNC(tovp, 0, cr); 2276 (void) VOP_FSYNC(fromvp, 0, cr); 2277 2278 out: 2279 if (in_crit) 2280 nbl_end_crit(srcvp); 2281 VN_RELE(srcvp); 2282 VN_RELE(tovp); 2283 VN_RELE(fromvp); 2284 2285 *status = puterrno(error); 2286 2287 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2288 "rfs_rename_end:(%S)", "done"); 2289 } 2290 void * 2291 rfs_rename_getfh(struct nfsrnmargs *args) 2292 { 2293 return (args->rna_from.da_fhandle); 2294 } 2295 2296 /* 2297 * Link to a file. 2298 * Create a file (to) which is a hard link to the given file (from). 2299 */ 2300 void 2301 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2302 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2303 { 2304 int error; 2305 vnode_t *fromvp; 2306 vnode_t *tovp; 2307 struct exportinfo *to_exi; 2308 fhandle_t *fh; 2309 2310 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, 2311 "rfs_link_start:"); 2312 2313 fromvp = nfs_fhtovp(args->la_from, exi); 2314 if (fromvp == NULL) { 2315 *status = NFSERR_STALE; 2316 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2317 "rfs_link_end:(%S)", "from stale"); 2318 return; 2319 } 2320 2321 fh = args->la_to.da_fhandle; 2322 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2323 if (to_exi == NULL) { 2324 VN_RELE(fromvp); 2325 *status = NFSERR_ACCES; 2326 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2327 "rfs_link_end:(%S)", "cross device"); 2328 return; 2329 } 2330 exi_rele(to_exi); 2331 2332 if (to_exi != exi) { 2333 VN_RELE(fromvp); 2334 *status = NFSERR_XDEV; 2335 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2336 "rfs_link_end:(%S)", "cross device"); 2337 return; 2338 } 2339 2340 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2341 if (tovp == NULL) { 2342 VN_RELE(fromvp); 2343 *status = NFSERR_STALE; 2344 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2345 "rfs_link_end:(%S)", "to stale"); 2346 return; 2347 } 2348 2349 if (tovp->v_type != VDIR) { 2350 VN_RELE(tovp); 2351 VN_RELE(fromvp); 2352 *status = NFSERR_NOTDIR; 2353 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2354 "rfs_link_end:(%S)", "not dir"); 2355 return; 2356 } 2357 /* 2358 * Disallow NULL paths 2359 */ 2360 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2361 VN_RELE(tovp); 2362 VN_RELE(fromvp); 2363 *status = NFSERR_ACCES; 2364 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2365 "rfs_link_end:(%S)", "access"); 2366 return; 2367 } 2368 2369 if (rdonly(exi, req)) { 2370 VN_RELE(tovp); 2371 VN_RELE(fromvp); 2372 *status = NFSERR_ROFS; 2373 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2374 "rfs_link_end:(%S)", "rofs"); 2375 return; 2376 } 2377 2378 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, 2379 "vop_link_start:"); 2380 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr); 2381 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, 2382 "vop_link_end:"); 2383 2384 /* 2385 * Force modified data and metadata out to stable storage. 2386 */ 2387 (void) VOP_FSYNC(tovp, 0, cr); 2388 (void) VOP_FSYNC(fromvp, FNODSYNC, cr); 2389 2390 VN_RELE(tovp); 2391 VN_RELE(fromvp); 2392 2393 *status = puterrno(error); 2394 2395 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2396 "rfs_link_end:(%S)", "done"); 2397 } 2398 void * 2399 rfs_link_getfh(struct nfslinkargs *args) 2400 { 2401 return (args->la_from); 2402 } 2403 2404 /* 2405 * Symbolicly link to a file. 2406 * Create a file (to) with the given attributes which is a symbolic link 2407 * to the given path name (to). 2408 */ 2409 void 2410 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2411 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2412 { 2413 int error; 2414 struct vattr va; 2415 vnode_t *vp; 2416 vnode_t *svp; 2417 int lerror; 2418 2419 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, 2420 "rfs_symlink_start:"); 2421 2422 /* 2423 * Disallow NULL paths 2424 */ 2425 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2426 *status = NFSERR_ACCES; 2427 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2428 "rfs_symlink_end:(%S)", "access"); 2429 return; 2430 } 2431 2432 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2433 if (vp == NULL) { 2434 *status = NFSERR_STALE; 2435 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2436 "rfs_symlink_end:(%S)", "stale"); 2437 return; 2438 } 2439 2440 if (rdonly(exi, req)) { 2441 VN_RELE(vp); 2442 *status = NFSERR_ROFS; 2443 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2444 "rfs_symlink_end:(%S)", "rofs"); 2445 return; 2446 } 2447 2448 error = sattr_to_vattr(args->sla_sa, &va); 2449 if (error) { 2450 VN_RELE(vp); 2451 *status = puterrno(error); 2452 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2453 "rfs_symlink_end:(%S)", "sattr"); 2454 return; 2455 } 2456 2457 if (!(va.va_mask & AT_MODE)) { 2458 VN_RELE(vp); 2459 *status = NFSERR_INVAL; 2460 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2461 "rfs_symlink_end:(%S)", "no mode"); 2462 return; 2463 } 2464 2465 va.va_type = VLNK; 2466 va.va_mask |= AT_TYPE; 2467 2468 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, 2469 "vop_symlink_start:"); 2470 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr); 2471 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, 2472 "vop_symlink_end:"); 2473 2474 /* 2475 * Force new data and metadata out to stable storage. 2476 */ 2477 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, 2478 "vop_lookup_start:"); 2479 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2480 0, NULL, cr); 2481 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, 2482 "vop_lookup_end:"); 2483 if (!lerror) { 2484 (void) VOP_FSYNC(svp, 0, cr); 2485 VN_RELE(svp); 2486 } 2487 2488 /* 2489 * Force modified data and metadata out to stable storage. 2490 */ 2491 (void) VOP_FSYNC(vp, 0, cr); 2492 2493 VN_RELE(vp); 2494 2495 *status = puterrno(error); 2496 2497 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2498 "rfs_symlink_end:(%S)", "done"); 2499 } 2500 void * 2501 rfs_symlink_getfh(struct nfsslargs *args) 2502 { 2503 return (args->sla_from.da_fhandle); 2504 } 2505 2506 /* 2507 * Make a directory. 2508 * Create a directory with the given name, parent directory, and attributes. 2509 * Returns a file handle and attributes for the new directory. 2510 */ 2511 void 2512 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2513 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2514 { 2515 int error; 2516 struct vattr va; 2517 vnode_t *dvp = NULL; 2518 vnode_t *vp; 2519 char *name = args->ca_da.da_name; 2520 2521 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, 2522 "rfs_mkdir_start:"); 2523 2524 /* 2525 * Disallow NULL paths 2526 */ 2527 if (name == NULL || *name == '\0') { 2528 dr->dr_status = NFSERR_ACCES; 2529 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2530 "rfs_mkdir_end:(%S)", "access"); 2531 return; 2532 } 2533 2534 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2535 if (vp == NULL) { 2536 dr->dr_status = NFSERR_STALE; 2537 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2538 "rfs_mkdir_end:(%S)", "stale"); 2539 return; 2540 } 2541 2542 if (rdonly(exi, req)) { 2543 VN_RELE(vp); 2544 dr->dr_status = NFSERR_ROFS; 2545 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2546 "rfs_mkdir_end:(%S)", "rofs"); 2547 return; 2548 } 2549 2550 error = sattr_to_vattr(args->ca_sa, &va); 2551 if (error) { 2552 VN_RELE(vp); 2553 dr->dr_status = puterrno(error); 2554 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2555 "rfs_mkdir_end:(%S)", "sattr"); 2556 return; 2557 } 2558 2559 if (!(va.va_mask & AT_MODE)) { 2560 VN_RELE(vp); 2561 dr->dr_status = NFSERR_INVAL; 2562 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2563 "rfs_mkdir_end:(%S)", "no mode"); 2564 return; 2565 } 2566 2567 va.va_type = VDIR; 2568 va.va_mask |= AT_TYPE; 2569 2570 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, 2571 "vop_mkdir_start:"); 2572 error = VOP_MKDIR(vp, name, &va, &dvp, cr); 2573 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, 2574 "vop_mkdir_end:"); 2575 2576 if (!error) { 2577 /* 2578 * Attribtutes of the newly created directory should 2579 * be returned to the client. 2580 */ 2581 va.va_mask = AT_ALL; /* We want everything */ 2582 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 2583 "vop_getattr_start:"); 2584 error = VOP_GETATTR(dvp, &va, 0, cr); 2585 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 2586 "vop_getattr_end:"); 2587 /* check for overflows */ 2588 if (!error) { 2589 acl_perm(vp, exi, &va, cr); 2590 error = vattr_to_nattr(&va, &dr->dr_attr); 2591 if (!error) { 2592 error = makefh(&dr->dr_fhandle, dvp, exi); 2593 } 2594 } 2595 /* 2596 * Force new data and metadata out to stable storage. 2597 */ 2598 (void) VOP_FSYNC(dvp, 0, cr); 2599 VN_RELE(dvp); 2600 } 2601 2602 /* 2603 * Force modified data and metadata out to stable storage. 2604 */ 2605 (void) VOP_FSYNC(vp, 0, cr); 2606 2607 VN_RELE(vp); 2608 2609 dr->dr_status = puterrno(error); 2610 2611 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2612 "rfs_mkdir_end:(%S)", "done"); 2613 } 2614 void * 2615 rfs_mkdir_getfh(struct nfscreatargs *args) 2616 { 2617 return (args->ca_da.da_fhandle); 2618 } 2619 2620 /* 2621 * Remove a directory. 2622 * Remove the given directory name from the given parent directory. 2623 */ 2624 void 2625 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2626 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2627 { 2628 int error; 2629 vnode_t *vp; 2630 2631 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, 2632 "rfs_rmdir_start:"); 2633 2634 /* 2635 * Disallow NULL paths 2636 */ 2637 if (da->da_name == NULL || *da->da_name == '\0') { 2638 *status = NFSERR_ACCES; 2639 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2640 "rfs_rmdir_end:(%S)", "access"); 2641 return; 2642 } 2643 2644 vp = nfs_fhtovp(da->da_fhandle, exi); 2645 if (vp == NULL) { 2646 *status = NFSERR_STALE; 2647 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2648 "rfs_rmdir_end:(%S)", "stale"); 2649 return; 2650 } 2651 2652 if (rdonly(exi, req)) { 2653 VN_RELE(vp); 2654 *status = NFSERR_ROFS; 2655 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2656 "rfs_rmdir_end:(%S)", "rofs"); 2657 return; 2658 } 2659 2660 /* 2661 * VOP_RMDIR now takes a new third argument (the current 2662 * directory of the process). That's because someone 2663 * wants to return EINVAL if one tries to remove ".". 2664 * Of course, NFS servers have no idea what their 2665 * clients' current directories are. We fake it by 2666 * supplying a vnode known to exist and illegal to 2667 * remove. 2668 */ 2669 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, 2670 "vop_rmdir_start:"); 2671 error = VOP_RMDIR(vp, da->da_name, rootdir, cr); 2672 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, 2673 "vop_rmdir_end:"); 2674 2675 /* 2676 * Force modified data and metadata out to stable storage. 2677 */ 2678 (void) VOP_FSYNC(vp, 0, cr); 2679 2680 VN_RELE(vp); 2681 2682 /* 2683 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2684 * if the directory is not empty. A System V NFS server 2685 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2686 * over the wire. 2687 */ 2688 if (error == EEXIST) 2689 *status = NFSERR_NOTEMPTY; 2690 else 2691 *status = puterrno(error); 2692 2693 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2694 "rfs_rmdir_end:(%S)", "done"); 2695 } 2696 void * 2697 rfs_rmdir_getfh(struct nfsdiropargs *da) 2698 { 2699 return (da->da_fhandle); 2700 } 2701 2702 /* ARGSUSED */ 2703 void 2704 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2705 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2706 { 2707 int error; 2708 int iseof; 2709 struct iovec iov; 2710 struct uio uio; 2711 vnode_t *vp; 2712 2713 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, 2714 "rfs_readdir_start:"); 2715 2716 vp = nfs_fhtovp(&rda->rda_fh, exi); 2717 if (vp == NULL) { 2718 rd->rd_entries = NULL; 2719 rd->rd_status = NFSERR_STALE; 2720 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2721 "rfs_readdir_end:(%S)", "stale"); 2722 return; 2723 } 2724 2725 if (vp->v_type != VDIR) { 2726 VN_RELE(vp); 2727 rd->rd_entries = NULL; 2728 rd->rd_status = NFSERR_NOTDIR; 2729 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2730 "rfs_readdir_end:(%S)", "notdir"); 2731 return; 2732 } 2733 2734 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, 2735 "vop_rwlock_start:"); 2736 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2737 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, 2738 "vop_rwlock_end:"); 2739 2740 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 2741 "vop_access_start:"); 2742 error = VOP_ACCESS(vp, VREAD, 0, cr); 2743 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 2744 "vop_access_end:"); 2745 if (error) { 2746 rd->rd_entries = NULL; 2747 goto bad; 2748 } 2749 2750 if (rda->rda_count == 0) { 2751 rd->rd_entries = NULL; 2752 rd->rd_size = 0; 2753 rd->rd_eof = FALSE; 2754 goto bad; 2755 } 2756 2757 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2758 2759 /* 2760 * Allocate data for entries. This will be freed by rfs_rddirfree. 2761 */ 2762 rd->rd_bufsize = (uint_t)rda->rda_count; 2763 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2764 2765 /* 2766 * Set up io vector to read directory data 2767 */ 2768 iov.iov_base = (caddr_t)rd->rd_entries; 2769 iov.iov_len = rda->rda_count; 2770 uio.uio_iov = &iov; 2771 uio.uio_iovcnt = 1; 2772 uio.uio_segflg = UIO_SYSSPACE; 2773 uio.uio_extflg = UIO_COPY_CACHED; 2774 uio.uio_loffset = (offset_t)rda->rda_offset; 2775 uio.uio_resid = rda->rda_count; 2776 2777 /* 2778 * read directory 2779 */ 2780 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, 2781 "vop_readdir_start:"); 2782 error = VOP_READDIR(vp, &uio, cr, &iseof); 2783 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, 2784 "vop_readdir_end:"); 2785 2786 /* 2787 * Clean up 2788 */ 2789 if (!error) { 2790 /* 2791 * set size and eof 2792 */ 2793 if (uio.uio_resid == rda->rda_count) { 2794 rd->rd_size = 0; 2795 rd->rd_eof = TRUE; 2796 } else { 2797 rd->rd_size = (uint32_t)(rda->rda_count - 2798 uio.uio_resid); 2799 rd->rd_eof = iseof ? TRUE : FALSE; 2800 } 2801 } 2802 2803 bad: 2804 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 2805 "vop_rwunlock_start:"); 2806 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2807 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 2808 "vop_rwunlock_end:"); 2809 2810 #if 0 /* notyet */ 2811 /* 2812 * Don't do this. It causes local disk writes when just 2813 * reading the file and the overhead is deemed larger 2814 * than the benefit. 2815 */ 2816 /* 2817 * Force modified metadata out to stable storage. 2818 */ 2819 (void) VOP_FSYNC(vp, FNODSYNC, cr); 2820 #endif 2821 2822 VN_RELE(vp); 2823 2824 rd->rd_status = puterrno(error); 2825 2826 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2827 "rfs_readdir_end:(%S)", "done"); 2828 } 2829 void * 2830 rfs_readdir_getfh(struct nfsrddirargs *rda) 2831 { 2832 return (&rda->rda_fh); 2833 } 2834 void 2835 rfs_rddirfree(struct nfsrddirres *rd) 2836 { 2837 if (rd->rd_entries != NULL) 2838 kmem_free(rd->rd_entries, rd->rd_bufsize); 2839 } 2840 2841 /* ARGSUSED */ 2842 void 2843 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2844 struct svc_req *req, cred_t *cr) 2845 { 2846 int error; 2847 struct statvfs64 sb; 2848 vnode_t *vp; 2849 2850 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, 2851 "rfs_statfs_start:"); 2852 2853 vp = nfs_fhtovp(fh, exi); 2854 if (vp == NULL) { 2855 fs->fs_status = NFSERR_STALE; 2856 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2857 "rfs_statfs_end:(%S)", "stale"); 2858 return; 2859 } 2860 2861 error = VFS_STATVFS(vp->v_vfsp, &sb); 2862 2863 if (!error) { 2864 fs->fs_tsize = nfstsize(); 2865 fs->fs_bsize = sb.f_frsize; 2866 fs->fs_blocks = sb.f_blocks; 2867 fs->fs_bfree = sb.f_bfree; 2868 fs->fs_bavail = sb.f_bavail; 2869 } 2870 2871 VN_RELE(vp); 2872 2873 fs->fs_status = puterrno(error); 2874 2875 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2876 "rfs_statfs_end:(%S)", "done"); 2877 } 2878 void * 2879 rfs_statfs_getfh(fhandle_t *fh) 2880 { 2881 return (fh); 2882 } 2883 2884 static int 2885 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2886 { 2887 vap->va_mask = 0; 2888 2889 /* 2890 * There was a sign extension bug in some VFS based systems 2891 * which stored the mode as a short. When it would get 2892 * assigned to a u_long, no sign extension would occur. 2893 * It needed to, but this wasn't noticed because sa_mode 2894 * would then get assigned back to the short, thus ignoring 2895 * the upper 16 bits of sa_mode. 2896 * 2897 * To make this implementation work for both broken 2898 * clients and good clients, we check for both versions 2899 * of the mode. 2900 */ 2901 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2902 sa->sa_mode != (uint32_t)-1) { 2903 vap->va_mask |= AT_MODE; 2904 vap->va_mode = sa->sa_mode; 2905 } 2906 if (sa->sa_uid != (uint32_t)-1) { 2907 vap->va_mask |= AT_UID; 2908 vap->va_uid = sa->sa_uid; 2909 } 2910 if (sa->sa_gid != (uint32_t)-1) { 2911 vap->va_mask |= AT_GID; 2912 vap->va_gid = sa->sa_gid; 2913 } 2914 if (sa->sa_size != (uint32_t)-1) { 2915 vap->va_mask |= AT_SIZE; 2916 vap->va_size = sa->sa_size; 2917 } 2918 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2919 sa->sa_atime.tv_usec != (int32_t)-1) { 2920 #ifndef _LP64 2921 /* return error if time overflow */ 2922 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2923 return (EOVERFLOW); 2924 #endif 2925 vap->va_mask |= AT_ATIME; 2926 /* 2927 * nfs protocol defines times as unsigned so don't extend sign, 2928 * unless sysadmin set nfs_allow_preepoch_time. 2929 */ 2930 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2931 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2932 } 2933 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2934 sa->sa_mtime.tv_usec != (int32_t)-1) { 2935 #ifndef _LP64 2936 /* return error if time overflow */ 2937 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2938 return (EOVERFLOW); 2939 #endif 2940 vap->va_mask |= AT_MTIME; 2941 /* 2942 * nfs protocol defines times as unsigned so don't extend sign, 2943 * unless sysadmin set nfs_allow_preepoch_time. 2944 */ 2945 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2946 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2947 } 2948 return (0); 2949 } 2950 2951 static enum nfsftype vt_to_nf[] = { 2952 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2953 }; 2954 2955 /* 2956 * check the following fields for overflow: nodeid, size, and time. 2957 * There could be a problem when converting 64-bit LP64 fields 2958 * into 32-bit ones. Return an error if there is an overflow. 2959 */ 2960 int 2961 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2962 { 2963 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2964 na->na_type = vt_to_nf[vap->va_type]; 2965 2966 if (vap->va_mode == (unsigned short) -1) 2967 na->na_mode = (uint32_t)-1; 2968 else 2969 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2970 2971 if (vap->va_uid == (unsigned short)(-1)) 2972 na->na_uid = (uint32_t)(-1); 2973 else if (vap->va_uid == UID_NOBODY) 2974 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2975 else 2976 na->na_uid = vap->va_uid; 2977 2978 if (vap->va_gid == (unsigned short)(-1)) 2979 na->na_gid = (uint32_t)-1; 2980 else if (vap->va_gid == GID_NOBODY) 2981 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2982 else 2983 na->na_gid = vap->va_gid; 2984 2985 /* 2986 * Do we need to check fsid for overflow? It is 64-bit in the 2987 * vattr, but are bigger than 32 bit values supported? 2988 */ 2989 na->na_fsid = vap->va_fsid; 2990 2991 na->na_nodeid = vap->va_nodeid; 2992 2993 /* 2994 * Check to make sure that the nodeid is representable over the 2995 * wire without losing bits. 2996 */ 2997 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2998 return (EFBIG); 2999 na->na_nlink = vap->va_nlink; 3000 3001 /* 3002 * Check for big files here, instead of at the caller. See 3003 * comments in cstat for large special file explanation. 3004 */ 3005 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 3006 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 3007 return (EFBIG); 3008 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 3009 /* UNKNOWN_SIZE | OVERFLOW */ 3010 na->na_size = MAXOFF32_T; 3011 } else 3012 na->na_size = vap->va_size; 3013 } else 3014 na->na_size = vap->va_size; 3015 3016 /* 3017 * If the vnode times overflow the 32-bit times that NFS2 3018 * uses on the wire then return an error. 3019 */ 3020 if (!NFS_VAP_TIME_OK(vap)) { 3021 return (EOVERFLOW); 3022 } 3023 na->na_atime.tv_sec = vap->va_atime.tv_sec; 3024 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 3025 3026 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 3027 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 3028 3029 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 3030 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 3031 3032 /* 3033 * If the dev_t will fit into 16 bits then compress 3034 * it, otherwise leave it alone. See comments in 3035 * nfs_client.c. 3036 */ 3037 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 3038 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 3039 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 3040 else 3041 (void) cmpldev(&na->na_rdev, vap->va_rdev); 3042 3043 na->na_blocks = vap->va_nblocks; 3044 na->na_blocksize = vap->va_blksize; 3045 3046 /* 3047 * This bit of ugliness is a *TEMPORARY* hack to preserve the 3048 * over-the-wire protocols for named-pipe vnodes. It remaps the 3049 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 3050 * 3051 * BUYER BEWARE: 3052 * If you are porting the NFS to a non-Sun server, you probably 3053 * don't want to include the following block of code. The 3054 * over-the-wire special file types will be changing with the 3055 * NFS Protocol Revision. 3056 */ 3057 if (vap->va_type == VFIFO) 3058 NA_SETFIFO(na); 3059 return (0); 3060 } 3061 3062 /* 3063 * acl v2 support: returns approximate permission. 3064 * default: returns minimal permission (more restrictive) 3065 * aclok: returns maximal permission (less restrictive) 3066 * This routine changes the permissions that are alaredy in *va. 3067 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3068 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3069 */ 3070 static void 3071 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3072 { 3073 vsecattr_t vsa; 3074 int aclcnt; 3075 aclent_t *aclentp; 3076 mode_t mask_perm; 3077 mode_t grp_perm; 3078 mode_t other_perm; 3079 mode_t other_orig; 3080 int error; 3081 3082 /* dont care default acl */ 3083 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3084 error = VOP_GETSECATTR(vp, &vsa, 0, cr); 3085 3086 if (!error) { 3087 aclcnt = vsa.vsa_aclcnt; 3088 if (aclcnt > MIN_ACL_ENTRIES) { 3089 /* non-trivial ACL */ 3090 aclentp = vsa.vsa_aclentp; 3091 if (exi->exi_export.ex_flags & EX_ACLOK) { 3092 /* maximal permissions */ 3093 grp_perm = 0; 3094 other_perm = 0; 3095 for (; aclcnt > 0; aclcnt--, aclentp++) { 3096 switch (aclentp->a_type) { 3097 case USER_OBJ: 3098 break; 3099 case USER: 3100 grp_perm |= 3101 aclentp->a_perm << 3; 3102 other_perm |= aclentp->a_perm; 3103 break; 3104 case GROUP_OBJ: 3105 grp_perm |= 3106 aclentp->a_perm << 3; 3107 break; 3108 case GROUP: 3109 other_perm |= aclentp->a_perm; 3110 break; 3111 case OTHER_OBJ: 3112 other_orig = aclentp->a_perm; 3113 break; 3114 case CLASS_OBJ: 3115 mask_perm = aclentp->a_perm; 3116 break; 3117 default: 3118 break; 3119 } 3120 } 3121 grp_perm &= mask_perm << 3; 3122 other_perm &= mask_perm; 3123 other_perm |= other_orig; 3124 3125 } else { 3126 /* minimal permissions */ 3127 grp_perm = 070; 3128 other_perm = 07; 3129 for (; aclcnt > 0; aclcnt--, aclentp++) { 3130 switch (aclentp->a_type) { 3131 case USER_OBJ: 3132 break; 3133 case USER: 3134 case CLASS_OBJ: 3135 grp_perm &= 3136 aclentp->a_perm << 3; 3137 other_perm &= 3138 aclentp->a_perm; 3139 break; 3140 case GROUP_OBJ: 3141 grp_perm &= 3142 aclentp->a_perm << 3; 3143 break; 3144 case GROUP: 3145 other_perm &= 3146 aclentp->a_perm; 3147 break; 3148 case OTHER_OBJ: 3149 other_perm &= 3150 aclentp->a_perm; 3151 break; 3152 default: 3153 break; 3154 } 3155 } 3156 } 3157 /* copy to va */ 3158 va->va_mode &= ~077; 3159 va->va_mode |= grp_perm | other_perm; 3160 } 3161 if (vsa.vsa_aclcnt) 3162 kmem_free(vsa.vsa_aclentp, 3163 vsa.vsa_aclcnt * sizeof (aclent_t)); 3164 } 3165 } 3166 3167 void 3168 rfs_srvrinit(void) 3169 { 3170 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3171 } 3172 3173 void 3174 rfs_srvrfini(void) 3175 { 3176 mutex_destroy(&rfs_async_write_lock); 3177 } 3178