1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/uio.h> 41 #include <sys/stat.h> 42 #include <sys/errno.h> 43 #include <sys/sysmacros.h> 44 #include <sys/statvfs.h> 45 #include <sys/kmem.h> 46 #include <sys/kstat.h> 47 #include <sys/dirent.h> 48 #include <sys/cmn_err.h> 49 #include <sys/debug.h> 50 #include <sys/vtrace.h> 51 #include <sys/mode.h> 52 #include <sys/acl.h> 53 #include <sys/nbmlock.h> 54 #include <sys/policy.h> 55 56 #include <rpc/types.h> 57 #include <rpc/auth.h> 58 #include <rpc/svc.h> 59 60 #include <nfs/nfs.h> 61 #include <nfs/export.h> 62 63 #include <vm/hat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kmem.h> 68 69 #include <sys/strsubr.h> 70 71 /* 72 * These are the interface routines for the server side of the 73 * Network File System. See the NFS version 2 protocol specification 74 * for a description of this interface. 75 */ 76 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 79 cred_t *); 80 81 /* 82 * Some "over the wire" UNIX file types. These are encoded 83 * into the mode. This needs to be fixed in the next rev. 84 */ 85 #define IFMT 0170000 /* type of file */ 86 #define IFCHR 0020000 /* character special */ 87 #define IFBLK 0060000 /* block special */ 88 #define IFSOCK 0140000 /* socket */ 89 90 /* 91 * Get file attributes. 92 * Returns the current attributes of the file with the given fhandle. 93 */ 94 /* ARGSUSED */ 95 void 96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 97 struct svc_req *req, cred_t *cr) 98 { 99 int error; 100 vnode_t *vp; 101 struct vattr va; 102 103 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:"); 104 105 vp = nfs_fhtovp(fhp, exi); 106 if (vp == NULL) { 107 ns->ns_status = NFSERR_STALE; 108 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 109 "rfs_getattr_end:(%S)", "stale"); 110 return; 111 } 112 113 /* 114 * Do the getattr. 115 */ 116 va.va_mask = AT_ALL; /* we want all the attributes */ 117 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 118 error = rfs4_delegated_getattr(vp, &va, 0, cr); 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 120 121 /* check for overflows */ 122 if (!error) { 123 acl_perm(vp, exi, &va, cr); 124 error = vattr_to_nattr(&va, &ns->ns_attr); 125 } 126 127 VN_RELE(vp); 128 129 ns->ns_status = puterrno(error); 130 131 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done"); 132 } 133 void * 134 rfs_getattr_getfh(fhandle_t *fhp) 135 { 136 return (fhp); 137 } 138 139 /* 140 * Set file attributes. 141 * Sets the attributes of the file with the given fhandle. Returns 142 * the new attributes. 143 */ 144 void 145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 146 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 147 { 148 int error; 149 int flag; 150 int in_crit = 0; 151 vnode_t *vp; 152 struct vattr va; 153 struct vattr bva; 154 struct flock64 bf; 155 156 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:"); 157 158 vp = nfs_fhtovp(&args->saa_fh, exi); 159 if (vp == NULL) { 160 ns->ns_status = NFSERR_STALE; 161 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 162 "rfs_setattr_end:(%S)", "stale"); 163 return; 164 } 165 166 if (rdonly(exi, req) || vn_is_readonly(vp)) { 167 VN_RELE(vp); 168 ns->ns_status = NFSERR_ROFS; 169 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 170 "rfs_setattr_end:(%S)", "rofs"); 171 return; 172 } 173 174 error = sattr_to_vattr(&args->saa_sa, &va); 175 if (error) { 176 VN_RELE(vp); 177 ns->ns_status = puterrno(error); 178 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 179 "rfs_setattr_end:(%S)", "sattr"); 180 return; 181 } 182 183 /* 184 * If the client is requesting a change to the mtime, 185 * but the nanosecond field is set to 1 billion, then 186 * this is a flag to the server that it should set the 187 * atime and mtime fields to the server's current time. 188 * The 1 billion number actually came from the client 189 * as 1 million, but the units in the over the wire 190 * request are microseconds instead of nanoseconds. 191 * 192 * This is an overload of the protocol and should be 193 * documented in the NFS Version 2 protocol specification. 194 */ 195 if (va.va_mask & AT_MTIME) { 196 if (va.va_mtime.tv_nsec == 1000000000) { 197 gethrestime(&va.va_mtime); 198 va.va_atime = va.va_mtime; 199 va.va_mask |= AT_ATIME; 200 flag = 0; 201 } else 202 flag = ATTR_UTIME; 203 } else 204 flag = 0; 205 206 /* 207 * If the filesystem is exported with nosuid, then mask off 208 * the setuid and setgid bits. 209 */ 210 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 211 (exi->exi_export.ex_flags & EX_NOSUID)) 212 va.va_mode &= ~(VSUID | VSGID); 213 214 /* 215 * We need to specially handle size changes because it is 216 * possible for the client to create a file with modes 217 * which indicate read-only, but with the file opened for 218 * writing. If the client then tries to set the size of 219 * the file, then the normal access checking done in 220 * VOP_SETATTR would prevent the client from doing so, 221 * although it should be legal for it to do so. To get 222 * around this, we do the access checking for ourselves 223 * and then use VOP_SPACE which doesn't do the access 224 * checking which VOP_SETATTR does. VOP_SPACE can only 225 * operate on VREG files, let VOP_SETATTR handle the other 226 * extremely rare cases. 227 * Also the client should not be allowed to change the 228 * size of the file if there is a conflicting non-blocking 229 * mandatory lock in the region of change. 230 * 231 * Also(2), check to see if the v4 side of the server has 232 * delegated this file. If so, then we set T_WOULDBLOCK 233 * so that the dispatch function dosn't send a reply, forcing 234 * the client to retrasmit its request. 235 */ 236 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 237 /* If delegated, mark as wouldblock so response is dropped */ 238 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 239 VN_RELE(vp); 240 curthread->t_flag |= T_WOULDBLOCK; 241 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 242 "rfs_setattr_end:(%S)", "delegated"); 243 return; 244 } 245 if (nbl_need_check(vp)) { 246 nbl_start_crit(vp, RW_READER); 247 in_crit = 1; 248 } 249 250 bva.va_mask = AT_UID | AT_SIZE; 251 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 252 error = VOP_GETATTR(vp, &bva, 0, cr, NULL); 253 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 254 if (error) { 255 if (in_crit) 256 nbl_end_crit(vp); 257 VN_RELE(vp); 258 ns->ns_status = puterrno(error); 259 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 260 "rfs_setattr_end:(%S)", "getattr"); 261 return; 262 } 263 264 if (in_crit) { 265 u_offset_t offset; 266 ssize_t length; 267 268 if (va.va_size < bva.va_size) { 269 offset = va.va_size; 270 length = bva.va_size - va.va_size; 271 } else { 272 offset = bva.va_size; 273 length = va.va_size - bva.va_size; 274 } 275 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0, 276 NULL)) { 277 error = EACCES; 278 } 279 } 280 281 if (crgetuid(cr) == bva.va_uid && !error && 282 va.va_size != bva.va_size) { 283 va.va_mask &= ~AT_SIZE; 284 bf.l_type = F_WRLCK; 285 bf.l_whence = 0; 286 bf.l_start = (off64_t)va.va_size; 287 bf.l_len = 0; 288 bf.l_sysid = 0; 289 bf.l_pid = 0; 290 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 291 "vop_space_start:"); 292 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 293 (offset_t)va.va_size, cr, NULL); 294 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:"); 295 } 296 if (in_crit) 297 nbl_end_crit(vp); 298 } else 299 error = 0; 300 301 /* 302 * Do the setattr. 303 */ 304 if (!error && va.va_mask) { 305 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:"); 306 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 307 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:"); 308 } 309 310 if (!error) { 311 va.va_mask = AT_ALL; /* get everything */ 312 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 313 error = rfs4_delegated_getattr(vp, &va, 0, cr); 314 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 315 316 /* check for overflows */ 317 if (!error) { 318 acl_perm(vp, exi, &va, cr); 319 error = vattr_to_nattr(&va, &ns->ns_attr); 320 } 321 } 322 323 /* 324 * Force modified metadata out to stable storage. 325 */ 326 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 327 328 VN_RELE(vp); 329 330 ns->ns_status = puterrno(error); 331 332 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done"); 333 } 334 void * 335 rfs_setattr_getfh(struct nfssaargs *args) 336 { 337 return (&args->saa_fh); 338 } 339 340 /* 341 * Directory lookup. 342 * Returns an fhandle and file attributes for file name in a directory. 343 */ 344 /* ARGSUSED */ 345 void 346 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 347 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 348 { 349 int error; 350 vnode_t *dvp; 351 vnode_t *vp; 352 struct vattr va; 353 fhandle_t *fhp = da->da_fhandle; 354 struct sec_ol sec = {0, 0}; 355 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 356 357 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:"); 358 359 /* 360 * Trusted Extension doesn't support NFSv2. MOUNT 361 * will reject v2 clients. Need to prevent v2 client 362 * access via WebNFS here. 363 */ 364 if (is_system_labeled() && req->rq_vers == 2) { 365 dr->dr_status = NFSERR_ACCES; 366 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 367 "rfs_lookup_end:(%S)", "access"); 368 return; 369 } 370 371 /* 372 * Disallow NULL paths 373 */ 374 if (da->da_name == NULL || *da->da_name == '\0') { 375 dr->dr_status = NFSERR_ACCES; 376 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 377 "rfs_lookup_end:(%S)", "access"); 378 return; 379 } 380 381 /* 382 * Allow lookups from the root - the default 383 * location of the public filehandle. 384 */ 385 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 386 dvp = rootdir; 387 VN_HOLD(dvp); 388 } else { 389 dvp = nfs_fhtovp(fhp, exi); 390 if (dvp == NULL) { 391 dr->dr_status = NFSERR_STALE; 392 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 393 "rfs_lookup_end:(%S)", "stale"); 394 return; 395 } 396 } 397 398 /* 399 * Not allow lookup beyond root. 400 * If the filehandle matches a filehandle of the exi, 401 * then the ".." refers beyond the root of an exported filesystem. 402 */ 403 if (strcmp(da->da_name, "..") == 0 && 404 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 405 VN_RELE(dvp); 406 dr->dr_status = NFSERR_NOENT; 407 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 408 "rfs_lookup_end:(%S)", "noent"); 409 return; 410 } 411 412 /* 413 * If the public filehandle is used then allow 414 * a multi-component lookup, i.e. evaluate 415 * a pathname and follow symbolic links if 416 * necessary. 417 * 418 * This may result in a vnode in another filesystem 419 * which is OK as long as the filesystem is exported. 420 */ 421 if (PUBLIC_FH2(fhp)) { 422 publicfh_flag = TRUE; 423 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 424 &sec); 425 } else { 426 /* 427 * Do a normal single component lookup. 428 */ 429 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 430 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr, 431 NULL, NULL, NULL); 432 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 433 } 434 435 if (!error) { 436 va.va_mask = AT_ALL; /* we want everything */ 437 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 438 error = rfs4_delegated_getattr(vp, &va, 0, cr); 439 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 440 /* check for overflows */ 441 if (!error) { 442 acl_perm(vp, exi, &va, cr); 443 error = vattr_to_nattr(&va, &dr->dr_attr); 444 if (!error) { 445 if (sec.sec_flags & SEC_QUERY) 446 error = makefh_ol(&dr->dr_fhandle, exi, 447 sec.sec_index); 448 else { 449 error = makefh(&dr->dr_fhandle, vp, 450 exi); 451 if (!error && publicfh_flag && 452 !chk_clnt_sec(exi, req)) 453 auth_weak = TRUE; 454 } 455 } 456 } 457 VN_RELE(vp); 458 } 459 460 VN_RELE(dvp); 461 462 /* 463 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 464 * and have obtained a new exportinfo in exi which needs to be 465 * released. Note the the original exportinfo pointed to by exi 466 * will be released by the caller, comon_dispatch. 467 */ 468 if (publicfh_flag && exi != NULL) 469 exi_rele(exi); 470 471 /* 472 * If it's public fh, no 0x81, and client's flavor is 473 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 474 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 475 */ 476 if (auth_weak) 477 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 478 else 479 dr->dr_status = puterrno(error); 480 481 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done"); 482 } 483 void * 484 rfs_lookup_getfh(struct nfsdiropargs *da) 485 { 486 return (da->da_fhandle); 487 } 488 489 /* 490 * Read symbolic link. 491 * Returns the string in the symbolic link at the given fhandle. 492 */ 493 /* ARGSUSED */ 494 void 495 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 496 struct svc_req *req, cred_t *cr) 497 { 498 int error; 499 struct iovec iov; 500 struct uio uio; 501 vnode_t *vp; 502 struct vattr va; 503 504 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:"); 505 506 vp = nfs_fhtovp(fhp, exi); 507 if (vp == NULL) { 508 rl->rl_data = NULL; 509 rl->rl_status = NFSERR_STALE; 510 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 511 "rfs_readlink_end:(%S)", "stale"); 512 return; 513 } 514 515 va.va_mask = AT_MODE; 516 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 517 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 518 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 519 520 if (error) { 521 VN_RELE(vp); 522 rl->rl_data = NULL; 523 rl->rl_status = puterrno(error); 524 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 525 "rfs_readlink_end:(%S)", "getattr error"); 526 return; 527 } 528 529 if (MANDLOCK(vp, va.va_mode)) { 530 VN_RELE(vp); 531 rl->rl_data = NULL; 532 rl->rl_status = NFSERR_ACCES; 533 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 534 "rfs_readlink_end:(%S)", "access"); 535 return; 536 } 537 538 /* 539 * XNFS and RFC1094 require us to return ENXIO if argument 540 * is not a link. BUGID 1138002. 541 */ 542 if (vp->v_type != VLNK) { 543 VN_RELE(vp); 544 rl->rl_data = NULL; 545 rl->rl_status = NFSERR_NXIO; 546 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 547 "rfs_readlink_end:(%S)", "nxio"); 548 return; 549 } 550 551 /* 552 * Allocate data for pathname. This will be freed by rfs_rlfree. 553 */ 554 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 555 556 /* 557 * Set up io vector to read sym link data 558 */ 559 iov.iov_base = rl->rl_data; 560 iov.iov_len = NFS_MAXPATHLEN; 561 uio.uio_iov = &iov; 562 uio.uio_iovcnt = 1; 563 uio.uio_segflg = UIO_SYSSPACE; 564 uio.uio_extflg = UIO_COPY_CACHED; 565 uio.uio_loffset = (offset_t)0; 566 uio.uio_resid = NFS_MAXPATHLEN; 567 568 /* 569 * Do the readlink. 570 */ 571 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:"); 572 error = VOP_READLINK(vp, &uio, cr, NULL); 573 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:"); 574 575 #if 0 /* notyet */ 576 /* 577 * Don't do this. It causes local disk writes when just 578 * reading the file and the overhead is deemed larger 579 * than the benefit. 580 */ 581 /* 582 * Force modified metadata out to stable storage. 583 */ 584 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 585 #endif 586 587 VN_RELE(vp); 588 589 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 590 591 /* 592 * XNFS and RFC1094 require us to return ENXIO if argument 593 * is not a link. UFS returns EINVAL if this is the case, 594 * so we do the mapping here. BUGID 1138002. 595 */ 596 if (error == EINVAL) 597 rl->rl_status = NFSERR_NXIO; 598 else 599 rl->rl_status = puterrno(error); 600 601 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 602 "rfs_readlink_end:(%S)", "done"); 603 } 604 void * 605 rfs_readlink_getfh(fhandle_t *fhp) 606 { 607 return (fhp); 608 } 609 /* 610 * Free data allocated by rfs_readlink 611 */ 612 void 613 rfs_rlfree(struct nfsrdlnres *rl) 614 { 615 if (rl->rl_data != NULL) 616 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 617 } 618 619 /* 620 * Read data. 621 * Returns some data read from the file at the given fhandle. 622 */ 623 /* ARGSUSED */ 624 void 625 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 626 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 627 { 628 vnode_t *vp; 629 int error; 630 struct vattr va; 631 struct iovec iov; 632 struct uio uio; 633 mblk_t *mp; 634 int alloc_err = 0; 635 int in_crit = 0; 636 637 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:"); 638 639 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 640 if (vp == NULL) { 641 rr->rr_data = NULL; 642 rr->rr_status = NFSERR_STALE; 643 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 644 "rfs_read_end:(%S)", "stale"); 645 return; 646 } 647 648 if (vp->v_type != VREG) { 649 VN_RELE(vp); 650 rr->rr_data = NULL; 651 rr->rr_status = NFSERR_ISDIR; 652 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 653 "rfs_read_end:(%S)", "isdir"); 654 return; 655 } 656 657 /* 658 * Check to see if the v4 side of the server has delegated 659 * this file. If so, then we mark thread as wouldblock so 660 * the response is dropped. 661 */ 662 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 663 VN_RELE(vp); 664 curthread->t_flag |= T_WOULDBLOCK; 665 rr->rr_data = NULL; 666 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 667 "rfs_read_end:(%S)", "delegated"); 668 return; 669 } 670 671 /* 672 * Enter the critical region before calling VOP_RWLOCK 673 * to avoid a deadlock with write requests. 674 */ 675 if (nbl_need_check(vp)) { 676 nbl_start_crit(vp, RW_READER); 677 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 678 0, NULL)) { 679 nbl_end_crit(vp); 680 VN_RELE(vp); 681 rr->rr_data = NULL; 682 rr->rr_status = NFSERR_ACCES; 683 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 684 "rfs_read_end:(%S)", " csf access error"); 685 return; 686 } 687 in_crit = 1; 688 } 689 690 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 691 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 692 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 693 694 va.va_mask = AT_ALL; 695 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 696 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 697 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 698 699 if (error) { 700 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 701 "vop_rwunlock_start:"); 702 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 703 if (in_crit) 704 nbl_end_crit(vp); 705 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 706 VN_RELE(vp); 707 rr->rr_data = NULL; 708 rr->rr_status = puterrno(error); 709 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 710 "rfs_read_end:(%S)", "getattr error"); 711 return; 712 } 713 714 /* 715 * This is a kludge to allow reading of files created 716 * with no read permission. The owner of the file 717 * is always allowed to read it. 718 */ 719 if (crgetuid(cr) != va.va_uid) { 720 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 721 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL); 722 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 723 if (error) { 724 /* 725 * Exec is the same as read over the net because 726 * of demand loading. 727 */ 728 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 729 "vop_access_start:"); 730 error = VOP_ACCESS(vp, VEXEC, 0, cr, NULL); 731 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 732 "vop_access_end:"); 733 } 734 if (error) { 735 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 736 "vop_rwunlock_start:"); 737 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 738 if (in_crit) 739 nbl_end_crit(vp); 740 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 741 "vop_rwunlock_end:"); 742 VN_RELE(vp); 743 rr->rr_data = NULL; 744 rr->rr_status = puterrno(error); 745 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 746 "rfs_read_end:(%S)", "access error"); 747 return; 748 } 749 } 750 751 if (MANDLOCK(vp, va.va_mode)) { 752 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 753 "vop_rwunlock_start:"); 754 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 755 if (in_crit) 756 nbl_end_crit(vp); 757 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 758 VN_RELE(vp); 759 rr->rr_data = NULL; 760 rr->rr_status = NFSERR_ACCES; 761 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 762 "rfs_read_end:(%S)", "mand lock"); 763 return; 764 } 765 766 if ((u_offset_t)ra->ra_offset >= va.va_size) { 767 rr->rr_count = 0; 768 rr->rr_data = NULL; 769 /* 770 * In this case, status is NFS_OK, but there is no data 771 * to encode. So set rr_mp to NULL. 772 */ 773 rr->rr_mp = NULL; 774 goto done; 775 } 776 777 /* 778 * mp will contain the data to be sent out in the read reply. 779 * This will be freed after the reply has been sent out (by the 780 * driver). 781 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 782 * that the call to xdrmblk_putmblk() never fails. 783 */ 784 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 785 &alloc_err); 786 ASSERT(mp != NULL); 787 ASSERT(alloc_err == 0); 788 789 rr->rr_mp = mp; 790 791 /* 792 * Set up io vector 793 */ 794 iov.iov_base = (caddr_t)mp->b_datap->db_base; 795 iov.iov_len = ra->ra_count; 796 uio.uio_iov = &iov; 797 uio.uio_iovcnt = 1; 798 uio.uio_segflg = UIO_SYSSPACE; 799 uio.uio_extflg = UIO_COPY_CACHED; 800 uio.uio_loffset = (offset_t)ra->ra_offset; 801 uio.uio_resid = ra->ra_count; 802 803 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:"); 804 error = VOP_READ(vp, &uio, 0, cr, NULL); 805 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:"); 806 807 if (error) { 808 freeb(mp); 809 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 810 "vop_rwunlock_start:"); 811 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 812 if (in_crit) 813 nbl_end_crit(vp); 814 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 815 VN_RELE(vp); 816 rr->rr_data = NULL; 817 rr->rr_status = puterrno(error); 818 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 819 "rfs_read_end:(%S)", "read error"); 820 return; 821 } 822 823 /* 824 * Get attributes again so we can send the latest access 825 * time to the client side for his cache. 826 */ 827 va.va_mask = AT_ALL; 828 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 829 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 830 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 831 if (error) { 832 freeb(mp); 833 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 834 "vop_rwunlock_start:"); 835 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 836 if (in_crit) 837 nbl_end_crit(vp); 838 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 839 "vop_rwunlock_end:"); 840 VN_RELE(vp); 841 rr->rr_data = NULL; 842 rr->rr_status = puterrno(error); 843 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 844 "rfs_read_end:(%S)", "read error"); 845 return; 846 } 847 848 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 849 850 rr->rr_data = (char *)mp->b_datap->db_base; 851 852 done: 853 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 854 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 855 if (in_crit) 856 nbl_end_crit(vp); 857 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 858 859 acl_perm(vp, exi, &va, cr); 860 861 /* check for overflows */ 862 error = vattr_to_nattr(&va, &rr->rr_attr); 863 864 #if 0 /* notyet */ 865 /* 866 * Don't do this. It causes local disk writes when just 867 * reading the file and the overhead is deemed larger 868 * than the benefit. 869 */ 870 /* 871 * Force modified metadata out to stable storage. 872 */ 873 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 874 #endif 875 876 VN_RELE(vp); 877 878 rr->rr_status = puterrno(error); 879 880 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done"); 881 } 882 883 /* 884 * Free data allocated by rfs_read 885 */ 886 void 887 rfs_rdfree(struct nfsrdresult *rr) 888 { 889 mblk_t *mp; 890 891 if (rr->rr_status == NFS_OK) { 892 mp = rr->rr_mp; 893 if (mp != NULL) 894 freeb(mp); 895 } 896 } 897 898 void * 899 rfs_read_getfh(struct nfsreadargs *ra) 900 { 901 return (&ra->ra_fhandle); 902 } 903 904 #define MAX_IOVECS 12 905 906 #ifdef DEBUG 907 static int rfs_write_sync_hits = 0; 908 static int rfs_write_sync_misses = 0; 909 #endif 910 911 /* 912 * Write data to file. 913 * Returns attributes of a file after writing some data to it. 914 * 915 * Any changes made here, especially in error handling might have 916 * to also be done in rfs_write (which clusters write requests). 917 */ 918 void 919 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 920 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 921 { 922 int error; 923 vnode_t *vp; 924 rlim64_t rlimit; 925 struct vattr va; 926 struct uio uio; 927 struct iovec iov[MAX_IOVECS]; 928 mblk_t *m; 929 struct iovec *iovp; 930 int iovcnt; 931 cred_t *savecred; 932 int in_crit = 0; 933 934 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync"); 935 936 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 937 if (vp == NULL) { 938 ns->ns_status = NFSERR_STALE; 939 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 940 "rfs_write_end:(%S)", "stale"); 941 return; 942 } 943 944 if (rdonly(exi, req)) { 945 VN_RELE(vp); 946 ns->ns_status = NFSERR_ROFS; 947 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 948 "rfs_write_end:(%S)", "rofs"); 949 return; 950 } 951 952 if (vp->v_type != VREG) { 953 VN_RELE(vp); 954 ns->ns_status = NFSERR_ISDIR; 955 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 956 "rfs_write_end:(%S)", "isdir"); 957 return; 958 } 959 960 /* 961 * Check to see if the v4 side of the server has delegated 962 * this file. If so, then we mark thread as wouldblock so 963 * the response is dropped. 964 */ 965 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 966 VN_RELE(vp); 967 curthread->t_flag |= T_WOULDBLOCK; 968 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 969 "rfs_write_end:(%S)", "delegated"); 970 return; 971 } 972 973 va.va_mask = AT_UID|AT_MODE; 974 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 975 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 976 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 977 978 if (error) { 979 VN_RELE(vp); 980 ns->ns_status = puterrno(error); 981 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 982 "rfs_write_end:(%S)", "getattr error"); 983 return; 984 } 985 986 if (crgetuid(cr) != va.va_uid) { 987 /* 988 * This is a kludge to allow writes of files created 989 * with read only permission. The owner of the file 990 * is always allowed to write it. 991 */ 992 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 993 error = VOP_ACCESS(vp, VWRITE, 0, cr, NULL); 994 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 995 if (error) { 996 VN_RELE(vp); 997 ns->ns_status = puterrno(error); 998 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 999 "rfs_write_end:(%S)", "access error"); 1000 return; 1001 } 1002 } 1003 1004 /* 1005 * Can't access a mandatory lock file. This might cause 1006 * the NFS service thread to block forever waiting for a 1007 * lock to be released that will never be released. 1008 */ 1009 if (MANDLOCK(vp, va.va_mode)) { 1010 VN_RELE(vp); 1011 ns->ns_status = NFSERR_ACCES; 1012 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1013 "rfs_write_end:(%S)", "mand lock"); 1014 return; 1015 } 1016 1017 /* 1018 * We have to enter the critical region before calling VOP_RWLOCK 1019 * to avoid a deadlock with ufs. 1020 */ 1021 if (nbl_need_check(vp)) { 1022 nbl_start_crit(vp, RW_READER); 1023 in_crit = 1; 1024 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1025 wa->wa_count, 0, NULL)) { 1026 error = EACCES; 1027 goto out; 1028 } 1029 } 1030 1031 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 1032 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1033 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 1034 1035 if (wa->wa_data) { 1036 iov[0].iov_base = wa->wa_data; 1037 iov[0].iov_len = wa->wa_count; 1038 uio.uio_iov = iov; 1039 uio.uio_iovcnt = 1; 1040 uio.uio_segflg = UIO_SYSSPACE; 1041 uio.uio_extflg = UIO_COPY_DEFAULT; 1042 uio.uio_loffset = (offset_t)wa->wa_offset; 1043 uio.uio_resid = wa->wa_count; 1044 /* 1045 * The limit is checked on the client. We 1046 * should allow any size writes here. 1047 */ 1048 uio.uio_llimit = curproc->p_fsz_ctl; 1049 rlimit = uio.uio_llimit - wa->wa_offset; 1050 if (rlimit < (rlim64_t)uio.uio_resid) 1051 uio.uio_resid = (uint_t)rlimit; 1052 1053 /* 1054 * for now we assume no append mode 1055 */ 1056 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1057 "vop_write_start:(%S)", "sync"); 1058 /* 1059 * We're changing creds because VM may fault and we need 1060 * the cred of the current thread to be used if quota 1061 * checking is enabled. 1062 */ 1063 savecred = curthread->t_cred; 1064 curthread->t_cred = cr; 1065 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1066 curthread->t_cred = savecred; 1067 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1068 } else { 1069 iovcnt = 0; 1070 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1071 iovcnt++; 1072 if (iovcnt <= MAX_IOVECS) { 1073 #ifdef DEBUG 1074 rfs_write_sync_hits++; 1075 #endif 1076 iovp = iov; 1077 } else { 1078 #ifdef DEBUG 1079 rfs_write_sync_misses++; 1080 #endif 1081 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1082 } 1083 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1084 uio.uio_iov = iovp; 1085 uio.uio_iovcnt = iovcnt; 1086 uio.uio_segflg = UIO_SYSSPACE; 1087 uio.uio_extflg = UIO_COPY_DEFAULT; 1088 uio.uio_loffset = (offset_t)wa->wa_offset; 1089 uio.uio_resid = wa->wa_count; 1090 /* 1091 * The limit is checked on the client. We 1092 * should allow any size writes here. 1093 */ 1094 uio.uio_llimit = curproc->p_fsz_ctl; 1095 rlimit = uio.uio_llimit - wa->wa_offset; 1096 if (rlimit < (rlim64_t)uio.uio_resid) 1097 uio.uio_resid = (uint_t)rlimit; 1098 1099 /* 1100 * For now we assume no append mode. 1101 */ 1102 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1103 "vop_write_start:(%S)", "iov sync"); 1104 /* 1105 * We're changing creds because VM may fault and we need 1106 * the cred of the current thread to be used if quota 1107 * checking is enabled. 1108 */ 1109 savecred = curthread->t_cred; 1110 curthread->t_cred = cr; 1111 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1112 curthread->t_cred = savecred; 1113 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1114 1115 if (iovp != iov) 1116 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1117 } 1118 1119 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1120 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1121 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1122 1123 if (!error) { 1124 /* 1125 * Get attributes again so we send the latest mod 1126 * time to the client side for his cache. 1127 */ 1128 va.va_mask = AT_ALL; /* now we want everything */ 1129 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1130 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 1131 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1132 /* check for overflows */ 1133 if (!error) { 1134 acl_perm(vp, exi, &va, cr); 1135 error = vattr_to_nattr(&va, &ns->ns_attr); 1136 } 1137 } 1138 1139 out: 1140 if (in_crit) 1141 nbl_end_crit(vp); 1142 VN_RELE(vp); 1143 1144 ns->ns_status = puterrno(error); 1145 1146 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync"); 1147 } 1148 1149 struct rfs_async_write { 1150 struct nfswriteargs *wa; 1151 struct nfsattrstat *ns; 1152 struct svc_req *req; 1153 cred_t *cr; 1154 kthread_t *thread; 1155 struct rfs_async_write *list; 1156 }; 1157 1158 struct rfs_async_write_list { 1159 fhandle_t *fhp; 1160 kcondvar_t cv; 1161 struct rfs_async_write *list; 1162 struct rfs_async_write_list *next; 1163 }; 1164 1165 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1166 static kmutex_t rfs_async_write_lock; 1167 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1168 1169 #define MAXCLIOVECS 42 1170 #define RFSWRITE_INITVAL (enum nfsstat) -1 1171 1172 #ifdef DEBUG 1173 static int rfs_write_hits = 0; 1174 static int rfs_write_misses = 0; 1175 #endif 1176 1177 /* 1178 * Write data to file. 1179 * Returns attributes of a file after writing some data to it. 1180 */ 1181 void 1182 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1183 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1184 { 1185 int error; 1186 vnode_t *vp; 1187 rlim64_t rlimit; 1188 struct vattr va; 1189 struct uio uio; 1190 struct rfs_async_write_list *lp; 1191 struct rfs_async_write_list *nlp; 1192 struct rfs_async_write *rp; 1193 struct rfs_async_write *nrp; 1194 struct rfs_async_write *trp; 1195 struct rfs_async_write *lrp; 1196 int data_written; 1197 int iovcnt; 1198 mblk_t *m; 1199 struct iovec *iovp; 1200 struct iovec *niovp; 1201 struct iovec iov[MAXCLIOVECS]; 1202 int count; 1203 int rcount; 1204 uint_t off; 1205 uint_t len; 1206 struct rfs_async_write nrpsp; 1207 struct rfs_async_write_list nlpsp; 1208 ushort_t t_flag; 1209 cred_t *savecred; 1210 int in_crit = 0; 1211 1212 if (!rfs_write_async) { 1213 rfs_write_sync(wa, ns, exi, req, cr); 1214 return; 1215 } 1216 1217 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1218 "rfs_write_start:(%S)", "async"); 1219 1220 /* 1221 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1222 * is considered an OK. 1223 */ 1224 ns->ns_status = RFSWRITE_INITVAL; 1225 1226 nrp = &nrpsp; 1227 nrp->wa = wa; 1228 nrp->ns = ns; 1229 nrp->req = req; 1230 nrp->cr = cr; 1231 nrp->thread = curthread; 1232 1233 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1234 1235 /* 1236 * Look to see if there is already a cluster started 1237 * for this file. 1238 */ 1239 mutex_enter(&rfs_async_write_lock); 1240 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1241 if (bcmp(&wa->wa_fhandle, lp->fhp, 1242 sizeof (fhandle_t)) == 0) 1243 break; 1244 } 1245 1246 /* 1247 * If lp is non-NULL, then there is already a cluster 1248 * started. We need to place ourselves in the cluster 1249 * list in the right place as determined by starting 1250 * offset. Conflicts with non-blocking mandatory locked 1251 * regions will be checked when the cluster is processed. 1252 */ 1253 if (lp != NULL) { 1254 rp = lp->list; 1255 trp = NULL; 1256 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1257 trp = rp; 1258 rp = rp->list; 1259 } 1260 nrp->list = rp; 1261 if (trp == NULL) 1262 lp->list = nrp; 1263 else 1264 trp->list = nrp; 1265 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1266 cv_wait(&lp->cv, &rfs_async_write_lock); 1267 mutex_exit(&rfs_async_write_lock); 1268 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1269 "rfs_write_end:(%S)", "cluster child"); 1270 return; 1271 } 1272 1273 /* 1274 * No cluster started yet, start one and add ourselves 1275 * to the list of clusters. 1276 */ 1277 nrp->list = NULL; 1278 1279 nlp = &nlpsp; 1280 nlp->fhp = &wa->wa_fhandle; 1281 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1282 nlp->list = nrp; 1283 nlp->next = NULL; 1284 1285 if (rfs_async_write_head == NULL) { 1286 rfs_async_write_head = nlp; 1287 } else { 1288 lp = rfs_async_write_head; 1289 while (lp->next != NULL) 1290 lp = lp->next; 1291 lp->next = nlp; 1292 } 1293 mutex_exit(&rfs_async_write_lock); 1294 1295 /* 1296 * Convert the file handle common to all of the requests 1297 * in this cluster to a vnode. 1298 */ 1299 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1300 if (vp == NULL) { 1301 mutex_enter(&rfs_async_write_lock); 1302 if (rfs_async_write_head == nlp) 1303 rfs_async_write_head = nlp->next; 1304 else { 1305 lp = rfs_async_write_head; 1306 while (lp->next != nlp) 1307 lp = lp->next; 1308 lp->next = nlp->next; 1309 } 1310 t_flag = curthread->t_flag & T_WOULDBLOCK; 1311 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1312 rp->ns->ns_status = NFSERR_STALE; 1313 rp->thread->t_flag |= t_flag; 1314 } 1315 cv_broadcast(&nlp->cv); 1316 mutex_exit(&rfs_async_write_lock); 1317 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1318 "rfs_write_end:(%S)", "stale"); 1319 return; 1320 } 1321 1322 /* 1323 * Can only write regular files. Attempts to write any 1324 * other file types fail with EISDIR. 1325 */ 1326 if (vp->v_type != VREG) { 1327 VN_RELE(vp); 1328 mutex_enter(&rfs_async_write_lock); 1329 if (rfs_async_write_head == nlp) 1330 rfs_async_write_head = nlp->next; 1331 else { 1332 lp = rfs_async_write_head; 1333 while (lp->next != nlp) 1334 lp = lp->next; 1335 lp->next = nlp->next; 1336 } 1337 t_flag = curthread->t_flag & T_WOULDBLOCK; 1338 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1339 rp->ns->ns_status = NFSERR_ISDIR; 1340 rp->thread->t_flag |= t_flag; 1341 } 1342 cv_broadcast(&nlp->cv); 1343 mutex_exit(&rfs_async_write_lock); 1344 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1345 "rfs_write_end:(%S)", "isdir"); 1346 return; 1347 } 1348 1349 /* 1350 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1351 * deadlock with ufs. 1352 */ 1353 if (nbl_need_check(vp)) { 1354 nbl_start_crit(vp, RW_READER); 1355 in_crit = 1; 1356 } 1357 1358 /* 1359 * Lock the file for writing. This operation provides 1360 * the delay which allows clusters to grow. 1361 */ 1362 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:"); 1363 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1364 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end"); 1365 1366 /* 1367 * Disconnect this cluster from the list of clusters. 1368 * The cluster that is being dealt with must be fixed 1369 * in size after this point, so there is no reason 1370 * to leave it on the list so that new requests can 1371 * find it. 1372 * 1373 * The algorithm is that the first write request will 1374 * create a cluster, convert the file handle to a 1375 * vnode pointer, and then lock the file for writing. 1376 * This request is not likely to be clustered with 1377 * any others. However, the next request will create 1378 * a new cluster and be blocked in VOP_RWLOCK while 1379 * the first request is being processed. This delay 1380 * will allow more requests to be clustered in this 1381 * second cluster. 1382 */ 1383 mutex_enter(&rfs_async_write_lock); 1384 if (rfs_async_write_head == nlp) 1385 rfs_async_write_head = nlp->next; 1386 else { 1387 lp = rfs_async_write_head; 1388 while (lp->next != nlp) 1389 lp = lp->next; 1390 lp->next = nlp->next; 1391 } 1392 mutex_exit(&rfs_async_write_lock); 1393 1394 /* 1395 * Step through the list of requests in this cluster. 1396 * We need to check permissions to make sure that all 1397 * of the requests have sufficient permission to write 1398 * the file. A cluster can be composed of requests 1399 * from different clients and different users on each 1400 * client. 1401 * 1402 * As a side effect, we also calculate the size of the 1403 * byte range that this cluster encompasses. 1404 */ 1405 rp = nlp->list; 1406 off = rp->wa->wa_offset; 1407 len = (uint_t)0; 1408 do { 1409 if (rdonly(exi, rp->req)) { 1410 rp->ns->ns_status = NFSERR_ROFS; 1411 t_flag = curthread->t_flag & T_WOULDBLOCK; 1412 rp->thread->t_flag |= t_flag; 1413 continue; 1414 } 1415 1416 va.va_mask = AT_UID|AT_MODE; 1417 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1418 error = VOP_GETATTR(vp, &va, 0, rp->cr, NULL); 1419 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1420 if (!error) { 1421 if (crgetuid(rp->cr) != va.va_uid) { 1422 /* 1423 * This is a kludge to allow writes of files 1424 * created with read only permission. The 1425 * owner of the file is always allowed to 1426 * write it. 1427 */ 1428 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1429 "vop_access_start:"); 1430 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, NULL); 1431 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1432 "vop_access_end:"); 1433 } 1434 if (!error && MANDLOCK(vp, va.va_mode)) 1435 error = EACCES; 1436 } 1437 1438 /* 1439 * Check for a conflict with a nbmand-locked region. 1440 */ 1441 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1442 rp->wa->wa_count, 0, NULL)) { 1443 error = EACCES; 1444 } 1445 1446 if (error) { 1447 rp->ns->ns_status = puterrno(error); 1448 t_flag = curthread->t_flag & T_WOULDBLOCK; 1449 rp->thread->t_flag |= t_flag; 1450 continue; 1451 } 1452 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1453 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1454 } while ((rp = rp->list) != NULL); 1455 1456 /* 1457 * Step through the cluster attempting to gather as many 1458 * requests which are contiguous as possible. These 1459 * contiguous requests are handled via one call to VOP_WRITE 1460 * instead of different calls to VOP_WRITE. We also keep 1461 * track of the fact that any data was written. 1462 */ 1463 rp = nlp->list; 1464 data_written = 0; 1465 do { 1466 /* 1467 * Skip any requests which are already marked as having an 1468 * error. 1469 */ 1470 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1471 rp = rp->list; 1472 continue; 1473 } 1474 1475 /* 1476 * Count the number of iovec's which are required 1477 * to handle this set of requests. One iovec is 1478 * needed for each data buffer, whether addressed 1479 * by wa_data or by the b_rptr pointers in the 1480 * mblk chains. 1481 */ 1482 iovcnt = 0; 1483 lrp = rp; 1484 for (;;) { 1485 if (lrp->wa->wa_data) 1486 iovcnt++; 1487 else { 1488 m = lrp->wa->wa_mblk; 1489 while (m != NULL) { 1490 iovcnt++; 1491 m = m->b_cont; 1492 } 1493 } 1494 if (lrp->list == NULL || 1495 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1496 lrp->wa->wa_offset + lrp->wa->wa_count != 1497 lrp->list->wa->wa_offset) { 1498 lrp = lrp->list; 1499 break; 1500 } 1501 lrp = lrp->list; 1502 } 1503 1504 if (iovcnt <= MAXCLIOVECS) { 1505 #ifdef DEBUG 1506 rfs_write_hits++; 1507 #endif 1508 niovp = iov; 1509 } else { 1510 #ifdef DEBUG 1511 rfs_write_misses++; 1512 #endif 1513 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1514 } 1515 /* 1516 * Put together the scatter/gather iovecs. 1517 */ 1518 iovp = niovp; 1519 trp = rp; 1520 count = 0; 1521 do { 1522 if (trp->wa->wa_data) { 1523 iovp->iov_base = trp->wa->wa_data; 1524 iovp->iov_len = trp->wa->wa_count; 1525 iovp++; 1526 } else { 1527 m = trp->wa->wa_mblk; 1528 rcount = trp->wa->wa_count; 1529 while (m != NULL) { 1530 iovp->iov_base = (caddr_t)m->b_rptr; 1531 iovp->iov_len = (m->b_wptr - m->b_rptr); 1532 rcount -= iovp->iov_len; 1533 if (rcount < 0) 1534 iovp->iov_len += rcount; 1535 iovp++; 1536 if (rcount <= 0) 1537 break; 1538 m = m->b_cont; 1539 } 1540 } 1541 count += trp->wa->wa_count; 1542 trp = trp->list; 1543 } while (trp != lrp); 1544 1545 uio.uio_iov = niovp; 1546 uio.uio_iovcnt = iovcnt; 1547 uio.uio_segflg = UIO_SYSSPACE; 1548 uio.uio_extflg = UIO_COPY_DEFAULT; 1549 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1550 uio.uio_resid = count; 1551 /* 1552 * The limit is checked on the client. We 1553 * should allow any size writes here. 1554 */ 1555 uio.uio_llimit = curproc->p_fsz_ctl; 1556 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1557 if (rlimit < (rlim64_t)uio.uio_resid) 1558 uio.uio_resid = (uint_t)rlimit; 1559 1560 /* 1561 * For now we assume no append mode. 1562 */ 1563 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1564 "vop_write_start:(%S)", "async"); 1565 1566 /* 1567 * Check to see if the v4 side of the server has 1568 * delegated this file. If so, then we mark thread 1569 * as wouldblock so the response is dropped. 1570 */ 1571 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1572 curthread->t_flag |= T_WOULDBLOCK; 1573 error = EACCES; /* just to have an error */ 1574 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1575 "rfs_write_end:(%S)", "delegated"); 1576 } else { 1577 /* 1578 * We're changing creds because VM may fault 1579 * and we need the cred of the current 1580 * thread to be used if quota * checking is 1581 * enabled. 1582 */ 1583 savecred = curthread->t_cred; 1584 curthread->t_cred = cr; 1585 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1586 curthread->t_cred = savecred; 1587 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1588 "vop_write_end:"); 1589 } 1590 1591 if (niovp != iov) 1592 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1593 1594 if (!error) { 1595 data_written = 1; 1596 /* 1597 * Get attributes again so we send the latest mod 1598 * time to the client side for his cache. 1599 */ 1600 va.va_mask = AT_ALL; /* now we want everything */ 1601 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1602 "vop_getattr_start:"); 1603 error = VOP_GETATTR(vp, &va, 0, rp->cr, NULL); 1604 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1605 "vop_getattr_end:"); 1606 if (!error) 1607 acl_perm(vp, exi, &va, rp->cr); 1608 } 1609 1610 /* 1611 * Fill in the status responses for each request 1612 * which was just handled. Also, copy the latest 1613 * attributes in to the attribute responses if 1614 * appropriate. 1615 */ 1616 t_flag = curthread->t_flag & T_WOULDBLOCK; 1617 do { 1618 rp->thread->t_flag |= t_flag; 1619 /* check for overflows */ 1620 if (!error) { 1621 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1622 } 1623 rp->ns->ns_status = puterrno(error); 1624 rp = rp->list; 1625 } while (rp != lrp); 1626 } while (rp != NULL); 1627 1628 /* 1629 * If any data was written at all, then we need to flush 1630 * the data and metadata to stable storage. 1631 */ 1632 if (data_written) { 1633 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:"); 1634 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, NULL); 1635 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:"); 1636 if (!error) { 1637 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1638 "vop_fsync_start:"); 1639 error = VOP_FSYNC(vp, FNODSYNC, cr, NULL); 1640 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:"); 1641 } 1642 } 1643 1644 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1645 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1646 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1647 1648 if (in_crit) 1649 nbl_end_crit(vp); 1650 VN_RELE(vp); 1651 1652 t_flag = curthread->t_flag & T_WOULDBLOCK; 1653 mutex_enter(&rfs_async_write_lock); 1654 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1655 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1656 rp->ns->ns_status = puterrno(error); 1657 rp->thread->t_flag |= t_flag; 1658 } 1659 } 1660 cv_broadcast(&nlp->cv); 1661 mutex_exit(&rfs_async_write_lock); 1662 1663 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async"); 1664 } 1665 1666 void * 1667 rfs_write_getfh(struct nfswriteargs *wa) 1668 { 1669 return (&wa->wa_fhandle); 1670 } 1671 1672 /* 1673 * Create a file. 1674 * Creates a file with given attributes and returns those attributes 1675 * and an fhandle for the new file. 1676 */ 1677 void 1678 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1679 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1680 { 1681 int error; 1682 int lookuperr; 1683 int in_crit = 0; 1684 struct vattr va; 1685 vnode_t *vp; 1686 vnode_t *dvp; 1687 char *name = args->ca_da.da_name; 1688 vnode_t *tvp = NULL; 1689 int mode; 1690 int lookup_ok; 1691 bool_t trunc; 1692 1693 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:"); 1694 1695 /* 1696 * Disallow NULL paths 1697 */ 1698 if (name == NULL || *name == '\0') { 1699 dr->dr_status = NFSERR_ACCES; 1700 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1701 "rfs_create_end:(%S)", "access"); 1702 return; 1703 } 1704 1705 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1706 if (dvp == NULL) { 1707 dr->dr_status = NFSERR_STALE; 1708 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1709 "rfs_create_end:(%S)", "stale"); 1710 return; 1711 } 1712 1713 error = sattr_to_vattr(args->ca_sa, &va); 1714 if (error) { 1715 dr->dr_status = puterrno(error); 1716 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1717 "rfs_create_end:(%S)", "sattr"); 1718 return; 1719 } 1720 1721 /* 1722 * Must specify the mode. 1723 */ 1724 if (!(va.va_mask & AT_MODE)) { 1725 VN_RELE(dvp); 1726 dr->dr_status = NFSERR_INVAL; 1727 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1728 "rfs_create_end:(%S)", "no mode"); 1729 return; 1730 } 1731 1732 /* 1733 * This is a completely gross hack to make mknod 1734 * work over the wire until we can wack the protocol 1735 */ 1736 if ((va.va_mode & IFMT) == IFCHR) { 1737 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1738 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1739 else { 1740 va.va_type = VCHR; 1741 /* 1742 * uncompress the received dev_t 1743 * if the top half is zero indicating a request 1744 * from an `older style' OS. 1745 */ 1746 if ((va.va_size & 0xffff0000) == 0) 1747 va.va_rdev = nfsv2_expdev(va.va_size); 1748 else 1749 va.va_rdev = (dev_t)va.va_size; 1750 } 1751 va.va_mask &= ~AT_SIZE; 1752 } else if ((va.va_mode & IFMT) == IFBLK) { 1753 va.va_type = VBLK; 1754 /* 1755 * uncompress the received dev_t 1756 * if the top half is zero indicating a request 1757 * from an `older style' OS. 1758 */ 1759 if ((va.va_size & 0xffff0000) == 0) 1760 va.va_rdev = nfsv2_expdev(va.va_size); 1761 else 1762 va.va_rdev = (dev_t)va.va_size; 1763 va.va_mask &= ~AT_SIZE; 1764 } else if ((va.va_mode & IFMT) == IFSOCK) { 1765 va.va_type = VSOCK; 1766 } else 1767 va.va_type = VREG; 1768 va.va_mode &= ~IFMT; 1769 va.va_mask |= AT_TYPE; 1770 1771 /* 1772 * Why was the choice made to use VWRITE as the mode to the 1773 * call to VOP_CREATE ? This results in a bug. When a client 1774 * opens a file that already exists and is RDONLY, the second 1775 * open fails with an EACESS because of the mode. 1776 * bug ID 1054648. 1777 */ 1778 lookup_ok = 0; 1779 mode = VWRITE; 1780 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1781 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 1782 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1783 NULL, NULL, NULL); 1784 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 1785 if (!error) { 1786 struct vattr at; 1787 1788 lookup_ok = 1; 1789 at.va_mask = AT_MODE; 1790 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1791 "vop_getattr_start:"); 1792 error = VOP_GETATTR(tvp, &at, 0, cr, NULL); 1793 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1794 "vop_getattr_end:"); 1795 if (!error) 1796 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1797 VN_RELE(tvp); 1798 tvp = NULL; 1799 } 1800 } 1801 1802 if (!lookup_ok) { 1803 if (rdonly(exi, req)) { 1804 error = EROFS; 1805 } else if (va.va_type != VREG && va.va_type != VFIFO && 1806 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1807 error = EPERM; 1808 } else { 1809 error = 0; 1810 } 1811 } 1812 1813 /* 1814 * If file size is being modified on an already existing file 1815 * make sure that there are no conflicting non-blocking mandatory 1816 * locks in the region being manipulated. Return EACCES if there 1817 * are conflicting locks. 1818 */ 1819 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1820 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1821 NULL, NULL, NULL); 1822 1823 if (!lookuperr && 1824 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1825 VN_RELE(tvp); 1826 curthread->t_flag |= T_WOULDBLOCK; 1827 goto out; 1828 } 1829 1830 if (!lookuperr && nbl_need_check(tvp)) { 1831 /* 1832 * The file exists. Now check if it has any 1833 * conflicting non-blocking mandatory locks 1834 * in the region being changed. 1835 */ 1836 struct vattr bva; 1837 u_offset_t offset; 1838 ssize_t length; 1839 1840 nbl_start_crit(tvp, RW_READER); 1841 in_crit = 1; 1842 1843 bva.va_mask = AT_SIZE; 1844 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL); 1845 if (!error) { 1846 if (va.va_size < bva.va_size) { 1847 offset = va.va_size; 1848 length = bva.va_size - va.va_size; 1849 } else { 1850 offset = bva.va_size; 1851 length = va.va_size - bva.va_size; 1852 } 1853 if (length) { 1854 if (nbl_conflict(tvp, NBL_WRITE, 1855 offset, length, 0, NULL)) { 1856 error = EACCES; 1857 } 1858 } 1859 } 1860 if (error) { 1861 nbl_end_crit(tvp); 1862 VN_RELE(tvp); 1863 in_crit = 0; 1864 } 1865 } else if (tvp != NULL) { 1866 VN_RELE(tvp); 1867 } 1868 } 1869 1870 if (!error) { 1871 /* 1872 * If filesystem is shared with nosuid the remove any 1873 * setuid/setgid bits on create. 1874 */ 1875 if (va.va_type == VREG && 1876 exi->exi_export.ex_flags & EX_NOSUID) 1877 va.va_mode &= ~(VSUID | VSGID); 1878 1879 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:"); 1880 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0, 1881 NULL, NULL); 1882 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:"); 1883 1884 if (!error) { 1885 1886 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1887 trunc = TRUE; 1888 else 1889 trunc = FALSE; 1890 1891 if (rfs4_check_delegated(FWRITE, vp, trunc)) { 1892 VN_RELE(vp); 1893 curthread->t_flag |= T_WOULDBLOCK; 1894 goto out; 1895 } 1896 va.va_mask = AT_ALL; 1897 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1898 "vop_getattr_start:"); 1899 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 1900 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1901 "vop_getattr_end:"); 1902 /* check for overflows */ 1903 if (!error) { 1904 acl_perm(vp, exi, &va, cr); 1905 error = vattr_to_nattr(&va, &dr->dr_attr); 1906 if (!error) { 1907 error = makefh(&dr->dr_fhandle, vp, 1908 exi); 1909 } 1910 } 1911 /* 1912 * Force modified metadata out to stable storage. 1913 */ 1914 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 1915 VN_RELE(vp); 1916 } 1917 1918 if (in_crit) { 1919 nbl_end_crit(tvp); 1920 VN_RELE(tvp); 1921 } 1922 } 1923 1924 /* 1925 * Force modified data and metadata out to stable storage. 1926 */ 1927 (void) VOP_FSYNC(dvp, 0, cr, NULL); 1928 1929 out: 1930 1931 VN_RELE(dvp); 1932 1933 dr->dr_status = puterrno(error); 1934 1935 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done"); 1936 } 1937 void * 1938 rfs_create_getfh(struct nfscreatargs *args) 1939 { 1940 return (args->ca_da.da_fhandle); 1941 } 1942 1943 /* 1944 * Remove a file. 1945 * Remove named file from parent directory. 1946 */ 1947 void 1948 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 1949 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1950 { 1951 int error = 0; 1952 vnode_t *vp; 1953 vnode_t *targvp; 1954 int in_crit = 0; 1955 1956 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:"); 1957 1958 /* 1959 * Disallow NULL paths 1960 */ 1961 if (da->da_name == NULL || *da->da_name == '\0') { 1962 *status = NFSERR_ACCES; 1963 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 1964 "rfs_remove_end:(%S)", "access"); 1965 return; 1966 } 1967 1968 vp = nfs_fhtovp(da->da_fhandle, exi); 1969 if (vp == NULL) { 1970 *status = NFSERR_STALE; 1971 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 1972 "rfs_remove_end:(%S)", "stale"); 1973 return; 1974 } 1975 1976 if (rdonly(exi, req)) { 1977 VN_RELE(vp); 1978 *status = NFSERR_ROFS; 1979 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 1980 "rfs_remove_end:(%S)", "rofs"); 1981 return; 1982 } 1983 1984 /* 1985 * Check for a conflict with a non-blocking mandatory share reservation. 1986 */ 1987 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 1988 NULL, cr, NULL, NULL, NULL); 1989 if (error != 0) { 1990 VN_RELE(vp); 1991 *status = puterrno(error); 1992 return; 1993 } 1994 1995 /* 1996 * If the file is delegated to an v4 client, then initiate 1997 * recall and drop this request (by setting T_WOULDBLOCK). 1998 * The client will eventually re-transmit the request and 1999 * (hopefully), by then, the v4 client will have returned 2000 * the delegation. 2001 */ 2002 2003 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2004 VN_RELE(vp); 2005 VN_RELE(targvp); 2006 curthread->t_flag |= T_WOULDBLOCK; 2007 return; 2008 } 2009 2010 if (nbl_need_check(targvp)) { 2011 nbl_start_crit(targvp, RW_READER); 2012 in_crit = 1; 2013 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { 2014 error = EACCES; 2015 goto out; 2016 } 2017 } 2018 2019 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:"); 2020 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0); 2021 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:"); 2022 2023 /* 2024 * Force modified data and metadata out to stable storage. 2025 */ 2026 (void) VOP_FSYNC(vp, 0, cr, NULL); 2027 2028 out: 2029 if (in_crit) 2030 nbl_end_crit(targvp); 2031 VN_RELE(targvp); 2032 VN_RELE(vp); 2033 2034 *status = puterrno(error); 2035 2036 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done"); 2037 } 2038 2039 void * 2040 rfs_remove_getfh(struct nfsdiropargs *da) 2041 { 2042 return (da->da_fhandle); 2043 } 2044 2045 /* 2046 * rename a file 2047 * Give a file (from) a new name (to). 2048 */ 2049 void 2050 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2051 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2052 { 2053 int error = 0; 2054 vnode_t *fromvp; 2055 vnode_t *tovp; 2056 struct exportinfo *to_exi; 2057 fhandle_t *fh; 2058 vnode_t *srcvp; 2059 vnode_t *targvp; 2060 int in_crit = 0; 2061 2062 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:"); 2063 2064 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2065 if (fromvp == NULL) { 2066 *status = NFSERR_STALE; 2067 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2068 "rfs_rename_end:(%S)", "from stale"); 2069 return; 2070 } 2071 2072 fh = args->rna_to.da_fhandle; 2073 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2074 if (to_exi == NULL) { 2075 VN_RELE(fromvp); 2076 *status = NFSERR_ACCES; 2077 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2078 "rfs_rename_end:(%S)", "cross device"); 2079 return; 2080 } 2081 exi_rele(to_exi); 2082 2083 if (to_exi != exi) { 2084 VN_RELE(fromvp); 2085 *status = NFSERR_XDEV; 2086 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2087 "rfs_rename_end:(%S)", "from stale"); 2088 return; 2089 } 2090 2091 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2092 if (tovp == NULL) { 2093 VN_RELE(fromvp); 2094 *status = NFSERR_STALE; 2095 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2096 "rfs_rename_end:(%S)", "to stale"); 2097 return; 2098 } 2099 2100 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2101 VN_RELE(tovp); 2102 VN_RELE(fromvp); 2103 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2104 "rfs_rename_end:(%S)", "not dir"); 2105 *status = NFSERR_NOTDIR; 2106 return; 2107 } 2108 2109 /* 2110 * Disallow NULL paths 2111 */ 2112 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2113 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2114 VN_RELE(tovp); 2115 VN_RELE(fromvp); 2116 *status = NFSERR_ACCES; 2117 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2118 "rfs_rename_end:(%S)", "access"); 2119 return; 2120 } 2121 2122 if (rdonly(exi, req)) { 2123 VN_RELE(tovp); 2124 VN_RELE(fromvp); 2125 *status = NFSERR_ROFS; 2126 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2127 "rfs_rename_end:(%S)", "rofs"); 2128 return; 2129 } 2130 2131 /* 2132 * Check for a conflict with a non-blocking mandatory share reservation. 2133 */ 2134 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2135 NULL, cr, NULL, NULL, NULL); 2136 if (error != 0) { 2137 VN_RELE(tovp); 2138 VN_RELE(fromvp); 2139 *status = puterrno(error); 2140 return; 2141 } 2142 2143 /* Check for delegations on the source file */ 2144 2145 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2146 VN_RELE(tovp); 2147 VN_RELE(fromvp); 2148 VN_RELE(srcvp); 2149 curthread->t_flag |= T_WOULDBLOCK; 2150 return; 2151 } 2152 2153 /* Check for delegation on the file being renamed over, if it exists */ 2154 2155 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2156 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr, 2157 NULL, NULL, NULL) == 0) { 2158 2159 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2160 VN_RELE(tovp); 2161 VN_RELE(fromvp); 2162 VN_RELE(srcvp); 2163 VN_RELE(targvp); 2164 curthread->t_flag |= T_WOULDBLOCK; 2165 return; 2166 } 2167 VN_RELE(targvp); 2168 } 2169 2170 2171 if (nbl_need_check(srcvp)) { 2172 nbl_start_crit(srcvp, RW_READER); 2173 in_crit = 1; 2174 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) { 2175 error = EACCES; 2176 goto out; 2177 } 2178 } 2179 2180 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:"); 2181 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2182 tovp, args->rna_to.da_name, cr, NULL, 0); 2183 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:"); 2184 2185 if (error == 0) { 2186 char *tmp; 2187 2188 /* fix the path name for the renamed file */ 2189 mutex_enter(&srcvp->v_lock); 2190 tmp = srcvp->v_path; 2191 srcvp->v_path = NULL; 2192 mutex_exit(&srcvp->v_lock); 2193 vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name, 2194 strlen(args->rna_to.da_name)); 2195 if (tmp != NULL) 2196 kmem_free(tmp, strlen(tmp) + 1); 2197 } 2198 2199 /* 2200 * Force modified data and metadata out to stable storage. 2201 */ 2202 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2203 (void) VOP_FSYNC(fromvp, 0, cr, NULL); 2204 2205 out: 2206 if (in_crit) 2207 nbl_end_crit(srcvp); 2208 VN_RELE(srcvp); 2209 VN_RELE(tovp); 2210 VN_RELE(fromvp); 2211 2212 *status = puterrno(error); 2213 2214 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done"); 2215 } 2216 void * 2217 rfs_rename_getfh(struct nfsrnmargs *args) 2218 { 2219 return (args->rna_from.da_fhandle); 2220 } 2221 2222 /* 2223 * Link to a file. 2224 * Create a file (to) which is a hard link to the given file (from). 2225 */ 2226 void 2227 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2228 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2229 { 2230 int error; 2231 vnode_t *fromvp; 2232 vnode_t *tovp; 2233 struct exportinfo *to_exi; 2234 fhandle_t *fh; 2235 2236 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:"); 2237 2238 fromvp = nfs_fhtovp(args->la_from, exi); 2239 if (fromvp == NULL) { 2240 *status = NFSERR_STALE; 2241 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2242 "rfs_link_end:(%S)", "from stale"); 2243 return; 2244 } 2245 2246 fh = args->la_to.da_fhandle; 2247 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2248 if (to_exi == NULL) { 2249 VN_RELE(fromvp); 2250 *status = NFSERR_ACCES; 2251 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2252 "rfs_link_end:(%S)", "cross device"); 2253 return; 2254 } 2255 exi_rele(to_exi); 2256 2257 if (to_exi != exi) { 2258 VN_RELE(fromvp); 2259 *status = NFSERR_XDEV; 2260 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2261 "rfs_link_end:(%S)", "cross device"); 2262 return; 2263 } 2264 2265 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2266 if (tovp == NULL) { 2267 VN_RELE(fromvp); 2268 *status = NFSERR_STALE; 2269 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2270 "rfs_link_end:(%S)", "to stale"); 2271 return; 2272 } 2273 2274 if (tovp->v_type != VDIR) { 2275 VN_RELE(tovp); 2276 VN_RELE(fromvp); 2277 *status = NFSERR_NOTDIR; 2278 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2279 "rfs_link_end:(%S)", "not dir"); 2280 return; 2281 } 2282 /* 2283 * Disallow NULL paths 2284 */ 2285 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2286 VN_RELE(tovp); 2287 VN_RELE(fromvp); 2288 *status = NFSERR_ACCES; 2289 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2290 "rfs_link_end:(%S)", "access"); 2291 return; 2292 } 2293 2294 if (rdonly(exi, req)) { 2295 VN_RELE(tovp); 2296 VN_RELE(fromvp); 2297 *status = NFSERR_ROFS; 2298 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2299 "rfs_link_end:(%S)", "rofs"); 2300 return; 2301 } 2302 2303 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:"); 2304 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0); 2305 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:"); 2306 2307 /* 2308 * Force modified data and metadata out to stable storage. 2309 */ 2310 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2311 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL); 2312 2313 VN_RELE(tovp); 2314 VN_RELE(fromvp); 2315 2316 *status = puterrno(error); 2317 2318 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done"); 2319 } 2320 void * 2321 rfs_link_getfh(struct nfslinkargs *args) 2322 { 2323 return (args->la_from); 2324 } 2325 2326 /* 2327 * Symbolicly link to a file. 2328 * Create a file (to) with the given attributes which is a symbolic link 2329 * to the given path name (to). 2330 */ 2331 void 2332 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2333 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2334 { 2335 int error; 2336 struct vattr va; 2337 vnode_t *vp; 2338 vnode_t *svp; 2339 int lerror; 2340 2341 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:"); 2342 2343 /* 2344 * Disallow NULL paths 2345 */ 2346 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2347 *status = NFSERR_ACCES; 2348 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2349 "rfs_symlink_end:(%S)", "access"); 2350 return; 2351 } 2352 2353 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2354 if (vp == NULL) { 2355 *status = NFSERR_STALE; 2356 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2357 "rfs_symlink_end:(%S)", "stale"); 2358 return; 2359 } 2360 2361 if (rdonly(exi, req)) { 2362 VN_RELE(vp); 2363 *status = NFSERR_ROFS; 2364 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2365 "rfs_symlink_end:(%S)", "rofs"); 2366 return; 2367 } 2368 2369 error = sattr_to_vattr(args->sla_sa, &va); 2370 if (error) { 2371 VN_RELE(vp); 2372 *status = puterrno(error); 2373 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2374 "rfs_symlink_end:(%S)", "sattr"); 2375 return; 2376 } 2377 2378 if (!(va.va_mask & AT_MODE)) { 2379 VN_RELE(vp); 2380 *status = NFSERR_INVAL; 2381 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2382 "rfs_symlink_end:(%S)", "no mode"); 2383 return; 2384 } 2385 2386 va.va_type = VLNK; 2387 va.va_mask |= AT_TYPE; 2388 2389 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:"); 2390 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr, 2391 NULL, 0); 2392 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:"); 2393 2394 /* 2395 * Force new data and metadata out to stable storage. 2396 */ 2397 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 2398 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2399 0, NULL, cr, NULL, NULL, NULL); 2400 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 2401 if (!lerror) { 2402 (void) VOP_FSYNC(svp, 0, cr, NULL); 2403 VN_RELE(svp); 2404 } 2405 2406 /* 2407 * Force modified data and metadata out to stable storage. 2408 */ 2409 (void) VOP_FSYNC(vp, 0, cr, NULL); 2410 2411 VN_RELE(vp); 2412 2413 *status = puterrno(error); 2414 2415 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done"); 2416 } 2417 void * 2418 rfs_symlink_getfh(struct nfsslargs *args) 2419 { 2420 return (args->sla_from.da_fhandle); 2421 } 2422 2423 /* 2424 * Make a directory. 2425 * Create a directory with the given name, parent directory, and attributes. 2426 * Returns a file handle and attributes for the new directory. 2427 */ 2428 void 2429 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2430 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2431 { 2432 int error; 2433 struct vattr va; 2434 vnode_t *dvp = NULL; 2435 vnode_t *vp; 2436 char *name = args->ca_da.da_name; 2437 2438 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:"); 2439 2440 /* 2441 * Disallow NULL paths 2442 */ 2443 if (name == NULL || *name == '\0') { 2444 dr->dr_status = NFSERR_ACCES; 2445 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2446 "rfs_mkdir_end:(%S)", "access"); 2447 return; 2448 } 2449 2450 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2451 if (vp == NULL) { 2452 dr->dr_status = NFSERR_STALE; 2453 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2454 "rfs_mkdir_end:(%S)", "stale"); 2455 return; 2456 } 2457 2458 if (rdonly(exi, req)) { 2459 VN_RELE(vp); 2460 dr->dr_status = NFSERR_ROFS; 2461 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2462 "rfs_mkdir_end:(%S)", "rofs"); 2463 return; 2464 } 2465 2466 error = sattr_to_vattr(args->ca_sa, &va); 2467 if (error) { 2468 VN_RELE(vp); 2469 dr->dr_status = puterrno(error); 2470 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2471 "rfs_mkdir_end:(%S)", "sattr"); 2472 return; 2473 } 2474 2475 if (!(va.va_mask & AT_MODE)) { 2476 VN_RELE(vp); 2477 dr->dr_status = NFSERR_INVAL; 2478 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2479 "rfs_mkdir_end:(%S)", "no mode"); 2480 return; 2481 } 2482 2483 va.va_type = VDIR; 2484 va.va_mask |= AT_TYPE; 2485 2486 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:"); 2487 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL); 2488 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:"); 2489 2490 if (!error) { 2491 /* 2492 * Attribtutes of the newly created directory should 2493 * be returned to the client. 2494 */ 2495 va.va_mask = AT_ALL; /* We want everything */ 2496 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 2497 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2498 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 2499 /* check for overflows */ 2500 if (!error) { 2501 acl_perm(vp, exi, &va, cr); 2502 error = vattr_to_nattr(&va, &dr->dr_attr); 2503 if (!error) { 2504 error = makefh(&dr->dr_fhandle, dvp, exi); 2505 } 2506 } 2507 /* 2508 * Force new data and metadata out to stable storage. 2509 */ 2510 (void) VOP_FSYNC(dvp, 0, cr, NULL); 2511 VN_RELE(dvp); 2512 } 2513 2514 /* 2515 * Force modified data and metadata out to stable storage. 2516 */ 2517 (void) VOP_FSYNC(vp, 0, cr, NULL); 2518 2519 VN_RELE(vp); 2520 2521 dr->dr_status = puterrno(error); 2522 2523 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done"); 2524 } 2525 void * 2526 rfs_mkdir_getfh(struct nfscreatargs *args) 2527 { 2528 return (args->ca_da.da_fhandle); 2529 } 2530 2531 /* 2532 * Remove a directory. 2533 * Remove the given directory name from the given parent directory. 2534 */ 2535 void 2536 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2537 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2538 { 2539 int error; 2540 vnode_t *vp; 2541 2542 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:"); 2543 2544 /* 2545 * Disallow NULL paths 2546 */ 2547 if (da->da_name == NULL || *da->da_name == '\0') { 2548 *status = NFSERR_ACCES; 2549 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2550 "rfs_rmdir_end:(%S)", "access"); 2551 return; 2552 } 2553 2554 vp = nfs_fhtovp(da->da_fhandle, exi); 2555 if (vp == NULL) { 2556 *status = NFSERR_STALE; 2557 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2558 "rfs_rmdir_end:(%S)", "stale"); 2559 return; 2560 } 2561 2562 if (rdonly(exi, req)) { 2563 VN_RELE(vp); 2564 *status = NFSERR_ROFS; 2565 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2566 "rfs_rmdir_end:(%S)", "rofs"); 2567 return; 2568 } 2569 2570 /* 2571 * VOP_RMDIR now takes a new third argument (the current 2572 * directory of the process). That's because someone 2573 * wants to return EINVAL if one tries to remove ".". 2574 * Of course, NFS servers have no idea what their 2575 * clients' current directories are. We fake it by 2576 * supplying a vnode known to exist and illegal to 2577 * remove. 2578 */ 2579 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:"); 2580 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0); 2581 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:"); 2582 2583 /* 2584 * Force modified data and metadata out to stable storage. 2585 */ 2586 (void) VOP_FSYNC(vp, 0, cr, NULL); 2587 2588 VN_RELE(vp); 2589 2590 /* 2591 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2592 * if the directory is not empty. A System V NFS server 2593 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2594 * over the wire. 2595 */ 2596 if (error == EEXIST) 2597 *status = NFSERR_NOTEMPTY; 2598 else 2599 *status = puterrno(error); 2600 2601 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done"); 2602 } 2603 void * 2604 rfs_rmdir_getfh(struct nfsdiropargs *da) 2605 { 2606 return (da->da_fhandle); 2607 } 2608 2609 /* ARGSUSED */ 2610 void 2611 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2612 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2613 { 2614 int error; 2615 int iseof; 2616 struct iovec iov; 2617 struct uio uio; 2618 vnode_t *vp; 2619 2620 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:"); 2621 2622 vp = nfs_fhtovp(&rda->rda_fh, exi); 2623 if (vp == NULL) { 2624 rd->rd_entries = NULL; 2625 rd->rd_status = NFSERR_STALE; 2626 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2627 "rfs_readdir_end:(%S)", "stale"); 2628 return; 2629 } 2630 2631 if (vp->v_type != VDIR) { 2632 VN_RELE(vp); 2633 rd->rd_entries = NULL; 2634 rd->rd_status = NFSERR_NOTDIR; 2635 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2636 "rfs_readdir_end:(%S)", "notdir"); 2637 return; 2638 } 2639 2640 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 2641 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2642 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 2643 2644 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 2645 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL); 2646 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 2647 if (error) { 2648 rd->rd_entries = NULL; 2649 goto bad; 2650 } 2651 2652 if (rda->rda_count == 0) { 2653 rd->rd_entries = NULL; 2654 rd->rd_size = 0; 2655 rd->rd_eof = FALSE; 2656 goto bad; 2657 } 2658 2659 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2660 2661 /* 2662 * Allocate data for entries. This will be freed by rfs_rddirfree. 2663 */ 2664 rd->rd_bufsize = (uint_t)rda->rda_count; 2665 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2666 2667 /* 2668 * Set up io vector to read directory data 2669 */ 2670 iov.iov_base = (caddr_t)rd->rd_entries; 2671 iov.iov_len = rda->rda_count; 2672 uio.uio_iov = &iov; 2673 uio.uio_iovcnt = 1; 2674 uio.uio_segflg = UIO_SYSSPACE; 2675 uio.uio_extflg = UIO_COPY_CACHED; 2676 uio.uio_loffset = (offset_t)rda->rda_offset; 2677 uio.uio_resid = rda->rda_count; 2678 2679 /* 2680 * read directory 2681 */ 2682 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:"); 2683 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0); 2684 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:"); 2685 2686 /* 2687 * Clean up 2688 */ 2689 if (!error) { 2690 /* 2691 * set size and eof 2692 */ 2693 if (uio.uio_resid == rda->rda_count) { 2694 rd->rd_size = 0; 2695 rd->rd_eof = TRUE; 2696 } else { 2697 rd->rd_size = (uint32_t)(rda->rda_count - 2698 uio.uio_resid); 2699 rd->rd_eof = iseof ? TRUE : FALSE; 2700 } 2701 } 2702 2703 bad: 2704 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 2705 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2706 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 2707 2708 #if 0 /* notyet */ 2709 /* 2710 * Don't do this. It causes local disk writes when just 2711 * reading the file and the overhead is deemed larger 2712 * than the benefit. 2713 */ 2714 /* 2715 * Force modified metadata out to stable storage. 2716 */ 2717 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 2718 #endif 2719 2720 VN_RELE(vp); 2721 2722 rd->rd_status = puterrno(error); 2723 2724 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done"); 2725 } 2726 void * 2727 rfs_readdir_getfh(struct nfsrddirargs *rda) 2728 { 2729 return (&rda->rda_fh); 2730 } 2731 void 2732 rfs_rddirfree(struct nfsrddirres *rd) 2733 { 2734 if (rd->rd_entries != NULL) 2735 kmem_free(rd->rd_entries, rd->rd_bufsize); 2736 } 2737 2738 /* ARGSUSED */ 2739 void 2740 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2741 struct svc_req *req, cred_t *cr) 2742 { 2743 int error; 2744 struct statvfs64 sb; 2745 vnode_t *vp; 2746 2747 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:"); 2748 2749 vp = nfs_fhtovp(fh, exi); 2750 if (vp == NULL) { 2751 fs->fs_status = NFSERR_STALE; 2752 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2753 "rfs_statfs_end:(%S)", "stale"); 2754 return; 2755 } 2756 2757 error = VFS_STATVFS(vp->v_vfsp, &sb); 2758 2759 if (!error) { 2760 fs->fs_tsize = nfstsize(); 2761 fs->fs_bsize = sb.f_frsize; 2762 fs->fs_blocks = sb.f_blocks; 2763 fs->fs_bfree = sb.f_bfree; 2764 fs->fs_bavail = sb.f_bavail; 2765 } 2766 2767 VN_RELE(vp); 2768 2769 fs->fs_status = puterrno(error); 2770 2771 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done"); 2772 } 2773 void * 2774 rfs_statfs_getfh(fhandle_t *fh) 2775 { 2776 return (fh); 2777 } 2778 2779 static int 2780 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2781 { 2782 vap->va_mask = 0; 2783 2784 /* 2785 * There was a sign extension bug in some VFS based systems 2786 * which stored the mode as a short. When it would get 2787 * assigned to a u_long, no sign extension would occur. 2788 * It needed to, but this wasn't noticed because sa_mode 2789 * would then get assigned back to the short, thus ignoring 2790 * the upper 16 bits of sa_mode. 2791 * 2792 * To make this implementation work for both broken 2793 * clients and good clients, we check for both versions 2794 * of the mode. 2795 */ 2796 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2797 sa->sa_mode != (uint32_t)-1) { 2798 vap->va_mask |= AT_MODE; 2799 vap->va_mode = sa->sa_mode; 2800 } 2801 if (sa->sa_uid != (uint32_t)-1) { 2802 vap->va_mask |= AT_UID; 2803 vap->va_uid = sa->sa_uid; 2804 } 2805 if (sa->sa_gid != (uint32_t)-1) { 2806 vap->va_mask |= AT_GID; 2807 vap->va_gid = sa->sa_gid; 2808 } 2809 if (sa->sa_size != (uint32_t)-1) { 2810 vap->va_mask |= AT_SIZE; 2811 vap->va_size = sa->sa_size; 2812 } 2813 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2814 sa->sa_atime.tv_usec != (int32_t)-1) { 2815 #ifndef _LP64 2816 /* return error if time overflow */ 2817 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2818 return (EOVERFLOW); 2819 #endif 2820 vap->va_mask |= AT_ATIME; 2821 /* 2822 * nfs protocol defines times as unsigned so don't extend sign, 2823 * unless sysadmin set nfs_allow_preepoch_time. 2824 */ 2825 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2826 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2827 } 2828 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2829 sa->sa_mtime.tv_usec != (int32_t)-1) { 2830 #ifndef _LP64 2831 /* return error if time overflow */ 2832 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2833 return (EOVERFLOW); 2834 #endif 2835 vap->va_mask |= AT_MTIME; 2836 /* 2837 * nfs protocol defines times as unsigned so don't extend sign, 2838 * unless sysadmin set nfs_allow_preepoch_time. 2839 */ 2840 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2841 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2842 } 2843 return (0); 2844 } 2845 2846 static enum nfsftype vt_to_nf[] = { 2847 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2848 }; 2849 2850 /* 2851 * check the following fields for overflow: nodeid, size, and time. 2852 * There could be a problem when converting 64-bit LP64 fields 2853 * into 32-bit ones. Return an error if there is an overflow. 2854 */ 2855 int 2856 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2857 { 2858 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2859 na->na_type = vt_to_nf[vap->va_type]; 2860 2861 if (vap->va_mode == (unsigned short) -1) 2862 na->na_mode = (uint32_t)-1; 2863 else 2864 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2865 2866 if (vap->va_uid == (unsigned short)(-1)) 2867 na->na_uid = (uint32_t)(-1); 2868 else if (vap->va_uid == UID_NOBODY) 2869 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2870 else 2871 na->na_uid = vap->va_uid; 2872 2873 if (vap->va_gid == (unsigned short)(-1)) 2874 na->na_gid = (uint32_t)-1; 2875 else if (vap->va_gid == GID_NOBODY) 2876 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2877 else 2878 na->na_gid = vap->va_gid; 2879 2880 /* 2881 * Do we need to check fsid for overflow? It is 64-bit in the 2882 * vattr, but are bigger than 32 bit values supported? 2883 */ 2884 na->na_fsid = vap->va_fsid; 2885 2886 na->na_nodeid = vap->va_nodeid; 2887 2888 /* 2889 * Check to make sure that the nodeid is representable over the 2890 * wire without losing bits. 2891 */ 2892 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2893 return (EFBIG); 2894 na->na_nlink = vap->va_nlink; 2895 2896 /* 2897 * Check for big files here, instead of at the caller. See 2898 * comments in cstat for large special file explanation. 2899 */ 2900 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2901 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2902 return (EFBIG); 2903 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2904 /* UNKNOWN_SIZE | OVERFLOW */ 2905 na->na_size = MAXOFF32_T; 2906 } else 2907 na->na_size = vap->va_size; 2908 } else 2909 na->na_size = vap->va_size; 2910 2911 /* 2912 * If the vnode times overflow the 32-bit times that NFS2 2913 * uses on the wire then return an error. 2914 */ 2915 if (!NFS_VAP_TIME_OK(vap)) { 2916 return (EOVERFLOW); 2917 } 2918 na->na_atime.tv_sec = vap->va_atime.tv_sec; 2919 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2920 2921 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 2922 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2923 2924 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 2925 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 2926 2927 /* 2928 * If the dev_t will fit into 16 bits then compress 2929 * it, otherwise leave it alone. See comments in 2930 * nfs_client.c. 2931 */ 2932 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 2933 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 2934 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 2935 else 2936 (void) cmpldev(&na->na_rdev, vap->va_rdev); 2937 2938 na->na_blocks = vap->va_nblocks; 2939 na->na_blocksize = vap->va_blksize; 2940 2941 /* 2942 * This bit of ugliness is a *TEMPORARY* hack to preserve the 2943 * over-the-wire protocols for named-pipe vnodes. It remaps the 2944 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 2945 * 2946 * BUYER BEWARE: 2947 * If you are porting the NFS to a non-Sun server, you probably 2948 * don't want to include the following block of code. The 2949 * over-the-wire special file types will be changing with the 2950 * NFS Protocol Revision. 2951 */ 2952 if (vap->va_type == VFIFO) 2953 NA_SETFIFO(na); 2954 return (0); 2955 } 2956 2957 /* 2958 * acl v2 support: returns approximate permission. 2959 * default: returns minimal permission (more restrictive) 2960 * aclok: returns maximal permission (less restrictive) 2961 * This routine changes the permissions that are alaredy in *va. 2962 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 2963 * CLASS_OBJ is always the same as GROUP_OBJ entry. 2964 */ 2965 static void 2966 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 2967 { 2968 vsecattr_t vsa; 2969 int aclcnt; 2970 aclent_t *aclentp; 2971 mode_t mask_perm; 2972 mode_t grp_perm; 2973 mode_t other_perm; 2974 mode_t other_orig; 2975 int error; 2976 2977 /* dont care default acl */ 2978 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 2979 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL); 2980 2981 if (!error) { 2982 aclcnt = vsa.vsa_aclcnt; 2983 if (aclcnt > MIN_ACL_ENTRIES) { 2984 /* non-trivial ACL */ 2985 aclentp = vsa.vsa_aclentp; 2986 if (exi->exi_export.ex_flags & EX_ACLOK) { 2987 /* maximal permissions */ 2988 grp_perm = 0; 2989 other_perm = 0; 2990 for (; aclcnt > 0; aclcnt--, aclentp++) { 2991 switch (aclentp->a_type) { 2992 case USER_OBJ: 2993 break; 2994 case USER: 2995 grp_perm |= 2996 aclentp->a_perm << 3; 2997 other_perm |= aclentp->a_perm; 2998 break; 2999 case GROUP_OBJ: 3000 grp_perm |= 3001 aclentp->a_perm << 3; 3002 break; 3003 case GROUP: 3004 other_perm |= aclentp->a_perm; 3005 break; 3006 case OTHER_OBJ: 3007 other_orig = aclentp->a_perm; 3008 break; 3009 case CLASS_OBJ: 3010 mask_perm = aclentp->a_perm; 3011 break; 3012 default: 3013 break; 3014 } 3015 } 3016 grp_perm &= mask_perm << 3; 3017 other_perm &= mask_perm; 3018 other_perm |= other_orig; 3019 3020 } else { 3021 /* minimal permissions */ 3022 grp_perm = 070; 3023 other_perm = 07; 3024 for (; aclcnt > 0; aclcnt--, aclentp++) { 3025 switch (aclentp->a_type) { 3026 case USER_OBJ: 3027 break; 3028 case USER: 3029 case CLASS_OBJ: 3030 grp_perm &= 3031 aclentp->a_perm << 3; 3032 other_perm &= 3033 aclentp->a_perm; 3034 break; 3035 case GROUP_OBJ: 3036 grp_perm &= 3037 aclentp->a_perm << 3; 3038 break; 3039 case GROUP: 3040 other_perm &= 3041 aclentp->a_perm; 3042 break; 3043 case OTHER_OBJ: 3044 other_perm &= 3045 aclentp->a_perm; 3046 break; 3047 default: 3048 break; 3049 } 3050 } 3051 } 3052 /* copy to va */ 3053 va->va_mode &= ~077; 3054 va->va_mode |= grp_perm | other_perm; 3055 } 3056 if (vsa.vsa_aclcnt) 3057 kmem_free(vsa.vsa_aclentp, 3058 vsa.vsa_aclcnt * sizeof (aclent_t)); 3059 } 3060 } 3061 3062 void 3063 rfs_srvrinit(void) 3064 { 3065 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3066 } 3067 3068 void 3069 rfs_srvrfini(void) 3070 { 3071 mutex_destroy(&rfs_async_write_lock); 3072 } 3073