1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/uio.h> 41 #include <sys/stat.h> 42 #include <sys/errno.h> 43 #include <sys/sysmacros.h> 44 #include <sys/statvfs.h> 45 #include <sys/kmem.h> 46 #include <sys/kstat.h> 47 #include <sys/dirent.h> 48 #include <sys/cmn_err.h> 49 #include <sys/debug.h> 50 #include <sys/vtrace.h> 51 #include <sys/mode.h> 52 #include <sys/acl.h> 53 #include <sys/nbmlock.h> 54 #include <sys/policy.h> 55 56 #include <rpc/types.h> 57 #include <rpc/auth.h> 58 #include <rpc/svc.h> 59 60 #include <nfs/nfs.h> 61 #include <nfs/export.h> 62 63 #include <vm/hat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kmem.h> 68 69 #include <sys/strsubr.h> 70 71 /* 72 * These are the interface routines for the server side of the 73 * Network File System. See the NFS version 2 protocol specification 74 * for a description of this interface. 75 */ 76 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 79 cred_t *); 80 81 /* 82 * Some "over the wire" UNIX file types. These are encoded 83 * into the mode. This needs to be fixed in the next rev. 84 */ 85 #define IFMT 0170000 /* type of file */ 86 #define IFCHR 0020000 /* character special */ 87 #define IFBLK 0060000 /* block special */ 88 #define IFSOCK 0140000 /* socket */ 89 90 u_longlong_t nfs2_srv_caller_id; 91 92 /* 93 * Get file attributes. 94 * Returns the current attributes of the file with the given fhandle. 95 */ 96 /* ARGSUSED */ 97 void 98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 99 struct svc_req *req, cred_t *cr) 100 { 101 int error; 102 vnode_t *vp; 103 struct vattr va; 104 105 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:"); 106 107 vp = nfs_fhtovp(fhp, exi); 108 if (vp == NULL) { 109 ns->ns_status = NFSERR_STALE; 110 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 111 "rfs_getattr_end:(%S)", "stale"); 112 return; 113 } 114 115 /* 116 * Do the getattr. 117 */ 118 va.va_mask = AT_ALL; /* we want all the attributes */ 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 120 error = rfs4_delegated_getattr(vp, &va, 0, cr); 121 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 122 123 /* check for overflows */ 124 if (!error) { 125 acl_perm(vp, exi, &va, cr); 126 error = vattr_to_nattr(&va, &ns->ns_attr); 127 } 128 129 VN_RELE(vp); 130 131 ns->ns_status = puterrno(error); 132 133 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done"); 134 } 135 void * 136 rfs_getattr_getfh(fhandle_t *fhp) 137 { 138 return (fhp); 139 } 140 141 /* 142 * Set file attributes. 143 * Sets the attributes of the file with the given fhandle. Returns 144 * the new attributes. 145 */ 146 void 147 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 148 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 149 { 150 int error; 151 int flag; 152 int in_crit = 0; 153 vnode_t *vp; 154 struct vattr va; 155 struct vattr bva; 156 struct flock64 bf; 157 caller_context_t ct; 158 159 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:"); 160 161 vp = nfs_fhtovp(&args->saa_fh, exi); 162 if (vp == NULL) { 163 ns->ns_status = NFSERR_STALE; 164 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 165 "rfs_setattr_end:(%S)", "stale"); 166 return; 167 } 168 169 if (rdonly(exi, req) || vn_is_readonly(vp)) { 170 VN_RELE(vp); 171 ns->ns_status = NFSERR_ROFS; 172 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 173 "rfs_setattr_end:(%S)", "rofs"); 174 return; 175 } 176 177 error = sattr_to_vattr(&args->saa_sa, &va); 178 if (error) { 179 VN_RELE(vp); 180 ns->ns_status = puterrno(error); 181 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 182 "rfs_setattr_end:(%S)", "sattr"); 183 return; 184 } 185 186 /* 187 * If the client is requesting a change to the mtime, 188 * but the nanosecond field is set to 1 billion, then 189 * this is a flag to the server that it should set the 190 * atime and mtime fields to the server's current time. 191 * The 1 billion number actually came from the client 192 * as 1 million, but the units in the over the wire 193 * request are microseconds instead of nanoseconds. 194 * 195 * This is an overload of the protocol and should be 196 * documented in the NFS Version 2 protocol specification. 197 */ 198 if (va.va_mask & AT_MTIME) { 199 if (va.va_mtime.tv_nsec == 1000000000) { 200 gethrestime(&va.va_mtime); 201 va.va_atime = va.va_mtime; 202 va.va_mask |= AT_ATIME; 203 flag = 0; 204 } else 205 flag = ATTR_UTIME; 206 } else 207 flag = 0; 208 209 /* 210 * If the filesystem is exported with nosuid, then mask off 211 * the setuid and setgid bits. 212 */ 213 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 214 (exi->exi_export.ex_flags & EX_NOSUID)) 215 va.va_mode &= ~(VSUID | VSGID); 216 217 ct.cc_sysid = 0; 218 ct.cc_pid = 0; 219 ct.cc_caller_id = nfs2_srv_caller_id; 220 ct.cc_flags = CC_DONTBLOCK; 221 222 /* 223 * We need to specially handle size changes because it is 224 * possible for the client to create a file with modes 225 * which indicate read-only, but with the file opened for 226 * writing. If the client then tries to set the size of 227 * the file, then the normal access checking done in 228 * VOP_SETATTR would prevent the client from doing so, 229 * although it should be legal for it to do so. To get 230 * around this, we do the access checking for ourselves 231 * and then use VOP_SPACE which doesn't do the access 232 * checking which VOP_SETATTR does. VOP_SPACE can only 233 * operate on VREG files, let VOP_SETATTR handle the other 234 * extremely rare cases. 235 * Also the client should not be allowed to change the 236 * size of the file if there is a conflicting non-blocking 237 * mandatory lock in the region of change. 238 */ 239 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 240 if (nbl_need_check(vp)) { 241 nbl_start_crit(vp, RW_READER); 242 in_crit = 1; 243 } 244 245 bva.va_mask = AT_UID | AT_SIZE; 246 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 247 error = VOP_GETATTR(vp, &bva, 0, cr, &ct); 248 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 249 if (error) { 250 if (in_crit) 251 nbl_end_crit(vp); 252 VN_RELE(vp); 253 ns->ns_status = puterrno(error); 254 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 255 "rfs_setattr_end:(%S)", "getattr"); 256 return; 257 } 258 259 if (in_crit) { 260 u_offset_t offset; 261 ssize_t length; 262 263 if (va.va_size < bva.va_size) { 264 offset = va.va_size; 265 length = bva.va_size - va.va_size; 266 } else { 267 offset = bva.va_size; 268 length = va.va_size - bva.va_size; 269 } 270 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0, 271 NULL)) { 272 error = EACCES; 273 } 274 } 275 276 if (crgetuid(cr) == bva.va_uid && !error && 277 va.va_size != bva.va_size) { 278 va.va_mask &= ~AT_SIZE; 279 bf.l_type = F_WRLCK; 280 bf.l_whence = 0; 281 bf.l_start = (off64_t)va.va_size; 282 bf.l_len = 0; 283 bf.l_sysid = 0; 284 bf.l_pid = 0; 285 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 286 "vop_space_start:"); 287 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 288 (offset_t)va.va_size, cr, &ct); 289 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:"); 290 } 291 if (in_crit) 292 nbl_end_crit(vp); 293 } else 294 error = 0; 295 296 /* 297 * Do the setattr. 298 */ 299 if (!error && va.va_mask) { 300 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:"); 301 error = VOP_SETATTR(vp, &va, flag, cr, &ct); 302 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:"); 303 } 304 305 /* 306 * check if the monitor on either vop_space or vop_setattr detected 307 * a delegation conflict and if so, mark the thread flag as 308 * wouldblock so that the response is dropped and the client will 309 * try again. 310 */ 311 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 312 VN_RELE(vp); 313 curthread->t_flag |= T_WOULDBLOCK; 314 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 315 "rfs_setattr_end:(%S)", "delegated"); 316 return; 317 } 318 319 if (!error) { 320 va.va_mask = AT_ALL; /* get everything */ 321 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 322 error = rfs4_delegated_getattr(vp, &va, 0, cr); 323 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 324 325 /* check for overflows */ 326 if (!error) { 327 acl_perm(vp, exi, &va, cr); 328 error = vattr_to_nattr(&va, &ns->ns_attr); 329 } 330 } 331 332 ct.cc_flags = 0; 333 334 /* 335 * Force modified metadata out to stable storage. 336 */ 337 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct); 338 339 VN_RELE(vp); 340 341 ns->ns_status = puterrno(error); 342 343 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done"); 344 } 345 void * 346 rfs_setattr_getfh(struct nfssaargs *args) 347 { 348 return (&args->saa_fh); 349 } 350 351 /* 352 * Directory lookup. 353 * Returns an fhandle and file attributes for file name in a directory. 354 */ 355 /* ARGSUSED */ 356 void 357 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 358 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 359 { 360 int error; 361 vnode_t *dvp; 362 vnode_t *vp; 363 struct vattr va; 364 fhandle_t *fhp = da->da_fhandle; 365 struct sec_ol sec = {0, 0}; 366 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 367 368 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:"); 369 370 /* 371 * Trusted Extension doesn't support NFSv2. MOUNT 372 * will reject v2 clients. Need to prevent v2 client 373 * access via WebNFS here. 374 */ 375 if (is_system_labeled() && req->rq_vers == 2) { 376 dr->dr_status = NFSERR_ACCES; 377 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 378 "rfs_lookup_end:(%S)", "access"); 379 return; 380 } 381 382 /* 383 * Disallow NULL paths 384 */ 385 if (da->da_name == NULL || *da->da_name == '\0') { 386 dr->dr_status = NFSERR_ACCES; 387 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 388 "rfs_lookup_end:(%S)", "access"); 389 return; 390 } 391 392 /* 393 * Allow lookups from the root - the default 394 * location of the public filehandle. 395 */ 396 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 397 dvp = rootdir; 398 VN_HOLD(dvp); 399 } else { 400 dvp = nfs_fhtovp(fhp, exi); 401 if (dvp == NULL) { 402 dr->dr_status = NFSERR_STALE; 403 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 404 "rfs_lookup_end:(%S)", "stale"); 405 return; 406 } 407 } 408 409 /* 410 * Not allow lookup beyond root. 411 * If the filehandle matches a filehandle of the exi, 412 * then the ".." refers beyond the root of an exported filesystem. 413 */ 414 if (strcmp(da->da_name, "..") == 0 && 415 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 416 VN_RELE(dvp); 417 dr->dr_status = NFSERR_NOENT; 418 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 419 "rfs_lookup_end:(%S)", "noent"); 420 return; 421 } 422 423 /* 424 * If the public filehandle is used then allow 425 * a multi-component lookup, i.e. evaluate 426 * a pathname and follow symbolic links if 427 * necessary. 428 * 429 * This may result in a vnode in another filesystem 430 * which is OK as long as the filesystem is exported. 431 */ 432 if (PUBLIC_FH2(fhp)) { 433 publicfh_flag = TRUE; 434 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 435 &sec); 436 } else { 437 /* 438 * Do a normal single component lookup. 439 */ 440 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 441 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr, 442 NULL, NULL, NULL); 443 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 444 } 445 446 if (!error) { 447 va.va_mask = AT_ALL; /* we want everything */ 448 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 449 error = rfs4_delegated_getattr(vp, &va, 0, cr); 450 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 451 /* check for overflows */ 452 if (!error) { 453 acl_perm(vp, exi, &va, cr); 454 error = vattr_to_nattr(&va, &dr->dr_attr); 455 if (!error) { 456 if (sec.sec_flags & SEC_QUERY) 457 error = makefh_ol(&dr->dr_fhandle, exi, 458 sec.sec_index); 459 else { 460 error = makefh(&dr->dr_fhandle, vp, 461 exi); 462 if (!error && publicfh_flag && 463 !chk_clnt_sec(exi, req)) 464 auth_weak = TRUE; 465 } 466 } 467 } 468 VN_RELE(vp); 469 } 470 471 VN_RELE(dvp); 472 473 /* 474 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 475 * and have obtained a new exportinfo in exi which needs to be 476 * released. Note the the original exportinfo pointed to by exi 477 * will be released by the caller, comon_dispatch. 478 */ 479 if (publicfh_flag && exi != NULL) 480 exi_rele(exi); 481 482 /* 483 * If it's public fh, no 0x81, and client's flavor is 484 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 485 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 486 */ 487 if (auth_weak) 488 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 489 else 490 dr->dr_status = puterrno(error); 491 492 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done"); 493 } 494 void * 495 rfs_lookup_getfh(struct nfsdiropargs *da) 496 { 497 return (da->da_fhandle); 498 } 499 500 /* 501 * Read symbolic link. 502 * Returns the string in the symbolic link at the given fhandle. 503 */ 504 /* ARGSUSED */ 505 void 506 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 507 struct svc_req *req, cred_t *cr) 508 { 509 int error; 510 struct iovec iov; 511 struct uio uio; 512 vnode_t *vp; 513 struct vattr va; 514 515 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:"); 516 517 vp = nfs_fhtovp(fhp, exi); 518 if (vp == NULL) { 519 rl->rl_data = NULL; 520 rl->rl_status = NFSERR_STALE; 521 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 522 "rfs_readlink_end:(%S)", "stale"); 523 return; 524 } 525 526 va.va_mask = AT_MODE; 527 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 528 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 529 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 530 531 if (error) { 532 VN_RELE(vp); 533 rl->rl_data = NULL; 534 rl->rl_status = puterrno(error); 535 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 536 "rfs_readlink_end:(%S)", "getattr error"); 537 return; 538 } 539 540 if (MANDLOCK(vp, va.va_mode)) { 541 VN_RELE(vp); 542 rl->rl_data = NULL; 543 rl->rl_status = NFSERR_ACCES; 544 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 545 "rfs_readlink_end:(%S)", "access"); 546 return; 547 } 548 549 /* 550 * XNFS and RFC1094 require us to return ENXIO if argument 551 * is not a link. BUGID 1138002. 552 */ 553 if (vp->v_type != VLNK) { 554 VN_RELE(vp); 555 rl->rl_data = NULL; 556 rl->rl_status = NFSERR_NXIO; 557 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 558 "rfs_readlink_end:(%S)", "nxio"); 559 return; 560 } 561 562 /* 563 * Allocate data for pathname. This will be freed by rfs_rlfree. 564 */ 565 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 566 567 /* 568 * Set up io vector to read sym link data 569 */ 570 iov.iov_base = rl->rl_data; 571 iov.iov_len = NFS_MAXPATHLEN; 572 uio.uio_iov = &iov; 573 uio.uio_iovcnt = 1; 574 uio.uio_segflg = UIO_SYSSPACE; 575 uio.uio_extflg = UIO_COPY_CACHED; 576 uio.uio_loffset = (offset_t)0; 577 uio.uio_resid = NFS_MAXPATHLEN; 578 579 /* 580 * Do the readlink. 581 */ 582 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:"); 583 error = VOP_READLINK(vp, &uio, cr, NULL); 584 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:"); 585 586 #if 0 /* notyet */ 587 /* 588 * Don't do this. It causes local disk writes when just 589 * reading the file and the overhead is deemed larger 590 * than the benefit. 591 */ 592 /* 593 * Force modified metadata out to stable storage. 594 */ 595 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 596 #endif 597 598 VN_RELE(vp); 599 600 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 601 602 /* 603 * XNFS and RFC1094 require us to return ENXIO if argument 604 * is not a link. UFS returns EINVAL if this is the case, 605 * so we do the mapping here. BUGID 1138002. 606 */ 607 if (error == EINVAL) 608 rl->rl_status = NFSERR_NXIO; 609 else 610 rl->rl_status = puterrno(error); 611 612 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 613 "rfs_readlink_end:(%S)", "done"); 614 } 615 void * 616 rfs_readlink_getfh(fhandle_t *fhp) 617 { 618 return (fhp); 619 } 620 /* 621 * Free data allocated by rfs_readlink 622 */ 623 void 624 rfs_rlfree(struct nfsrdlnres *rl) 625 { 626 if (rl->rl_data != NULL) 627 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 628 } 629 630 /* 631 * Read data. 632 * Returns some data read from the file at the given fhandle. 633 */ 634 /* ARGSUSED */ 635 void 636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 637 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 638 { 639 vnode_t *vp; 640 int error; 641 struct vattr va; 642 struct iovec iov; 643 struct uio uio; 644 mblk_t *mp; 645 int alloc_err = 0; 646 int in_crit = 0; 647 caller_context_t ct; 648 649 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:"); 650 651 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 652 if (vp == NULL) { 653 rr->rr_data = NULL; 654 rr->rr_status = NFSERR_STALE; 655 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 656 "rfs_read_end:(%S)", "stale"); 657 return; 658 } 659 660 if (vp->v_type != VREG) { 661 VN_RELE(vp); 662 rr->rr_data = NULL; 663 rr->rr_status = NFSERR_ISDIR; 664 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 665 "rfs_read_end:(%S)", "isdir"); 666 return; 667 } 668 669 ct.cc_sysid = 0; 670 ct.cc_pid = 0; 671 ct.cc_caller_id = nfs2_srv_caller_id; 672 ct.cc_flags = CC_DONTBLOCK; 673 674 /* 675 * Enter the critical region before calling VOP_RWLOCK 676 * to avoid a deadlock with write requests. 677 */ 678 if (nbl_need_check(vp)) { 679 nbl_start_crit(vp, RW_READER); 680 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 681 0, NULL)) { 682 nbl_end_crit(vp); 683 VN_RELE(vp); 684 rr->rr_data = NULL; 685 rr->rr_status = NFSERR_ACCES; 686 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 687 "rfs_read_end:(%S)", " csf access error"); 688 return; 689 } 690 in_crit = 1; 691 } 692 693 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 694 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct); 695 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 696 697 /* check if a monitor detected a delegation conflict */ 698 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 699 VN_RELE(vp); 700 /* mark as wouldblock so response is dropped */ 701 curthread->t_flag |= T_WOULDBLOCK; 702 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 703 "rfs_read_end:(%S)", "delegated"); 704 rr->rr_data = NULL; 705 return; 706 } 707 708 va.va_mask = AT_ALL; 709 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 710 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 711 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 712 713 if (error) { 714 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 715 "vop_rwunlock_start:"); 716 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 717 if (in_crit) 718 nbl_end_crit(vp); 719 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 720 VN_RELE(vp); 721 rr->rr_data = NULL; 722 rr->rr_status = puterrno(error); 723 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 724 "rfs_read_end:(%S)", "getattr error"); 725 return; 726 } 727 728 /* 729 * This is a kludge to allow reading of files created 730 * with no read permission. The owner of the file 731 * is always allowed to read it. 732 */ 733 if (crgetuid(cr) != va.va_uid) { 734 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 735 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct); 736 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 737 if (error) { 738 /* 739 * Exec is the same as read over the net because 740 * of demand loading. 741 */ 742 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 743 "vop_access_start:"); 744 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct); 745 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 746 "vop_access_end:"); 747 } 748 if (error) { 749 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 750 "vop_rwunlock_start:"); 751 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 752 if (in_crit) 753 nbl_end_crit(vp); 754 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 755 "vop_rwunlock_end:"); 756 VN_RELE(vp); 757 rr->rr_data = NULL; 758 rr->rr_status = puterrno(error); 759 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 760 "rfs_read_end:(%S)", "access error"); 761 return; 762 } 763 } 764 765 if (MANDLOCK(vp, va.va_mode)) { 766 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 767 "vop_rwunlock_start:"); 768 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 769 if (in_crit) 770 nbl_end_crit(vp); 771 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 772 VN_RELE(vp); 773 rr->rr_data = NULL; 774 rr->rr_status = NFSERR_ACCES; 775 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 776 "rfs_read_end:(%S)", "mand lock"); 777 return; 778 } 779 780 if ((u_offset_t)ra->ra_offset >= va.va_size) { 781 rr->rr_count = 0; 782 rr->rr_data = NULL; 783 /* 784 * In this case, status is NFS_OK, but there is no data 785 * to encode. So set rr_mp to NULL. 786 */ 787 rr->rr_mp = NULL; 788 goto done; 789 } 790 791 /* 792 * mp will contain the data to be sent out in the read reply. 793 * This will be freed after the reply has been sent out (by the 794 * driver). 795 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 796 * that the call to xdrmblk_putmblk() never fails. 797 */ 798 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 799 &alloc_err); 800 ASSERT(mp != NULL); 801 ASSERT(alloc_err == 0); 802 803 rr->rr_mp = mp; 804 805 /* 806 * Set up io vector 807 */ 808 iov.iov_base = (caddr_t)mp->b_datap->db_base; 809 iov.iov_len = ra->ra_count; 810 uio.uio_iov = &iov; 811 uio.uio_iovcnt = 1; 812 uio.uio_segflg = UIO_SYSSPACE; 813 uio.uio_extflg = UIO_COPY_CACHED; 814 uio.uio_loffset = (offset_t)ra->ra_offset; 815 uio.uio_resid = ra->ra_count; 816 817 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:"); 818 error = VOP_READ(vp, &uio, 0, cr, &ct); 819 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:"); 820 821 if (error) { 822 freeb(mp); 823 824 /* 825 * check if a monitor detected a delegation conflict and 826 * mark as wouldblock so response is dropped 827 */ 828 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 829 curthread->t_flag |= T_WOULDBLOCK; 830 else 831 rr->rr_status = puterrno(error); 832 833 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 834 "vop_rwunlock_start:"); 835 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 836 if (in_crit) 837 nbl_end_crit(vp); 838 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 839 VN_RELE(vp); 840 rr->rr_data = NULL; 841 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 842 "rfs_read_end:(%S)", "read error"); 843 return; 844 } 845 846 /* 847 * Get attributes again so we can send the latest access 848 * time to the client side for his cache. 849 */ 850 va.va_mask = AT_ALL; 851 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 852 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 853 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 854 if (error) { 855 freeb(mp); 856 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 857 "vop_rwunlock_start:"); 858 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 859 if (in_crit) 860 nbl_end_crit(vp); 861 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 862 "vop_rwunlock_end:"); 863 VN_RELE(vp); 864 rr->rr_data = NULL; 865 rr->rr_status = puterrno(error); 866 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 867 "rfs_read_end:(%S)", "read error"); 868 return; 869 } 870 871 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 872 873 rr->rr_data = (char *)mp->b_datap->db_base; 874 875 done: 876 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 877 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 878 if (in_crit) 879 nbl_end_crit(vp); 880 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 881 882 acl_perm(vp, exi, &va, cr); 883 884 /* check for overflows */ 885 error = vattr_to_nattr(&va, &rr->rr_attr); 886 887 #if 0 /* notyet */ 888 /* 889 * Don't do this. It causes local disk writes when just 890 * reading the file and the overhead is deemed larger 891 * than the benefit. 892 */ 893 /* 894 * Force modified metadata out to stable storage. 895 */ 896 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 897 #endif 898 899 VN_RELE(vp); 900 901 rr->rr_status = puterrno(error); 902 903 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done"); 904 } 905 906 /* 907 * Free data allocated by rfs_read 908 */ 909 void 910 rfs_rdfree(struct nfsrdresult *rr) 911 { 912 mblk_t *mp; 913 914 if (rr->rr_status == NFS_OK) { 915 mp = rr->rr_mp; 916 if (mp != NULL) 917 freeb(mp); 918 } 919 } 920 921 void * 922 rfs_read_getfh(struct nfsreadargs *ra) 923 { 924 return (&ra->ra_fhandle); 925 } 926 927 #define MAX_IOVECS 12 928 929 #ifdef DEBUG 930 static int rfs_write_sync_hits = 0; 931 static int rfs_write_sync_misses = 0; 932 #endif 933 934 /* 935 * Write data to file. 936 * Returns attributes of a file after writing some data to it. 937 * 938 * Any changes made here, especially in error handling might have 939 * to also be done in rfs_write (which clusters write requests). 940 */ 941 void 942 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 943 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 944 { 945 int error; 946 vnode_t *vp; 947 rlim64_t rlimit; 948 struct vattr va; 949 struct uio uio; 950 struct iovec iov[MAX_IOVECS]; 951 mblk_t *m; 952 struct iovec *iovp; 953 int iovcnt; 954 cred_t *savecred; 955 int in_crit = 0; 956 caller_context_t ct; 957 958 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync"); 959 960 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 961 if (vp == NULL) { 962 ns->ns_status = NFSERR_STALE; 963 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 964 "rfs_write_end:(%S)", "stale"); 965 return; 966 } 967 968 if (rdonly(exi, req)) { 969 VN_RELE(vp); 970 ns->ns_status = NFSERR_ROFS; 971 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 972 "rfs_write_end:(%S)", "rofs"); 973 return; 974 } 975 976 if (vp->v_type != VREG) { 977 VN_RELE(vp); 978 ns->ns_status = NFSERR_ISDIR; 979 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 980 "rfs_write_end:(%S)", "isdir"); 981 return; 982 } 983 984 ct.cc_sysid = 0; 985 ct.cc_pid = 0; 986 ct.cc_caller_id = nfs2_srv_caller_id; 987 ct.cc_flags = CC_DONTBLOCK; 988 989 va.va_mask = AT_UID|AT_MODE; 990 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 991 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 992 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 993 994 if (error) { 995 VN_RELE(vp); 996 ns->ns_status = puterrno(error); 997 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 998 "rfs_write_end:(%S)", "getattr error"); 999 return; 1000 } 1001 1002 if (crgetuid(cr) != va.va_uid) { 1003 /* 1004 * This is a kludge to allow writes of files created 1005 * with read only permission. The owner of the file 1006 * is always allowed to write it. 1007 */ 1008 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 1009 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct); 1010 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 1011 if (error) { 1012 VN_RELE(vp); 1013 ns->ns_status = puterrno(error); 1014 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1015 "rfs_write_end:(%S)", "access error"); 1016 return; 1017 } 1018 } 1019 1020 /* 1021 * Can't access a mandatory lock file. This might cause 1022 * the NFS service thread to block forever waiting for a 1023 * lock to be released that will never be released. 1024 */ 1025 if (MANDLOCK(vp, va.va_mode)) { 1026 VN_RELE(vp); 1027 ns->ns_status = NFSERR_ACCES; 1028 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1029 "rfs_write_end:(%S)", "mand lock"); 1030 return; 1031 } 1032 1033 /* 1034 * We have to enter the critical region before calling VOP_RWLOCK 1035 * to avoid a deadlock with ufs. 1036 */ 1037 if (nbl_need_check(vp)) { 1038 nbl_start_crit(vp, RW_READER); 1039 in_crit = 1; 1040 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1041 wa->wa_count, 0, NULL)) { 1042 error = EACCES; 1043 goto out; 1044 } 1045 } 1046 1047 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 1048 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); 1049 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 1050 1051 /* check if a monitor detected a delegation conflict */ 1052 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 1053 VN_RELE(vp); 1054 /* mark as wouldblock so response is dropped */ 1055 curthread->t_flag |= T_WOULDBLOCK; 1056 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1057 "rfs_write_end:(%S)", "delegated"); 1058 return; 1059 } 1060 1061 if (wa->wa_data) { 1062 iov[0].iov_base = wa->wa_data; 1063 iov[0].iov_len = wa->wa_count; 1064 uio.uio_iov = iov; 1065 uio.uio_iovcnt = 1; 1066 uio.uio_segflg = UIO_SYSSPACE; 1067 uio.uio_extflg = UIO_COPY_DEFAULT; 1068 uio.uio_loffset = (offset_t)wa->wa_offset; 1069 uio.uio_resid = wa->wa_count; 1070 /* 1071 * The limit is checked on the client. We 1072 * should allow any size writes here. 1073 */ 1074 uio.uio_llimit = curproc->p_fsz_ctl; 1075 rlimit = uio.uio_llimit - wa->wa_offset; 1076 if (rlimit < (rlim64_t)uio.uio_resid) 1077 uio.uio_resid = (uint_t)rlimit; 1078 1079 /* 1080 * for now we assume no append mode 1081 */ 1082 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1083 "vop_write_start:(%S)", "sync"); 1084 /* 1085 * We're changing creds because VM may fault and we need 1086 * the cred of the current thread to be used if quota 1087 * checking is enabled. 1088 */ 1089 savecred = curthread->t_cred; 1090 curthread->t_cred = cr; 1091 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); 1092 curthread->t_cred = savecred; 1093 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1094 } else { 1095 iovcnt = 0; 1096 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1097 iovcnt++; 1098 if (iovcnt <= MAX_IOVECS) { 1099 #ifdef DEBUG 1100 rfs_write_sync_hits++; 1101 #endif 1102 iovp = iov; 1103 } else { 1104 #ifdef DEBUG 1105 rfs_write_sync_misses++; 1106 #endif 1107 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1108 } 1109 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1110 uio.uio_iov = iovp; 1111 uio.uio_iovcnt = iovcnt; 1112 uio.uio_segflg = UIO_SYSSPACE; 1113 uio.uio_extflg = UIO_COPY_DEFAULT; 1114 uio.uio_loffset = (offset_t)wa->wa_offset; 1115 uio.uio_resid = wa->wa_count; 1116 /* 1117 * The limit is checked on the client. We 1118 * should allow any size writes here. 1119 */ 1120 uio.uio_llimit = curproc->p_fsz_ctl; 1121 rlimit = uio.uio_llimit - wa->wa_offset; 1122 if (rlimit < (rlim64_t)uio.uio_resid) 1123 uio.uio_resid = (uint_t)rlimit; 1124 1125 /* 1126 * For now we assume no append mode. 1127 */ 1128 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1129 "vop_write_start:(%S)", "iov sync"); 1130 /* 1131 * We're changing creds because VM may fault and we need 1132 * the cred of the current thread to be used if quota 1133 * checking is enabled. 1134 */ 1135 savecred = curthread->t_cred; 1136 curthread->t_cred = cr; 1137 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); 1138 curthread->t_cred = savecred; 1139 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1140 1141 if (iovp != iov) 1142 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1143 } 1144 1145 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1146 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); 1147 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1148 1149 if (!error) { 1150 /* 1151 * Get attributes again so we send the latest mod 1152 * time to the client side for his cache. 1153 */ 1154 va.va_mask = AT_ALL; /* now we want everything */ 1155 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1156 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 1157 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1158 /* check for overflows */ 1159 if (!error) { 1160 acl_perm(vp, exi, &va, cr); 1161 error = vattr_to_nattr(&va, &ns->ns_attr); 1162 } 1163 } 1164 1165 out: 1166 if (in_crit) 1167 nbl_end_crit(vp); 1168 VN_RELE(vp); 1169 1170 /* check if a monitor detected a delegation conflict */ 1171 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 1172 /* mark as wouldblock so response is dropped */ 1173 curthread->t_flag |= T_WOULDBLOCK; 1174 else 1175 ns->ns_status = puterrno(error); 1176 1177 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync"); 1178 } 1179 1180 struct rfs_async_write { 1181 struct nfswriteargs *wa; 1182 struct nfsattrstat *ns; 1183 struct svc_req *req; 1184 cred_t *cr; 1185 kthread_t *thread; 1186 struct rfs_async_write *list; 1187 }; 1188 1189 struct rfs_async_write_list { 1190 fhandle_t *fhp; 1191 kcondvar_t cv; 1192 struct rfs_async_write *list; 1193 struct rfs_async_write_list *next; 1194 }; 1195 1196 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1197 static kmutex_t rfs_async_write_lock; 1198 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1199 1200 #define MAXCLIOVECS 42 1201 #define RFSWRITE_INITVAL (enum nfsstat) -1 1202 1203 #ifdef DEBUG 1204 static int rfs_write_hits = 0; 1205 static int rfs_write_misses = 0; 1206 #endif 1207 1208 /* 1209 * Write data to file. 1210 * Returns attributes of a file after writing some data to it. 1211 */ 1212 void 1213 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1214 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1215 { 1216 int error; 1217 vnode_t *vp; 1218 rlim64_t rlimit; 1219 struct vattr va; 1220 struct uio uio; 1221 struct rfs_async_write_list *lp; 1222 struct rfs_async_write_list *nlp; 1223 struct rfs_async_write *rp; 1224 struct rfs_async_write *nrp; 1225 struct rfs_async_write *trp; 1226 struct rfs_async_write *lrp; 1227 int data_written; 1228 int iovcnt; 1229 mblk_t *m; 1230 struct iovec *iovp; 1231 struct iovec *niovp; 1232 struct iovec iov[MAXCLIOVECS]; 1233 int count; 1234 int rcount; 1235 uint_t off; 1236 uint_t len; 1237 struct rfs_async_write nrpsp; 1238 struct rfs_async_write_list nlpsp; 1239 ushort_t t_flag; 1240 cred_t *savecred; 1241 int in_crit = 0; 1242 caller_context_t ct; 1243 1244 if (!rfs_write_async) { 1245 rfs_write_sync(wa, ns, exi, req, cr); 1246 return; 1247 } 1248 1249 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1250 "rfs_write_start:(%S)", "async"); 1251 1252 /* 1253 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1254 * is considered an OK. 1255 */ 1256 ns->ns_status = RFSWRITE_INITVAL; 1257 1258 nrp = &nrpsp; 1259 nrp->wa = wa; 1260 nrp->ns = ns; 1261 nrp->req = req; 1262 nrp->cr = cr; 1263 nrp->thread = curthread; 1264 1265 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1266 1267 /* 1268 * Look to see if there is already a cluster started 1269 * for this file. 1270 */ 1271 mutex_enter(&rfs_async_write_lock); 1272 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1273 if (bcmp(&wa->wa_fhandle, lp->fhp, 1274 sizeof (fhandle_t)) == 0) 1275 break; 1276 } 1277 1278 /* 1279 * If lp is non-NULL, then there is already a cluster 1280 * started. We need to place ourselves in the cluster 1281 * list in the right place as determined by starting 1282 * offset. Conflicts with non-blocking mandatory locked 1283 * regions will be checked when the cluster is processed. 1284 */ 1285 if (lp != NULL) { 1286 rp = lp->list; 1287 trp = NULL; 1288 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1289 trp = rp; 1290 rp = rp->list; 1291 } 1292 nrp->list = rp; 1293 if (trp == NULL) 1294 lp->list = nrp; 1295 else 1296 trp->list = nrp; 1297 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1298 cv_wait(&lp->cv, &rfs_async_write_lock); 1299 mutex_exit(&rfs_async_write_lock); 1300 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1301 "rfs_write_end:(%S)", "cluster child"); 1302 return; 1303 } 1304 1305 /* 1306 * No cluster started yet, start one and add ourselves 1307 * to the list of clusters. 1308 */ 1309 nrp->list = NULL; 1310 1311 nlp = &nlpsp; 1312 nlp->fhp = &wa->wa_fhandle; 1313 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1314 nlp->list = nrp; 1315 nlp->next = NULL; 1316 1317 if (rfs_async_write_head == NULL) { 1318 rfs_async_write_head = nlp; 1319 } else { 1320 lp = rfs_async_write_head; 1321 while (lp->next != NULL) 1322 lp = lp->next; 1323 lp->next = nlp; 1324 } 1325 mutex_exit(&rfs_async_write_lock); 1326 1327 /* 1328 * Convert the file handle common to all of the requests 1329 * in this cluster to a vnode. 1330 */ 1331 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1332 if (vp == NULL) { 1333 mutex_enter(&rfs_async_write_lock); 1334 if (rfs_async_write_head == nlp) 1335 rfs_async_write_head = nlp->next; 1336 else { 1337 lp = rfs_async_write_head; 1338 while (lp->next != nlp) 1339 lp = lp->next; 1340 lp->next = nlp->next; 1341 } 1342 t_flag = curthread->t_flag & T_WOULDBLOCK; 1343 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1344 rp->ns->ns_status = NFSERR_STALE; 1345 rp->thread->t_flag |= t_flag; 1346 } 1347 cv_broadcast(&nlp->cv); 1348 mutex_exit(&rfs_async_write_lock); 1349 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1350 "rfs_write_end:(%S)", "stale"); 1351 return; 1352 } 1353 1354 /* 1355 * Can only write regular files. Attempts to write any 1356 * other file types fail with EISDIR. 1357 */ 1358 if (vp->v_type != VREG) { 1359 VN_RELE(vp); 1360 mutex_enter(&rfs_async_write_lock); 1361 if (rfs_async_write_head == nlp) 1362 rfs_async_write_head = nlp->next; 1363 else { 1364 lp = rfs_async_write_head; 1365 while (lp->next != nlp) 1366 lp = lp->next; 1367 lp->next = nlp->next; 1368 } 1369 t_flag = curthread->t_flag & T_WOULDBLOCK; 1370 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1371 rp->ns->ns_status = NFSERR_ISDIR; 1372 rp->thread->t_flag |= t_flag; 1373 } 1374 cv_broadcast(&nlp->cv); 1375 mutex_exit(&rfs_async_write_lock); 1376 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1377 "rfs_write_end:(%S)", "isdir"); 1378 return; 1379 } 1380 1381 /* 1382 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1383 * deadlock with ufs. 1384 */ 1385 if (nbl_need_check(vp)) { 1386 nbl_start_crit(vp, RW_READER); 1387 in_crit = 1; 1388 } 1389 1390 ct.cc_sysid = 0; 1391 ct.cc_pid = 0; 1392 ct.cc_caller_id = nfs2_srv_caller_id; 1393 ct.cc_flags = CC_DONTBLOCK; 1394 1395 /* 1396 * Lock the file for writing. This operation provides 1397 * the delay which allows clusters to grow. 1398 */ 1399 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:"); 1400 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); 1401 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end"); 1402 1403 /* check if a monitor detected a delegation conflict */ 1404 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 1405 VN_RELE(vp); 1406 /* mark as wouldblock so response is dropped */ 1407 curthread->t_flag |= T_WOULDBLOCK; 1408 mutex_enter(&rfs_async_write_lock); 1409 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1410 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1411 rp->ns->ns_status = puterrno(error); 1412 rp->thread->t_flag |= T_WOULDBLOCK; 1413 } 1414 } 1415 cv_broadcast(&nlp->cv); 1416 mutex_exit(&rfs_async_write_lock); 1417 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1418 "rfs_write_end:(%S)", "delegated"); 1419 return; 1420 } 1421 1422 /* 1423 * Disconnect this cluster from the list of clusters. 1424 * The cluster that is being dealt with must be fixed 1425 * in size after this point, so there is no reason 1426 * to leave it on the list so that new requests can 1427 * find it. 1428 * 1429 * The algorithm is that the first write request will 1430 * create a cluster, convert the file handle to a 1431 * vnode pointer, and then lock the file for writing. 1432 * This request is not likely to be clustered with 1433 * any others. However, the next request will create 1434 * a new cluster and be blocked in VOP_RWLOCK while 1435 * the first request is being processed. This delay 1436 * will allow more requests to be clustered in this 1437 * second cluster. 1438 */ 1439 mutex_enter(&rfs_async_write_lock); 1440 if (rfs_async_write_head == nlp) 1441 rfs_async_write_head = nlp->next; 1442 else { 1443 lp = rfs_async_write_head; 1444 while (lp->next != nlp) 1445 lp = lp->next; 1446 lp->next = nlp->next; 1447 } 1448 mutex_exit(&rfs_async_write_lock); 1449 1450 /* 1451 * Step through the list of requests in this cluster. 1452 * We need to check permissions to make sure that all 1453 * of the requests have sufficient permission to write 1454 * the file. A cluster can be composed of requests 1455 * from different clients and different users on each 1456 * client. 1457 * 1458 * As a side effect, we also calculate the size of the 1459 * byte range that this cluster encompasses. 1460 */ 1461 rp = nlp->list; 1462 off = rp->wa->wa_offset; 1463 len = (uint_t)0; 1464 do { 1465 if (rdonly(exi, rp->req)) { 1466 rp->ns->ns_status = NFSERR_ROFS; 1467 t_flag = curthread->t_flag & T_WOULDBLOCK; 1468 rp->thread->t_flag |= t_flag; 1469 continue; 1470 } 1471 1472 va.va_mask = AT_UID|AT_MODE; 1473 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1474 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct); 1475 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1476 if (!error) { 1477 if (crgetuid(rp->cr) != va.va_uid) { 1478 /* 1479 * This is a kludge to allow writes of files 1480 * created with read only permission. The 1481 * owner of the file is always allowed to 1482 * write it. 1483 */ 1484 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1485 "vop_access_start:"); 1486 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct); 1487 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1488 "vop_access_end:"); 1489 } 1490 if (!error && MANDLOCK(vp, va.va_mode)) 1491 error = EACCES; 1492 } 1493 1494 /* 1495 * Check for a conflict with a nbmand-locked region. 1496 */ 1497 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1498 rp->wa->wa_count, 0, NULL)) { 1499 error = EACCES; 1500 } 1501 1502 if (error) { 1503 rp->ns->ns_status = puterrno(error); 1504 t_flag = curthread->t_flag & T_WOULDBLOCK; 1505 rp->thread->t_flag |= t_flag; 1506 continue; 1507 } 1508 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1509 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1510 } while ((rp = rp->list) != NULL); 1511 1512 /* 1513 * Step through the cluster attempting to gather as many 1514 * requests which are contiguous as possible. These 1515 * contiguous requests are handled via one call to VOP_WRITE 1516 * instead of different calls to VOP_WRITE. We also keep 1517 * track of the fact that any data was written. 1518 */ 1519 rp = nlp->list; 1520 data_written = 0; 1521 do { 1522 /* 1523 * Skip any requests which are already marked as having an 1524 * error. 1525 */ 1526 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1527 rp = rp->list; 1528 continue; 1529 } 1530 1531 /* 1532 * Count the number of iovec's which are required 1533 * to handle this set of requests. One iovec is 1534 * needed for each data buffer, whether addressed 1535 * by wa_data or by the b_rptr pointers in the 1536 * mblk chains. 1537 */ 1538 iovcnt = 0; 1539 lrp = rp; 1540 for (;;) { 1541 if (lrp->wa->wa_data) 1542 iovcnt++; 1543 else { 1544 m = lrp->wa->wa_mblk; 1545 while (m != NULL) { 1546 iovcnt++; 1547 m = m->b_cont; 1548 } 1549 } 1550 if (lrp->list == NULL || 1551 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1552 lrp->wa->wa_offset + lrp->wa->wa_count != 1553 lrp->list->wa->wa_offset) { 1554 lrp = lrp->list; 1555 break; 1556 } 1557 lrp = lrp->list; 1558 } 1559 1560 if (iovcnt <= MAXCLIOVECS) { 1561 #ifdef DEBUG 1562 rfs_write_hits++; 1563 #endif 1564 niovp = iov; 1565 } else { 1566 #ifdef DEBUG 1567 rfs_write_misses++; 1568 #endif 1569 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1570 } 1571 /* 1572 * Put together the scatter/gather iovecs. 1573 */ 1574 iovp = niovp; 1575 trp = rp; 1576 count = 0; 1577 do { 1578 if (trp->wa->wa_data) { 1579 iovp->iov_base = trp->wa->wa_data; 1580 iovp->iov_len = trp->wa->wa_count; 1581 iovp++; 1582 } else { 1583 m = trp->wa->wa_mblk; 1584 rcount = trp->wa->wa_count; 1585 while (m != NULL) { 1586 iovp->iov_base = (caddr_t)m->b_rptr; 1587 iovp->iov_len = (m->b_wptr - m->b_rptr); 1588 rcount -= iovp->iov_len; 1589 if (rcount < 0) 1590 iovp->iov_len += rcount; 1591 iovp++; 1592 if (rcount <= 0) 1593 break; 1594 m = m->b_cont; 1595 } 1596 } 1597 count += trp->wa->wa_count; 1598 trp = trp->list; 1599 } while (trp != lrp); 1600 1601 uio.uio_iov = niovp; 1602 uio.uio_iovcnt = iovcnt; 1603 uio.uio_segflg = UIO_SYSSPACE; 1604 uio.uio_extflg = UIO_COPY_DEFAULT; 1605 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1606 uio.uio_resid = count; 1607 /* 1608 * The limit is checked on the client. We 1609 * should allow any size writes here. 1610 */ 1611 uio.uio_llimit = curproc->p_fsz_ctl; 1612 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1613 if (rlimit < (rlim64_t)uio.uio_resid) 1614 uio.uio_resid = (uint_t)rlimit; 1615 1616 /* 1617 * For now we assume no append mode. 1618 */ 1619 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1620 "vop_write_start:(%S)", "async"); 1621 1622 /* 1623 * We're changing creds because VM may fault 1624 * and we need the cred of the current 1625 * thread to be used if quota * checking is 1626 * enabled. 1627 */ 1628 savecred = curthread->t_cred; 1629 curthread->t_cred = cr; 1630 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct); 1631 curthread->t_cred = savecred; 1632 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1633 1634 /* check if a monitor detected a delegation conflict */ 1635 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 1636 /* mark as wouldblock so response is dropped */ 1637 curthread->t_flag |= T_WOULDBLOCK; 1638 1639 if (niovp != iov) 1640 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1641 1642 if (!error) { 1643 data_written = 1; 1644 /* 1645 * Get attributes again so we send the latest mod 1646 * time to the client side for his cache. 1647 */ 1648 va.va_mask = AT_ALL; /* now we want everything */ 1649 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1650 "vop_getattr_start:"); 1651 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct); 1652 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1653 "vop_getattr_end:"); 1654 if (!error) 1655 acl_perm(vp, exi, &va, rp->cr); 1656 } 1657 1658 /* 1659 * Fill in the status responses for each request 1660 * which was just handled. Also, copy the latest 1661 * attributes in to the attribute responses if 1662 * appropriate. 1663 */ 1664 t_flag = curthread->t_flag & T_WOULDBLOCK; 1665 do { 1666 rp->thread->t_flag |= t_flag; 1667 /* check for overflows */ 1668 if (!error) { 1669 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1670 } 1671 rp->ns->ns_status = puterrno(error); 1672 rp = rp->list; 1673 } while (rp != lrp); 1674 } while (rp != NULL); 1675 1676 /* 1677 * If any data was written at all, then we need to flush 1678 * the data and metadata to stable storage. 1679 */ 1680 if (data_written) { 1681 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:"); 1682 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct); 1683 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:"); 1684 if (!error) { 1685 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1686 "vop_fsync_start:"); 1687 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct); 1688 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:"); 1689 } 1690 } 1691 1692 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1693 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); 1694 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1695 1696 if (in_crit) 1697 nbl_end_crit(vp); 1698 VN_RELE(vp); 1699 1700 t_flag = curthread->t_flag & T_WOULDBLOCK; 1701 mutex_enter(&rfs_async_write_lock); 1702 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1703 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1704 rp->ns->ns_status = puterrno(error); 1705 rp->thread->t_flag |= t_flag; 1706 } 1707 } 1708 cv_broadcast(&nlp->cv); 1709 mutex_exit(&rfs_async_write_lock); 1710 1711 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async"); 1712 } 1713 1714 void * 1715 rfs_write_getfh(struct nfswriteargs *wa) 1716 { 1717 return (&wa->wa_fhandle); 1718 } 1719 1720 /* 1721 * Create a file. 1722 * Creates a file with given attributes and returns those attributes 1723 * and an fhandle for the new file. 1724 */ 1725 void 1726 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1727 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1728 { 1729 int error; 1730 int lookuperr; 1731 int in_crit = 0; 1732 struct vattr va; 1733 vnode_t *vp; 1734 vnode_t *realvp; 1735 vnode_t *dvp; 1736 char *name = args->ca_da.da_name; 1737 vnode_t *tvp = NULL; 1738 int mode; 1739 int lookup_ok; 1740 bool_t trunc; 1741 1742 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:"); 1743 1744 /* 1745 * Disallow NULL paths 1746 */ 1747 if (name == NULL || *name == '\0') { 1748 dr->dr_status = NFSERR_ACCES; 1749 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1750 "rfs_create_end:(%S)", "access"); 1751 return; 1752 } 1753 1754 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1755 if (dvp == NULL) { 1756 dr->dr_status = NFSERR_STALE; 1757 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1758 "rfs_create_end:(%S)", "stale"); 1759 return; 1760 } 1761 1762 error = sattr_to_vattr(args->ca_sa, &va); 1763 if (error) { 1764 dr->dr_status = puterrno(error); 1765 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1766 "rfs_create_end:(%S)", "sattr"); 1767 return; 1768 } 1769 1770 /* 1771 * Must specify the mode. 1772 */ 1773 if (!(va.va_mask & AT_MODE)) { 1774 VN_RELE(dvp); 1775 dr->dr_status = NFSERR_INVAL; 1776 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1777 "rfs_create_end:(%S)", "no mode"); 1778 return; 1779 } 1780 1781 /* 1782 * This is a completely gross hack to make mknod 1783 * work over the wire until we can wack the protocol 1784 */ 1785 if ((va.va_mode & IFMT) == IFCHR) { 1786 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1787 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1788 else { 1789 va.va_type = VCHR; 1790 /* 1791 * uncompress the received dev_t 1792 * if the top half is zero indicating a request 1793 * from an `older style' OS. 1794 */ 1795 if ((va.va_size & 0xffff0000) == 0) 1796 va.va_rdev = nfsv2_expdev(va.va_size); 1797 else 1798 va.va_rdev = (dev_t)va.va_size; 1799 } 1800 va.va_mask &= ~AT_SIZE; 1801 } else if ((va.va_mode & IFMT) == IFBLK) { 1802 va.va_type = VBLK; 1803 /* 1804 * uncompress the received dev_t 1805 * if the top half is zero indicating a request 1806 * from an `older style' OS. 1807 */ 1808 if ((va.va_size & 0xffff0000) == 0) 1809 va.va_rdev = nfsv2_expdev(va.va_size); 1810 else 1811 va.va_rdev = (dev_t)va.va_size; 1812 va.va_mask &= ~AT_SIZE; 1813 } else if ((va.va_mode & IFMT) == IFSOCK) { 1814 va.va_type = VSOCK; 1815 } else 1816 va.va_type = VREG; 1817 va.va_mode &= ~IFMT; 1818 va.va_mask |= AT_TYPE; 1819 1820 /* 1821 * Why was the choice made to use VWRITE as the mode to the 1822 * call to VOP_CREATE ? This results in a bug. When a client 1823 * opens a file that already exists and is RDONLY, the second 1824 * open fails with an EACESS because of the mode. 1825 * bug ID 1054648. 1826 */ 1827 lookup_ok = 0; 1828 mode = VWRITE; 1829 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1830 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 1831 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1832 NULL, NULL, NULL); 1833 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 1834 if (!error) { 1835 struct vattr at; 1836 1837 lookup_ok = 1; 1838 at.va_mask = AT_MODE; 1839 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1840 "vop_getattr_start:"); 1841 error = VOP_GETATTR(tvp, &at, 0, cr, NULL); 1842 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1843 "vop_getattr_end:"); 1844 if (!error) 1845 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1846 VN_RELE(tvp); 1847 tvp = NULL; 1848 } 1849 } 1850 1851 if (!lookup_ok) { 1852 if (rdonly(exi, req)) { 1853 error = EROFS; 1854 } else if (va.va_type != VREG && va.va_type != VFIFO && 1855 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1856 error = EPERM; 1857 } else { 1858 error = 0; 1859 } 1860 } 1861 1862 /* 1863 * If file size is being modified on an already existing file 1864 * make sure that there are no conflicting non-blocking mandatory 1865 * locks in the region being manipulated. Return EACCES if there 1866 * are conflicting locks. 1867 */ 1868 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1869 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1870 NULL, NULL, NULL); 1871 1872 if (!lookuperr && 1873 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1874 VN_RELE(tvp); 1875 curthread->t_flag |= T_WOULDBLOCK; 1876 goto out; 1877 } 1878 1879 if (!lookuperr && nbl_need_check(tvp)) { 1880 /* 1881 * The file exists. Now check if it has any 1882 * conflicting non-blocking mandatory locks 1883 * in the region being changed. 1884 */ 1885 struct vattr bva; 1886 u_offset_t offset; 1887 ssize_t length; 1888 1889 nbl_start_crit(tvp, RW_READER); 1890 in_crit = 1; 1891 1892 bva.va_mask = AT_SIZE; 1893 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL); 1894 if (!error) { 1895 if (va.va_size < bva.va_size) { 1896 offset = va.va_size; 1897 length = bva.va_size - va.va_size; 1898 } else { 1899 offset = bva.va_size; 1900 length = va.va_size - bva.va_size; 1901 } 1902 if (length) { 1903 if (nbl_conflict(tvp, NBL_WRITE, 1904 offset, length, 0, NULL)) { 1905 error = EACCES; 1906 } 1907 } 1908 } 1909 if (error) { 1910 nbl_end_crit(tvp); 1911 VN_RELE(tvp); 1912 in_crit = 0; 1913 } 1914 } else if (tvp != NULL) { 1915 VN_RELE(tvp); 1916 } 1917 } 1918 1919 if (!error) { 1920 /* 1921 * If filesystem is shared with nosuid the remove any 1922 * setuid/setgid bits on create. 1923 */ 1924 if (va.va_type == VREG && 1925 exi->exi_export.ex_flags & EX_NOSUID) 1926 va.va_mode &= ~(VSUID | VSGID); 1927 1928 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:"); 1929 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0, 1930 NULL, NULL); 1931 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:"); 1932 1933 if (!error) { 1934 1935 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1936 trunc = TRUE; 1937 else 1938 trunc = FALSE; 1939 1940 if (rfs4_check_delegated(FWRITE, vp, trunc)) { 1941 VN_RELE(vp); 1942 curthread->t_flag |= T_WOULDBLOCK; 1943 goto out; 1944 } 1945 va.va_mask = AT_ALL; 1946 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1947 "vop_getattr_start:"); 1948 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 1949 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1950 "vop_getattr_end:"); 1951 /* check for overflows */ 1952 if (!error) { 1953 acl_perm(vp, exi, &va, cr); 1954 error = vattr_to_nattr(&va, &dr->dr_attr); 1955 if (!error) { 1956 error = makefh(&dr->dr_fhandle, vp, 1957 exi); 1958 } 1959 } 1960 /* 1961 * Force modified metadata out to stable storage. 1962 * 1963 * if a underlying vp exists, pass it to VOP_FSYNC 1964 */ 1965 if (VOP_REALVP(vp, &realvp, NULL) == 0) 1966 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL); 1967 else 1968 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 1969 VN_RELE(vp); 1970 } 1971 1972 if (in_crit) { 1973 nbl_end_crit(tvp); 1974 VN_RELE(tvp); 1975 } 1976 } 1977 1978 /* 1979 * Force modified data and metadata out to stable storage. 1980 */ 1981 (void) VOP_FSYNC(dvp, 0, cr, NULL); 1982 1983 out: 1984 1985 VN_RELE(dvp); 1986 1987 dr->dr_status = puterrno(error); 1988 1989 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done"); 1990 } 1991 void * 1992 rfs_create_getfh(struct nfscreatargs *args) 1993 { 1994 return (args->ca_da.da_fhandle); 1995 } 1996 1997 /* 1998 * Remove a file. 1999 * Remove named file from parent directory. 2000 */ 2001 void 2002 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 2003 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2004 { 2005 int error = 0; 2006 vnode_t *vp; 2007 vnode_t *targvp; 2008 int in_crit = 0; 2009 2010 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:"); 2011 2012 /* 2013 * Disallow NULL paths 2014 */ 2015 if (da->da_name == NULL || *da->da_name == '\0') { 2016 *status = NFSERR_ACCES; 2017 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2018 "rfs_remove_end:(%S)", "access"); 2019 return; 2020 } 2021 2022 vp = nfs_fhtovp(da->da_fhandle, exi); 2023 if (vp == NULL) { 2024 *status = NFSERR_STALE; 2025 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2026 "rfs_remove_end:(%S)", "stale"); 2027 return; 2028 } 2029 2030 if (rdonly(exi, req)) { 2031 VN_RELE(vp); 2032 *status = NFSERR_ROFS; 2033 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2034 "rfs_remove_end:(%S)", "rofs"); 2035 return; 2036 } 2037 2038 /* 2039 * Check for a conflict with a non-blocking mandatory share reservation. 2040 */ 2041 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2042 NULL, cr, NULL, NULL, NULL); 2043 if (error != 0) { 2044 VN_RELE(vp); 2045 *status = puterrno(error); 2046 return; 2047 } 2048 2049 /* 2050 * If the file is delegated to an v4 client, then initiate 2051 * recall and drop this request (by setting T_WOULDBLOCK). 2052 * The client will eventually re-transmit the request and 2053 * (hopefully), by then, the v4 client will have returned 2054 * the delegation. 2055 */ 2056 2057 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2058 VN_RELE(vp); 2059 VN_RELE(targvp); 2060 curthread->t_flag |= T_WOULDBLOCK; 2061 return; 2062 } 2063 2064 if (nbl_need_check(targvp)) { 2065 nbl_start_crit(targvp, RW_READER); 2066 in_crit = 1; 2067 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { 2068 error = EACCES; 2069 goto out; 2070 } 2071 } 2072 2073 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:"); 2074 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0); 2075 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:"); 2076 2077 /* 2078 * Force modified data and metadata out to stable storage. 2079 */ 2080 (void) VOP_FSYNC(vp, 0, cr, NULL); 2081 2082 out: 2083 if (in_crit) 2084 nbl_end_crit(targvp); 2085 VN_RELE(targvp); 2086 VN_RELE(vp); 2087 2088 *status = puterrno(error); 2089 2090 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done"); 2091 } 2092 2093 void * 2094 rfs_remove_getfh(struct nfsdiropargs *da) 2095 { 2096 return (da->da_fhandle); 2097 } 2098 2099 /* 2100 * rename a file 2101 * Give a file (from) a new name (to). 2102 */ 2103 void 2104 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2105 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2106 { 2107 int error = 0; 2108 vnode_t *fromvp; 2109 vnode_t *tovp; 2110 struct exportinfo *to_exi; 2111 fhandle_t *fh; 2112 vnode_t *srcvp; 2113 vnode_t *targvp; 2114 int in_crit = 0; 2115 2116 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:"); 2117 2118 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2119 if (fromvp == NULL) { 2120 *status = NFSERR_STALE; 2121 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2122 "rfs_rename_end:(%S)", "from stale"); 2123 return; 2124 } 2125 2126 fh = args->rna_to.da_fhandle; 2127 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2128 if (to_exi == NULL) { 2129 VN_RELE(fromvp); 2130 *status = NFSERR_ACCES; 2131 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2132 "rfs_rename_end:(%S)", "cross device"); 2133 return; 2134 } 2135 exi_rele(to_exi); 2136 2137 if (to_exi != exi) { 2138 VN_RELE(fromvp); 2139 *status = NFSERR_XDEV; 2140 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2141 "rfs_rename_end:(%S)", "from stale"); 2142 return; 2143 } 2144 2145 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2146 if (tovp == NULL) { 2147 VN_RELE(fromvp); 2148 *status = NFSERR_STALE; 2149 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2150 "rfs_rename_end:(%S)", "to stale"); 2151 return; 2152 } 2153 2154 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2155 VN_RELE(tovp); 2156 VN_RELE(fromvp); 2157 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2158 "rfs_rename_end:(%S)", "not dir"); 2159 *status = NFSERR_NOTDIR; 2160 return; 2161 } 2162 2163 /* 2164 * Disallow NULL paths 2165 */ 2166 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2167 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2168 VN_RELE(tovp); 2169 VN_RELE(fromvp); 2170 *status = NFSERR_ACCES; 2171 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2172 "rfs_rename_end:(%S)", "access"); 2173 return; 2174 } 2175 2176 if (rdonly(exi, req)) { 2177 VN_RELE(tovp); 2178 VN_RELE(fromvp); 2179 *status = NFSERR_ROFS; 2180 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2181 "rfs_rename_end:(%S)", "rofs"); 2182 return; 2183 } 2184 2185 /* 2186 * Check for a conflict with a non-blocking mandatory share reservation. 2187 */ 2188 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2189 NULL, cr, NULL, NULL, NULL); 2190 if (error != 0) { 2191 VN_RELE(tovp); 2192 VN_RELE(fromvp); 2193 *status = puterrno(error); 2194 return; 2195 } 2196 2197 /* Check for delegations on the source file */ 2198 2199 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2200 VN_RELE(tovp); 2201 VN_RELE(fromvp); 2202 VN_RELE(srcvp); 2203 curthread->t_flag |= T_WOULDBLOCK; 2204 return; 2205 } 2206 2207 /* Check for delegation on the file being renamed over, if it exists */ 2208 2209 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2210 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr, 2211 NULL, NULL, NULL) == 0) { 2212 2213 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2214 VN_RELE(tovp); 2215 VN_RELE(fromvp); 2216 VN_RELE(srcvp); 2217 VN_RELE(targvp); 2218 curthread->t_flag |= T_WOULDBLOCK; 2219 return; 2220 } 2221 VN_RELE(targvp); 2222 } 2223 2224 2225 if (nbl_need_check(srcvp)) { 2226 nbl_start_crit(srcvp, RW_READER); 2227 in_crit = 1; 2228 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) { 2229 error = EACCES; 2230 goto out; 2231 } 2232 } 2233 2234 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:"); 2235 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2236 tovp, args->rna_to.da_name, cr, NULL, 0); 2237 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:"); 2238 2239 if (error == 0) 2240 vn_renamepath(tovp, srcvp, args->rna_to.da_name, 2241 strlen(args->rna_to.da_name)); 2242 2243 /* 2244 * Force modified data and metadata out to stable storage. 2245 */ 2246 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2247 (void) VOP_FSYNC(fromvp, 0, cr, NULL); 2248 2249 out: 2250 if (in_crit) 2251 nbl_end_crit(srcvp); 2252 VN_RELE(srcvp); 2253 VN_RELE(tovp); 2254 VN_RELE(fromvp); 2255 2256 *status = puterrno(error); 2257 2258 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done"); 2259 } 2260 void * 2261 rfs_rename_getfh(struct nfsrnmargs *args) 2262 { 2263 return (args->rna_from.da_fhandle); 2264 } 2265 2266 /* 2267 * Link to a file. 2268 * Create a file (to) which is a hard link to the given file (from). 2269 */ 2270 void 2271 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2272 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2273 { 2274 int error; 2275 vnode_t *fromvp; 2276 vnode_t *tovp; 2277 struct exportinfo *to_exi; 2278 fhandle_t *fh; 2279 2280 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:"); 2281 2282 fromvp = nfs_fhtovp(args->la_from, exi); 2283 if (fromvp == NULL) { 2284 *status = NFSERR_STALE; 2285 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2286 "rfs_link_end:(%S)", "from stale"); 2287 return; 2288 } 2289 2290 fh = args->la_to.da_fhandle; 2291 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2292 if (to_exi == NULL) { 2293 VN_RELE(fromvp); 2294 *status = NFSERR_ACCES; 2295 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2296 "rfs_link_end:(%S)", "cross device"); 2297 return; 2298 } 2299 exi_rele(to_exi); 2300 2301 if (to_exi != exi) { 2302 VN_RELE(fromvp); 2303 *status = NFSERR_XDEV; 2304 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2305 "rfs_link_end:(%S)", "cross device"); 2306 return; 2307 } 2308 2309 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2310 if (tovp == NULL) { 2311 VN_RELE(fromvp); 2312 *status = NFSERR_STALE; 2313 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2314 "rfs_link_end:(%S)", "to stale"); 2315 return; 2316 } 2317 2318 if (tovp->v_type != VDIR) { 2319 VN_RELE(tovp); 2320 VN_RELE(fromvp); 2321 *status = NFSERR_NOTDIR; 2322 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2323 "rfs_link_end:(%S)", "not dir"); 2324 return; 2325 } 2326 /* 2327 * Disallow NULL paths 2328 */ 2329 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2330 VN_RELE(tovp); 2331 VN_RELE(fromvp); 2332 *status = NFSERR_ACCES; 2333 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2334 "rfs_link_end:(%S)", "access"); 2335 return; 2336 } 2337 2338 if (rdonly(exi, req)) { 2339 VN_RELE(tovp); 2340 VN_RELE(fromvp); 2341 *status = NFSERR_ROFS; 2342 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2343 "rfs_link_end:(%S)", "rofs"); 2344 return; 2345 } 2346 2347 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:"); 2348 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0); 2349 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:"); 2350 2351 /* 2352 * Force modified data and metadata out to stable storage. 2353 */ 2354 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2355 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL); 2356 2357 VN_RELE(tovp); 2358 VN_RELE(fromvp); 2359 2360 *status = puterrno(error); 2361 2362 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done"); 2363 } 2364 void * 2365 rfs_link_getfh(struct nfslinkargs *args) 2366 { 2367 return (args->la_from); 2368 } 2369 2370 /* 2371 * Symbolicly link to a file. 2372 * Create a file (to) with the given attributes which is a symbolic link 2373 * to the given path name (to). 2374 */ 2375 void 2376 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2377 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2378 { 2379 int error; 2380 struct vattr va; 2381 vnode_t *vp; 2382 vnode_t *svp; 2383 int lerror; 2384 2385 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:"); 2386 2387 /* 2388 * Disallow NULL paths 2389 */ 2390 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2391 *status = NFSERR_ACCES; 2392 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2393 "rfs_symlink_end:(%S)", "access"); 2394 return; 2395 } 2396 2397 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2398 if (vp == NULL) { 2399 *status = NFSERR_STALE; 2400 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2401 "rfs_symlink_end:(%S)", "stale"); 2402 return; 2403 } 2404 2405 if (rdonly(exi, req)) { 2406 VN_RELE(vp); 2407 *status = NFSERR_ROFS; 2408 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2409 "rfs_symlink_end:(%S)", "rofs"); 2410 return; 2411 } 2412 2413 error = sattr_to_vattr(args->sla_sa, &va); 2414 if (error) { 2415 VN_RELE(vp); 2416 *status = puterrno(error); 2417 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2418 "rfs_symlink_end:(%S)", "sattr"); 2419 return; 2420 } 2421 2422 if (!(va.va_mask & AT_MODE)) { 2423 VN_RELE(vp); 2424 *status = NFSERR_INVAL; 2425 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2426 "rfs_symlink_end:(%S)", "no mode"); 2427 return; 2428 } 2429 2430 va.va_type = VLNK; 2431 va.va_mask |= AT_TYPE; 2432 2433 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:"); 2434 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr, 2435 NULL, 0); 2436 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:"); 2437 2438 /* 2439 * Force new data and metadata out to stable storage. 2440 */ 2441 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 2442 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2443 0, NULL, cr, NULL, NULL, NULL); 2444 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 2445 if (!lerror) { 2446 (void) VOP_FSYNC(svp, 0, cr, NULL); 2447 VN_RELE(svp); 2448 } 2449 2450 /* 2451 * Force modified data and metadata out to stable storage. 2452 */ 2453 (void) VOP_FSYNC(vp, 0, cr, NULL); 2454 2455 VN_RELE(vp); 2456 2457 *status = puterrno(error); 2458 2459 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done"); 2460 } 2461 void * 2462 rfs_symlink_getfh(struct nfsslargs *args) 2463 { 2464 return (args->sla_from.da_fhandle); 2465 } 2466 2467 /* 2468 * Make a directory. 2469 * Create a directory with the given name, parent directory, and attributes. 2470 * Returns a file handle and attributes for the new directory. 2471 */ 2472 void 2473 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2474 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2475 { 2476 int error; 2477 struct vattr va; 2478 vnode_t *dvp = NULL; 2479 vnode_t *vp; 2480 char *name = args->ca_da.da_name; 2481 2482 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:"); 2483 2484 /* 2485 * Disallow NULL paths 2486 */ 2487 if (name == NULL || *name == '\0') { 2488 dr->dr_status = NFSERR_ACCES; 2489 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2490 "rfs_mkdir_end:(%S)", "access"); 2491 return; 2492 } 2493 2494 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2495 if (vp == NULL) { 2496 dr->dr_status = NFSERR_STALE; 2497 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2498 "rfs_mkdir_end:(%S)", "stale"); 2499 return; 2500 } 2501 2502 if (rdonly(exi, req)) { 2503 VN_RELE(vp); 2504 dr->dr_status = NFSERR_ROFS; 2505 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2506 "rfs_mkdir_end:(%S)", "rofs"); 2507 return; 2508 } 2509 2510 error = sattr_to_vattr(args->ca_sa, &va); 2511 if (error) { 2512 VN_RELE(vp); 2513 dr->dr_status = puterrno(error); 2514 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2515 "rfs_mkdir_end:(%S)", "sattr"); 2516 return; 2517 } 2518 2519 if (!(va.va_mask & AT_MODE)) { 2520 VN_RELE(vp); 2521 dr->dr_status = NFSERR_INVAL; 2522 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2523 "rfs_mkdir_end:(%S)", "no mode"); 2524 return; 2525 } 2526 2527 va.va_type = VDIR; 2528 va.va_mask |= AT_TYPE; 2529 2530 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:"); 2531 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL); 2532 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:"); 2533 2534 if (!error) { 2535 /* 2536 * Attribtutes of the newly created directory should 2537 * be returned to the client. 2538 */ 2539 va.va_mask = AT_ALL; /* We want everything */ 2540 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 2541 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2542 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 2543 /* check for overflows */ 2544 if (!error) { 2545 acl_perm(vp, exi, &va, cr); 2546 error = vattr_to_nattr(&va, &dr->dr_attr); 2547 if (!error) { 2548 error = makefh(&dr->dr_fhandle, dvp, exi); 2549 } 2550 } 2551 /* 2552 * Force new data and metadata out to stable storage. 2553 */ 2554 (void) VOP_FSYNC(dvp, 0, cr, NULL); 2555 VN_RELE(dvp); 2556 } 2557 2558 /* 2559 * Force modified data and metadata out to stable storage. 2560 */ 2561 (void) VOP_FSYNC(vp, 0, cr, NULL); 2562 2563 VN_RELE(vp); 2564 2565 dr->dr_status = puterrno(error); 2566 2567 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done"); 2568 } 2569 void * 2570 rfs_mkdir_getfh(struct nfscreatargs *args) 2571 { 2572 return (args->ca_da.da_fhandle); 2573 } 2574 2575 /* 2576 * Remove a directory. 2577 * Remove the given directory name from the given parent directory. 2578 */ 2579 void 2580 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2581 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2582 { 2583 int error; 2584 vnode_t *vp; 2585 2586 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:"); 2587 2588 /* 2589 * Disallow NULL paths 2590 */ 2591 if (da->da_name == NULL || *da->da_name == '\0') { 2592 *status = NFSERR_ACCES; 2593 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2594 "rfs_rmdir_end:(%S)", "access"); 2595 return; 2596 } 2597 2598 vp = nfs_fhtovp(da->da_fhandle, exi); 2599 if (vp == NULL) { 2600 *status = NFSERR_STALE; 2601 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2602 "rfs_rmdir_end:(%S)", "stale"); 2603 return; 2604 } 2605 2606 if (rdonly(exi, req)) { 2607 VN_RELE(vp); 2608 *status = NFSERR_ROFS; 2609 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2610 "rfs_rmdir_end:(%S)", "rofs"); 2611 return; 2612 } 2613 2614 /* 2615 * VOP_RMDIR now takes a new third argument (the current 2616 * directory of the process). That's because someone 2617 * wants to return EINVAL if one tries to remove ".". 2618 * Of course, NFS servers have no idea what their 2619 * clients' current directories are. We fake it by 2620 * supplying a vnode known to exist and illegal to 2621 * remove. 2622 */ 2623 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:"); 2624 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0); 2625 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:"); 2626 2627 /* 2628 * Force modified data and metadata out to stable storage. 2629 */ 2630 (void) VOP_FSYNC(vp, 0, cr, NULL); 2631 2632 VN_RELE(vp); 2633 2634 /* 2635 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2636 * if the directory is not empty. A System V NFS server 2637 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2638 * over the wire. 2639 */ 2640 if (error == EEXIST) 2641 *status = NFSERR_NOTEMPTY; 2642 else 2643 *status = puterrno(error); 2644 2645 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done"); 2646 } 2647 void * 2648 rfs_rmdir_getfh(struct nfsdiropargs *da) 2649 { 2650 return (da->da_fhandle); 2651 } 2652 2653 /* ARGSUSED */ 2654 void 2655 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2656 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2657 { 2658 int error; 2659 int iseof; 2660 struct iovec iov; 2661 struct uio uio; 2662 vnode_t *vp; 2663 2664 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:"); 2665 2666 vp = nfs_fhtovp(&rda->rda_fh, exi); 2667 if (vp == NULL) { 2668 rd->rd_entries = NULL; 2669 rd->rd_status = NFSERR_STALE; 2670 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2671 "rfs_readdir_end:(%S)", "stale"); 2672 return; 2673 } 2674 2675 if (vp->v_type != VDIR) { 2676 VN_RELE(vp); 2677 rd->rd_entries = NULL; 2678 rd->rd_status = NFSERR_NOTDIR; 2679 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2680 "rfs_readdir_end:(%S)", "notdir"); 2681 return; 2682 } 2683 2684 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 2685 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2686 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 2687 2688 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 2689 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL); 2690 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 2691 if (error) { 2692 rd->rd_entries = NULL; 2693 goto bad; 2694 } 2695 2696 if (rda->rda_count == 0) { 2697 rd->rd_entries = NULL; 2698 rd->rd_size = 0; 2699 rd->rd_eof = FALSE; 2700 goto bad; 2701 } 2702 2703 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2704 2705 /* 2706 * Allocate data for entries. This will be freed by rfs_rddirfree. 2707 */ 2708 rd->rd_bufsize = (uint_t)rda->rda_count; 2709 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2710 2711 /* 2712 * Set up io vector to read directory data 2713 */ 2714 iov.iov_base = (caddr_t)rd->rd_entries; 2715 iov.iov_len = rda->rda_count; 2716 uio.uio_iov = &iov; 2717 uio.uio_iovcnt = 1; 2718 uio.uio_segflg = UIO_SYSSPACE; 2719 uio.uio_extflg = UIO_COPY_CACHED; 2720 uio.uio_loffset = (offset_t)rda->rda_offset; 2721 uio.uio_resid = rda->rda_count; 2722 2723 /* 2724 * read directory 2725 */ 2726 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:"); 2727 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0); 2728 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:"); 2729 2730 /* 2731 * Clean up 2732 */ 2733 if (!error) { 2734 /* 2735 * set size and eof 2736 */ 2737 if (uio.uio_resid == rda->rda_count) { 2738 rd->rd_size = 0; 2739 rd->rd_eof = TRUE; 2740 } else { 2741 rd->rd_size = (uint32_t)(rda->rda_count - 2742 uio.uio_resid); 2743 rd->rd_eof = iseof ? TRUE : FALSE; 2744 } 2745 } 2746 2747 bad: 2748 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 2749 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2750 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 2751 2752 #if 0 /* notyet */ 2753 /* 2754 * Don't do this. It causes local disk writes when just 2755 * reading the file and the overhead is deemed larger 2756 * than the benefit. 2757 */ 2758 /* 2759 * Force modified metadata out to stable storage. 2760 */ 2761 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 2762 #endif 2763 2764 VN_RELE(vp); 2765 2766 rd->rd_status = puterrno(error); 2767 2768 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done"); 2769 } 2770 void * 2771 rfs_readdir_getfh(struct nfsrddirargs *rda) 2772 { 2773 return (&rda->rda_fh); 2774 } 2775 void 2776 rfs_rddirfree(struct nfsrddirres *rd) 2777 { 2778 if (rd->rd_entries != NULL) 2779 kmem_free(rd->rd_entries, rd->rd_bufsize); 2780 } 2781 2782 /* ARGSUSED */ 2783 void 2784 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2785 struct svc_req *req, cred_t *cr) 2786 { 2787 int error; 2788 struct statvfs64 sb; 2789 vnode_t *vp; 2790 2791 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:"); 2792 2793 vp = nfs_fhtovp(fh, exi); 2794 if (vp == NULL) { 2795 fs->fs_status = NFSERR_STALE; 2796 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2797 "rfs_statfs_end:(%S)", "stale"); 2798 return; 2799 } 2800 2801 error = VFS_STATVFS(vp->v_vfsp, &sb); 2802 2803 if (!error) { 2804 fs->fs_tsize = nfstsize(); 2805 fs->fs_bsize = sb.f_frsize; 2806 fs->fs_blocks = sb.f_blocks; 2807 fs->fs_bfree = sb.f_bfree; 2808 fs->fs_bavail = sb.f_bavail; 2809 } 2810 2811 VN_RELE(vp); 2812 2813 fs->fs_status = puterrno(error); 2814 2815 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done"); 2816 } 2817 void * 2818 rfs_statfs_getfh(fhandle_t *fh) 2819 { 2820 return (fh); 2821 } 2822 2823 static int 2824 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2825 { 2826 vap->va_mask = 0; 2827 2828 /* 2829 * There was a sign extension bug in some VFS based systems 2830 * which stored the mode as a short. When it would get 2831 * assigned to a u_long, no sign extension would occur. 2832 * It needed to, but this wasn't noticed because sa_mode 2833 * would then get assigned back to the short, thus ignoring 2834 * the upper 16 bits of sa_mode. 2835 * 2836 * To make this implementation work for both broken 2837 * clients and good clients, we check for both versions 2838 * of the mode. 2839 */ 2840 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2841 sa->sa_mode != (uint32_t)-1) { 2842 vap->va_mask |= AT_MODE; 2843 vap->va_mode = sa->sa_mode; 2844 } 2845 if (sa->sa_uid != (uint32_t)-1) { 2846 vap->va_mask |= AT_UID; 2847 vap->va_uid = sa->sa_uid; 2848 } 2849 if (sa->sa_gid != (uint32_t)-1) { 2850 vap->va_mask |= AT_GID; 2851 vap->va_gid = sa->sa_gid; 2852 } 2853 if (sa->sa_size != (uint32_t)-1) { 2854 vap->va_mask |= AT_SIZE; 2855 vap->va_size = sa->sa_size; 2856 } 2857 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2858 sa->sa_atime.tv_usec != (int32_t)-1) { 2859 #ifndef _LP64 2860 /* return error if time overflow */ 2861 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2862 return (EOVERFLOW); 2863 #endif 2864 vap->va_mask |= AT_ATIME; 2865 /* 2866 * nfs protocol defines times as unsigned so don't extend sign, 2867 * unless sysadmin set nfs_allow_preepoch_time. 2868 */ 2869 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2870 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2871 } 2872 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2873 sa->sa_mtime.tv_usec != (int32_t)-1) { 2874 #ifndef _LP64 2875 /* return error if time overflow */ 2876 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2877 return (EOVERFLOW); 2878 #endif 2879 vap->va_mask |= AT_MTIME; 2880 /* 2881 * nfs protocol defines times as unsigned so don't extend sign, 2882 * unless sysadmin set nfs_allow_preepoch_time. 2883 */ 2884 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2885 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2886 } 2887 return (0); 2888 } 2889 2890 static enum nfsftype vt_to_nf[] = { 2891 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2892 }; 2893 2894 /* 2895 * check the following fields for overflow: nodeid, size, and time. 2896 * There could be a problem when converting 64-bit LP64 fields 2897 * into 32-bit ones. Return an error if there is an overflow. 2898 */ 2899 int 2900 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2901 { 2902 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2903 na->na_type = vt_to_nf[vap->va_type]; 2904 2905 if (vap->va_mode == (unsigned short) -1) 2906 na->na_mode = (uint32_t)-1; 2907 else 2908 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2909 2910 if (vap->va_uid == (unsigned short)(-1)) 2911 na->na_uid = (uint32_t)(-1); 2912 else if (vap->va_uid == UID_NOBODY) 2913 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2914 else 2915 na->na_uid = vap->va_uid; 2916 2917 if (vap->va_gid == (unsigned short)(-1)) 2918 na->na_gid = (uint32_t)-1; 2919 else if (vap->va_gid == GID_NOBODY) 2920 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2921 else 2922 na->na_gid = vap->va_gid; 2923 2924 /* 2925 * Do we need to check fsid for overflow? It is 64-bit in the 2926 * vattr, but are bigger than 32 bit values supported? 2927 */ 2928 na->na_fsid = vap->va_fsid; 2929 2930 na->na_nodeid = vap->va_nodeid; 2931 2932 /* 2933 * Check to make sure that the nodeid is representable over the 2934 * wire without losing bits. 2935 */ 2936 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2937 return (EFBIG); 2938 na->na_nlink = vap->va_nlink; 2939 2940 /* 2941 * Check for big files here, instead of at the caller. See 2942 * comments in cstat for large special file explanation. 2943 */ 2944 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2945 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2946 return (EFBIG); 2947 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2948 /* UNKNOWN_SIZE | OVERFLOW */ 2949 na->na_size = MAXOFF32_T; 2950 } else 2951 na->na_size = vap->va_size; 2952 } else 2953 na->na_size = vap->va_size; 2954 2955 /* 2956 * If the vnode times overflow the 32-bit times that NFS2 2957 * uses on the wire then return an error. 2958 */ 2959 if (!NFS_VAP_TIME_OK(vap)) { 2960 return (EOVERFLOW); 2961 } 2962 na->na_atime.tv_sec = vap->va_atime.tv_sec; 2963 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2964 2965 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 2966 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2967 2968 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 2969 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 2970 2971 /* 2972 * If the dev_t will fit into 16 bits then compress 2973 * it, otherwise leave it alone. See comments in 2974 * nfs_client.c. 2975 */ 2976 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 2977 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 2978 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 2979 else 2980 (void) cmpldev(&na->na_rdev, vap->va_rdev); 2981 2982 na->na_blocks = vap->va_nblocks; 2983 na->na_blocksize = vap->va_blksize; 2984 2985 /* 2986 * This bit of ugliness is a *TEMPORARY* hack to preserve the 2987 * over-the-wire protocols for named-pipe vnodes. It remaps the 2988 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 2989 * 2990 * BUYER BEWARE: 2991 * If you are porting the NFS to a non-Sun server, you probably 2992 * don't want to include the following block of code. The 2993 * over-the-wire special file types will be changing with the 2994 * NFS Protocol Revision. 2995 */ 2996 if (vap->va_type == VFIFO) 2997 NA_SETFIFO(na); 2998 return (0); 2999 } 3000 3001 /* 3002 * acl v2 support: returns approximate permission. 3003 * default: returns minimal permission (more restrictive) 3004 * aclok: returns maximal permission (less restrictive) 3005 * This routine changes the permissions that are alaredy in *va. 3006 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3007 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3008 */ 3009 static void 3010 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3011 { 3012 vsecattr_t vsa; 3013 int aclcnt; 3014 aclent_t *aclentp; 3015 mode_t mask_perm; 3016 mode_t grp_perm; 3017 mode_t other_perm; 3018 mode_t other_orig; 3019 int error; 3020 3021 /* dont care default acl */ 3022 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3023 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL); 3024 3025 if (!error) { 3026 aclcnt = vsa.vsa_aclcnt; 3027 if (aclcnt > MIN_ACL_ENTRIES) { 3028 /* non-trivial ACL */ 3029 aclentp = vsa.vsa_aclentp; 3030 if (exi->exi_export.ex_flags & EX_ACLOK) { 3031 /* maximal permissions */ 3032 grp_perm = 0; 3033 other_perm = 0; 3034 for (; aclcnt > 0; aclcnt--, aclentp++) { 3035 switch (aclentp->a_type) { 3036 case USER_OBJ: 3037 break; 3038 case USER: 3039 grp_perm |= 3040 aclentp->a_perm << 3; 3041 other_perm |= aclentp->a_perm; 3042 break; 3043 case GROUP_OBJ: 3044 grp_perm |= 3045 aclentp->a_perm << 3; 3046 break; 3047 case GROUP: 3048 other_perm |= aclentp->a_perm; 3049 break; 3050 case OTHER_OBJ: 3051 other_orig = aclentp->a_perm; 3052 break; 3053 case CLASS_OBJ: 3054 mask_perm = aclentp->a_perm; 3055 break; 3056 default: 3057 break; 3058 } 3059 } 3060 grp_perm &= mask_perm << 3; 3061 other_perm &= mask_perm; 3062 other_perm |= other_orig; 3063 3064 } else { 3065 /* minimal permissions */ 3066 grp_perm = 070; 3067 other_perm = 07; 3068 for (; aclcnt > 0; aclcnt--, aclentp++) { 3069 switch (aclentp->a_type) { 3070 case USER_OBJ: 3071 break; 3072 case USER: 3073 case CLASS_OBJ: 3074 grp_perm &= 3075 aclentp->a_perm << 3; 3076 other_perm &= 3077 aclentp->a_perm; 3078 break; 3079 case GROUP_OBJ: 3080 grp_perm &= 3081 aclentp->a_perm << 3; 3082 break; 3083 case GROUP: 3084 other_perm &= 3085 aclentp->a_perm; 3086 break; 3087 case OTHER_OBJ: 3088 other_perm &= 3089 aclentp->a_perm; 3090 break; 3091 default: 3092 break; 3093 } 3094 } 3095 } 3096 /* copy to va */ 3097 va->va_mode &= ~077; 3098 va->va_mode |= grp_perm | other_perm; 3099 } 3100 if (vsa.vsa_aclcnt) 3101 kmem_free(vsa.vsa_aclentp, 3102 vsa.vsa_aclcnt * sizeof (aclent_t)); 3103 } 3104 } 3105 3106 void 3107 rfs_srvrinit(void) 3108 { 3109 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3110 nfs2_srv_caller_id = fs_new_caller_id(); 3111 } 3112 3113 void 3114 rfs_srvrfini(void) 3115 { 3116 mutex_destroy(&rfs_async_write_lock); 3117 } 3118