1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/uio.h> 41 #include <sys/stat.h> 42 #include <sys/errno.h> 43 #include <sys/sysmacros.h> 44 #include <sys/statvfs.h> 45 #include <sys/kmem.h> 46 #include <sys/kstat.h> 47 #include <sys/dirent.h> 48 #include <sys/cmn_err.h> 49 #include <sys/debug.h> 50 #include <sys/vtrace.h> 51 #include <sys/mode.h> 52 #include <sys/acl.h> 53 #include <sys/nbmlock.h> 54 #include <sys/policy.h> 55 56 #include <rpc/types.h> 57 #include <rpc/auth.h> 58 #include <rpc/svc.h> 59 60 #include <nfs/nfs.h> 61 #include <nfs/export.h> 62 63 #include <vm/hat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kmem.h> 68 69 #include <sys/strsubr.h> 70 71 /* 72 * These are the interface routines for the server side of the 73 * Network File System. See the NFS version 2 protocol specification 74 * for a description of this interface. 75 */ 76 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 79 cred_t *); 80 81 /* 82 * Some "over the wire" UNIX file types. These are encoded 83 * into the mode. This needs to be fixed in the next rev. 84 */ 85 #define IFMT 0170000 /* type of file */ 86 #define IFCHR 0020000 /* character special */ 87 #define IFBLK 0060000 /* block special */ 88 #define IFSOCK 0140000 /* socket */ 89 90 u_longlong_t nfs2_srv_caller_id; 91 92 /* 93 * Get file attributes. 94 * Returns the current attributes of the file with the given fhandle. 95 */ 96 /* ARGSUSED */ 97 void 98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 99 struct svc_req *req, cred_t *cr) 100 { 101 int error; 102 vnode_t *vp; 103 struct vattr va; 104 105 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:"); 106 107 vp = nfs_fhtovp(fhp, exi); 108 if (vp == NULL) { 109 ns->ns_status = NFSERR_STALE; 110 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 111 "rfs_getattr_end:(%S)", "stale"); 112 return; 113 } 114 115 /* 116 * Do the getattr. 117 */ 118 va.va_mask = AT_ALL; /* we want all the attributes */ 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 120 error = rfs4_delegated_getattr(vp, &va, 0, cr); 121 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 122 123 /* check for overflows */ 124 if (!error) { 125 acl_perm(vp, exi, &va, cr); 126 error = vattr_to_nattr(&va, &ns->ns_attr); 127 } 128 129 VN_RELE(vp); 130 131 ns->ns_status = puterrno(error); 132 133 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done"); 134 } 135 void * 136 rfs_getattr_getfh(fhandle_t *fhp) 137 { 138 return (fhp); 139 } 140 141 /* 142 * Set file attributes. 143 * Sets the attributes of the file with the given fhandle. Returns 144 * the new attributes. 145 */ 146 void 147 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 148 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 149 { 150 int error; 151 int flag; 152 int in_crit = 0; 153 vnode_t *vp; 154 struct vattr va; 155 struct vattr bva; 156 struct flock64 bf; 157 caller_context_t ct; 158 159 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:"); 160 161 vp = nfs_fhtovp(&args->saa_fh, exi); 162 if (vp == NULL) { 163 ns->ns_status = NFSERR_STALE; 164 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 165 "rfs_setattr_end:(%S)", "stale"); 166 return; 167 } 168 169 if (rdonly(exi, req) || vn_is_readonly(vp)) { 170 VN_RELE(vp); 171 ns->ns_status = NFSERR_ROFS; 172 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 173 "rfs_setattr_end:(%S)", "rofs"); 174 return; 175 } 176 177 error = sattr_to_vattr(&args->saa_sa, &va); 178 if (error) { 179 VN_RELE(vp); 180 ns->ns_status = puterrno(error); 181 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 182 "rfs_setattr_end:(%S)", "sattr"); 183 return; 184 } 185 186 /* 187 * If the client is requesting a change to the mtime, 188 * but the nanosecond field is set to 1 billion, then 189 * this is a flag to the server that it should set the 190 * atime and mtime fields to the server's current time. 191 * The 1 billion number actually came from the client 192 * as 1 million, but the units in the over the wire 193 * request are microseconds instead of nanoseconds. 194 * 195 * This is an overload of the protocol and should be 196 * documented in the NFS Version 2 protocol specification. 197 */ 198 if (va.va_mask & AT_MTIME) { 199 if (va.va_mtime.tv_nsec == 1000000000) { 200 gethrestime(&va.va_mtime); 201 va.va_atime = va.va_mtime; 202 va.va_mask |= AT_ATIME; 203 flag = 0; 204 } else 205 flag = ATTR_UTIME; 206 } else 207 flag = 0; 208 209 /* 210 * If the filesystem is exported with nosuid, then mask off 211 * the setuid and setgid bits. 212 */ 213 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 214 (exi->exi_export.ex_flags & EX_NOSUID)) 215 va.va_mode &= ~(VSUID | VSGID); 216 217 ct.cc_sysid = 0; 218 ct.cc_pid = 0; 219 ct.cc_caller_id = nfs2_srv_caller_id; 220 ct.cc_flags = CC_DONTBLOCK; 221 222 /* 223 * We need to specially handle size changes because it is 224 * possible for the client to create a file with modes 225 * which indicate read-only, but with the file opened for 226 * writing. If the client then tries to set the size of 227 * the file, then the normal access checking done in 228 * VOP_SETATTR would prevent the client from doing so, 229 * although it should be legal for it to do so. To get 230 * around this, we do the access checking for ourselves 231 * and then use VOP_SPACE which doesn't do the access 232 * checking which VOP_SETATTR does. VOP_SPACE can only 233 * operate on VREG files, let VOP_SETATTR handle the other 234 * extremely rare cases. 235 * Also the client should not be allowed to change the 236 * size of the file if there is a conflicting non-blocking 237 * mandatory lock in the region of change. 238 */ 239 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 240 if (nbl_need_check(vp)) { 241 nbl_start_crit(vp, RW_READER); 242 in_crit = 1; 243 } 244 245 bva.va_mask = AT_UID | AT_SIZE; 246 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 247 error = VOP_GETATTR(vp, &bva, 0, cr, &ct); 248 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 249 if (error) { 250 if (in_crit) 251 nbl_end_crit(vp); 252 VN_RELE(vp); 253 ns->ns_status = puterrno(error); 254 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 255 "rfs_setattr_end:(%S)", "getattr"); 256 return; 257 } 258 259 if (in_crit) { 260 u_offset_t offset; 261 ssize_t length; 262 263 if (va.va_size < bva.va_size) { 264 offset = va.va_size; 265 length = bva.va_size - va.va_size; 266 } else { 267 offset = bva.va_size; 268 length = va.va_size - bva.va_size; 269 } 270 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0, 271 NULL)) { 272 error = EACCES; 273 } 274 } 275 276 if (crgetuid(cr) == bva.va_uid && !error && 277 va.va_size != bva.va_size) { 278 va.va_mask &= ~AT_SIZE; 279 bf.l_type = F_WRLCK; 280 bf.l_whence = 0; 281 bf.l_start = (off64_t)va.va_size; 282 bf.l_len = 0; 283 bf.l_sysid = 0; 284 bf.l_pid = 0; 285 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 286 "vop_space_start:"); 287 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 288 (offset_t)va.va_size, cr, &ct); 289 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:"); 290 } 291 if (in_crit) 292 nbl_end_crit(vp); 293 } else 294 error = 0; 295 296 /* 297 * Do the setattr. 298 */ 299 if (!error && va.va_mask) { 300 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:"); 301 error = VOP_SETATTR(vp, &va, flag, cr, &ct); 302 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:"); 303 } 304 305 /* 306 * check if the monitor on either vop_space or vop_setattr detected 307 * a delegation conflict and if so, mark the thread flag as 308 * wouldblock so that the response is dropped and the client will 309 * try again. 310 */ 311 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 312 VN_RELE(vp); 313 curthread->t_flag |= T_WOULDBLOCK; 314 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 315 "rfs_setattr_end:(%S)", "delegated"); 316 return; 317 } 318 319 if (!error) { 320 va.va_mask = AT_ALL; /* get everything */ 321 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 322 error = rfs4_delegated_getattr(vp, &va, 0, cr); 323 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 324 325 /* check for overflows */ 326 if (!error) { 327 acl_perm(vp, exi, &va, cr); 328 error = vattr_to_nattr(&va, &ns->ns_attr); 329 } 330 } 331 332 ct.cc_flags = 0; 333 334 /* 335 * Force modified metadata out to stable storage. 336 */ 337 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct); 338 339 VN_RELE(vp); 340 341 ns->ns_status = puterrno(error); 342 343 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done"); 344 } 345 void * 346 rfs_setattr_getfh(struct nfssaargs *args) 347 { 348 return (&args->saa_fh); 349 } 350 351 /* 352 * Directory lookup. 353 * Returns an fhandle and file attributes for file name in a directory. 354 */ 355 /* ARGSUSED */ 356 void 357 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 358 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 359 { 360 int error; 361 vnode_t *dvp; 362 vnode_t *vp; 363 struct vattr va; 364 fhandle_t *fhp = da->da_fhandle; 365 struct sec_ol sec = {0, 0}; 366 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 367 368 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:"); 369 370 /* 371 * Trusted Extension doesn't support NFSv2. MOUNT 372 * will reject v2 clients. Need to prevent v2 client 373 * access via WebNFS here. 374 */ 375 if (is_system_labeled() && req->rq_vers == 2) { 376 dr->dr_status = NFSERR_ACCES; 377 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 378 "rfs_lookup_end:(%S)", "access"); 379 return; 380 } 381 382 /* 383 * Disallow NULL paths 384 */ 385 if (da->da_name == NULL || *da->da_name == '\0') { 386 dr->dr_status = NFSERR_ACCES; 387 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 388 "rfs_lookup_end:(%S)", "access"); 389 return; 390 } 391 392 /* 393 * Allow lookups from the root - the default 394 * location of the public filehandle. 395 */ 396 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 397 dvp = rootdir; 398 VN_HOLD(dvp); 399 } else { 400 dvp = nfs_fhtovp(fhp, exi); 401 if (dvp == NULL) { 402 dr->dr_status = NFSERR_STALE; 403 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 404 "rfs_lookup_end:(%S)", "stale"); 405 return; 406 } 407 } 408 409 /* 410 * Not allow lookup beyond root. 411 * If the filehandle matches a filehandle of the exi, 412 * then the ".." refers beyond the root of an exported filesystem. 413 */ 414 if (strcmp(da->da_name, "..") == 0 && 415 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 416 VN_RELE(dvp); 417 dr->dr_status = NFSERR_NOENT; 418 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 419 "rfs_lookup_end:(%S)", "noent"); 420 return; 421 } 422 423 /* 424 * If the public filehandle is used then allow 425 * a multi-component lookup, i.e. evaluate 426 * a pathname and follow symbolic links if 427 * necessary. 428 * 429 * This may result in a vnode in another filesystem 430 * which is OK as long as the filesystem is exported. 431 */ 432 if (PUBLIC_FH2(fhp)) { 433 publicfh_flag = TRUE; 434 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 435 &sec); 436 } else { 437 /* 438 * Do a normal single component lookup. 439 */ 440 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 441 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr, 442 NULL, NULL, NULL); 443 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 444 } 445 446 if (!error) { 447 va.va_mask = AT_ALL; /* we want everything */ 448 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 449 error = rfs4_delegated_getattr(vp, &va, 0, cr); 450 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 451 /* check for overflows */ 452 if (!error) { 453 acl_perm(vp, exi, &va, cr); 454 error = vattr_to_nattr(&va, &dr->dr_attr); 455 if (!error) { 456 if (sec.sec_flags & SEC_QUERY) 457 error = makefh_ol(&dr->dr_fhandle, exi, 458 sec.sec_index); 459 else { 460 error = makefh(&dr->dr_fhandle, vp, 461 exi); 462 if (!error && publicfh_flag && 463 !chk_clnt_sec(exi, req)) 464 auth_weak = TRUE; 465 } 466 } 467 } 468 VN_RELE(vp); 469 } 470 471 VN_RELE(dvp); 472 473 /* 474 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 475 * and have obtained a new exportinfo in exi which needs to be 476 * released. Note the the original exportinfo pointed to by exi 477 * will be released by the caller, comon_dispatch. 478 */ 479 if (publicfh_flag && exi != NULL) 480 exi_rele(exi); 481 482 /* 483 * If it's public fh, no 0x81, and client's flavor is 484 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 485 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 486 */ 487 if (auth_weak) 488 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 489 else 490 dr->dr_status = puterrno(error); 491 492 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done"); 493 } 494 void * 495 rfs_lookup_getfh(struct nfsdiropargs *da) 496 { 497 return (da->da_fhandle); 498 } 499 500 /* 501 * Read symbolic link. 502 * Returns the string in the symbolic link at the given fhandle. 503 */ 504 /* ARGSUSED */ 505 void 506 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 507 struct svc_req *req, cred_t *cr) 508 { 509 int error; 510 struct iovec iov; 511 struct uio uio; 512 vnode_t *vp; 513 struct vattr va; 514 515 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:"); 516 517 vp = nfs_fhtovp(fhp, exi); 518 if (vp == NULL) { 519 rl->rl_data = NULL; 520 rl->rl_status = NFSERR_STALE; 521 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 522 "rfs_readlink_end:(%S)", "stale"); 523 return; 524 } 525 526 va.va_mask = AT_MODE; 527 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 528 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 529 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 530 531 if (error) { 532 VN_RELE(vp); 533 rl->rl_data = NULL; 534 rl->rl_status = puterrno(error); 535 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 536 "rfs_readlink_end:(%S)", "getattr error"); 537 return; 538 } 539 540 if (MANDLOCK(vp, va.va_mode)) { 541 VN_RELE(vp); 542 rl->rl_data = NULL; 543 rl->rl_status = NFSERR_ACCES; 544 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 545 "rfs_readlink_end:(%S)", "access"); 546 return; 547 } 548 549 /* 550 * XNFS and RFC1094 require us to return ENXIO if argument 551 * is not a link. BUGID 1138002. 552 */ 553 if (vp->v_type != VLNK) { 554 VN_RELE(vp); 555 rl->rl_data = NULL; 556 rl->rl_status = NFSERR_NXIO; 557 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 558 "rfs_readlink_end:(%S)", "nxio"); 559 return; 560 } 561 562 /* 563 * Allocate data for pathname. This will be freed by rfs_rlfree. 564 */ 565 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 566 567 /* 568 * Set up io vector to read sym link data 569 */ 570 iov.iov_base = rl->rl_data; 571 iov.iov_len = NFS_MAXPATHLEN; 572 uio.uio_iov = &iov; 573 uio.uio_iovcnt = 1; 574 uio.uio_segflg = UIO_SYSSPACE; 575 uio.uio_extflg = UIO_COPY_CACHED; 576 uio.uio_loffset = (offset_t)0; 577 uio.uio_resid = NFS_MAXPATHLEN; 578 579 /* 580 * Do the readlink. 581 */ 582 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:"); 583 error = VOP_READLINK(vp, &uio, cr, NULL); 584 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:"); 585 586 #if 0 /* notyet */ 587 /* 588 * Don't do this. It causes local disk writes when just 589 * reading the file and the overhead is deemed larger 590 * than the benefit. 591 */ 592 /* 593 * Force modified metadata out to stable storage. 594 */ 595 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 596 #endif 597 598 VN_RELE(vp); 599 600 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 601 602 /* 603 * XNFS and RFC1094 require us to return ENXIO if argument 604 * is not a link. UFS returns EINVAL if this is the case, 605 * so we do the mapping here. BUGID 1138002. 606 */ 607 if (error == EINVAL) 608 rl->rl_status = NFSERR_NXIO; 609 else 610 rl->rl_status = puterrno(error); 611 612 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 613 "rfs_readlink_end:(%S)", "done"); 614 } 615 void * 616 rfs_readlink_getfh(fhandle_t *fhp) 617 { 618 return (fhp); 619 } 620 /* 621 * Free data allocated by rfs_readlink 622 */ 623 void 624 rfs_rlfree(struct nfsrdlnres *rl) 625 { 626 if (rl->rl_data != NULL) 627 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 628 } 629 630 /* 631 * Read data. 632 * Returns some data read from the file at the given fhandle. 633 */ 634 /* ARGSUSED */ 635 void 636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 637 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 638 { 639 vnode_t *vp; 640 int error; 641 struct vattr va; 642 struct iovec iov; 643 struct uio uio; 644 mblk_t *mp; 645 int alloc_err = 0; 646 int in_crit = 0; 647 caller_context_t ct; 648 649 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:"); 650 651 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 652 if (vp == NULL) { 653 rr->rr_data = NULL; 654 rr->rr_status = NFSERR_STALE; 655 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 656 "rfs_read_end:(%S)", "stale"); 657 return; 658 } 659 660 if (vp->v_type != VREG) { 661 VN_RELE(vp); 662 rr->rr_data = NULL; 663 rr->rr_status = NFSERR_ISDIR; 664 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 665 "rfs_read_end:(%S)", "isdir"); 666 return; 667 } 668 669 ct.cc_sysid = 0; 670 ct.cc_pid = 0; 671 ct.cc_caller_id = nfs2_srv_caller_id; 672 ct.cc_flags = CC_DONTBLOCK; 673 674 /* 675 * Enter the critical region before calling VOP_RWLOCK 676 * to avoid a deadlock with write requests. 677 */ 678 if (nbl_need_check(vp)) { 679 nbl_start_crit(vp, RW_READER); 680 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 681 0, NULL)) { 682 nbl_end_crit(vp); 683 VN_RELE(vp); 684 rr->rr_data = NULL; 685 rr->rr_status = NFSERR_ACCES; 686 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 687 "rfs_read_end:(%S)", " csf access error"); 688 return; 689 } 690 in_crit = 1; 691 } 692 693 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 694 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct); 695 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 696 697 /* check if a monitor detected a delegation conflict */ 698 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 699 VN_RELE(vp); 700 /* mark as wouldblock so response is dropped */ 701 curthread->t_flag |= T_WOULDBLOCK; 702 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 703 "rfs_read_end:(%S)", "delegated"); 704 rr->rr_data = NULL; 705 return; 706 } 707 708 va.va_mask = AT_ALL; 709 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 710 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 711 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 712 713 if (error) { 714 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 715 "vop_rwunlock_start:"); 716 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 717 if (in_crit) 718 nbl_end_crit(vp); 719 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 720 VN_RELE(vp); 721 rr->rr_data = NULL; 722 rr->rr_status = puterrno(error); 723 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 724 "rfs_read_end:(%S)", "getattr error"); 725 return; 726 } 727 728 /* 729 * This is a kludge to allow reading of files created 730 * with no read permission. The owner of the file 731 * is always allowed to read it. 732 */ 733 if (crgetuid(cr) != va.va_uid) { 734 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 735 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct); 736 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 737 if (error) { 738 /* 739 * Exec is the same as read over the net because 740 * of demand loading. 741 */ 742 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 743 "vop_access_start:"); 744 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct); 745 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 746 "vop_access_end:"); 747 } 748 if (error) { 749 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 750 "vop_rwunlock_start:"); 751 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 752 if (in_crit) 753 nbl_end_crit(vp); 754 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 755 "vop_rwunlock_end:"); 756 VN_RELE(vp); 757 rr->rr_data = NULL; 758 rr->rr_status = puterrno(error); 759 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 760 "rfs_read_end:(%S)", "access error"); 761 return; 762 } 763 } 764 765 if (MANDLOCK(vp, va.va_mode)) { 766 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 767 "vop_rwunlock_start:"); 768 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 769 if (in_crit) 770 nbl_end_crit(vp); 771 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 772 VN_RELE(vp); 773 rr->rr_data = NULL; 774 rr->rr_status = NFSERR_ACCES; 775 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 776 "rfs_read_end:(%S)", "mand lock"); 777 return; 778 } 779 780 if ((u_offset_t)ra->ra_offset >= va.va_size) { 781 rr->rr_count = 0; 782 rr->rr_data = NULL; 783 /* 784 * In this case, status is NFS_OK, but there is no data 785 * to encode. So set rr_mp to NULL. 786 */ 787 rr->rr_mp = NULL; 788 goto done; 789 } 790 791 /* 792 * mp will contain the data to be sent out in the read reply. 793 * This will be freed after the reply has been sent out (by the 794 * driver). 795 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 796 * that the call to xdrmblk_putmblk() never fails. 797 */ 798 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 799 &alloc_err); 800 ASSERT(mp != NULL); 801 ASSERT(alloc_err == 0); 802 803 rr->rr_mp = mp; 804 805 /* 806 * Set up io vector 807 */ 808 iov.iov_base = (caddr_t)mp->b_datap->db_base; 809 iov.iov_len = ra->ra_count; 810 uio.uio_iov = &iov; 811 uio.uio_iovcnt = 1; 812 uio.uio_segflg = UIO_SYSSPACE; 813 uio.uio_extflg = UIO_COPY_CACHED; 814 uio.uio_loffset = (offset_t)ra->ra_offset; 815 uio.uio_resid = ra->ra_count; 816 817 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:"); 818 error = VOP_READ(vp, &uio, 0, cr, &ct); 819 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:"); 820 821 if (error) { 822 freeb(mp); 823 824 /* 825 * check if a monitor detected a delegation conflict and 826 * mark as wouldblock so response is dropped 827 */ 828 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 829 curthread->t_flag |= T_WOULDBLOCK; 830 else 831 rr->rr_status = puterrno(error); 832 833 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 834 "vop_rwunlock_start:"); 835 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 836 if (in_crit) 837 nbl_end_crit(vp); 838 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 839 VN_RELE(vp); 840 rr->rr_data = NULL; 841 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 842 "rfs_read_end:(%S)", "read error"); 843 return; 844 } 845 846 /* 847 * Get attributes again so we can send the latest access 848 * time to the client side for his cache. 849 */ 850 va.va_mask = AT_ALL; 851 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 852 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 853 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 854 if (error) { 855 freeb(mp); 856 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 857 "vop_rwunlock_start:"); 858 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 859 if (in_crit) 860 nbl_end_crit(vp); 861 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 862 "vop_rwunlock_end:"); 863 VN_RELE(vp); 864 rr->rr_data = NULL; 865 rr->rr_status = puterrno(error); 866 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 867 "rfs_read_end:(%S)", "read error"); 868 return; 869 } 870 871 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 872 873 rr->rr_data = (char *)mp->b_datap->db_base; 874 875 done: 876 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 877 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct); 878 if (in_crit) 879 nbl_end_crit(vp); 880 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 881 882 acl_perm(vp, exi, &va, cr); 883 884 /* check for overflows */ 885 error = vattr_to_nattr(&va, &rr->rr_attr); 886 887 #if 0 /* notyet */ 888 /* 889 * Don't do this. It causes local disk writes when just 890 * reading the file and the overhead is deemed larger 891 * than the benefit. 892 */ 893 /* 894 * Force modified metadata out to stable storage. 895 */ 896 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 897 #endif 898 899 VN_RELE(vp); 900 901 rr->rr_status = puterrno(error); 902 903 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done"); 904 } 905 906 /* 907 * Free data allocated by rfs_read 908 */ 909 void 910 rfs_rdfree(struct nfsrdresult *rr) 911 { 912 mblk_t *mp; 913 914 if (rr->rr_status == NFS_OK) { 915 mp = rr->rr_mp; 916 if (mp != NULL) 917 freeb(mp); 918 } 919 } 920 921 void * 922 rfs_read_getfh(struct nfsreadargs *ra) 923 { 924 return (&ra->ra_fhandle); 925 } 926 927 #define MAX_IOVECS 12 928 929 #ifdef DEBUG 930 static int rfs_write_sync_hits = 0; 931 static int rfs_write_sync_misses = 0; 932 #endif 933 934 /* 935 * Write data to file. 936 * Returns attributes of a file after writing some data to it. 937 * 938 * Any changes made here, especially in error handling might have 939 * to also be done in rfs_write (which clusters write requests). 940 */ 941 void 942 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 943 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 944 { 945 int error; 946 vnode_t *vp; 947 rlim64_t rlimit; 948 struct vattr va; 949 struct uio uio; 950 struct iovec iov[MAX_IOVECS]; 951 mblk_t *m; 952 struct iovec *iovp; 953 int iovcnt; 954 cred_t *savecred; 955 int in_crit = 0; 956 caller_context_t ct; 957 958 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync"); 959 960 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 961 if (vp == NULL) { 962 ns->ns_status = NFSERR_STALE; 963 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 964 "rfs_write_end:(%S)", "stale"); 965 return; 966 } 967 968 if (rdonly(exi, req)) { 969 VN_RELE(vp); 970 ns->ns_status = NFSERR_ROFS; 971 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 972 "rfs_write_end:(%S)", "rofs"); 973 return; 974 } 975 976 if (vp->v_type != VREG) { 977 VN_RELE(vp); 978 ns->ns_status = NFSERR_ISDIR; 979 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 980 "rfs_write_end:(%S)", "isdir"); 981 return; 982 } 983 984 ct.cc_sysid = 0; 985 ct.cc_pid = 0; 986 ct.cc_caller_id = nfs2_srv_caller_id; 987 ct.cc_flags = CC_DONTBLOCK; 988 989 va.va_mask = AT_UID|AT_MODE; 990 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 991 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 992 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 993 994 if (error) { 995 VN_RELE(vp); 996 ns->ns_status = puterrno(error); 997 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 998 "rfs_write_end:(%S)", "getattr error"); 999 return; 1000 } 1001 1002 if (crgetuid(cr) != va.va_uid) { 1003 /* 1004 * This is a kludge to allow writes of files created 1005 * with read only permission. The owner of the file 1006 * is always allowed to write it. 1007 */ 1008 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 1009 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct); 1010 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 1011 if (error) { 1012 VN_RELE(vp); 1013 ns->ns_status = puterrno(error); 1014 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1015 "rfs_write_end:(%S)", "access error"); 1016 return; 1017 } 1018 } 1019 1020 /* 1021 * Can't access a mandatory lock file. This might cause 1022 * the NFS service thread to block forever waiting for a 1023 * lock to be released that will never be released. 1024 */ 1025 if (MANDLOCK(vp, va.va_mode)) { 1026 VN_RELE(vp); 1027 ns->ns_status = NFSERR_ACCES; 1028 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1029 "rfs_write_end:(%S)", "mand lock"); 1030 return; 1031 } 1032 1033 /* 1034 * We have to enter the critical region before calling VOP_RWLOCK 1035 * to avoid a deadlock with ufs. 1036 */ 1037 if (nbl_need_check(vp)) { 1038 nbl_start_crit(vp, RW_READER); 1039 in_crit = 1; 1040 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1041 wa->wa_count, 0, NULL)) { 1042 error = EACCES; 1043 goto out; 1044 } 1045 } 1046 1047 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 1048 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); 1049 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 1050 1051 /* check if a monitor detected a delegation conflict */ 1052 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 1053 VN_RELE(vp); 1054 /* mark as wouldblock so response is dropped */ 1055 curthread->t_flag |= T_WOULDBLOCK; 1056 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1057 "rfs_write_end:(%S)", "delegated"); 1058 return; 1059 } 1060 1061 if (wa->wa_data) { 1062 iov[0].iov_base = wa->wa_data; 1063 iov[0].iov_len = wa->wa_count; 1064 uio.uio_iov = iov; 1065 uio.uio_iovcnt = 1; 1066 uio.uio_segflg = UIO_SYSSPACE; 1067 uio.uio_extflg = UIO_COPY_DEFAULT; 1068 uio.uio_loffset = (offset_t)wa->wa_offset; 1069 uio.uio_resid = wa->wa_count; 1070 /* 1071 * The limit is checked on the client. We 1072 * should allow any size writes here. 1073 */ 1074 uio.uio_llimit = curproc->p_fsz_ctl; 1075 rlimit = uio.uio_llimit - wa->wa_offset; 1076 if (rlimit < (rlim64_t)uio.uio_resid) 1077 uio.uio_resid = (uint_t)rlimit; 1078 1079 /* 1080 * for now we assume no append mode 1081 */ 1082 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1083 "vop_write_start:(%S)", "sync"); 1084 /* 1085 * We're changing creds because VM may fault and we need 1086 * the cred of the current thread to be used if quota 1087 * checking is enabled. 1088 */ 1089 savecred = curthread->t_cred; 1090 curthread->t_cred = cr; 1091 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); 1092 curthread->t_cred = savecred; 1093 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1094 } else { 1095 iovcnt = 0; 1096 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1097 iovcnt++; 1098 if (iovcnt <= MAX_IOVECS) { 1099 #ifdef DEBUG 1100 rfs_write_sync_hits++; 1101 #endif 1102 iovp = iov; 1103 } else { 1104 #ifdef DEBUG 1105 rfs_write_sync_misses++; 1106 #endif 1107 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1108 } 1109 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1110 uio.uio_iov = iovp; 1111 uio.uio_iovcnt = iovcnt; 1112 uio.uio_segflg = UIO_SYSSPACE; 1113 uio.uio_extflg = UIO_COPY_DEFAULT; 1114 uio.uio_loffset = (offset_t)wa->wa_offset; 1115 uio.uio_resid = wa->wa_count; 1116 /* 1117 * The limit is checked on the client. We 1118 * should allow any size writes here. 1119 */ 1120 uio.uio_llimit = curproc->p_fsz_ctl; 1121 rlimit = uio.uio_llimit - wa->wa_offset; 1122 if (rlimit < (rlim64_t)uio.uio_resid) 1123 uio.uio_resid = (uint_t)rlimit; 1124 1125 /* 1126 * For now we assume no append mode. 1127 */ 1128 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1129 "vop_write_start:(%S)", "iov sync"); 1130 /* 1131 * We're changing creds because VM may fault and we need 1132 * the cred of the current thread to be used if quota 1133 * checking is enabled. 1134 */ 1135 savecred = curthread->t_cred; 1136 curthread->t_cred = cr; 1137 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct); 1138 curthread->t_cred = savecred; 1139 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1140 1141 if (iovp != iov) 1142 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1143 } 1144 1145 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1146 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); 1147 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1148 1149 if (!error) { 1150 /* 1151 * Get attributes again so we send the latest mod 1152 * time to the client side for his cache. 1153 */ 1154 va.va_mask = AT_ALL; /* now we want everything */ 1155 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1156 error = VOP_GETATTR(vp, &va, 0, cr, &ct); 1157 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1158 /* check for overflows */ 1159 if (!error) { 1160 acl_perm(vp, exi, &va, cr); 1161 error = vattr_to_nattr(&va, &ns->ns_attr); 1162 } 1163 } 1164 1165 out: 1166 if (in_crit) 1167 nbl_end_crit(vp); 1168 VN_RELE(vp); 1169 1170 /* check if a monitor detected a delegation conflict */ 1171 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 1172 /* mark as wouldblock so response is dropped */ 1173 curthread->t_flag |= T_WOULDBLOCK; 1174 else 1175 ns->ns_status = puterrno(error); 1176 1177 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync"); 1178 } 1179 1180 struct rfs_async_write { 1181 struct nfswriteargs *wa; 1182 struct nfsattrstat *ns; 1183 struct svc_req *req; 1184 cred_t *cr; 1185 kthread_t *thread; 1186 struct rfs_async_write *list; 1187 }; 1188 1189 struct rfs_async_write_list { 1190 fhandle_t *fhp; 1191 kcondvar_t cv; 1192 struct rfs_async_write *list; 1193 struct rfs_async_write_list *next; 1194 }; 1195 1196 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1197 static kmutex_t rfs_async_write_lock; 1198 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1199 1200 #define MAXCLIOVECS 42 1201 #define RFSWRITE_INITVAL (enum nfsstat) -1 1202 1203 #ifdef DEBUG 1204 static int rfs_write_hits = 0; 1205 static int rfs_write_misses = 0; 1206 #endif 1207 1208 /* 1209 * Write data to file. 1210 * Returns attributes of a file after writing some data to it. 1211 */ 1212 void 1213 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1214 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1215 { 1216 int error; 1217 vnode_t *vp; 1218 rlim64_t rlimit; 1219 struct vattr va; 1220 struct uio uio; 1221 struct rfs_async_write_list *lp; 1222 struct rfs_async_write_list *nlp; 1223 struct rfs_async_write *rp; 1224 struct rfs_async_write *nrp; 1225 struct rfs_async_write *trp; 1226 struct rfs_async_write *lrp; 1227 int data_written; 1228 int iovcnt; 1229 mblk_t *m; 1230 struct iovec *iovp; 1231 struct iovec *niovp; 1232 struct iovec iov[MAXCLIOVECS]; 1233 int count; 1234 int rcount; 1235 uint_t off; 1236 uint_t len; 1237 struct rfs_async_write nrpsp; 1238 struct rfs_async_write_list nlpsp; 1239 ushort_t t_flag; 1240 cred_t *savecred; 1241 int in_crit = 0; 1242 caller_context_t ct; 1243 1244 if (!rfs_write_async) { 1245 rfs_write_sync(wa, ns, exi, req, cr); 1246 return; 1247 } 1248 1249 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1250 "rfs_write_start:(%S)", "async"); 1251 1252 /* 1253 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1254 * is considered an OK. 1255 */ 1256 ns->ns_status = RFSWRITE_INITVAL; 1257 1258 nrp = &nrpsp; 1259 nrp->wa = wa; 1260 nrp->ns = ns; 1261 nrp->req = req; 1262 nrp->cr = cr; 1263 nrp->thread = curthread; 1264 1265 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1266 1267 /* 1268 * Look to see if there is already a cluster started 1269 * for this file. 1270 */ 1271 mutex_enter(&rfs_async_write_lock); 1272 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1273 if (bcmp(&wa->wa_fhandle, lp->fhp, 1274 sizeof (fhandle_t)) == 0) 1275 break; 1276 } 1277 1278 /* 1279 * If lp is non-NULL, then there is already a cluster 1280 * started. We need to place ourselves in the cluster 1281 * list in the right place as determined by starting 1282 * offset. Conflicts with non-blocking mandatory locked 1283 * regions will be checked when the cluster is processed. 1284 */ 1285 if (lp != NULL) { 1286 rp = lp->list; 1287 trp = NULL; 1288 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1289 trp = rp; 1290 rp = rp->list; 1291 } 1292 nrp->list = rp; 1293 if (trp == NULL) 1294 lp->list = nrp; 1295 else 1296 trp->list = nrp; 1297 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1298 cv_wait(&lp->cv, &rfs_async_write_lock); 1299 mutex_exit(&rfs_async_write_lock); 1300 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1301 "rfs_write_end:(%S)", "cluster child"); 1302 return; 1303 } 1304 1305 /* 1306 * No cluster started yet, start one and add ourselves 1307 * to the list of clusters. 1308 */ 1309 nrp->list = NULL; 1310 1311 nlp = &nlpsp; 1312 nlp->fhp = &wa->wa_fhandle; 1313 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1314 nlp->list = nrp; 1315 nlp->next = NULL; 1316 1317 if (rfs_async_write_head == NULL) { 1318 rfs_async_write_head = nlp; 1319 } else { 1320 lp = rfs_async_write_head; 1321 while (lp->next != NULL) 1322 lp = lp->next; 1323 lp->next = nlp; 1324 } 1325 mutex_exit(&rfs_async_write_lock); 1326 1327 /* 1328 * Convert the file handle common to all of the requests 1329 * in this cluster to a vnode. 1330 */ 1331 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1332 if (vp == NULL) { 1333 mutex_enter(&rfs_async_write_lock); 1334 if (rfs_async_write_head == nlp) 1335 rfs_async_write_head = nlp->next; 1336 else { 1337 lp = rfs_async_write_head; 1338 while (lp->next != nlp) 1339 lp = lp->next; 1340 lp->next = nlp->next; 1341 } 1342 t_flag = curthread->t_flag & T_WOULDBLOCK; 1343 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1344 rp->ns->ns_status = NFSERR_STALE; 1345 rp->thread->t_flag |= t_flag; 1346 } 1347 cv_broadcast(&nlp->cv); 1348 mutex_exit(&rfs_async_write_lock); 1349 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1350 "rfs_write_end:(%S)", "stale"); 1351 return; 1352 } 1353 1354 /* 1355 * Can only write regular files. Attempts to write any 1356 * other file types fail with EISDIR. 1357 */ 1358 if (vp->v_type != VREG) { 1359 VN_RELE(vp); 1360 mutex_enter(&rfs_async_write_lock); 1361 if (rfs_async_write_head == nlp) 1362 rfs_async_write_head = nlp->next; 1363 else { 1364 lp = rfs_async_write_head; 1365 while (lp->next != nlp) 1366 lp = lp->next; 1367 lp->next = nlp->next; 1368 } 1369 t_flag = curthread->t_flag & T_WOULDBLOCK; 1370 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1371 rp->ns->ns_status = NFSERR_ISDIR; 1372 rp->thread->t_flag |= t_flag; 1373 } 1374 cv_broadcast(&nlp->cv); 1375 mutex_exit(&rfs_async_write_lock); 1376 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1377 "rfs_write_end:(%S)", "isdir"); 1378 return; 1379 } 1380 1381 /* 1382 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1383 * deadlock with ufs. 1384 */ 1385 if (nbl_need_check(vp)) { 1386 nbl_start_crit(vp, RW_READER); 1387 in_crit = 1; 1388 } 1389 1390 ct.cc_sysid = 0; 1391 ct.cc_pid = 0; 1392 ct.cc_caller_id = nfs2_srv_caller_id; 1393 ct.cc_flags = CC_DONTBLOCK; 1394 1395 /* 1396 * Lock the file for writing. This operation provides 1397 * the delay which allows clusters to grow. 1398 */ 1399 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:"); 1400 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); 1401 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end"); 1402 1403 /* check if a monitor detected a delegation conflict */ 1404 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { 1405 VN_RELE(vp); 1406 /* mark as wouldblock so response is dropped */ 1407 curthread->t_flag |= T_WOULDBLOCK; 1408 mutex_enter(&rfs_async_write_lock); 1409 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1410 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1411 rp->ns->ns_status = puterrno(error); 1412 rp->thread->t_flag |= T_WOULDBLOCK; 1413 } 1414 } 1415 cv_broadcast(&nlp->cv); 1416 mutex_exit(&rfs_async_write_lock); 1417 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1418 "rfs_write_end:(%S)", "delegated"); 1419 return; 1420 } 1421 1422 /* 1423 * Disconnect this cluster from the list of clusters. 1424 * The cluster that is being dealt with must be fixed 1425 * in size after this point, so there is no reason 1426 * to leave it on the list so that new requests can 1427 * find it. 1428 * 1429 * The algorithm is that the first write request will 1430 * create a cluster, convert the file handle to a 1431 * vnode pointer, and then lock the file for writing. 1432 * This request is not likely to be clustered with 1433 * any others. However, the next request will create 1434 * a new cluster and be blocked in VOP_RWLOCK while 1435 * the first request is being processed. This delay 1436 * will allow more requests to be clustered in this 1437 * second cluster. 1438 */ 1439 mutex_enter(&rfs_async_write_lock); 1440 if (rfs_async_write_head == nlp) 1441 rfs_async_write_head = nlp->next; 1442 else { 1443 lp = rfs_async_write_head; 1444 while (lp->next != nlp) 1445 lp = lp->next; 1446 lp->next = nlp->next; 1447 } 1448 mutex_exit(&rfs_async_write_lock); 1449 1450 /* 1451 * Step through the list of requests in this cluster. 1452 * We need to check permissions to make sure that all 1453 * of the requests have sufficient permission to write 1454 * the file. A cluster can be composed of requests 1455 * from different clients and different users on each 1456 * client. 1457 * 1458 * As a side effect, we also calculate the size of the 1459 * byte range that this cluster encompasses. 1460 */ 1461 rp = nlp->list; 1462 off = rp->wa->wa_offset; 1463 len = (uint_t)0; 1464 do { 1465 if (rdonly(exi, rp->req)) { 1466 rp->ns->ns_status = NFSERR_ROFS; 1467 t_flag = curthread->t_flag & T_WOULDBLOCK; 1468 rp->thread->t_flag |= t_flag; 1469 continue; 1470 } 1471 1472 va.va_mask = AT_UID|AT_MODE; 1473 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1474 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct); 1475 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1476 if (!error) { 1477 if (crgetuid(rp->cr) != va.va_uid) { 1478 /* 1479 * This is a kludge to allow writes of files 1480 * created with read only permission. The 1481 * owner of the file is always allowed to 1482 * write it. 1483 */ 1484 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1485 "vop_access_start:"); 1486 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct); 1487 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1488 "vop_access_end:"); 1489 } 1490 if (!error && MANDLOCK(vp, va.va_mode)) 1491 error = EACCES; 1492 } 1493 1494 /* 1495 * Check for a conflict with a nbmand-locked region. 1496 */ 1497 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1498 rp->wa->wa_count, 0, NULL)) { 1499 error = EACCES; 1500 } 1501 1502 if (error) { 1503 rp->ns->ns_status = puterrno(error); 1504 t_flag = curthread->t_flag & T_WOULDBLOCK; 1505 rp->thread->t_flag |= t_flag; 1506 continue; 1507 } 1508 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1509 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1510 } while ((rp = rp->list) != NULL); 1511 1512 /* 1513 * Step through the cluster attempting to gather as many 1514 * requests which are contiguous as possible. These 1515 * contiguous requests are handled via one call to VOP_WRITE 1516 * instead of different calls to VOP_WRITE. We also keep 1517 * track of the fact that any data was written. 1518 */ 1519 rp = nlp->list; 1520 data_written = 0; 1521 do { 1522 /* 1523 * Skip any requests which are already marked as having an 1524 * error. 1525 */ 1526 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1527 rp = rp->list; 1528 continue; 1529 } 1530 1531 /* 1532 * Count the number of iovec's which are required 1533 * to handle this set of requests. One iovec is 1534 * needed for each data buffer, whether addressed 1535 * by wa_data or by the b_rptr pointers in the 1536 * mblk chains. 1537 */ 1538 iovcnt = 0; 1539 lrp = rp; 1540 for (;;) { 1541 if (lrp->wa->wa_data) 1542 iovcnt++; 1543 else { 1544 m = lrp->wa->wa_mblk; 1545 while (m != NULL) { 1546 iovcnt++; 1547 m = m->b_cont; 1548 } 1549 } 1550 if (lrp->list == NULL || 1551 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1552 lrp->wa->wa_offset + lrp->wa->wa_count != 1553 lrp->list->wa->wa_offset) { 1554 lrp = lrp->list; 1555 break; 1556 } 1557 lrp = lrp->list; 1558 } 1559 1560 if (iovcnt <= MAXCLIOVECS) { 1561 #ifdef DEBUG 1562 rfs_write_hits++; 1563 #endif 1564 niovp = iov; 1565 } else { 1566 #ifdef DEBUG 1567 rfs_write_misses++; 1568 #endif 1569 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1570 } 1571 /* 1572 * Put together the scatter/gather iovecs. 1573 */ 1574 iovp = niovp; 1575 trp = rp; 1576 count = 0; 1577 do { 1578 if (trp->wa->wa_data) { 1579 iovp->iov_base = trp->wa->wa_data; 1580 iovp->iov_len = trp->wa->wa_count; 1581 iovp++; 1582 } else { 1583 m = trp->wa->wa_mblk; 1584 rcount = trp->wa->wa_count; 1585 while (m != NULL) { 1586 iovp->iov_base = (caddr_t)m->b_rptr; 1587 iovp->iov_len = (m->b_wptr - m->b_rptr); 1588 rcount -= iovp->iov_len; 1589 if (rcount < 0) 1590 iovp->iov_len += rcount; 1591 iovp++; 1592 if (rcount <= 0) 1593 break; 1594 m = m->b_cont; 1595 } 1596 } 1597 count += trp->wa->wa_count; 1598 trp = trp->list; 1599 } while (trp != lrp); 1600 1601 uio.uio_iov = niovp; 1602 uio.uio_iovcnt = iovcnt; 1603 uio.uio_segflg = UIO_SYSSPACE; 1604 uio.uio_extflg = UIO_COPY_DEFAULT; 1605 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1606 uio.uio_resid = count; 1607 /* 1608 * The limit is checked on the client. We 1609 * should allow any size writes here. 1610 */ 1611 uio.uio_llimit = curproc->p_fsz_ctl; 1612 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1613 if (rlimit < (rlim64_t)uio.uio_resid) 1614 uio.uio_resid = (uint_t)rlimit; 1615 1616 /* 1617 * For now we assume no append mode. 1618 */ 1619 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1620 "vop_write_start:(%S)", "async"); 1621 1622 /* 1623 * We're changing creds because VM may fault 1624 * and we need the cred of the current 1625 * thread to be used if quota * checking is 1626 * enabled. 1627 */ 1628 savecred = curthread->t_cred; 1629 curthread->t_cred = cr; 1630 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct); 1631 curthread->t_cred = savecred; 1632 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1633 1634 /* check if a monitor detected a delegation conflict */ 1635 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) 1636 /* mark as wouldblock so response is dropped */ 1637 curthread->t_flag |= T_WOULDBLOCK; 1638 1639 if (niovp != iov) 1640 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1641 1642 if (!error) { 1643 data_written = 1; 1644 /* 1645 * Get attributes again so we send the latest mod 1646 * time to the client side for his cache. 1647 */ 1648 va.va_mask = AT_ALL; /* now we want everything */ 1649 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1650 "vop_getattr_start:"); 1651 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct); 1652 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1653 "vop_getattr_end:"); 1654 if (!error) 1655 acl_perm(vp, exi, &va, rp->cr); 1656 } 1657 1658 /* 1659 * Fill in the status responses for each request 1660 * which was just handled. Also, copy the latest 1661 * attributes in to the attribute responses if 1662 * appropriate. 1663 */ 1664 t_flag = curthread->t_flag & T_WOULDBLOCK; 1665 do { 1666 rp->thread->t_flag |= t_flag; 1667 /* check for overflows */ 1668 if (!error) { 1669 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1670 } 1671 rp->ns->ns_status = puterrno(error); 1672 rp = rp->list; 1673 } while (rp != lrp); 1674 } while (rp != NULL); 1675 1676 /* 1677 * If any data was written at all, then we need to flush 1678 * the data and metadata to stable storage. 1679 */ 1680 if (data_written) { 1681 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:"); 1682 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct); 1683 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:"); 1684 if (!error) { 1685 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1686 "vop_fsync_start:"); 1687 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct); 1688 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:"); 1689 } 1690 } 1691 1692 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1693 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); 1694 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1695 1696 if (in_crit) 1697 nbl_end_crit(vp); 1698 VN_RELE(vp); 1699 1700 t_flag = curthread->t_flag & T_WOULDBLOCK; 1701 mutex_enter(&rfs_async_write_lock); 1702 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1703 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1704 rp->ns->ns_status = puterrno(error); 1705 rp->thread->t_flag |= t_flag; 1706 } 1707 } 1708 cv_broadcast(&nlp->cv); 1709 mutex_exit(&rfs_async_write_lock); 1710 1711 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async"); 1712 } 1713 1714 void * 1715 rfs_write_getfh(struct nfswriteargs *wa) 1716 { 1717 return (&wa->wa_fhandle); 1718 } 1719 1720 /* 1721 * Create a file. 1722 * Creates a file with given attributes and returns those attributes 1723 * and an fhandle for the new file. 1724 */ 1725 void 1726 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1727 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1728 { 1729 int error; 1730 int lookuperr; 1731 int in_crit = 0; 1732 struct vattr va; 1733 vnode_t *vp; 1734 vnode_t *dvp; 1735 char *name = args->ca_da.da_name; 1736 vnode_t *tvp = NULL; 1737 int mode; 1738 int lookup_ok; 1739 bool_t trunc; 1740 1741 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:"); 1742 1743 /* 1744 * Disallow NULL paths 1745 */ 1746 if (name == NULL || *name == '\0') { 1747 dr->dr_status = NFSERR_ACCES; 1748 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1749 "rfs_create_end:(%S)", "access"); 1750 return; 1751 } 1752 1753 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1754 if (dvp == NULL) { 1755 dr->dr_status = NFSERR_STALE; 1756 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1757 "rfs_create_end:(%S)", "stale"); 1758 return; 1759 } 1760 1761 error = sattr_to_vattr(args->ca_sa, &va); 1762 if (error) { 1763 dr->dr_status = puterrno(error); 1764 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1765 "rfs_create_end:(%S)", "sattr"); 1766 return; 1767 } 1768 1769 /* 1770 * Must specify the mode. 1771 */ 1772 if (!(va.va_mask & AT_MODE)) { 1773 VN_RELE(dvp); 1774 dr->dr_status = NFSERR_INVAL; 1775 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1776 "rfs_create_end:(%S)", "no mode"); 1777 return; 1778 } 1779 1780 /* 1781 * This is a completely gross hack to make mknod 1782 * work over the wire until we can wack the protocol 1783 */ 1784 if ((va.va_mode & IFMT) == IFCHR) { 1785 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1786 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1787 else { 1788 va.va_type = VCHR; 1789 /* 1790 * uncompress the received dev_t 1791 * if the top half is zero indicating a request 1792 * from an `older style' OS. 1793 */ 1794 if ((va.va_size & 0xffff0000) == 0) 1795 va.va_rdev = nfsv2_expdev(va.va_size); 1796 else 1797 va.va_rdev = (dev_t)va.va_size; 1798 } 1799 va.va_mask &= ~AT_SIZE; 1800 } else if ((va.va_mode & IFMT) == IFBLK) { 1801 va.va_type = VBLK; 1802 /* 1803 * uncompress the received dev_t 1804 * if the top half is zero indicating a request 1805 * from an `older style' OS. 1806 */ 1807 if ((va.va_size & 0xffff0000) == 0) 1808 va.va_rdev = nfsv2_expdev(va.va_size); 1809 else 1810 va.va_rdev = (dev_t)va.va_size; 1811 va.va_mask &= ~AT_SIZE; 1812 } else if ((va.va_mode & IFMT) == IFSOCK) { 1813 va.va_type = VSOCK; 1814 } else 1815 va.va_type = VREG; 1816 va.va_mode &= ~IFMT; 1817 va.va_mask |= AT_TYPE; 1818 1819 /* 1820 * Why was the choice made to use VWRITE as the mode to the 1821 * call to VOP_CREATE ? This results in a bug. When a client 1822 * opens a file that already exists and is RDONLY, the second 1823 * open fails with an EACESS because of the mode. 1824 * bug ID 1054648. 1825 */ 1826 lookup_ok = 0; 1827 mode = VWRITE; 1828 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1829 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 1830 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1831 NULL, NULL, NULL); 1832 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 1833 if (!error) { 1834 struct vattr at; 1835 1836 lookup_ok = 1; 1837 at.va_mask = AT_MODE; 1838 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1839 "vop_getattr_start:"); 1840 error = VOP_GETATTR(tvp, &at, 0, cr, NULL); 1841 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1842 "vop_getattr_end:"); 1843 if (!error) 1844 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1845 VN_RELE(tvp); 1846 tvp = NULL; 1847 } 1848 } 1849 1850 if (!lookup_ok) { 1851 if (rdonly(exi, req)) { 1852 error = EROFS; 1853 } else if (va.va_type != VREG && va.va_type != VFIFO && 1854 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1855 error = EPERM; 1856 } else { 1857 error = 0; 1858 } 1859 } 1860 1861 /* 1862 * If file size is being modified on an already existing file 1863 * make sure that there are no conflicting non-blocking mandatory 1864 * locks in the region being manipulated. Return EACCES if there 1865 * are conflicting locks. 1866 */ 1867 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1868 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr, 1869 NULL, NULL, NULL); 1870 1871 if (!lookuperr && 1872 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1873 VN_RELE(tvp); 1874 curthread->t_flag |= T_WOULDBLOCK; 1875 goto out; 1876 } 1877 1878 if (!lookuperr && nbl_need_check(tvp)) { 1879 /* 1880 * The file exists. Now check if it has any 1881 * conflicting non-blocking mandatory locks 1882 * in the region being changed. 1883 */ 1884 struct vattr bva; 1885 u_offset_t offset; 1886 ssize_t length; 1887 1888 nbl_start_crit(tvp, RW_READER); 1889 in_crit = 1; 1890 1891 bva.va_mask = AT_SIZE; 1892 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL); 1893 if (!error) { 1894 if (va.va_size < bva.va_size) { 1895 offset = va.va_size; 1896 length = bva.va_size - va.va_size; 1897 } else { 1898 offset = bva.va_size; 1899 length = va.va_size - bva.va_size; 1900 } 1901 if (length) { 1902 if (nbl_conflict(tvp, NBL_WRITE, 1903 offset, length, 0, NULL)) { 1904 error = EACCES; 1905 } 1906 } 1907 } 1908 if (error) { 1909 nbl_end_crit(tvp); 1910 VN_RELE(tvp); 1911 in_crit = 0; 1912 } 1913 } else if (tvp != NULL) { 1914 VN_RELE(tvp); 1915 } 1916 } 1917 1918 if (!error) { 1919 /* 1920 * If filesystem is shared with nosuid the remove any 1921 * setuid/setgid bits on create. 1922 */ 1923 if (va.va_type == VREG && 1924 exi->exi_export.ex_flags & EX_NOSUID) 1925 va.va_mode &= ~(VSUID | VSGID); 1926 1927 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:"); 1928 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0, 1929 NULL, NULL); 1930 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:"); 1931 1932 if (!error) { 1933 1934 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1935 trunc = TRUE; 1936 else 1937 trunc = FALSE; 1938 1939 if (rfs4_check_delegated(FWRITE, vp, trunc)) { 1940 VN_RELE(vp); 1941 curthread->t_flag |= T_WOULDBLOCK; 1942 goto out; 1943 } 1944 va.va_mask = AT_ALL; 1945 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1946 "vop_getattr_start:"); 1947 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 1948 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1949 "vop_getattr_end:"); 1950 /* check for overflows */ 1951 if (!error) { 1952 acl_perm(vp, exi, &va, cr); 1953 error = vattr_to_nattr(&va, &dr->dr_attr); 1954 if (!error) { 1955 error = makefh(&dr->dr_fhandle, vp, 1956 exi); 1957 } 1958 } 1959 /* 1960 * Force modified metadata out to stable storage. 1961 */ 1962 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 1963 VN_RELE(vp); 1964 } 1965 1966 if (in_crit) { 1967 nbl_end_crit(tvp); 1968 VN_RELE(tvp); 1969 } 1970 } 1971 1972 /* 1973 * Force modified data and metadata out to stable storage. 1974 */ 1975 (void) VOP_FSYNC(dvp, 0, cr, NULL); 1976 1977 out: 1978 1979 VN_RELE(dvp); 1980 1981 dr->dr_status = puterrno(error); 1982 1983 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done"); 1984 } 1985 void * 1986 rfs_create_getfh(struct nfscreatargs *args) 1987 { 1988 return (args->ca_da.da_fhandle); 1989 } 1990 1991 /* 1992 * Remove a file. 1993 * Remove named file from parent directory. 1994 */ 1995 void 1996 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 1997 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1998 { 1999 int error = 0; 2000 vnode_t *vp; 2001 vnode_t *targvp; 2002 int in_crit = 0; 2003 2004 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:"); 2005 2006 /* 2007 * Disallow NULL paths 2008 */ 2009 if (da->da_name == NULL || *da->da_name == '\0') { 2010 *status = NFSERR_ACCES; 2011 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2012 "rfs_remove_end:(%S)", "access"); 2013 return; 2014 } 2015 2016 vp = nfs_fhtovp(da->da_fhandle, exi); 2017 if (vp == NULL) { 2018 *status = NFSERR_STALE; 2019 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2020 "rfs_remove_end:(%S)", "stale"); 2021 return; 2022 } 2023 2024 if (rdonly(exi, req)) { 2025 VN_RELE(vp); 2026 *status = NFSERR_ROFS; 2027 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 2028 "rfs_remove_end:(%S)", "rofs"); 2029 return; 2030 } 2031 2032 /* 2033 * Check for a conflict with a non-blocking mandatory share reservation. 2034 */ 2035 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, 2036 NULL, cr, NULL, NULL, NULL); 2037 if (error != 0) { 2038 VN_RELE(vp); 2039 *status = puterrno(error); 2040 return; 2041 } 2042 2043 /* 2044 * If the file is delegated to an v4 client, then initiate 2045 * recall and drop this request (by setting T_WOULDBLOCK). 2046 * The client will eventually re-transmit the request and 2047 * (hopefully), by then, the v4 client will have returned 2048 * the delegation. 2049 */ 2050 2051 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2052 VN_RELE(vp); 2053 VN_RELE(targvp); 2054 curthread->t_flag |= T_WOULDBLOCK; 2055 return; 2056 } 2057 2058 if (nbl_need_check(targvp)) { 2059 nbl_start_crit(targvp, RW_READER); 2060 in_crit = 1; 2061 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) { 2062 error = EACCES; 2063 goto out; 2064 } 2065 } 2066 2067 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:"); 2068 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0); 2069 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:"); 2070 2071 /* 2072 * Force modified data and metadata out to stable storage. 2073 */ 2074 (void) VOP_FSYNC(vp, 0, cr, NULL); 2075 2076 out: 2077 if (in_crit) 2078 nbl_end_crit(targvp); 2079 VN_RELE(targvp); 2080 VN_RELE(vp); 2081 2082 *status = puterrno(error); 2083 2084 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done"); 2085 } 2086 2087 void * 2088 rfs_remove_getfh(struct nfsdiropargs *da) 2089 { 2090 return (da->da_fhandle); 2091 } 2092 2093 /* 2094 * rename a file 2095 * Give a file (from) a new name (to). 2096 */ 2097 void 2098 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2099 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2100 { 2101 int error = 0; 2102 vnode_t *fromvp; 2103 vnode_t *tovp; 2104 struct exportinfo *to_exi; 2105 fhandle_t *fh; 2106 vnode_t *srcvp; 2107 vnode_t *targvp; 2108 int in_crit = 0; 2109 2110 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:"); 2111 2112 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2113 if (fromvp == NULL) { 2114 *status = NFSERR_STALE; 2115 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2116 "rfs_rename_end:(%S)", "from stale"); 2117 return; 2118 } 2119 2120 fh = args->rna_to.da_fhandle; 2121 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2122 if (to_exi == NULL) { 2123 VN_RELE(fromvp); 2124 *status = NFSERR_ACCES; 2125 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2126 "rfs_rename_end:(%S)", "cross device"); 2127 return; 2128 } 2129 exi_rele(to_exi); 2130 2131 if (to_exi != exi) { 2132 VN_RELE(fromvp); 2133 *status = NFSERR_XDEV; 2134 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2135 "rfs_rename_end:(%S)", "from stale"); 2136 return; 2137 } 2138 2139 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2140 if (tovp == NULL) { 2141 VN_RELE(fromvp); 2142 *status = NFSERR_STALE; 2143 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2144 "rfs_rename_end:(%S)", "to stale"); 2145 return; 2146 } 2147 2148 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2149 VN_RELE(tovp); 2150 VN_RELE(fromvp); 2151 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2152 "rfs_rename_end:(%S)", "not dir"); 2153 *status = NFSERR_NOTDIR; 2154 return; 2155 } 2156 2157 /* 2158 * Disallow NULL paths 2159 */ 2160 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2161 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2162 VN_RELE(tovp); 2163 VN_RELE(fromvp); 2164 *status = NFSERR_ACCES; 2165 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2166 "rfs_rename_end:(%S)", "access"); 2167 return; 2168 } 2169 2170 if (rdonly(exi, req)) { 2171 VN_RELE(tovp); 2172 VN_RELE(fromvp); 2173 *status = NFSERR_ROFS; 2174 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2175 "rfs_rename_end:(%S)", "rofs"); 2176 return; 2177 } 2178 2179 /* 2180 * Check for a conflict with a non-blocking mandatory share reservation. 2181 */ 2182 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2183 NULL, cr, NULL, NULL, NULL); 2184 if (error != 0) { 2185 VN_RELE(tovp); 2186 VN_RELE(fromvp); 2187 *status = puterrno(error); 2188 return; 2189 } 2190 2191 /* Check for delegations on the source file */ 2192 2193 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2194 VN_RELE(tovp); 2195 VN_RELE(fromvp); 2196 VN_RELE(srcvp); 2197 curthread->t_flag |= T_WOULDBLOCK; 2198 return; 2199 } 2200 2201 /* Check for delegation on the file being renamed over, if it exists */ 2202 2203 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2204 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr, 2205 NULL, NULL, NULL) == 0) { 2206 2207 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2208 VN_RELE(tovp); 2209 VN_RELE(fromvp); 2210 VN_RELE(srcvp); 2211 VN_RELE(targvp); 2212 curthread->t_flag |= T_WOULDBLOCK; 2213 return; 2214 } 2215 VN_RELE(targvp); 2216 } 2217 2218 2219 if (nbl_need_check(srcvp)) { 2220 nbl_start_crit(srcvp, RW_READER); 2221 in_crit = 1; 2222 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) { 2223 error = EACCES; 2224 goto out; 2225 } 2226 } 2227 2228 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:"); 2229 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2230 tovp, args->rna_to.da_name, cr, NULL, 0); 2231 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:"); 2232 2233 if (error == 0) { 2234 char *tmp; 2235 2236 /* fix the path name for the renamed file */ 2237 mutex_enter(&srcvp->v_lock); 2238 tmp = srcvp->v_path; 2239 srcvp->v_path = NULL; 2240 mutex_exit(&srcvp->v_lock); 2241 vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name, 2242 strlen(args->rna_to.da_name)); 2243 if (tmp != NULL) 2244 kmem_free(tmp, strlen(tmp) + 1); 2245 } 2246 2247 /* 2248 * Force modified data and metadata out to stable storage. 2249 */ 2250 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2251 (void) VOP_FSYNC(fromvp, 0, cr, NULL); 2252 2253 out: 2254 if (in_crit) 2255 nbl_end_crit(srcvp); 2256 VN_RELE(srcvp); 2257 VN_RELE(tovp); 2258 VN_RELE(fromvp); 2259 2260 *status = puterrno(error); 2261 2262 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done"); 2263 } 2264 void * 2265 rfs_rename_getfh(struct nfsrnmargs *args) 2266 { 2267 return (args->rna_from.da_fhandle); 2268 } 2269 2270 /* 2271 * Link to a file. 2272 * Create a file (to) which is a hard link to the given file (from). 2273 */ 2274 void 2275 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2276 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2277 { 2278 int error; 2279 vnode_t *fromvp; 2280 vnode_t *tovp; 2281 struct exportinfo *to_exi; 2282 fhandle_t *fh; 2283 2284 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:"); 2285 2286 fromvp = nfs_fhtovp(args->la_from, exi); 2287 if (fromvp == NULL) { 2288 *status = NFSERR_STALE; 2289 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2290 "rfs_link_end:(%S)", "from stale"); 2291 return; 2292 } 2293 2294 fh = args->la_to.da_fhandle; 2295 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2296 if (to_exi == NULL) { 2297 VN_RELE(fromvp); 2298 *status = NFSERR_ACCES; 2299 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2300 "rfs_link_end:(%S)", "cross device"); 2301 return; 2302 } 2303 exi_rele(to_exi); 2304 2305 if (to_exi != exi) { 2306 VN_RELE(fromvp); 2307 *status = NFSERR_XDEV; 2308 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2309 "rfs_link_end:(%S)", "cross device"); 2310 return; 2311 } 2312 2313 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2314 if (tovp == NULL) { 2315 VN_RELE(fromvp); 2316 *status = NFSERR_STALE; 2317 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2318 "rfs_link_end:(%S)", "to stale"); 2319 return; 2320 } 2321 2322 if (tovp->v_type != VDIR) { 2323 VN_RELE(tovp); 2324 VN_RELE(fromvp); 2325 *status = NFSERR_NOTDIR; 2326 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2327 "rfs_link_end:(%S)", "not dir"); 2328 return; 2329 } 2330 /* 2331 * Disallow NULL paths 2332 */ 2333 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2334 VN_RELE(tovp); 2335 VN_RELE(fromvp); 2336 *status = NFSERR_ACCES; 2337 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2338 "rfs_link_end:(%S)", "access"); 2339 return; 2340 } 2341 2342 if (rdonly(exi, req)) { 2343 VN_RELE(tovp); 2344 VN_RELE(fromvp); 2345 *status = NFSERR_ROFS; 2346 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2347 "rfs_link_end:(%S)", "rofs"); 2348 return; 2349 } 2350 2351 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:"); 2352 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0); 2353 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:"); 2354 2355 /* 2356 * Force modified data and metadata out to stable storage. 2357 */ 2358 (void) VOP_FSYNC(tovp, 0, cr, NULL); 2359 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL); 2360 2361 VN_RELE(tovp); 2362 VN_RELE(fromvp); 2363 2364 *status = puterrno(error); 2365 2366 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done"); 2367 } 2368 void * 2369 rfs_link_getfh(struct nfslinkargs *args) 2370 { 2371 return (args->la_from); 2372 } 2373 2374 /* 2375 * Symbolicly link to a file. 2376 * Create a file (to) with the given attributes which is a symbolic link 2377 * to the given path name (to). 2378 */ 2379 void 2380 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2381 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2382 { 2383 int error; 2384 struct vattr va; 2385 vnode_t *vp; 2386 vnode_t *svp; 2387 int lerror; 2388 2389 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:"); 2390 2391 /* 2392 * Disallow NULL paths 2393 */ 2394 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2395 *status = NFSERR_ACCES; 2396 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2397 "rfs_symlink_end:(%S)", "access"); 2398 return; 2399 } 2400 2401 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2402 if (vp == NULL) { 2403 *status = NFSERR_STALE; 2404 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2405 "rfs_symlink_end:(%S)", "stale"); 2406 return; 2407 } 2408 2409 if (rdonly(exi, req)) { 2410 VN_RELE(vp); 2411 *status = NFSERR_ROFS; 2412 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2413 "rfs_symlink_end:(%S)", "rofs"); 2414 return; 2415 } 2416 2417 error = sattr_to_vattr(args->sla_sa, &va); 2418 if (error) { 2419 VN_RELE(vp); 2420 *status = puterrno(error); 2421 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2422 "rfs_symlink_end:(%S)", "sattr"); 2423 return; 2424 } 2425 2426 if (!(va.va_mask & AT_MODE)) { 2427 VN_RELE(vp); 2428 *status = NFSERR_INVAL; 2429 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2430 "rfs_symlink_end:(%S)", "no mode"); 2431 return; 2432 } 2433 2434 va.va_type = VLNK; 2435 va.va_mask |= AT_TYPE; 2436 2437 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:"); 2438 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr, 2439 NULL, 0); 2440 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:"); 2441 2442 /* 2443 * Force new data and metadata out to stable storage. 2444 */ 2445 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 2446 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2447 0, NULL, cr, NULL, NULL, NULL); 2448 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 2449 if (!lerror) { 2450 (void) VOP_FSYNC(svp, 0, cr, NULL); 2451 VN_RELE(svp); 2452 } 2453 2454 /* 2455 * Force modified data and metadata out to stable storage. 2456 */ 2457 (void) VOP_FSYNC(vp, 0, cr, NULL); 2458 2459 VN_RELE(vp); 2460 2461 *status = puterrno(error); 2462 2463 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done"); 2464 } 2465 void * 2466 rfs_symlink_getfh(struct nfsslargs *args) 2467 { 2468 return (args->sla_from.da_fhandle); 2469 } 2470 2471 /* 2472 * Make a directory. 2473 * Create a directory with the given name, parent directory, and attributes. 2474 * Returns a file handle and attributes for the new directory. 2475 */ 2476 void 2477 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2478 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2479 { 2480 int error; 2481 struct vattr va; 2482 vnode_t *dvp = NULL; 2483 vnode_t *vp; 2484 char *name = args->ca_da.da_name; 2485 2486 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:"); 2487 2488 /* 2489 * Disallow NULL paths 2490 */ 2491 if (name == NULL || *name == '\0') { 2492 dr->dr_status = NFSERR_ACCES; 2493 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2494 "rfs_mkdir_end:(%S)", "access"); 2495 return; 2496 } 2497 2498 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2499 if (vp == NULL) { 2500 dr->dr_status = NFSERR_STALE; 2501 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2502 "rfs_mkdir_end:(%S)", "stale"); 2503 return; 2504 } 2505 2506 if (rdonly(exi, req)) { 2507 VN_RELE(vp); 2508 dr->dr_status = NFSERR_ROFS; 2509 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2510 "rfs_mkdir_end:(%S)", "rofs"); 2511 return; 2512 } 2513 2514 error = sattr_to_vattr(args->ca_sa, &va); 2515 if (error) { 2516 VN_RELE(vp); 2517 dr->dr_status = puterrno(error); 2518 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2519 "rfs_mkdir_end:(%S)", "sattr"); 2520 return; 2521 } 2522 2523 if (!(va.va_mask & AT_MODE)) { 2524 VN_RELE(vp); 2525 dr->dr_status = NFSERR_INVAL; 2526 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2527 "rfs_mkdir_end:(%S)", "no mode"); 2528 return; 2529 } 2530 2531 va.va_type = VDIR; 2532 va.va_mask |= AT_TYPE; 2533 2534 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:"); 2535 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL); 2536 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:"); 2537 2538 if (!error) { 2539 /* 2540 * Attribtutes of the newly created directory should 2541 * be returned to the client. 2542 */ 2543 va.va_mask = AT_ALL; /* We want everything */ 2544 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 2545 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2546 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 2547 /* check for overflows */ 2548 if (!error) { 2549 acl_perm(vp, exi, &va, cr); 2550 error = vattr_to_nattr(&va, &dr->dr_attr); 2551 if (!error) { 2552 error = makefh(&dr->dr_fhandle, dvp, exi); 2553 } 2554 } 2555 /* 2556 * Force new data and metadata out to stable storage. 2557 */ 2558 (void) VOP_FSYNC(dvp, 0, cr, NULL); 2559 VN_RELE(dvp); 2560 } 2561 2562 /* 2563 * Force modified data and metadata out to stable storage. 2564 */ 2565 (void) VOP_FSYNC(vp, 0, cr, NULL); 2566 2567 VN_RELE(vp); 2568 2569 dr->dr_status = puterrno(error); 2570 2571 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done"); 2572 } 2573 void * 2574 rfs_mkdir_getfh(struct nfscreatargs *args) 2575 { 2576 return (args->ca_da.da_fhandle); 2577 } 2578 2579 /* 2580 * Remove a directory. 2581 * Remove the given directory name from the given parent directory. 2582 */ 2583 void 2584 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2585 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2586 { 2587 int error; 2588 vnode_t *vp; 2589 2590 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:"); 2591 2592 /* 2593 * Disallow NULL paths 2594 */ 2595 if (da->da_name == NULL || *da->da_name == '\0') { 2596 *status = NFSERR_ACCES; 2597 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2598 "rfs_rmdir_end:(%S)", "access"); 2599 return; 2600 } 2601 2602 vp = nfs_fhtovp(da->da_fhandle, exi); 2603 if (vp == NULL) { 2604 *status = NFSERR_STALE; 2605 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2606 "rfs_rmdir_end:(%S)", "stale"); 2607 return; 2608 } 2609 2610 if (rdonly(exi, req)) { 2611 VN_RELE(vp); 2612 *status = NFSERR_ROFS; 2613 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2614 "rfs_rmdir_end:(%S)", "rofs"); 2615 return; 2616 } 2617 2618 /* 2619 * VOP_RMDIR now takes a new third argument (the current 2620 * directory of the process). That's because someone 2621 * wants to return EINVAL if one tries to remove ".". 2622 * Of course, NFS servers have no idea what their 2623 * clients' current directories are. We fake it by 2624 * supplying a vnode known to exist and illegal to 2625 * remove. 2626 */ 2627 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:"); 2628 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0); 2629 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:"); 2630 2631 /* 2632 * Force modified data and metadata out to stable storage. 2633 */ 2634 (void) VOP_FSYNC(vp, 0, cr, NULL); 2635 2636 VN_RELE(vp); 2637 2638 /* 2639 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2640 * if the directory is not empty. A System V NFS server 2641 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2642 * over the wire. 2643 */ 2644 if (error == EEXIST) 2645 *status = NFSERR_NOTEMPTY; 2646 else 2647 *status = puterrno(error); 2648 2649 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done"); 2650 } 2651 void * 2652 rfs_rmdir_getfh(struct nfsdiropargs *da) 2653 { 2654 return (da->da_fhandle); 2655 } 2656 2657 /* ARGSUSED */ 2658 void 2659 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2660 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2661 { 2662 int error; 2663 int iseof; 2664 struct iovec iov; 2665 struct uio uio; 2666 vnode_t *vp; 2667 2668 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:"); 2669 2670 vp = nfs_fhtovp(&rda->rda_fh, exi); 2671 if (vp == NULL) { 2672 rd->rd_entries = NULL; 2673 rd->rd_status = NFSERR_STALE; 2674 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2675 "rfs_readdir_end:(%S)", "stale"); 2676 return; 2677 } 2678 2679 if (vp->v_type != VDIR) { 2680 VN_RELE(vp); 2681 rd->rd_entries = NULL; 2682 rd->rd_status = NFSERR_NOTDIR; 2683 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2684 "rfs_readdir_end:(%S)", "notdir"); 2685 return; 2686 } 2687 2688 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 2689 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2690 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 2691 2692 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 2693 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL); 2694 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 2695 if (error) { 2696 rd->rd_entries = NULL; 2697 goto bad; 2698 } 2699 2700 if (rda->rda_count == 0) { 2701 rd->rd_entries = NULL; 2702 rd->rd_size = 0; 2703 rd->rd_eof = FALSE; 2704 goto bad; 2705 } 2706 2707 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2708 2709 /* 2710 * Allocate data for entries. This will be freed by rfs_rddirfree. 2711 */ 2712 rd->rd_bufsize = (uint_t)rda->rda_count; 2713 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2714 2715 /* 2716 * Set up io vector to read directory data 2717 */ 2718 iov.iov_base = (caddr_t)rd->rd_entries; 2719 iov.iov_len = rda->rda_count; 2720 uio.uio_iov = &iov; 2721 uio.uio_iovcnt = 1; 2722 uio.uio_segflg = UIO_SYSSPACE; 2723 uio.uio_extflg = UIO_COPY_CACHED; 2724 uio.uio_loffset = (offset_t)rda->rda_offset; 2725 uio.uio_resid = rda->rda_count; 2726 2727 /* 2728 * read directory 2729 */ 2730 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:"); 2731 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0); 2732 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:"); 2733 2734 /* 2735 * Clean up 2736 */ 2737 if (!error) { 2738 /* 2739 * set size and eof 2740 */ 2741 if (uio.uio_resid == rda->rda_count) { 2742 rd->rd_size = 0; 2743 rd->rd_eof = TRUE; 2744 } else { 2745 rd->rd_size = (uint32_t)(rda->rda_count - 2746 uio.uio_resid); 2747 rd->rd_eof = iseof ? TRUE : FALSE; 2748 } 2749 } 2750 2751 bad: 2752 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 2753 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2754 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 2755 2756 #if 0 /* notyet */ 2757 /* 2758 * Don't do this. It causes local disk writes when just 2759 * reading the file and the overhead is deemed larger 2760 * than the benefit. 2761 */ 2762 /* 2763 * Force modified metadata out to stable storage. 2764 */ 2765 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL); 2766 #endif 2767 2768 VN_RELE(vp); 2769 2770 rd->rd_status = puterrno(error); 2771 2772 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done"); 2773 } 2774 void * 2775 rfs_readdir_getfh(struct nfsrddirargs *rda) 2776 { 2777 return (&rda->rda_fh); 2778 } 2779 void 2780 rfs_rddirfree(struct nfsrddirres *rd) 2781 { 2782 if (rd->rd_entries != NULL) 2783 kmem_free(rd->rd_entries, rd->rd_bufsize); 2784 } 2785 2786 /* ARGSUSED */ 2787 void 2788 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2789 struct svc_req *req, cred_t *cr) 2790 { 2791 int error; 2792 struct statvfs64 sb; 2793 vnode_t *vp; 2794 2795 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:"); 2796 2797 vp = nfs_fhtovp(fh, exi); 2798 if (vp == NULL) { 2799 fs->fs_status = NFSERR_STALE; 2800 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2801 "rfs_statfs_end:(%S)", "stale"); 2802 return; 2803 } 2804 2805 error = VFS_STATVFS(vp->v_vfsp, &sb); 2806 2807 if (!error) { 2808 fs->fs_tsize = nfstsize(); 2809 fs->fs_bsize = sb.f_frsize; 2810 fs->fs_blocks = sb.f_blocks; 2811 fs->fs_bfree = sb.f_bfree; 2812 fs->fs_bavail = sb.f_bavail; 2813 } 2814 2815 VN_RELE(vp); 2816 2817 fs->fs_status = puterrno(error); 2818 2819 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done"); 2820 } 2821 void * 2822 rfs_statfs_getfh(fhandle_t *fh) 2823 { 2824 return (fh); 2825 } 2826 2827 static int 2828 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2829 { 2830 vap->va_mask = 0; 2831 2832 /* 2833 * There was a sign extension bug in some VFS based systems 2834 * which stored the mode as a short. When it would get 2835 * assigned to a u_long, no sign extension would occur. 2836 * It needed to, but this wasn't noticed because sa_mode 2837 * would then get assigned back to the short, thus ignoring 2838 * the upper 16 bits of sa_mode. 2839 * 2840 * To make this implementation work for both broken 2841 * clients and good clients, we check for both versions 2842 * of the mode. 2843 */ 2844 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2845 sa->sa_mode != (uint32_t)-1) { 2846 vap->va_mask |= AT_MODE; 2847 vap->va_mode = sa->sa_mode; 2848 } 2849 if (sa->sa_uid != (uint32_t)-1) { 2850 vap->va_mask |= AT_UID; 2851 vap->va_uid = sa->sa_uid; 2852 } 2853 if (sa->sa_gid != (uint32_t)-1) { 2854 vap->va_mask |= AT_GID; 2855 vap->va_gid = sa->sa_gid; 2856 } 2857 if (sa->sa_size != (uint32_t)-1) { 2858 vap->va_mask |= AT_SIZE; 2859 vap->va_size = sa->sa_size; 2860 } 2861 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2862 sa->sa_atime.tv_usec != (int32_t)-1) { 2863 #ifndef _LP64 2864 /* return error if time overflow */ 2865 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2866 return (EOVERFLOW); 2867 #endif 2868 vap->va_mask |= AT_ATIME; 2869 /* 2870 * nfs protocol defines times as unsigned so don't extend sign, 2871 * unless sysadmin set nfs_allow_preepoch_time. 2872 */ 2873 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2874 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2875 } 2876 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2877 sa->sa_mtime.tv_usec != (int32_t)-1) { 2878 #ifndef _LP64 2879 /* return error if time overflow */ 2880 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2881 return (EOVERFLOW); 2882 #endif 2883 vap->va_mask |= AT_MTIME; 2884 /* 2885 * nfs protocol defines times as unsigned so don't extend sign, 2886 * unless sysadmin set nfs_allow_preepoch_time. 2887 */ 2888 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2889 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2890 } 2891 return (0); 2892 } 2893 2894 static enum nfsftype vt_to_nf[] = { 2895 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2896 }; 2897 2898 /* 2899 * check the following fields for overflow: nodeid, size, and time. 2900 * There could be a problem when converting 64-bit LP64 fields 2901 * into 32-bit ones. Return an error if there is an overflow. 2902 */ 2903 int 2904 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2905 { 2906 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2907 na->na_type = vt_to_nf[vap->va_type]; 2908 2909 if (vap->va_mode == (unsigned short) -1) 2910 na->na_mode = (uint32_t)-1; 2911 else 2912 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2913 2914 if (vap->va_uid == (unsigned short)(-1)) 2915 na->na_uid = (uint32_t)(-1); 2916 else if (vap->va_uid == UID_NOBODY) 2917 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2918 else 2919 na->na_uid = vap->va_uid; 2920 2921 if (vap->va_gid == (unsigned short)(-1)) 2922 na->na_gid = (uint32_t)-1; 2923 else if (vap->va_gid == GID_NOBODY) 2924 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2925 else 2926 na->na_gid = vap->va_gid; 2927 2928 /* 2929 * Do we need to check fsid for overflow? It is 64-bit in the 2930 * vattr, but are bigger than 32 bit values supported? 2931 */ 2932 na->na_fsid = vap->va_fsid; 2933 2934 na->na_nodeid = vap->va_nodeid; 2935 2936 /* 2937 * Check to make sure that the nodeid is representable over the 2938 * wire without losing bits. 2939 */ 2940 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2941 return (EFBIG); 2942 na->na_nlink = vap->va_nlink; 2943 2944 /* 2945 * Check for big files here, instead of at the caller. See 2946 * comments in cstat for large special file explanation. 2947 */ 2948 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2949 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2950 return (EFBIG); 2951 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2952 /* UNKNOWN_SIZE | OVERFLOW */ 2953 na->na_size = MAXOFF32_T; 2954 } else 2955 na->na_size = vap->va_size; 2956 } else 2957 na->na_size = vap->va_size; 2958 2959 /* 2960 * If the vnode times overflow the 32-bit times that NFS2 2961 * uses on the wire then return an error. 2962 */ 2963 if (!NFS_VAP_TIME_OK(vap)) { 2964 return (EOVERFLOW); 2965 } 2966 na->na_atime.tv_sec = vap->va_atime.tv_sec; 2967 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2968 2969 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 2970 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2971 2972 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 2973 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 2974 2975 /* 2976 * If the dev_t will fit into 16 bits then compress 2977 * it, otherwise leave it alone. See comments in 2978 * nfs_client.c. 2979 */ 2980 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 2981 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 2982 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 2983 else 2984 (void) cmpldev(&na->na_rdev, vap->va_rdev); 2985 2986 na->na_blocks = vap->va_nblocks; 2987 na->na_blocksize = vap->va_blksize; 2988 2989 /* 2990 * This bit of ugliness is a *TEMPORARY* hack to preserve the 2991 * over-the-wire protocols for named-pipe vnodes. It remaps the 2992 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 2993 * 2994 * BUYER BEWARE: 2995 * If you are porting the NFS to a non-Sun server, you probably 2996 * don't want to include the following block of code. The 2997 * over-the-wire special file types will be changing with the 2998 * NFS Protocol Revision. 2999 */ 3000 if (vap->va_type == VFIFO) 3001 NA_SETFIFO(na); 3002 return (0); 3003 } 3004 3005 /* 3006 * acl v2 support: returns approximate permission. 3007 * default: returns minimal permission (more restrictive) 3008 * aclok: returns maximal permission (less restrictive) 3009 * This routine changes the permissions that are alaredy in *va. 3010 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 3011 * CLASS_OBJ is always the same as GROUP_OBJ entry. 3012 */ 3013 static void 3014 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 3015 { 3016 vsecattr_t vsa; 3017 int aclcnt; 3018 aclent_t *aclentp; 3019 mode_t mask_perm; 3020 mode_t grp_perm; 3021 mode_t other_perm; 3022 mode_t other_orig; 3023 int error; 3024 3025 /* dont care default acl */ 3026 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 3027 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL); 3028 3029 if (!error) { 3030 aclcnt = vsa.vsa_aclcnt; 3031 if (aclcnt > MIN_ACL_ENTRIES) { 3032 /* non-trivial ACL */ 3033 aclentp = vsa.vsa_aclentp; 3034 if (exi->exi_export.ex_flags & EX_ACLOK) { 3035 /* maximal permissions */ 3036 grp_perm = 0; 3037 other_perm = 0; 3038 for (; aclcnt > 0; aclcnt--, aclentp++) { 3039 switch (aclentp->a_type) { 3040 case USER_OBJ: 3041 break; 3042 case USER: 3043 grp_perm |= 3044 aclentp->a_perm << 3; 3045 other_perm |= aclentp->a_perm; 3046 break; 3047 case GROUP_OBJ: 3048 grp_perm |= 3049 aclentp->a_perm << 3; 3050 break; 3051 case GROUP: 3052 other_perm |= aclentp->a_perm; 3053 break; 3054 case OTHER_OBJ: 3055 other_orig = aclentp->a_perm; 3056 break; 3057 case CLASS_OBJ: 3058 mask_perm = aclentp->a_perm; 3059 break; 3060 default: 3061 break; 3062 } 3063 } 3064 grp_perm &= mask_perm << 3; 3065 other_perm &= mask_perm; 3066 other_perm |= other_orig; 3067 3068 } else { 3069 /* minimal permissions */ 3070 grp_perm = 070; 3071 other_perm = 07; 3072 for (; aclcnt > 0; aclcnt--, aclentp++) { 3073 switch (aclentp->a_type) { 3074 case USER_OBJ: 3075 break; 3076 case USER: 3077 case CLASS_OBJ: 3078 grp_perm &= 3079 aclentp->a_perm << 3; 3080 other_perm &= 3081 aclentp->a_perm; 3082 break; 3083 case GROUP_OBJ: 3084 grp_perm &= 3085 aclentp->a_perm << 3; 3086 break; 3087 case GROUP: 3088 other_perm &= 3089 aclentp->a_perm; 3090 break; 3091 case OTHER_OBJ: 3092 other_perm &= 3093 aclentp->a_perm; 3094 break; 3095 default: 3096 break; 3097 } 3098 } 3099 } 3100 /* copy to va */ 3101 va->va_mode &= ~077; 3102 va->va_mode |= grp_perm | other_perm; 3103 } 3104 if (vsa.vsa_aclcnt) 3105 kmem_free(vsa.vsa_aclentp, 3106 vsa.vsa_aclcnt * sizeof (aclent_t)); 3107 } 3108 } 3109 3110 void 3111 rfs_srvrinit(void) 3112 { 3113 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3114 nfs2_srv_caller_id = fs_new_caller_id(); 3115 } 3116 3117 void 3118 rfs_srvrfini(void) 3119 { 3120 mutex_destroy(&rfs_async_write_lock); 3121 } 3122