1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All rights reserved. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/buf.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/uio.h> 41 #include <sys/stat.h> 42 #include <sys/errno.h> 43 #include <sys/sysmacros.h> 44 #include <sys/statvfs.h> 45 #include <sys/kmem.h> 46 #include <sys/kstat.h> 47 #include <sys/dirent.h> 48 #include <sys/cmn_err.h> 49 #include <sys/debug.h> 50 #include <sys/vtrace.h> 51 #include <sys/mode.h> 52 #include <sys/acl.h> 53 #include <sys/nbmlock.h> 54 #include <sys/policy.h> 55 56 #include <rpc/types.h> 57 #include <rpc/auth.h> 58 #include <rpc/svc.h> 59 60 #include <nfs/nfs.h> 61 #include <nfs/export.h> 62 63 #include <vm/hat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kmem.h> 68 69 #include <sys/strsubr.h> 70 71 /* 72 * These are the interface routines for the server side of the 73 * Network File System. See the NFS version 2 protocol specification 74 * for a description of this interface. 75 */ 76 77 static int sattr_to_vattr(struct nfssattr *, struct vattr *); 78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *, 79 cred_t *); 80 81 /* 82 * Some "over the wire" UNIX file types. These are encoded 83 * into the mode. This needs to be fixed in the next rev. 84 */ 85 #define IFMT 0170000 /* type of file */ 86 #define IFCHR 0020000 /* character special */ 87 #define IFBLK 0060000 /* block special */ 88 #define IFSOCK 0140000 /* socket */ 89 90 /* 91 * Get file attributes. 92 * Returns the current attributes of the file with the given fhandle. 93 */ 94 /* ARGSUSED */ 95 void 96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi, 97 struct svc_req *req, cred_t *cr) 98 { 99 int error; 100 vnode_t *vp; 101 struct vattr va; 102 103 TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:"); 104 105 vp = nfs_fhtovp(fhp, exi); 106 if (vp == NULL) { 107 ns->ns_status = NFSERR_STALE; 108 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, 109 "rfs_getattr_end:(%S)", "stale"); 110 return; 111 } 112 113 /* 114 * Do the getattr. 115 */ 116 va.va_mask = AT_ALL; /* we want all the attributes */ 117 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 118 error = rfs4_delegated_getattr(vp, &va, 0, cr); 119 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 120 121 /* check for overflows */ 122 if (!error) { 123 acl_perm(vp, exi, &va, cr); 124 error = vattr_to_nattr(&va, &ns->ns_attr); 125 } 126 127 VN_RELE(vp); 128 129 ns->ns_status = puterrno(error); 130 131 TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done"); 132 } 133 void * 134 rfs_getattr_getfh(fhandle_t *fhp) 135 { 136 return (fhp); 137 } 138 139 /* 140 * Set file attributes. 141 * Sets the attributes of the file with the given fhandle. Returns 142 * the new attributes. 143 */ 144 void 145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns, 146 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 147 { 148 int error; 149 int flag; 150 int in_crit = 0; 151 vnode_t *vp; 152 struct vattr va; 153 struct vattr bva; 154 struct flock64 bf; 155 156 TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:"); 157 158 vp = nfs_fhtovp(&args->saa_fh, exi); 159 if (vp == NULL) { 160 ns->ns_status = NFSERR_STALE; 161 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 162 "rfs_setattr_end:(%S)", "stale"); 163 return; 164 } 165 166 if (rdonly(exi, req) || vn_is_readonly(vp)) { 167 VN_RELE(vp); 168 ns->ns_status = NFSERR_ROFS; 169 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 170 "rfs_setattr_end:(%S)", "rofs"); 171 return; 172 } 173 174 error = sattr_to_vattr(&args->saa_sa, &va); 175 if (error) { 176 VN_RELE(vp); 177 ns->ns_status = puterrno(error); 178 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 179 "rfs_setattr_end:(%S)", "sattr"); 180 return; 181 } 182 183 /* 184 * If the client is requesting a change to the mtime, 185 * but the nanosecond field is set to 1 billion, then 186 * this is a flag to the server that it should set the 187 * atime and mtime fields to the server's current time. 188 * The 1 billion number actually came from the client 189 * as 1 million, but the units in the over the wire 190 * request are microseconds instead of nanoseconds. 191 * 192 * This is an overload of the protocol and should be 193 * documented in the NFS Version 2 protocol specification. 194 */ 195 if (va.va_mask & AT_MTIME) { 196 if (va.va_mtime.tv_nsec == 1000000000) { 197 gethrestime(&va.va_mtime); 198 va.va_atime = va.va_mtime; 199 va.va_mask |= AT_ATIME; 200 flag = 0; 201 } else 202 flag = ATTR_UTIME; 203 } else 204 flag = 0; 205 206 /* 207 * If the filesystem is exported with nosuid, then mask off 208 * the setuid and setgid bits. 209 */ 210 if ((va.va_mask & AT_MODE) && vp->v_type == VREG && 211 (exi->exi_export.ex_flags & EX_NOSUID)) 212 va.va_mode &= ~(VSUID | VSGID); 213 214 /* 215 * We need to specially handle size changes because it is 216 * possible for the client to create a file with modes 217 * which indicate read-only, but with the file opened for 218 * writing. If the client then tries to set the size of 219 * the file, then the normal access checking done in 220 * VOP_SETATTR would prevent the client from doing so, 221 * although it should be legal for it to do so. To get 222 * around this, we do the access checking for ourselves 223 * and then use VOP_SPACE which doesn't do the access 224 * checking which VOP_SETATTR does. VOP_SPACE can only 225 * operate on VREG files, let VOP_SETATTR handle the other 226 * extremely rare cases. 227 * Also the client should not be allowed to change the 228 * size of the file if there is a conflicting non-blocking 229 * mandatory lock in the region of change. 230 * 231 * Also(2), check to see if the v4 side of the server has 232 * delegated this file. If so, then we set T_WOULDBLOCK 233 * so that the dispatch function dosn't send a reply, forcing 234 * the client to retrasmit its request. 235 */ 236 if (vp->v_type == VREG && va.va_mask & AT_SIZE) { 237 /* If delegated, mark as wouldblock so response is dropped */ 238 if (rfs4_check_delegated(FWRITE, vp, TRUE)) { 239 VN_RELE(vp); 240 curthread->t_flag |= T_WOULDBLOCK; 241 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 242 "rfs_setattr_end:(%S)", "delegated"); 243 return; 244 } 245 if (nbl_need_check(vp)) { 246 nbl_start_crit(vp, RW_READER); 247 in_crit = 1; 248 } 249 250 bva.va_mask = AT_UID | AT_SIZE; 251 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 252 error = VOP_GETATTR(vp, &bva, 0, cr); 253 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 254 if (error) { 255 if (in_crit) 256 nbl_end_crit(vp); 257 VN_RELE(vp); 258 ns->ns_status = puterrno(error); 259 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, 260 "rfs_setattr_end:(%S)", "getattr"); 261 return; 262 } 263 264 if (in_crit) { 265 u_offset_t offset; 266 ssize_t length; 267 268 if (va.va_size < bva.va_size) { 269 offset = va.va_size; 270 length = bva.va_size - va.va_size; 271 } else { 272 offset = bva.va_size; 273 length = va.va_size - bva.va_size; 274 } 275 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) { 276 error = EACCES; 277 } 278 } 279 280 if (crgetuid(cr) == bva.va_uid && !error && 281 va.va_size != bva.va_size) { 282 va.va_mask &= ~AT_SIZE; 283 bf.l_type = F_WRLCK; 284 bf.l_whence = 0; 285 bf.l_start = (off64_t)va.va_size; 286 bf.l_len = 0; 287 bf.l_sysid = 0; 288 bf.l_pid = 0; 289 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START, 290 "vop_space_start:"); 291 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, 292 (offset_t)va.va_size, cr, NULL); 293 TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:"); 294 } 295 if (in_crit) 296 nbl_end_crit(vp); 297 } else 298 error = 0; 299 300 /* 301 * Do the setattr. 302 */ 303 if (!error && va.va_mask) { 304 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:"); 305 error = VOP_SETATTR(vp, &va, flag, cr, NULL); 306 TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:"); 307 } 308 309 if (!error) { 310 va.va_mask = AT_ALL; /* get everything */ 311 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 312 error = rfs4_delegated_getattr(vp, &va, 0, cr); 313 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 314 315 /* check for overflows */ 316 if (!error) { 317 acl_perm(vp, exi, &va, cr); 318 error = vattr_to_nattr(&va, &ns->ns_attr); 319 } 320 } 321 322 /* 323 * Force modified metadata out to stable storage. 324 */ 325 (void) VOP_FSYNC(vp, FNODSYNC, cr); 326 327 VN_RELE(vp); 328 329 ns->ns_status = puterrno(error); 330 331 TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done"); 332 } 333 void * 334 rfs_setattr_getfh(struct nfssaargs *args) 335 { 336 return (&args->saa_fh); 337 } 338 339 /* 340 * Directory lookup. 341 * Returns an fhandle and file attributes for file name in a directory. 342 */ 343 /* ARGSUSED */ 344 void 345 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, 346 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 347 { 348 int error; 349 vnode_t *dvp; 350 vnode_t *vp; 351 struct vattr va; 352 fhandle_t *fhp = da->da_fhandle; 353 struct sec_ol sec = {0, 0}; 354 bool_t publicfh_flag = FALSE, auth_weak = FALSE; 355 356 TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:"); 357 358 /* 359 * Trusted Extension doesn't support NFSv2. MOUNT 360 * will reject v2 clients. Need to prevent v2 client 361 * access via WebNFS here. 362 */ 363 if (is_system_labeled() && req->rq_vers == 2) { 364 dr->dr_status = NFSERR_ACCES; 365 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 366 "rfs_lookup_end:(%S)", "access"); 367 return; 368 } 369 370 /* 371 * Disallow NULL paths 372 */ 373 if (da->da_name == NULL || *da->da_name == '\0') { 374 dr->dr_status = NFSERR_ACCES; 375 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 376 "rfs_lookup_end:(%S)", "access"); 377 return; 378 } 379 380 /* 381 * Allow lookups from the root - the default 382 * location of the public filehandle. 383 */ 384 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) { 385 dvp = rootdir; 386 VN_HOLD(dvp); 387 } else { 388 dvp = nfs_fhtovp(fhp, exi); 389 if (dvp == NULL) { 390 dr->dr_status = NFSERR_STALE; 391 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 392 "rfs_lookup_end:(%S)", "stale"); 393 return; 394 } 395 } 396 397 /* 398 * Not allow lookup beyond root. 399 * If the filehandle matches a filehandle of the exi, 400 * then the ".." refers beyond the root of an exported filesystem. 401 */ 402 if (strcmp(da->da_name, "..") == 0 && 403 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) { 404 VN_RELE(dvp); 405 dr->dr_status = NFSERR_NOENT; 406 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, 407 "rfs_lookup_end:(%S)", "noent"); 408 return; 409 } 410 411 /* 412 * If the public filehandle is used then allow 413 * a multi-component lookup, i.e. evaluate 414 * a pathname and follow symbolic links if 415 * necessary. 416 * 417 * This may result in a vnode in another filesystem 418 * which is OK as long as the filesystem is exported. 419 */ 420 if (PUBLIC_FH2(fhp)) { 421 publicfh_flag = TRUE; 422 error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi, 423 &sec); 424 } else { 425 /* 426 * Do a normal single component lookup. 427 */ 428 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 429 error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr); 430 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 431 } 432 433 if (!error) { 434 va.va_mask = AT_ALL; /* we want everything */ 435 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 436 error = rfs4_delegated_getattr(vp, &va, 0, cr); 437 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 438 /* check for overflows */ 439 if (!error) { 440 acl_perm(vp, exi, &va, cr); 441 error = vattr_to_nattr(&va, &dr->dr_attr); 442 if (!error) { 443 if (sec.sec_flags & SEC_QUERY) 444 error = makefh_ol(&dr->dr_fhandle, exi, 445 sec.sec_index); 446 else { 447 error = makefh(&dr->dr_fhandle, vp, 448 exi); 449 if (!error && publicfh_flag && 450 !chk_clnt_sec(exi, req)) 451 auth_weak = TRUE; 452 } 453 } 454 } 455 VN_RELE(vp); 456 } 457 458 VN_RELE(dvp); 459 460 /* 461 * If publicfh_flag is true then we have called rfs_publicfh_mclookup 462 * and have obtained a new exportinfo in exi which needs to be 463 * released. Note the the original exportinfo pointed to by exi 464 * will be released by the caller, comon_dispatch. 465 */ 466 if (publicfh_flag && exi != NULL) 467 exi_rele(exi); 468 469 /* 470 * If it's public fh, no 0x81, and client's flavor is 471 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now. 472 * Then set RPC status to AUTH_TOOWEAK in common_dispatch. 473 */ 474 if (auth_weak) 475 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR; 476 else 477 dr->dr_status = puterrno(error); 478 479 TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done"); 480 } 481 void * 482 rfs_lookup_getfh(struct nfsdiropargs *da) 483 { 484 return (da->da_fhandle); 485 } 486 487 /* 488 * Read symbolic link. 489 * Returns the string in the symbolic link at the given fhandle. 490 */ 491 /* ARGSUSED */ 492 void 493 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi, 494 struct svc_req *req, cred_t *cr) 495 { 496 int error; 497 struct iovec iov; 498 struct uio uio; 499 vnode_t *vp; 500 struct vattr va; 501 502 TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:"); 503 504 vp = nfs_fhtovp(fhp, exi); 505 if (vp == NULL) { 506 rl->rl_data = NULL; 507 rl->rl_status = NFSERR_STALE; 508 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 509 "rfs_readlink_end:(%S)", "stale"); 510 return; 511 } 512 513 va.va_mask = AT_MODE; 514 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 515 error = VOP_GETATTR(vp, &va, 0, cr); 516 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 517 518 if (error) { 519 VN_RELE(vp); 520 rl->rl_data = NULL; 521 rl->rl_status = puterrno(error); 522 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 523 "rfs_readlink_end:(%S)", "getattr error"); 524 return; 525 } 526 527 if (MANDLOCK(vp, va.va_mode)) { 528 VN_RELE(vp); 529 rl->rl_data = NULL; 530 rl->rl_status = NFSERR_ACCES; 531 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 532 "rfs_readlink_end:(%S)", "access"); 533 return; 534 } 535 536 /* 537 * XNFS and RFC1094 require us to return ENXIO if argument 538 * is not a link. BUGID 1138002. 539 */ 540 if (vp->v_type != VLNK) { 541 VN_RELE(vp); 542 rl->rl_data = NULL; 543 rl->rl_status = NFSERR_NXIO; 544 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 545 "rfs_readlink_end:(%S)", "nxio"); 546 return; 547 } 548 549 /* 550 * Allocate data for pathname. This will be freed by rfs_rlfree. 551 */ 552 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP); 553 554 /* 555 * Set up io vector to read sym link data 556 */ 557 iov.iov_base = rl->rl_data; 558 iov.iov_len = NFS_MAXPATHLEN; 559 uio.uio_iov = &iov; 560 uio.uio_iovcnt = 1; 561 uio.uio_segflg = UIO_SYSSPACE; 562 uio.uio_extflg = UIO_COPY_CACHED; 563 uio.uio_loffset = (offset_t)0; 564 uio.uio_resid = NFS_MAXPATHLEN; 565 566 /* 567 * Do the readlink. 568 */ 569 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:"); 570 error = VOP_READLINK(vp, &uio, cr); 571 TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:"); 572 573 #if 0 /* notyet */ 574 /* 575 * Don't do this. It causes local disk writes when just 576 * reading the file and the overhead is deemed larger 577 * than the benefit. 578 */ 579 /* 580 * Force modified metadata out to stable storage. 581 */ 582 (void) VOP_FSYNC(vp, FNODSYNC, cr); 583 #endif 584 585 VN_RELE(vp); 586 587 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid); 588 589 /* 590 * XNFS and RFC1094 require us to return ENXIO if argument 591 * is not a link. UFS returns EINVAL if this is the case, 592 * so we do the mapping here. BUGID 1138002. 593 */ 594 if (error == EINVAL) 595 rl->rl_status = NFSERR_NXIO; 596 else 597 rl->rl_status = puterrno(error); 598 599 TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END, 600 "rfs_readlink_end:(%S)", "done"); 601 } 602 void * 603 rfs_readlink_getfh(fhandle_t *fhp) 604 { 605 return (fhp); 606 } 607 /* 608 * Free data allocated by rfs_readlink 609 */ 610 void 611 rfs_rlfree(struct nfsrdlnres *rl) 612 { 613 if (rl->rl_data != NULL) 614 kmem_free(rl->rl_data, NFS_MAXPATHLEN); 615 } 616 617 /* 618 * Read data. 619 * Returns some data read from the file at the given fhandle. 620 */ 621 /* ARGSUSED */ 622 void 623 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr, 624 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 625 { 626 vnode_t *vp; 627 int error; 628 struct vattr va; 629 struct iovec iov; 630 struct uio uio; 631 mblk_t *mp; 632 int alloc_err = 0; 633 int in_crit = 0; 634 635 TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:"); 636 637 vp = nfs_fhtovp(&ra->ra_fhandle, exi); 638 if (vp == NULL) { 639 rr->rr_data = NULL; 640 rr->rr_status = NFSERR_STALE; 641 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 642 "rfs_read_end:(%S)", "stale"); 643 return; 644 } 645 646 if (vp->v_type != VREG) { 647 VN_RELE(vp); 648 rr->rr_data = NULL; 649 rr->rr_status = NFSERR_ISDIR; 650 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 651 "rfs_read_end:(%S)", "isdir"); 652 return; 653 } 654 655 /* 656 * Check to see if the v4 side of the server has delegated 657 * this file. If so, then we mark thread as wouldblock so 658 * the response is dropped. 659 */ 660 if (rfs4_check_delegated(FREAD, vp, FALSE)) { 661 VN_RELE(vp); 662 curthread->t_flag |= T_WOULDBLOCK; 663 rr->rr_data = NULL; 664 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 665 "rfs_read_end:(%S)", "delegated"); 666 return; 667 } 668 669 /* 670 * Enter the critical region before calling VOP_RWLOCK 671 * to avoid a deadlock with write requests. 672 */ 673 if (nbl_need_check(vp)) { 674 nbl_start_crit(vp, RW_READER); 675 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count, 676 0)) { 677 nbl_end_crit(vp); 678 VN_RELE(vp); 679 rr->rr_data = NULL; 680 rr->rr_status = NFSERR_ACCES; 681 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 682 "rfs_read_end:(%S)", " csf access error"); 683 return; 684 } 685 in_crit = 1; 686 } 687 688 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 689 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 690 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 691 692 va.va_mask = AT_ALL; 693 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 694 error = VOP_GETATTR(vp, &va, 0, cr); 695 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 696 697 if (error) { 698 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 699 "vop_rwunlock_start:"); 700 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 701 if (in_crit) 702 nbl_end_crit(vp); 703 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 704 VN_RELE(vp); 705 rr->rr_data = NULL; 706 rr->rr_status = puterrno(error); 707 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 708 "rfs_read_end:(%S)", "getattr error"); 709 return; 710 } 711 712 /* 713 * This is a kludge to allow reading of files created 714 * with no read permission. The owner of the file 715 * is always allowed to read it. 716 */ 717 if (crgetuid(cr) != va.va_uid) { 718 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 719 error = VOP_ACCESS(vp, VREAD, 0, cr); 720 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 721 if (error) { 722 /* 723 * Exec is the same as read over the net because 724 * of demand loading. 725 */ 726 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 727 "vop_access_start:"); 728 error = VOP_ACCESS(vp, VEXEC, 0, cr); 729 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 730 "vop_access_end:"); 731 } 732 if (error) { 733 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 734 "vop_rwunlock_start:"); 735 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 736 if (in_crit) 737 nbl_end_crit(vp); 738 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 739 "vop_rwunlock_end:"); 740 VN_RELE(vp); 741 rr->rr_data = NULL; 742 rr->rr_status = puterrno(error); 743 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 744 "rfs_read_end:(%S)", "access error"); 745 return; 746 } 747 } 748 749 if (MANDLOCK(vp, va.va_mode)) { 750 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 751 "vop_rwunlock_start:"); 752 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 753 if (in_crit) 754 nbl_end_crit(vp); 755 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 756 VN_RELE(vp); 757 rr->rr_data = NULL; 758 rr->rr_status = NFSERR_ACCES; 759 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 760 "rfs_read_end:(%S)", "mand lock"); 761 return; 762 } 763 764 if ((u_offset_t)ra->ra_offset >= va.va_size) { 765 rr->rr_count = 0; 766 rr->rr_data = NULL; 767 /* 768 * In this case, status is NFS_OK, but there is no data 769 * to encode. So set rr_mp to NULL. 770 */ 771 rr->rr_mp = NULL; 772 goto done; 773 } 774 775 /* 776 * mp will contain the data to be sent out in the read reply. 777 * This will be freed after the reply has been sent out (by the 778 * driver). 779 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so 780 * that the call to xdrmblk_putmblk() never fails. 781 */ 782 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG, 783 &alloc_err); 784 ASSERT(mp != NULL); 785 ASSERT(alloc_err == 0); 786 787 rr->rr_mp = mp; 788 789 /* 790 * Set up io vector 791 */ 792 iov.iov_base = (caddr_t)mp->b_datap->db_base; 793 iov.iov_len = ra->ra_count; 794 uio.uio_iov = &iov; 795 uio.uio_iovcnt = 1; 796 uio.uio_segflg = UIO_SYSSPACE; 797 uio.uio_extflg = UIO_COPY_CACHED; 798 uio.uio_loffset = (offset_t)ra->ra_offset; 799 uio.uio_resid = ra->ra_count; 800 801 TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:"); 802 error = VOP_READ(vp, &uio, 0, cr, NULL); 803 TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:"); 804 805 if (error) { 806 freeb(mp); 807 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 808 "vop_rwunlock_start:"); 809 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 810 if (in_crit) 811 nbl_end_crit(vp); 812 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 813 VN_RELE(vp); 814 rr->rr_data = NULL; 815 rr->rr_status = puterrno(error); 816 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 817 "rfs_read_end:(%S)", "read error"); 818 return; 819 } 820 821 /* 822 * Get attributes again so we can send the latest access 823 * time to the client side for his cache. 824 */ 825 va.va_mask = AT_ALL; 826 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 827 error = VOP_GETATTR(vp, &va, 0, cr); 828 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 829 if (error) { 830 freeb(mp); 831 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, 832 "vop_rwunlock_start:"); 833 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 834 if (in_crit) 835 nbl_end_crit(vp); 836 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, 837 "vop_rwunlock_end:"); 838 VN_RELE(vp); 839 rr->rr_data = NULL; 840 rr->rr_status = puterrno(error); 841 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 842 "rfs_read_end:(%S)", "read error"); 843 return; 844 } 845 846 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid); 847 848 rr->rr_data = (char *)mp->b_datap->db_base; 849 850 done: 851 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 852 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 853 if (in_crit) 854 nbl_end_crit(vp); 855 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 856 857 acl_perm(vp, exi, &va, cr); 858 859 /* check for overflows */ 860 error = vattr_to_nattr(&va, &rr->rr_attr); 861 862 #if 0 /* notyet */ 863 /* 864 * Don't do this. It causes local disk writes when just 865 * reading the file and the overhead is deemed larger 866 * than the benefit. 867 */ 868 /* 869 * Force modified metadata out to stable storage. 870 */ 871 (void) VOP_FSYNC(vp, FNODSYNC, cr); 872 #endif 873 874 VN_RELE(vp); 875 876 rr->rr_status = puterrno(error); 877 878 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done"); 879 } 880 881 /* 882 * Free data allocated by rfs_read 883 */ 884 void 885 rfs_rdfree(struct nfsrdresult *rr) 886 { 887 mblk_t *mp; 888 889 if (rr->rr_status == NFS_OK) { 890 mp = rr->rr_mp; 891 if (mp != NULL) 892 freeb(mp); 893 } 894 } 895 896 void * 897 rfs_read_getfh(struct nfsreadargs *ra) 898 { 899 return (&ra->ra_fhandle); 900 } 901 902 #define MAX_IOVECS 12 903 904 #ifdef DEBUG 905 static int rfs_write_sync_hits = 0; 906 static int rfs_write_sync_misses = 0; 907 #endif 908 909 /* 910 * Write data to file. 911 * Returns attributes of a file after writing some data to it. 912 * 913 * Any changes made here, especially in error handling might have 914 * to also be done in rfs_write (which clusters write requests). 915 */ 916 void 917 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns, 918 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 919 { 920 int error; 921 vnode_t *vp; 922 rlim64_t rlimit; 923 struct vattr va; 924 struct uio uio; 925 struct iovec iov[MAX_IOVECS]; 926 mblk_t *m; 927 struct iovec *iovp; 928 int iovcnt; 929 cred_t *savecred; 930 int in_crit = 0; 931 932 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync"); 933 934 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 935 if (vp == NULL) { 936 ns->ns_status = NFSERR_STALE; 937 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 938 "rfs_write_end:(%S)", "stale"); 939 return; 940 } 941 942 if (rdonly(exi, req)) { 943 VN_RELE(vp); 944 ns->ns_status = NFSERR_ROFS; 945 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 946 "rfs_write_end:(%S)", "rofs"); 947 return; 948 } 949 950 if (vp->v_type != VREG) { 951 VN_RELE(vp); 952 ns->ns_status = NFSERR_ISDIR; 953 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 954 "rfs_write_end:(%S)", "isdir"); 955 return; 956 } 957 958 /* 959 * Check to see if the v4 side of the server has delegated 960 * this file. If so, then we mark thread as wouldblock so 961 * the response is dropped. 962 */ 963 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 964 VN_RELE(vp); 965 curthread->t_flag |= T_WOULDBLOCK; 966 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 967 "rfs_write_end:(%S)", "delegated"); 968 return; 969 } 970 971 va.va_mask = AT_UID|AT_MODE; 972 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 973 error = VOP_GETATTR(vp, &va, 0, cr); 974 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 975 976 if (error) { 977 VN_RELE(vp); 978 ns->ns_status = puterrno(error); 979 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 980 "rfs_write_end:(%S)", "getattr error"); 981 return; 982 } 983 984 if (crgetuid(cr) != va.va_uid) { 985 /* 986 * This is a kludge to allow writes of files created 987 * with read only permission. The owner of the file 988 * is always allowed to write it. 989 */ 990 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 991 error = VOP_ACCESS(vp, VWRITE, 0, cr); 992 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 993 if (error) { 994 VN_RELE(vp); 995 ns->ns_status = puterrno(error); 996 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 997 "rfs_write_end:(%S)", "access error"); 998 return; 999 } 1000 } 1001 1002 /* 1003 * Can't access a mandatory lock file. This might cause 1004 * the NFS service thread to block forever waiting for a 1005 * lock to be released that will never be released. 1006 */ 1007 if (MANDLOCK(vp, va.va_mode)) { 1008 VN_RELE(vp); 1009 ns->ns_status = NFSERR_ACCES; 1010 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1011 "rfs_write_end:(%S)", "mand lock"); 1012 return; 1013 } 1014 1015 /* 1016 * We have to enter the critical region before calling VOP_RWLOCK 1017 * to avoid a deadlock with ufs. 1018 */ 1019 if (nbl_need_check(vp)) { 1020 nbl_start_crit(vp, RW_READER); 1021 in_crit = 1; 1022 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset, 1023 wa->wa_count, 0)) { 1024 error = EACCES; 1025 goto out; 1026 } 1027 } 1028 1029 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 1030 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1031 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 1032 1033 if (wa->wa_data) { 1034 iov[0].iov_base = wa->wa_data; 1035 iov[0].iov_len = wa->wa_count; 1036 uio.uio_iov = iov; 1037 uio.uio_iovcnt = 1; 1038 uio.uio_segflg = UIO_SYSSPACE; 1039 uio.uio_extflg = UIO_COPY_DEFAULT; 1040 uio.uio_loffset = (offset_t)wa->wa_offset; 1041 uio.uio_resid = wa->wa_count; 1042 /* 1043 * The limit is checked on the client. We 1044 * should allow any size writes here. 1045 */ 1046 uio.uio_llimit = curproc->p_fsz_ctl; 1047 rlimit = uio.uio_llimit - wa->wa_offset; 1048 if (rlimit < (rlim64_t)uio.uio_resid) 1049 uio.uio_resid = (uint_t)rlimit; 1050 1051 /* 1052 * for now we assume no append mode 1053 */ 1054 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1055 "vop_write_start:(%S)", "sync"); 1056 /* 1057 * We're changing creds because VM may fault and we need 1058 * the cred of the current thread to be used if quota 1059 * checking is enabled. 1060 */ 1061 savecred = curthread->t_cred; 1062 curthread->t_cred = cr; 1063 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1064 curthread->t_cred = savecred; 1065 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1066 } else { 1067 iovcnt = 0; 1068 for (m = wa->wa_mblk; m != NULL; m = m->b_cont) 1069 iovcnt++; 1070 if (iovcnt <= MAX_IOVECS) { 1071 #ifdef DEBUG 1072 rfs_write_sync_hits++; 1073 #endif 1074 iovp = iov; 1075 } else { 1076 #ifdef DEBUG 1077 rfs_write_sync_misses++; 1078 #endif 1079 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); 1080 } 1081 mblk_to_iov(wa->wa_mblk, iovcnt, iovp); 1082 uio.uio_iov = iovp; 1083 uio.uio_iovcnt = iovcnt; 1084 uio.uio_segflg = UIO_SYSSPACE; 1085 uio.uio_extflg = UIO_COPY_DEFAULT; 1086 uio.uio_loffset = (offset_t)wa->wa_offset; 1087 uio.uio_resid = wa->wa_count; 1088 /* 1089 * The limit is checked on the client. We 1090 * should allow any size writes here. 1091 */ 1092 uio.uio_llimit = curproc->p_fsz_ctl; 1093 rlimit = uio.uio_llimit - wa->wa_offset; 1094 if (rlimit < (rlim64_t)uio.uio_resid) 1095 uio.uio_resid = (uint_t)rlimit; 1096 1097 /* 1098 * For now we assume no append mode. 1099 */ 1100 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1101 "vop_write_start:(%S)", "iov sync"); 1102 /* 1103 * We're changing creds because VM may fault and we need 1104 * the cred of the current thread to be used if quota 1105 * checking is enabled. 1106 */ 1107 savecred = curthread->t_cred; 1108 curthread->t_cred = cr; 1109 error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL); 1110 curthread->t_cred = savecred; 1111 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:"); 1112 1113 if (iovp != iov) 1114 kmem_free(iovp, sizeof (*iovp) * iovcnt); 1115 } 1116 1117 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1118 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1119 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1120 1121 if (!error) { 1122 /* 1123 * Get attributes again so we send the latest mod 1124 * time to the client side for his cache. 1125 */ 1126 va.va_mask = AT_ALL; /* now we want everything */ 1127 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1128 error = VOP_GETATTR(vp, &va, 0, cr); 1129 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1130 /* check for overflows */ 1131 if (!error) { 1132 acl_perm(vp, exi, &va, cr); 1133 error = vattr_to_nattr(&va, &ns->ns_attr); 1134 } 1135 } 1136 1137 out: 1138 if (in_crit) 1139 nbl_end_crit(vp); 1140 VN_RELE(vp); 1141 1142 ns->ns_status = puterrno(error); 1143 1144 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync"); 1145 } 1146 1147 struct rfs_async_write { 1148 struct nfswriteargs *wa; 1149 struct nfsattrstat *ns; 1150 struct svc_req *req; 1151 cred_t *cr; 1152 kthread_t *thread; 1153 struct rfs_async_write *list; 1154 }; 1155 1156 struct rfs_async_write_list { 1157 fhandle_t *fhp; 1158 kcondvar_t cv; 1159 struct rfs_async_write *list; 1160 struct rfs_async_write_list *next; 1161 }; 1162 1163 static struct rfs_async_write_list *rfs_async_write_head = NULL; 1164 static kmutex_t rfs_async_write_lock; 1165 static int rfs_write_async = 1; /* enables write clustering if == 1 */ 1166 1167 #define MAXCLIOVECS 42 1168 #define RFSWRITE_INITVAL (enum nfsstat) -1 1169 1170 #ifdef DEBUG 1171 static int rfs_write_hits = 0; 1172 static int rfs_write_misses = 0; 1173 #endif 1174 1175 /* 1176 * Write data to file. 1177 * Returns attributes of a file after writing some data to it. 1178 */ 1179 void 1180 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns, 1181 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1182 { 1183 int error; 1184 vnode_t *vp; 1185 rlim64_t rlimit; 1186 struct vattr va; 1187 struct uio uio; 1188 struct rfs_async_write_list *lp; 1189 struct rfs_async_write_list *nlp; 1190 struct rfs_async_write *rp; 1191 struct rfs_async_write *nrp; 1192 struct rfs_async_write *trp; 1193 struct rfs_async_write *lrp; 1194 int data_written; 1195 int iovcnt; 1196 mblk_t *m; 1197 struct iovec *iovp; 1198 struct iovec *niovp; 1199 struct iovec iov[MAXCLIOVECS]; 1200 int count; 1201 int rcount; 1202 uint_t off; 1203 uint_t len; 1204 struct rfs_async_write nrpsp; 1205 struct rfs_async_write_list nlpsp; 1206 ushort_t t_flag; 1207 cred_t *savecred; 1208 int in_crit = 0; 1209 1210 if (!rfs_write_async) { 1211 rfs_write_sync(wa, ns, exi, req, cr); 1212 return; 1213 } 1214 1215 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, 1216 "rfs_write_start:(%S)", "async"); 1217 1218 /* 1219 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0 1220 * is considered an OK. 1221 */ 1222 ns->ns_status = RFSWRITE_INITVAL; 1223 1224 nrp = &nrpsp; 1225 nrp->wa = wa; 1226 nrp->ns = ns; 1227 nrp->req = req; 1228 nrp->cr = cr; 1229 nrp->thread = curthread; 1230 1231 ASSERT(curthread->t_schedflag & TS_DONT_SWAP); 1232 1233 /* 1234 * Look to see if there is already a cluster started 1235 * for this file. 1236 */ 1237 mutex_enter(&rfs_async_write_lock); 1238 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) { 1239 if (bcmp(&wa->wa_fhandle, lp->fhp, 1240 sizeof (fhandle_t)) == 0) 1241 break; 1242 } 1243 1244 /* 1245 * If lp is non-NULL, then there is already a cluster 1246 * started. We need to place ourselves in the cluster 1247 * list in the right place as determined by starting 1248 * offset. Conflicts with non-blocking mandatory locked 1249 * regions will be checked when the cluster is processed. 1250 */ 1251 if (lp != NULL) { 1252 rp = lp->list; 1253 trp = NULL; 1254 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) { 1255 trp = rp; 1256 rp = rp->list; 1257 } 1258 nrp->list = rp; 1259 if (trp == NULL) 1260 lp->list = nrp; 1261 else 1262 trp->list = nrp; 1263 while (nrp->ns->ns_status == RFSWRITE_INITVAL) 1264 cv_wait(&lp->cv, &rfs_async_write_lock); 1265 mutex_exit(&rfs_async_write_lock); 1266 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1267 "rfs_write_end:(%S)", "cluster child"); 1268 return; 1269 } 1270 1271 /* 1272 * No cluster started yet, start one and add ourselves 1273 * to the list of clusters. 1274 */ 1275 nrp->list = NULL; 1276 1277 nlp = &nlpsp; 1278 nlp->fhp = &wa->wa_fhandle; 1279 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL); 1280 nlp->list = nrp; 1281 nlp->next = NULL; 1282 1283 if (rfs_async_write_head == NULL) { 1284 rfs_async_write_head = nlp; 1285 } else { 1286 lp = rfs_async_write_head; 1287 while (lp->next != NULL) 1288 lp = lp->next; 1289 lp->next = nlp; 1290 } 1291 mutex_exit(&rfs_async_write_lock); 1292 1293 /* 1294 * Convert the file handle common to all of the requests 1295 * in this cluster to a vnode. 1296 */ 1297 vp = nfs_fhtovp(&wa->wa_fhandle, exi); 1298 if (vp == NULL) { 1299 mutex_enter(&rfs_async_write_lock); 1300 if (rfs_async_write_head == nlp) 1301 rfs_async_write_head = nlp->next; 1302 else { 1303 lp = rfs_async_write_head; 1304 while (lp->next != nlp) 1305 lp = lp->next; 1306 lp->next = nlp->next; 1307 } 1308 t_flag = curthread->t_flag & T_WOULDBLOCK; 1309 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1310 rp->ns->ns_status = NFSERR_STALE; 1311 rp->thread->t_flag |= t_flag; 1312 } 1313 cv_broadcast(&nlp->cv); 1314 mutex_exit(&rfs_async_write_lock); 1315 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1316 "rfs_write_end:(%S)", "stale"); 1317 return; 1318 } 1319 1320 /* 1321 * Can only write regular files. Attempts to write any 1322 * other file types fail with EISDIR. 1323 */ 1324 if (vp->v_type != VREG) { 1325 VN_RELE(vp); 1326 mutex_enter(&rfs_async_write_lock); 1327 if (rfs_async_write_head == nlp) 1328 rfs_async_write_head = nlp->next; 1329 else { 1330 lp = rfs_async_write_head; 1331 while (lp->next != nlp) 1332 lp = lp->next; 1333 lp->next = nlp->next; 1334 } 1335 t_flag = curthread->t_flag & T_WOULDBLOCK; 1336 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1337 rp->ns->ns_status = NFSERR_ISDIR; 1338 rp->thread->t_flag |= t_flag; 1339 } 1340 cv_broadcast(&nlp->cv); 1341 mutex_exit(&rfs_async_write_lock); 1342 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, 1343 "rfs_write_end:(%S)", "isdir"); 1344 return; 1345 } 1346 1347 /* 1348 * Enter the critical region before calling VOP_RWLOCK, to avoid a 1349 * deadlock with ufs. 1350 */ 1351 if (nbl_need_check(vp)) { 1352 nbl_start_crit(vp, RW_READER); 1353 in_crit = 1; 1354 } 1355 1356 /* 1357 * Lock the file for writing. This operation provides 1358 * the delay which allows clusters to grow. 1359 */ 1360 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:"); 1361 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1362 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end"); 1363 1364 /* 1365 * Disconnect this cluster from the list of clusters. 1366 * The cluster that is being dealt with must be fixed 1367 * in size after this point, so there is no reason 1368 * to leave it on the list so that new requests can 1369 * find it. 1370 * 1371 * The algorithm is that the first write request will 1372 * create a cluster, convert the file handle to a 1373 * vnode pointer, and then lock the file for writing. 1374 * This request is not likely to be clustered with 1375 * any others. However, the next request will create 1376 * a new cluster and be blocked in VOP_RWLOCK while 1377 * the first request is being processed. This delay 1378 * will allow more requests to be clustered in this 1379 * second cluster. 1380 */ 1381 mutex_enter(&rfs_async_write_lock); 1382 if (rfs_async_write_head == nlp) 1383 rfs_async_write_head = nlp->next; 1384 else { 1385 lp = rfs_async_write_head; 1386 while (lp->next != nlp) 1387 lp = lp->next; 1388 lp->next = nlp->next; 1389 } 1390 mutex_exit(&rfs_async_write_lock); 1391 1392 /* 1393 * Step through the list of requests in this cluster. 1394 * We need to check permissions to make sure that all 1395 * of the requests have sufficient permission to write 1396 * the file. A cluster can be composed of requests 1397 * from different clients and different users on each 1398 * client. 1399 * 1400 * As a side effect, we also calculate the size of the 1401 * byte range that this cluster encompasses. 1402 */ 1403 rp = nlp->list; 1404 off = rp->wa->wa_offset; 1405 len = (uint_t)0; 1406 do { 1407 if (rdonly(exi, rp->req)) { 1408 rp->ns->ns_status = NFSERR_ROFS; 1409 t_flag = curthread->t_flag & T_WOULDBLOCK; 1410 rp->thread->t_flag |= t_flag; 1411 continue; 1412 } 1413 1414 va.va_mask = AT_UID|AT_MODE; 1415 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 1416 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1417 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 1418 if (!error) { 1419 if (crgetuid(rp->cr) != va.va_uid) { 1420 /* 1421 * This is a kludge to allow writes of files 1422 * created with read only permission. The 1423 * owner of the file is always allowed to 1424 * write it. 1425 */ 1426 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, 1427 "vop_access_start:"); 1428 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr); 1429 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, 1430 "vop_access_end:"); 1431 } 1432 if (!error && MANDLOCK(vp, va.va_mode)) 1433 error = EACCES; 1434 } 1435 1436 /* 1437 * Check for a conflict with a nbmand-locked region. 1438 */ 1439 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset, 1440 rp->wa->wa_count, 0)) { 1441 error = EACCES; 1442 } 1443 1444 if (error) { 1445 rp->ns->ns_status = puterrno(error); 1446 t_flag = curthread->t_flag & T_WOULDBLOCK; 1447 rp->thread->t_flag |= t_flag; 1448 continue; 1449 } 1450 if (len < rp->wa->wa_offset + rp->wa->wa_count - off) 1451 len = rp->wa->wa_offset + rp->wa->wa_count - off; 1452 } while ((rp = rp->list) != NULL); 1453 1454 /* 1455 * Step through the cluster attempting to gather as many 1456 * requests which are contiguous as possible. These 1457 * contiguous requests are handled via one call to VOP_WRITE 1458 * instead of different calls to VOP_WRITE. We also keep 1459 * track of the fact that any data was written. 1460 */ 1461 rp = nlp->list; 1462 data_written = 0; 1463 do { 1464 /* 1465 * Skip any requests which are already marked as having an 1466 * error. 1467 */ 1468 if (rp->ns->ns_status != RFSWRITE_INITVAL) { 1469 rp = rp->list; 1470 continue; 1471 } 1472 1473 /* 1474 * Count the number of iovec's which are required 1475 * to handle this set of requests. One iovec is 1476 * needed for each data buffer, whether addressed 1477 * by wa_data or by the b_rptr pointers in the 1478 * mblk chains. 1479 */ 1480 iovcnt = 0; 1481 lrp = rp; 1482 for (;;) { 1483 if (lrp->wa->wa_data) 1484 iovcnt++; 1485 else { 1486 m = lrp->wa->wa_mblk; 1487 while (m != NULL) { 1488 iovcnt++; 1489 m = m->b_cont; 1490 } 1491 } 1492 if (lrp->list == NULL || 1493 lrp->list->ns->ns_status != RFSWRITE_INITVAL || 1494 lrp->wa->wa_offset + lrp->wa->wa_count != 1495 lrp->list->wa->wa_offset) { 1496 lrp = lrp->list; 1497 break; 1498 } 1499 lrp = lrp->list; 1500 } 1501 1502 if (iovcnt <= MAXCLIOVECS) { 1503 #ifdef DEBUG 1504 rfs_write_hits++; 1505 #endif 1506 niovp = iov; 1507 } else { 1508 #ifdef DEBUG 1509 rfs_write_misses++; 1510 #endif 1511 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP); 1512 } 1513 /* 1514 * Put together the scatter/gather iovecs. 1515 */ 1516 iovp = niovp; 1517 trp = rp; 1518 count = 0; 1519 do { 1520 if (trp->wa->wa_data) { 1521 iovp->iov_base = trp->wa->wa_data; 1522 iovp->iov_len = trp->wa->wa_count; 1523 iovp++; 1524 } else { 1525 m = trp->wa->wa_mblk; 1526 rcount = trp->wa->wa_count; 1527 while (m != NULL) { 1528 iovp->iov_base = (caddr_t)m->b_rptr; 1529 iovp->iov_len = (m->b_wptr - m->b_rptr); 1530 rcount -= iovp->iov_len; 1531 if (rcount < 0) 1532 iovp->iov_len += rcount; 1533 iovp++; 1534 if (rcount <= 0) 1535 break; 1536 m = m->b_cont; 1537 } 1538 } 1539 count += trp->wa->wa_count; 1540 trp = trp->list; 1541 } while (trp != lrp); 1542 1543 uio.uio_iov = niovp; 1544 uio.uio_iovcnt = iovcnt; 1545 uio.uio_segflg = UIO_SYSSPACE; 1546 uio.uio_extflg = UIO_COPY_DEFAULT; 1547 uio.uio_loffset = (offset_t)rp->wa->wa_offset; 1548 uio.uio_resid = count; 1549 /* 1550 * The limit is checked on the client. We 1551 * should allow any size writes here. 1552 */ 1553 uio.uio_llimit = curproc->p_fsz_ctl; 1554 rlimit = uio.uio_llimit - rp->wa->wa_offset; 1555 if (rlimit < (rlim64_t)uio.uio_resid) 1556 uio.uio_resid = (uint_t)rlimit; 1557 1558 /* 1559 * For now we assume no append mode. 1560 */ 1561 TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START, 1562 "vop_write_start:(%S)", "async"); 1563 1564 /* 1565 * Check to see if the v4 side of the server has 1566 * delegated this file. If so, then we mark thread 1567 * as wouldblock so the response is dropped. 1568 */ 1569 if (rfs4_check_delegated(FWRITE, vp, FALSE)) { 1570 curthread->t_flag |= T_WOULDBLOCK; 1571 error = EACCES; /* just to have an error */ 1572 TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, 1573 "rfs_write_end:(%S)", "delegated"); 1574 } else { 1575 /* 1576 * We're changing creds because VM may fault 1577 * and we need the cred of the current 1578 * thread to be used if quota * checking is 1579 * enabled. 1580 */ 1581 savecred = curthread->t_cred; 1582 curthread->t_cred = cr; 1583 error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL); 1584 curthread->t_cred = savecred; 1585 TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, 1586 "vop_write_end:"); 1587 } 1588 1589 if (niovp != iov) 1590 kmem_free(niovp, sizeof (*niovp) * iovcnt); 1591 1592 if (!error) { 1593 data_written = 1; 1594 /* 1595 * Get attributes again so we send the latest mod 1596 * time to the client side for his cache. 1597 */ 1598 va.va_mask = AT_ALL; /* now we want everything */ 1599 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1600 "vop_getattr_start:"); 1601 error = VOP_GETATTR(vp, &va, 0, rp->cr); 1602 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1603 "vop_getattr_end:"); 1604 if (!error) 1605 acl_perm(vp, exi, &va, rp->cr); 1606 } 1607 1608 /* 1609 * Fill in the status responses for each request 1610 * which was just handled. Also, copy the latest 1611 * attributes in to the attribute responses if 1612 * appropriate. 1613 */ 1614 t_flag = curthread->t_flag & T_WOULDBLOCK; 1615 do { 1616 rp->thread->t_flag |= t_flag; 1617 /* check for overflows */ 1618 if (!error) { 1619 error = vattr_to_nattr(&va, &rp->ns->ns_attr); 1620 } 1621 rp->ns->ns_status = puterrno(error); 1622 rp = rp->list; 1623 } while (rp != lrp); 1624 } while (rp != NULL); 1625 1626 /* 1627 * If any data was written at all, then we need to flush 1628 * the data and metadata to stable storage. 1629 */ 1630 if (data_written) { 1631 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:"); 1632 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr); 1633 TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:"); 1634 if (!error) { 1635 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START, 1636 "vop_fsync_start:"); 1637 error = VOP_FSYNC(vp, FNODSYNC, cr); 1638 TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:"); 1639 } 1640 } 1641 1642 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 1643 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1644 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 1645 1646 if (in_crit) 1647 nbl_end_crit(vp); 1648 VN_RELE(vp); 1649 1650 t_flag = curthread->t_flag & T_WOULDBLOCK; 1651 mutex_enter(&rfs_async_write_lock); 1652 for (rp = nlp->list; rp != NULL; rp = rp->list) { 1653 if (rp->ns->ns_status == RFSWRITE_INITVAL) { 1654 rp->ns->ns_status = puterrno(error); 1655 rp->thread->t_flag |= t_flag; 1656 } 1657 } 1658 cv_broadcast(&nlp->cv); 1659 mutex_exit(&rfs_async_write_lock); 1660 1661 TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async"); 1662 } 1663 1664 void * 1665 rfs_write_getfh(struct nfswriteargs *wa) 1666 { 1667 return (&wa->wa_fhandle); 1668 } 1669 1670 /* 1671 * Create a file. 1672 * Creates a file with given attributes and returns those attributes 1673 * and an fhandle for the new file. 1674 */ 1675 void 1676 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr, 1677 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1678 { 1679 int error; 1680 int lookuperr; 1681 int in_crit = 0; 1682 struct vattr va; 1683 vnode_t *vp; 1684 vnode_t *dvp; 1685 char *name = args->ca_da.da_name; 1686 vnode_t *tvp = NULL; 1687 int mode; 1688 int lookup_ok; 1689 bool_t trunc; 1690 1691 TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:"); 1692 1693 /* 1694 * Disallow NULL paths 1695 */ 1696 if (name == NULL || *name == '\0') { 1697 dr->dr_status = NFSERR_ACCES; 1698 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1699 "rfs_create_end:(%S)", "access"); 1700 return; 1701 } 1702 1703 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 1704 if (dvp == NULL) { 1705 dr->dr_status = NFSERR_STALE; 1706 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1707 "rfs_create_end:(%S)", "stale"); 1708 return; 1709 } 1710 1711 error = sattr_to_vattr(args->ca_sa, &va); 1712 if (error) { 1713 dr->dr_status = puterrno(error); 1714 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1715 "rfs_create_end:(%S)", "sattr"); 1716 return; 1717 } 1718 1719 /* 1720 * Must specify the mode. 1721 */ 1722 if (!(va.va_mask & AT_MODE)) { 1723 VN_RELE(dvp); 1724 dr->dr_status = NFSERR_INVAL; 1725 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, 1726 "rfs_create_end:(%S)", "no mode"); 1727 return; 1728 } 1729 1730 /* 1731 * This is a completely gross hack to make mknod 1732 * work over the wire until we can wack the protocol 1733 */ 1734 if ((va.va_mode & IFMT) == IFCHR) { 1735 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV) 1736 va.va_type = VFIFO; /* xtra kludge for named pipe */ 1737 else { 1738 va.va_type = VCHR; 1739 /* 1740 * uncompress the received dev_t 1741 * if the top half is zero indicating a request 1742 * from an `older style' OS. 1743 */ 1744 if ((va.va_size & 0xffff0000) == 0) 1745 va.va_rdev = nfsv2_expdev(va.va_size); 1746 else 1747 va.va_rdev = (dev_t)va.va_size; 1748 } 1749 va.va_mask &= ~AT_SIZE; 1750 } else if ((va.va_mode & IFMT) == IFBLK) { 1751 va.va_type = VBLK; 1752 /* 1753 * uncompress the received dev_t 1754 * if the top half is zero indicating a request 1755 * from an `older style' OS. 1756 */ 1757 if ((va.va_size & 0xffff0000) == 0) 1758 va.va_rdev = nfsv2_expdev(va.va_size); 1759 else 1760 va.va_rdev = (dev_t)va.va_size; 1761 va.va_mask &= ~AT_SIZE; 1762 } else if ((va.va_mode & IFMT) == IFSOCK) { 1763 va.va_type = VSOCK; 1764 } else 1765 va.va_type = VREG; 1766 va.va_mode &= ~IFMT; 1767 va.va_mask |= AT_TYPE; 1768 1769 /* 1770 * Why was the choice made to use VWRITE as the mode to the 1771 * call to VOP_CREATE ? This results in a bug. When a client 1772 * opens a file that already exists and is RDONLY, the second 1773 * open fails with an EACESS because of the mode. 1774 * bug ID 1054648. 1775 */ 1776 lookup_ok = 0; 1777 mode = VWRITE; 1778 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) { 1779 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 1780 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1781 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 1782 if (!error) { 1783 struct vattr at; 1784 1785 lookup_ok = 1; 1786 at.va_mask = AT_MODE; 1787 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1788 "vop_getattr_start:"); 1789 error = VOP_GETATTR(tvp, &at, 0, cr); 1790 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1791 "vop_getattr_end:"); 1792 if (!error) 1793 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD; 1794 VN_RELE(tvp); 1795 tvp = NULL; 1796 } 1797 } 1798 1799 if (!lookup_ok) { 1800 if (rdonly(exi, req)) { 1801 error = EROFS; 1802 } else if (va.va_type != VREG && va.va_type != VFIFO && 1803 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) { 1804 error = EPERM; 1805 } else { 1806 error = 0; 1807 } 1808 } 1809 1810 /* 1811 * If file size is being modified on an already existing file 1812 * make sure that there are no conflicting non-blocking mandatory 1813 * locks in the region being manipulated. Return EACCES if there 1814 * are conflicting locks. 1815 */ 1816 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) { 1817 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr); 1818 1819 if (!lookuperr && 1820 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) { 1821 VN_RELE(tvp); 1822 curthread->t_flag |= T_WOULDBLOCK; 1823 goto out; 1824 } 1825 1826 if (!lookuperr && nbl_need_check(tvp)) { 1827 /* 1828 * The file exists. Now check if it has any 1829 * conflicting non-blocking mandatory locks 1830 * in the region being changed. 1831 */ 1832 struct vattr bva; 1833 u_offset_t offset; 1834 ssize_t length; 1835 1836 nbl_start_crit(tvp, RW_READER); 1837 in_crit = 1; 1838 1839 bva.va_mask = AT_SIZE; 1840 error = VOP_GETATTR(tvp, &bva, 0, cr); 1841 if (!error) { 1842 if (va.va_size < bva.va_size) { 1843 offset = va.va_size; 1844 length = bva.va_size - va.va_size; 1845 } else { 1846 offset = bva.va_size; 1847 length = va.va_size - bva.va_size; 1848 } 1849 if (length) { 1850 if (nbl_conflict(tvp, NBL_WRITE, 1851 offset, length, 0)) { 1852 error = EACCES; 1853 } 1854 } 1855 } 1856 if (error) { 1857 nbl_end_crit(tvp); 1858 VN_RELE(tvp); 1859 in_crit = 0; 1860 } 1861 } else if (tvp != NULL) { 1862 VN_RELE(tvp); 1863 } 1864 } 1865 1866 if (!error) { 1867 /* 1868 * If filesystem is shared with nosuid the remove any 1869 * setuid/setgid bits on create. 1870 */ 1871 if (va.va_type == VREG && 1872 exi->exi_export.ex_flags & EX_NOSUID) 1873 va.va_mode &= ~(VSUID | VSGID); 1874 1875 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:"); 1876 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0); 1877 TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:"); 1878 1879 if (!error) { 1880 1881 if ((va.va_mask & AT_SIZE) && (va.va_size == 0)) 1882 trunc = TRUE; 1883 else 1884 trunc = FALSE; 1885 1886 if (rfs4_check_delegated(FWRITE, vp, trunc)) { 1887 VN_RELE(vp); 1888 curthread->t_flag |= T_WOULDBLOCK; 1889 goto out; 1890 } 1891 va.va_mask = AT_ALL; 1892 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, 1893 "vop_getattr_start:"); 1894 error = VOP_GETATTR(vp, &va, 0, cr); 1895 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, 1896 "vop_getattr_end:"); 1897 /* check for overflows */ 1898 if (!error) { 1899 acl_perm(vp, exi, &va, cr); 1900 error = vattr_to_nattr(&va, &dr->dr_attr); 1901 if (!error) { 1902 error = makefh(&dr->dr_fhandle, vp, 1903 exi); 1904 } 1905 } 1906 /* 1907 * Force modified metadata out to stable storage. 1908 */ 1909 (void) VOP_FSYNC(vp, FNODSYNC, cr); 1910 VN_RELE(vp); 1911 } 1912 1913 if (in_crit) { 1914 nbl_end_crit(tvp); 1915 VN_RELE(tvp); 1916 } 1917 } 1918 1919 /* 1920 * Force modified data and metadata out to stable storage. 1921 */ 1922 (void) VOP_FSYNC(dvp, 0, cr); 1923 1924 out: 1925 1926 VN_RELE(dvp); 1927 1928 dr->dr_status = puterrno(error); 1929 1930 TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done"); 1931 } 1932 void * 1933 rfs_create_getfh(struct nfscreatargs *args) 1934 { 1935 return (args->ca_da.da_fhandle); 1936 } 1937 1938 /* 1939 * Remove a file. 1940 * Remove named file from parent directory. 1941 */ 1942 void 1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status, 1944 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 1945 { 1946 int error = 0; 1947 vnode_t *vp; 1948 vnode_t *targvp; 1949 int in_crit = 0; 1950 1951 TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:"); 1952 1953 /* 1954 * Disallow NULL paths 1955 */ 1956 if (da->da_name == NULL || *da->da_name == '\0') { 1957 *status = NFSERR_ACCES; 1958 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 1959 "rfs_remove_end:(%S)", "access"); 1960 return; 1961 } 1962 1963 vp = nfs_fhtovp(da->da_fhandle, exi); 1964 if (vp == NULL) { 1965 *status = NFSERR_STALE; 1966 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 1967 "rfs_remove_end:(%S)", "stale"); 1968 return; 1969 } 1970 1971 if (rdonly(exi, req)) { 1972 VN_RELE(vp); 1973 *status = NFSERR_ROFS; 1974 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, 1975 "rfs_remove_end:(%S)", "rofs"); 1976 return; 1977 } 1978 1979 /* 1980 * Check for a conflict with a non-blocking mandatory share reservation. 1981 */ 1982 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, NULL, cr); 1983 if (error != 0) { 1984 VN_RELE(vp); 1985 *status = puterrno(error); 1986 return; 1987 } 1988 1989 /* 1990 * If the file is delegated to an v4 client, then initiate 1991 * recall and drop this request (by setting T_WOULDBLOCK). 1992 * The client will eventually re-transmit the request and 1993 * (hopefully), by then, the v4 client will have returned 1994 * the delegation. 1995 */ 1996 1997 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 1998 VN_RELE(vp); 1999 VN_RELE(targvp); 2000 curthread->t_flag |= T_WOULDBLOCK; 2001 return; 2002 } 2003 2004 if (nbl_need_check(targvp)) { 2005 nbl_start_crit(targvp, RW_READER); 2006 in_crit = 1; 2007 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) { 2008 error = EACCES; 2009 goto out; 2010 } 2011 } 2012 2013 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:"); 2014 error = VOP_REMOVE(vp, da->da_name, cr); 2015 TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:"); 2016 2017 /* 2018 * Force modified data and metadata out to stable storage. 2019 */ 2020 (void) VOP_FSYNC(vp, 0, cr); 2021 2022 out: 2023 if (in_crit) 2024 nbl_end_crit(targvp); 2025 VN_RELE(targvp); 2026 VN_RELE(vp); 2027 2028 *status = puterrno(error); 2029 2030 TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done"); 2031 } 2032 2033 void * 2034 rfs_remove_getfh(struct nfsdiropargs *da) 2035 { 2036 return (da->da_fhandle); 2037 } 2038 2039 /* 2040 * rename a file 2041 * Give a file (from) a new name (to). 2042 */ 2043 void 2044 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status, 2045 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2046 { 2047 int error = 0; 2048 vnode_t *fromvp; 2049 vnode_t *tovp; 2050 struct exportinfo *to_exi; 2051 fhandle_t *fh; 2052 vnode_t *srcvp; 2053 vnode_t *targvp; 2054 int in_crit = 0; 2055 2056 TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:"); 2057 2058 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi); 2059 if (fromvp == NULL) { 2060 *status = NFSERR_STALE; 2061 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2062 "rfs_rename_end:(%S)", "from stale"); 2063 return; 2064 } 2065 2066 fh = args->rna_to.da_fhandle; 2067 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2068 if (to_exi == NULL) { 2069 VN_RELE(fromvp); 2070 *status = NFSERR_ACCES; 2071 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2072 "rfs_rename_end:(%S)", "cross device"); 2073 return; 2074 } 2075 exi_rele(to_exi); 2076 2077 if (to_exi != exi) { 2078 VN_RELE(fromvp); 2079 *status = NFSERR_XDEV; 2080 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2081 "rfs_rename_end:(%S)", "from stale"); 2082 return; 2083 } 2084 2085 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi); 2086 if (tovp == NULL) { 2087 VN_RELE(fromvp); 2088 *status = NFSERR_STALE; 2089 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2090 "rfs_rename_end:(%S)", "to stale"); 2091 return; 2092 } 2093 2094 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) { 2095 VN_RELE(tovp); 2096 VN_RELE(fromvp); 2097 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2098 "rfs_rename_end:(%S)", "not dir"); 2099 *status = NFSERR_NOTDIR; 2100 return; 2101 } 2102 2103 /* 2104 * Disallow NULL paths 2105 */ 2106 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' || 2107 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') { 2108 VN_RELE(tovp); 2109 VN_RELE(fromvp); 2110 *status = NFSERR_ACCES; 2111 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2112 "rfs_rename_end:(%S)", "access"); 2113 return; 2114 } 2115 2116 if (rdonly(exi, req)) { 2117 VN_RELE(tovp); 2118 VN_RELE(fromvp); 2119 *status = NFSERR_ROFS; 2120 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, 2121 "rfs_rename_end:(%S)", "rofs"); 2122 return; 2123 } 2124 2125 /* 2126 * Check for a conflict with a non-blocking mandatory share reservation. 2127 */ 2128 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0, 2129 NULL, cr); 2130 if (error != 0) { 2131 VN_RELE(tovp); 2132 VN_RELE(fromvp); 2133 *status = puterrno(error); 2134 return; 2135 } 2136 2137 /* Check for delegations on the source file */ 2138 2139 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) { 2140 VN_RELE(tovp); 2141 VN_RELE(fromvp); 2142 VN_RELE(srcvp); 2143 curthread->t_flag |= T_WOULDBLOCK; 2144 return; 2145 } 2146 2147 /* Check for delegation on the file being renamed over, if it exists */ 2148 2149 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE && 2150 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr) 2151 == 0) { 2152 2153 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) { 2154 VN_RELE(tovp); 2155 VN_RELE(fromvp); 2156 VN_RELE(srcvp); 2157 VN_RELE(targvp); 2158 curthread->t_flag |= T_WOULDBLOCK; 2159 return; 2160 } 2161 VN_RELE(targvp); 2162 } 2163 2164 2165 if (nbl_need_check(srcvp)) { 2166 nbl_start_crit(srcvp, RW_READER); 2167 in_crit = 1; 2168 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) { 2169 error = EACCES; 2170 goto out; 2171 } 2172 } 2173 2174 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:"); 2175 error = VOP_RENAME(fromvp, args->rna_from.da_name, 2176 tovp, args->rna_to.da_name, cr); 2177 TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:"); 2178 2179 if (error == 0) { 2180 char *tmp; 2181 2182 /* fix the path name for the renamed file */ 2183 mutex_enter(&srcvp->v_lock); 2184 tmp = srcvp->v_path; 2185 srcvp->v_path = NULL; 2186 mutex_exit(&srcvp->v_lock); 2187 vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name, 2188 strlen(args->rna_to.da_name)); 2189 if (tmp != NULL) 2190 kmem_free(tmp, strlen(tmp) + 1); 2191 } 2192 2193 /* 2194 * Force modified data and metadata out to stable storage. 2195 */ 2196 (void) VOP_FSYNC(tovp, 0, cr); 2197 (void) VOP_FSYNC(fromvp, 0, cr); 2198 2199 out: 2200 if (in_crit) 2201 nbl_end_crit(srcvp); 2202 VN_RELE(srcvp); 2203 VN_RELE(tovp); 2204 VN_RELE(fromvp); 2205 2206 *status = puterrno(error); 2207 2208 TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done"); 2209 } 2210 void * 2211 rfs_rename_getfh(struct nfsrnmargs *args) 2212 { 2213 return (args->rna_from.da_fhandle); 2214 } 2215 2216 /* 2217 * Link to a file. 2218 * Create a file (to) which is a hard link to the given file (from). 2219 */ 2220 void 2221 rfs_link(struct nfslinkargs *args, enum nfsstat *status, 2222 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2223 { 2224 int error; 2225 vnode_t *fromvp; 2226 vnode_t *tovp; 2227 struct exportinfo *to_exi; 2228 fhandle_t *fh; 2229 2230 TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:"); 2231 2232 fromvp = nfs_fhtovp(args->la_from, exi); 2233 if (fromvp == NULL) { 2234 *status = NFSERR_STALE; 2235 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2236 "rfs_link_end:(%S)", "from stale"); 2237 return; 2238 } 2239 2240 fh = args->la_to.da_fhandle; 2241 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen); 2242 if (to_exi == NULL) { 2243 VN_RELE(fromvp); 2244 *status = NFSERR_ACCES; 2245 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2246 "rfs_link_end:(%S)", "cross device"); 2247 return; 2248 } 2249 exi_rele(to_exi); 2250 2251 if (to_exi != exi) { 2252 VN_RELE(fromvp); 2253 *status = NFSERR_XDEV; 2254 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2255 "rfs_link_end:(%S)", "cross device"); 2256 return; 2257 } 2258 2259 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi); 2260 if (tovp == NULL) { 2261 VN_RELE(fromvp); 2262 *status = NFSERR_STALE; 2263 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2264 "rfs_link_end:(%S)", "to stale"); 2265 return; 2266 } 2267 2268 if (tovp->v_type != VDIR) { 2269 VN_RELE(tovp); 2270 VN_RELE(fromvp); 2271 *status = NFSERR_NOTDIR; 2272 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2273 "rfs_link_end:(%S)", "not dir"); 2274 return; 2275 } 2276 /* 2277 * Disallow NULL paths 2278 */ 2279 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') { 2280 VN_RELE(tovp); 2281 VN_RELE(fromvp); 2282 *status = NFSERR_ACCES; 2283 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2284 "rfs_link_end:(%S)", "access"); 2285 return; 2286 } 2287 2288 if (rdonly(exi, req)) { 2289 VN_RELE(tovp); 2290 VN_RELE(fromvp); 2291 *status = NFSERR_ROFS; 2292 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, 2293 "rfs_link_end:(%S)", "rofs"); 2294 return; 2295 } 2296 2297 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:"); 2298 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr); 2299 TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:"); 2300 2301 /* 2302 * Force modified data and metadata out to stable storage. 2303 */ 2304 (void) VOP_FSYNC(tovp, 0, cr); 2305 (void) VOP_FSYNC(fromvp, FNODSYNC, cr); 2306 2307 VN_RELE(tovp); 2308 VN_RELE(fromvp); 2309 2310 *status = puterrno(error); 2311 2312 TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done"); 2313 } 2314 void * 2315 rfs_link_getfh(struct nfslinkargs *args) 2316 { 2317 return (args->la_from); 2318 } 2319 2320 /* 2321 * Symbolicly link to a file. 2322 * Create a file (to) with the given attributes which is a symbolic link 2323 * to the given path name (to). 2324 */ 2325 void 2326 rfs_symlink(struct nfsslargs *args, enum nfsstat *status, 2327 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2328 { 2329 int error; 2330 struct vattr va; 2331 vnode_t *vp; 2332 vnode_t *svp; 2333 int lerror; 2334 2335 TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:"); 2336 2337 /* 2338 * Disallow NULL paths 2339 */ 2340 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') { 2341 *status = NFSERR_ACCES; 2342 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2343 "rfs_symlink_end:(%S)", "access"); 2344 return; 2345 } 2346 2347 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi); 2348 if (vp == NULL) { 2349 *status = NFSERR_STALE; 2350 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2351 "rfs_symlink_end:(%S)", "stale"); 2352 return; 2353 } 2354 2355 if (rdonly(exi, req)) { 2356 VN_RELE(vp); 2357 *status = NFSERR_ROFS; 2358 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2359 "rfs_symlink_end:(%S)", "rofs"); 2360 return; 2361 } 2362 2363 error = sattr_to_vattr(args->sla_sa, &va); 2364 if (error) { 2365 VN_RELE(vp); 2366 *status = puterrno(error); 2367 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2368 "rfs_symlink_end:(%S)", "sattr"); 2369 return; 2370 } 2371 2372 if (!(va.va_mask & AT_MODE)) { 2373 VN_RELE(vp); 2374 *status = NFSERR_INVAL; 2375 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, 2376 "rfs_symlink_end:(%S)", "no mode"); 2377 return; 2378 } 2379 2380 va.va_type = VLNK; 2381 va.va_mask |= AT_TYPE; 2382 2383 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:"); 2384 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr); 2385 TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:"); 2386 2387 /* 2388 * Force new data and metadata out to stable storage. 2389 */ 2390 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:"); 2391 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 2392 0, NULL, cr); 2393 TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:"); 2394 if (!lerror) { 2395 (void) VOP_FSYNC(svp, 0, cr); 2396 VN_RELE(svp); 2397 } 2398 2399 /* 2400 * Force modified data and metadata out to stable storage. 2401 */ 2402 (void) VOP_FSYNC(vp, 0, cr); 2403 2404 VN_RELE(vp); 2405 2406 *status = puterrno(error); 2407 2408 TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done"); 2409 } 2410 void * 2411 rfs_symlink_getfh(struct nfsslargs *args) 2412 { 2413 return (args->sla_from.da_fhandle); 2414 } 2415 2416 /* 2417 * Make a directory. 2418 * Create a directory with the given name, parent directory, and attributes. 2419 * Returns a file handle and attributes for the new directory. 2420 */ 2421 void 2422 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr, 2423 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2424 { 2425 int error; 2426 struct vattr va; 2427 vnode_t *dvp = NULL; 2428 vnode_t *vp; 2429 char *name = args->ca_da.da_name; 2430 2431 TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:"); 2432 2433 /* 2434 * Disallow NULL paths 2435 */ 2436 if (name == NULL || *name == '\0') { 2437 dr->dr_status = NFSERR_ACCES; 2438 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2439 "rfs_mkdir_end:(%S)", "access"); 2440 return; 2441 } 2442 2443 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi); 2444 if (vp == NULL) { 2445 dr->dr_status = NFSERR_STALE; 2446 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2447 "rfs_mkdir_end:(%S)", "stale"); 2448 return; 2449 } 2450 2451 if (rdonly(exi, req)) { 2452 VN_RELE(vp); 2453 dr->dr_status = NFSERR_ROFS; 2454 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2455 "rfs_mkdir_end:(%S)", "rofs"); 2456 return; 2457 } 2458 2459 error = sattr_to_vattr(args->ca_sa, &va); 2460 if (error) { 2461 VN_RELE(vp); 2462 dr->dr_status = puterrno(error); 2463 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2464 "rfs_mkdir_end:(%S)", "sattr"); 2465 return; 2466 } 2467 2468 if (!(va.va_mask & AT_MODE)) { 2469 VN_RELE(vp); 2470 dr->dr_status = NFSERR_INVAL; 2471 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, 2472 "rfs_mkdir_end:(%S)", "no mode"); 2473 return; 2474 } 2475 2476 va.va_type = VDIR; 2477 va.va_mask |= AT_TYPE; 2478 2479 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:"); 2480 error = VOP_MKDIR(vp, name, &va, &dvp, cr); 2481 TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:"); 2482 2483 if (!error) { 2484 /* 2485 * Attribtutes of the newly created directory should 2486 * be returned to the client. 2487 */ 2488 va.va_mask = AT_ALL; /* We want everything */ 2489 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:"); 2490 error = VOP_GETATTR(dvp, &va, 0, cr); 2491 TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:"); 2492 /* check for overflows */ 2493 if (!error) { 2494 acl_perm(vp, exi, &va, cr); 2495 error = vattr_to_nattr(&va, &dr->dr_attr); 2496 if (!error) { 2497 error = makefh(&dr->dr_fhandle, dvp, exi); 2498 } 2499 } 2500 /* 2501 * Force new data and metadata out to stable storage. 2502 */ 2503 (void) VOP_FSYNC(dvp, 0, cr); 2504 VN_RELE(dvp); 2505 } 2506 2507 /* 2508 * Force modified data and metadata out to stable storage. 2509 */ 2510 (void) VOP_FSYNC(vp, 0, cr); 2511 2512 VN_RELE(vp); 2513 2514 dr->dr_status = puterrno(error); 2515 2516 TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done"); 2517 } 2518 void * 2519 rfs_mkdir_getfh(struct nfscreatargs *args) 2520 { 2521 return (args->ca_da.da_fhandle); 2522 } 2523 2524 /* 2525 * Remove a directory. 2526 * Remove the given directory name from the given parent directory. 2527 */ 2528 void 2529 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status, 2530 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2531 { 2532 int error; 2533 vnode_t *vp; 2534 2535 TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:"); 2536 2537 /* 2538 * Disallow NULL paths 2539 */ 2540 if (da->da_name == NULL || *da->da_name == '\0') { 2541 *status = NFSERR_ACCES; 2542 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2543 "rfs_rmdir_end:(%S)", "access"); 2544 return; 2545 } 2546 2547 vp = nfs_fhtovp(da->da_fhandle, exi); 2548 if (vp == NULL) { 2549 *status = NFSERR_STALE; 2550 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2551 "rfs_rmdir_end:(%S)", "stale"); 2552 return; 2553 } 2554 2555 if (rdonly(exi, req)) { 2556 VN_RELE(vp); 2557 *status = NFSERR_ROFS; 2558 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, 2559 "rfs_rmdir_end:(%S)", "rofs"); 2560 return; 2561 } 2562 2563 /* 2564 * VOP_RMDIR now takes a new third argument (the current 2565 * directory of the process). That's because someone 2566 * wants to return EINVAL if one tries to remove ".". 2567 * Of course, NFS servers have no idea what their 2568 * clients' current directories are. We fake it by 2569 * supplying a vnode known to exist and illegal to 2570 * remove. 2571 */ 2572 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:"); 2573 error = VOP_RMDIR(vp, da->da_name, rootdir, cr); 2574 TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:"); 2575 2576 /* 2577 * Force modified data and metadata out to stable storage. 2578 */ 2579 (void) VOP_FSYNC(vp, 0, cr); 2580 2581 VN_RELE(vp); 2582 2583 /* 2584 * System V defines rmdir to return EEXIST, not ENOTEMPTY, 2585 * if the directory is not empty. A System V NFS server 2586 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit 2587 * over the wire. 2588 */ 2589 if (error == EEXIST) 2590 *status = NFSERR_NOTEMPTY; 2591 else 2592 *status = puterrno(error); 2593 2594 TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done"); 2595 } 2596 void * 2597 rfs_rmdir_getfh(struct nfsdiropargs *da) 2598 { 2599 return (da->da_fhandle); 2600 } 2601 2602 /* ARGSUSED */ 2603 void 2604 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd, 2605 struct exportinfo *exi, struct svc_req *req, cred_t *cr) 2606 { 2607 int error; 2608 int iseof; 2609 struct iovec iov; 2610 struct uio uio; 2611 vnode_t *vp; 2612 2613 TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:"); 2614 2615 vp = nfs_fhtovp(&rda->rda_fh, exi); 2616 if (vp == NULL) { 2617 rd->rd_entries = NULL; 2618 rd->rd_status = NFSERR_STALE; 2619 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2620 "rfs_readdir_end:(%S)", "stale"); 2621 return; 2622 } 2623 2624 if (vp->v_type != VDIR) { 2625 VN_RELE(vp); 2626 rd->rd_entries = NULL; 2627 rd->rd_status = NFSERR_NOTDIR; 2628 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, 2629 "rfs_readdir_end:(%S)", "notdir"); 2630 return; 2631 } 2632 2633 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:"); 2634 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 2635 TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:"); 2636 2637 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:"); 2638 error = VOP_ACCESS(vp, VREAD, 0, cr); 2639 TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:"); 2640 if (error) { 2641 rd->rd_entries = NULL; 2642 goto bad; 2643 } 2644 2645 if (rda->rda_count == 0) { 2646 rd->rd_entries = NULL; 2647 rd->rd_size = 0; 2648 rd->rd_eof = FALSE; 2649 goto bad; 2650 } 2651 2652 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA); 2653 2654 /* 2655 * Allocate data for entries. This will be freed by rfs_rddirfree. 2656 */ 2657 rd->rd_bufsize = (uint_t)rda->rda_count; 2658 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP); 2659 2660 /* 2661 * Set up io vector to read directory data 2662 */ 2663 iov.iov_base = (caddr_t)rd->rd_entries; 2664 iov.iov_len = rda->rda_count; 2665 uio.uio_iov = &iov; 2666 uio.uio_iovcnt = 1; 2667 uio.uio_segflg = UIO_SYSSPACE; 2668 uio.uio_extflg = UIO_COPY_CACHED; 2669 uio.uio_loffset = (offset_t)rda->rda_offset; 2670 uio.uio_resid = rda->rda_count; 2671 2672 /* 2673 * read directory 2674 */ 2675 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:"); 2676 error = VOP_READDIR(vp, &uio, cr, &iseof); 2677 TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:"); 2678 2679 /* 2680 * Clean up 2681 */ 2682 if (!error) { 2683 /* 2684 * set size and eof 2685 */ 2686 if (uio.uio_resid == rda->rda_count) { 2687 rd->rd_size = 0; 2688 rd->rd_eof = TRUE; 2689 } else { 2690 rd->rd_size = (uint32_t)(rda->rda_count - 2691 uio.uio_resid); 2692 rd->rd_eof = iseof ? TRUE : FALSE; 2693 } 2694 } 2695 2696 bad: 2697 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:"); 2698 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 2699 TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:"); 2700 2701 #if 0 /* notyet */ 2702 /* 2703 * Don't do this. It causes local disk writes when just 2704 * reading the file and the overhead is deemed larger 2705 * than the benefit. 2706 */ 2707 /* 2708 * Force modified metadata out to stable storage. 2709 */ 2710 (void) VOP_FSYNC(vp, FNODSYNC, cr); 2711 #endif 2712 2713 VN_RELE(vp); 2714 2715 rd->rd_status = puterrno(error); 2716 2717 TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done"); 2718 } 2719 void * 2720 rfs_readdir_getfh(struct nfsrddirargs *rda) 2721 { 2722 return (&rda->rda_fh); 2723 } 2724 void 2725 rfs_rddirfree(struct nfsrddirres *rd) 2726 { 2727 if (rd->rd_entries != NULL) 2728 kmem_free(rd->rd_entries, rd->rd_bufsize); 2729 } 2730 2731 /* ARGSUSED */ 2732 void 2733 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi, 2734 struct svc_req *req, cred_t *cr) 2735 { 2736 int error; 2737 struct statvfs64 sb; 2738 vnode_t *vp; 2739 2740 TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:"); 2741 2742 vp = nfs_fhtovp(fh, exi); 2743 if (vp == NULL) { 2744 fs->fs_status = NFSERR_STALE; 2745 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, 2746 "rfs_statfs_end:(%S)", "stale"); 2747 return; 2748 } 2749 2750 error = VFS_STATVFS(vp->v_vfsp, &sb); 2751 2752 if (!error) { 2753 fs->fs_tsize = nfstsize(); 2754 fs->fs_bsize = sb.f_frsize; 2755 fs->fs_blocks = sb.f_blocks; 2756 fs->fs_bfree = sb.f_bfree; 2757 fs->fs_bavail = sb.f_bavail; 2758 } 2759 2760 VN_RELE(vp); 2761 2762 fs->fs_status = puterrno(error); 2763 2764 TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done"); 2765 } 2766 void * 2767 rfs_statfs_getfh(fhandle_t *fh) 2768 { 2769 return (fh); 2770 } 2771 2772 static int 2773 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap) 2774 { 2775 vap->va_mask = 0; 2776 2777 /* 2778 * There was a sign extension bug in some VFS based systems 2779 * which stored the mode as a short. When it would get 2780 * assigned to a u_long, no sign extension would occur. 2781 * It needed to, but this wasn't noticed because sa_mode 2782 * would then get assigned back to the short, thus ignoring 2783 * the upper 16 bits of sa_mode. 2784 * 2785 * To make this implementation work for both broken 2786 * clients and good clients, we check for both versions 2787 * of the mode. 2788 */ 2789 if (sa->sa_mode != (uint32_t)((ushort_t)-1) && 2790 sa->sa_mode != (uint32_t)-1) { 2791 vap->va_mask |= AT_MODE; 2792 vap->va_mode = sa->sa_mode; 2793 } 2794 if (sa->sa_uid != (uint32_t)-1) { 2795 vap->va_mask |= AT_UID; 2796 vap->va_uid = sa->sa_uid; 2797 } 2798 if (sa->sa_gid != (uint32_t)-1) { 2799 vap->va_mask |= AT_GID; 2800 vap->va_gid = sa->sa_gid; 2801 } 2802 if (sa->sa_size != (uint32_t)-1) { 2803 vap->va_mask |= AT_SIZE; 2804 vap->va_size = sa->sa_size; 2805 } 2806 if (sa->sa_atime.tv_sec != (int32_t)-1 && 2807 sa->sa_atime.tv_usec != (int32_t)-1) { 2808 #ifndef _LP64 2809 /* return error if time overflow */ 2810 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec)) 2811 return (EOVERFLOW); 2812 #endif 2813 vap->va_mask |= AT_ATIME; 2814 /* 2815 * nfs protocol defines times as unsigned so don't extend sign, 2816 * unless sysadmin set nfs_allow_preepoch_time. 2817 */ 2818 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec); 2819 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000); 2820 } 2821 if (sa->sa_mtime.tv_sec != (int32_t)-1 && 2822 sa->sa_mtime.tv_usec != (int32_t)-1) { 2823 #ifndef _LP64 2824 /* return error if time overflow */ 2825 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec)) 2826 return (EOVERFLOW); 2827 #endif 2828 vap->va_mask |= AT_MTIME; 2829 /* 2830 * nfs protocol defines times as unsigned so don't extend sign, 2831 * unless sysadmin set nfs_allow_preepoch_time. 2832 */ 2833 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec); 2834 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000); 2835 } 2836 return (0); 2837 } 2838 2839 static enum nfsftype vt_to_nf[] = { 2840 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0 2841 }; 2842 2843 /* 2844 * check the following fields for overflow: nodeid, size, and time. 2845 * There could be a problem when converting 64-bit LP64 fields 2846 * into 32-bit ones. Return an error if there is an overflow. 2847 */ 2848 int 2849 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na) 2850 { 2851 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD); 2852 na->na_type = vt_to_nf[vap->va_type]; 2853 2854 if (vap->va_mode == (unsigned short) -1) 2855 na->na_mode = (uint32_t)-1; 2856 else 2857 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode; 2858 2859 if (vap->va_uid == (unsigned short)(-1)) 2860 na->na_uid = (uint32_t)(-1); 2861 else if (vap->va_uid == UID_NOBODY) 2862 na->na_uid = (uint32_t)NFS_UID_NOBODY; 2863 else 2864 na->na_uid = vap->va_uid; 2865 2866 if (vap->va_gid == (unsigned short)(-1)) 2867 na->na_gid = (uint32_t)-1; 2868 else if (vap->va_gid == GID_NOBODY) 2869 na->na_gid = (uint32_t)NFS_GID_NOBODY; 2870 else 2871 na->na_gid = vap->va_gid; 2872 2873 /* 2874 * Do we need to check fsid for overflow? It is 64-bit in the 2875 * vattr, but are bigger than 32 bit values supported? 2876 */ 2877 na->na_fsid = vap->va_fsid; 2878 2879 na->na_nodeid = vap->va_nodeid; 2880 2881 /* 2882 * Check to make sure that the nodeid is representable over the 2883 * wire without losing bits. 2884 */ 2885 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid) 2886 return (EFBIG); 2887 na->na_nlink = vap->va_nlink; 2888 2889 /* 2890 * Check for big files here, instead of at the caller. See 2891 * comments in cstat for large special file explanation. 2892 */ 2893 if (vap->va_size > (u_longlong_t)MAXOFF32_T) { 2894 if ((vap->va_type == VREG) || (vap->va_type == VDIR)) 2895 return (EFBIG); 2896 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) { 2897 /* UNKNOWN_SIZE | OVERFLOW */ 2898 na->na_size = MAXOFF32_T; 2899 } else 2900 na->na_size = vap->va_size; 2901 } else 2902 na->na_size = vap->va_size; 2903 2904 /* 2905 * If the vnode times overflow the 32-bit times that NFS2 2906 * uses on the wire then return an error. 2907 */ 2908 if (!NFS_VAP_TIME_OK(vap)) { 2909 return (EOVERFLOW); 2910 } 2911 na->na_atime.tv_sec = vap->va_atime.tv_sec; 2912 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2913 2914 na->na_mtime.tv_sec = vap->va_mtime.tv_sec; 2915 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2916 2917 na->na_ctime.tv_sec = vap->va_ctime.tv_sec; 2918 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000; 2919 2920 /* 2921 * If the dev_t will fit into 16 bits then compress 2922 * it, otherwise leave it alone. See comments in 2923 * nfs_client.c. 2924 */ 2925 if (getminor(vap->va_rdev) <= SO4_MAXMIN && 2926 getmajor(vap->va_rdev) <= SO4_MAXMAJ) 2927 na->na_rdev = nfsv2_cmpdev(vap->va_rdev); 2928 else 2929 (void) cmpldev(&na->na_rdev, vap->va_rdev); 2930 2931 na->na_blocks = vap->va_nblocks; 2932 na->na_blocksize = vap->va_blksize; 2933 2934 /* 2935 * This bit of ugliness is a *TEMPORARY* hack to preserve the 2936 * over-the-wire protocols for named-pipe vnodes. It remaps the 2937 * VFIFO type to the special over-the-wire type. (see note in nfs.h) 2938 * 2939 * BUYER BEWARE: 2940 * If you are porting the NFS to a non-Sun server, you probably 2941 * don't want to include the following block of code. The 2942 * over-the-wire special file types will be changing with the 2943 * NFS Protocol Revision. 2944 */ 2945 if (vap->va_type == VFIFO) 2946 NA_SETFIFO(na); 2947 return (0); 2948 } 2949 2950 /* 2951 * acl v2 support: returns approximate permission. 2952 * default: returns minimal permission (more restrictive) 2953 * aclok: returns maximal permission (less restrictive) 2954 * This routine changes the permissions that are alaredy in *va. 2955 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES, 2956 * CLASS_OBJ is always the same as GROUP_OBJ entry. 2957 */ 2958 static void 2959 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr) 2960 { 2961 vsecattr_t vsa; 2962 int aclcnt; 2963 aclent_t *aclentp; 2964 mode_t mask_perm; 2965 mode_t grp_perm; 2966 mode_t other_perm; 2967 mode_t other_orig; 2968 int error; 2969 2970 /* dont care default acl */ 2971 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT); 2972 error = VOP_GETSECATTR(vp, &vsa, 0, cr); 2973 2974 if (!error) { 2975 aclcnt = vsa.vsa_aclcnt; 2976 if (aclcnt > MIN_ACL_ENTRIES) { 2977 /* non-trivial ACL */ 2978 aclentp = vsa.vsa_aclentp; 2979 if (exi->exi_export.ex_flags & EX_ACLOK) { 2980 /* maximal permissions */ 2981 grp_perm = 0; 2982 other_perm = 0; 2983 for (; aclcnt > 0; aclcnt--, aclentp++) { 2984 switch (aclentp->a_type) { 2985 case USER_OBJ: 2986 break; 2987 case USER: 2988 grp_perm |= 2989 aclentp->a_perm << 3; 2990 other_perm |= aclentp->a_perm; 2991 break; 2992 case GROUP_OBJ: 2993 grp_perm |= 2994 aclentp->a_perm << 3; 2995 break; 2996 case GROUP: 2997 other_perm |= aclentp->a_perm; 2998 break; 2999 case OTHER_OBJ: 3000 other_orig = aclentp->a_perm; 3001 break; 3002 case CLASS_OBJ: 3003 mask_perm = aclentp->a_perm; 3004 break; 3005 default: 3006 break; 3007 } 3008 } 3009 grp_perm &= mask_perm << 3; 3010 other_perm &= mask_perm; 3011 other_perm |= other_orig; 3012 3013 } else { 3014 /* minimal permissions */ 3015 grp_perm = 070; 3016 other_perm = 07; 3017 for (; aclcnt > 0; aclcnt--, aclentp++) { 3018 switch (aclentp->a_type) { 3019 case USER_OBJ: 3020 break; 3021 case USER: 3022 case CLASS_OBJ: 3023 grp_perm &= 3024 aclentp->a_perm << 3; 3025 other_perm &= 3026 aclentp->a_perm; 3027 break; 3028 case GROUP_OBJ: 3029 grp_perm &= 3030 aclentp->a_perm << 3; 3031 break; 3032 case GROUP: 3033 other_perm &= 3034 aclentp->a_perm; 3035 break; 3036 case OTHER_OBJ: 3037 other_perm &= 3038 aclentp->a_perm; 3039 break; 3040 default: 3041 break; 3042 } 3043 } 3044 } 3045 /* copy to va */ 3046 va->va_mode &= ~077; 3047 va->va_mode |= grp_perm | other_perm; 3048 } 3049 if (vsa.vsa_aclcnt) 3050 kmem_free(vsa.vsa_aclentp, 3051 vsa.vsa_aclcnt * sizeof (aclent_t)); 3052 } 3053 } 3054 3055 void 3056 rfs_srvrinit(void) 3057 { 3058 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL); 3059 } 3060 3061 void 3062 rfs_srvrfini(void) 3063 { 3064 mutex_destroy(&rfs_async_write_lock); 3065 } 3066