1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/param.h> 27 #include <sys/types.h> 28 #include <sys/systm.h> 29 #include <sys/cred_impl.h> 30 #include <sys/proc.h> 31 #include <sys/user.h> 32 #include <sys/time.h> 33 #include <sys/buf.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/socket.h> 37 #include <sys/uio.h> 38 #include <sys/tiuser.h> 39 #include <sys/swap.h> 40 #include <sys/errno.h> 41 #include <sys/debug.h> 42 #include <sys/kmem.h> 43 #include <sys/kstat.h> 44 #include <sys/cmn_err.h> 45 #include <sys/vtrace.h> 46 #include <sys/session.h> 47 #include <sys/dnlc.h> 48 #include <sys/bitmap.h> 49 #include <sys/acl.h> 50 #include <sys/ddi.h> 51 #include <sys/pathname.h> 52 #include <sys/flock.h> 53 #include <sys/dirent.h> 54 #include <sys/flock.h> 55 #include <sys/callb.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/tsol/tnet.h> 59 #include <sys/priv.h> 60 #include <sys/sdt.h> 61 #include <sys/attr.h> 62 63 #include <inet/ip6.h> 64 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs4.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/rnode.h> 74 #include <nfs/nfs_acl.h> 75 76 #include <sys/tsol/label.h> 77 78 /* 79 * The hash queues for the access to active and cached rnodes 80 * are organized as doubly linked lists. A reader/writer lock 81 * for each hash bucket is used to control access and to synchronize 82 * lookups, additions, and deletions from the hash queue. 83 * 84 * The rnode freelist is organized as a doubly linked list with 85 * a head pointer. Additions and deletions are synchronized via 86 * a single mutex. 87 * 88 * In order to add an rnode to the free list, it must be hashed into 89 * a hash queue and the exclusive lock to the hash queue be held. 90 * If an rnode is not hashed into a hash queue, then it is destroyed 91 * because it represents no valuable information that can be reused 92 * about the file. The exclusive lock to the hash queue must be 93 * held in order to prevent a lookup in the hash queue from finding 94 * the rnode and using it and assuming that the rnode is not on the 95 * freelist. The lookup in the hash queue will have the hash queue 96 * locked, either exclusive or shared. 97 * 98 * The vnode reference count for each rnode is not allowed to drop 99 * below 1. This prevents external entities, such as the VM 100 * subsystem, from acquiring references to vnodes already on the 101 * freelist and then trying to place them back on the freelist 102 * when their reference is released. This means that the when an 103 * rnode is looked up in the hash queues, then either the rnode 104 * is removed from the freelist and that reference is transferred to 105 * the new reference or the vnode reference count must be incremented 106 * accordingly. The mutex for the freelist must be held in order to 107 * accurately test to see if the rnode is on the freelist or not. 108 * The hash queue lock might be held shared and it is possible that 109 * two different threads may race to remove the rnode from the 110 * freelist. This race can be resolved by holding the mutex for the 111 * freelist. Please note that the mutex for the freelist does not 112 * need to held if the rnode is not on the freelist. It can not be 113 * placed on the freelist due to the requirement that the thread 114 * putting the rnode on the freelist must hold the exclusive lock 115 * to the hash queue and the thread doing the lookup in the hash 116 * queue is holding either a shared or exclusive lock to the hash 117 * queue. 118 * 119 * The lock ordering is: 120 * 121 * hash bucket lock -> vnode lock 122 * hash bucket lock -> freelist lock 123 */ 124 static rhashq_t *rtable; 125 126 static kmutex_t rpfreelist_lock; 127 static rnode_t *rpfreelist = NULL; 128 static long rnew = 0; 129 long nrnode = 0; 130 131 static int rtablesize; 132 static int rtablemask; 133 134 static int hashlen = 4; 135 136 static struct kmem_cache *rnode_cache; 137 138 /* 139 * Mutex to protect the following variables: 140 * nfs_major 141 * nfs_minor 142 */ 143 kmutex_t nfs_minor_lock; 144 int nfs_major; 145 int nfs_minor; 146 147 /* Do we allow preepoch (negative) time values otw? */ 148 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 149 150 /* 151 * Access cache 152 */ 153 static acache_hash_t *acache; 154 static long nacache; /* used strictly to size the number of hash queues */ 155 156 static int acachesize; 157 static int acachemask; 158 static struct kmem_cache *acache_cache; 159 160 /* 161 * Client side utilities 162 */ 163 164 /* 165 * client side statistics 166 */ 167 static const struct clstat clstat_tmpl = { 168 { "calls", KSTAT_DATA_UINT64 }, 169 { "badcalls", KSTAT_DATA_UINT64 }, 170 { "clgets", KSTAT_DATA_UINT64 }, 171 { "cltoomany", KSTAT_DATA_UINT64 }, 172 #ifdef DEBUG 173 { "clalloc", KSTAT_DATA_UINT64 }, 174 { "noresponse", KSTAT_DATA_UINT64 }, 175 { "failover", KSTAT_DATA_UINT64 }, 176 { "remap", KSTAT_DATA_UINT64 }, 177 #endif 178 }; 179 180 /* 181 * The following are statistics that describe behavior of the system as a whole 182 * and doesn't correspond to any one particular zone. 183 */ 184 #ifdef DEBUG 185 static struct clstat_debug { 186 kstat_named_t nrnode; /* number of allocated rnodes */ 187 kstat_named_t access; /* size of access cache */ 188 kstat_named_t dirent; /* size of readdir cache */ 189 kstat_named_t dirents; /* size of readdir buf cache */ 190 kstat_named_t reclaim; /* number of reclaims */ 191 kstat_named_t clreclaim; /* number of cl reclaims */ 192 kstat_named_t f_reclaim; /* number of free reclaims */ 193 kstat_named_t a_reclaim; /* number of active reclaims */ 194 kstat_named_t r_reclaim; /* number of rnode reclaims */ 195 kstat_named_t rpath; /* bytes used to store rpaths */ 196 } clstat_debug = { 197 { "nrnode", KSTAT_DATA_UINT64 }, 198 { "access", KSTAT_DATA_UINT64 }, 199 { "dirent", KSTAT_DATA_UINT64 }, 200 { "dirents", KSTAT_DATA_UINT64 }, 201 { "reclaim", KSTAT_DATA_UINT64 }, 202 { "clreclaim", KSTAT_DATA_UINT64 }, 203 { "f_reclaim", KSTAT_DATA_UINT64 }, 204 { "a_reclaim", KSTAT_DATA_UINT64 }, 205 { "r_reclaim", KSTAT_DATA_UINT64 }, 206 { "r_path", KSTAT_DATA_UINT64 }, 207 }; 208 #endif /* DEBUG */ 209 210 /* 211 * We keep a global list of per-zone client data, so we can clean up all zones 212 * if we get low on memory. 213 */ 214 static list_t nfs_clnt_list; 215 static kmutex_t nfs_clnt_list_lock; 216 static zone_key_t nfsclnt_zone_key; 217 218 static struct kmem_cache *chtab_cache; 219 220 /* 221 * Some servers do not properly update the attributes of the 222 * directory when changes are made. To allow interoperability 223 * with these broken servers, the nfs_disable_rddir_cache 224 * parameter must be set in /etc/system 225 */ 226 int nfs_disable_rddir_cache = 0; 227 228 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 229 struct chtab **); 230 void clfree(CLIENT *, struct chtab *); 231 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 232 struct chtab **, struct nfs_clnt *); 233 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 234 struct chtab **, struct nfs_clnt *); 235 static void clreclaim(void *); 236 static int nfs_feedback(int, int, mntinfo_t *); 237 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 238 caddr_t, cred_t *, int *, enum clnt_stat *, int, 239 failinfo_t *); 240 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 241 caddr_t, cred_t *, int *, int, failinfo_t *); 242 static void rinactive(rnode_t *, cred_t *); 243 static int rtablehash(nfs_fhandle *); 244 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 245 struct vnodeops *, 246 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 247 cred_t *), 248 int (*)(const void *, const void *), int *, cred_t *, 249 char *, char *); 250 static void rp_rmfree(rnode_t *); 251 static void rp_addhash(rnode_t *); 252 static void rp_rmhash_locked(rnode_t *); 253 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 254 static void destroy_rnode(rnode_t *); 255 static void rddir_cache_free(rddir_cache *); 256 static int nfs_free_data_reclaim(rnode_t *); 257 static int nfs_active_data_reclaim(rnode_t *); 258 static int nfs_free_reclaim(void); 259 static int nfs_active_reclaim(void); 260 static int nfs_rnode_reclaim(void); 261 static void nfs_reclaim(void *); 262 static int failover_safe(failinfo_t *); 263 static void failover_newserver(mntinfo_t *mi); 264 static void failover_thread(mntinfo_t *mi); 265 static int failover_wait(mntinfo_t *); 266 static int failover_remap(failinfo_t *); 267 static int failover_lookup(char *, vnode_t *, 268 int (*)(vnode_t *, char *, vnode_t **, 269 struct pathname *, int, vnode_t *, cred_t *, int), 270 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 271 vnode_t **); 272 static void nfs_free_r_path(rnode_t *); 273 static void nfs_set_vroot(vnode_t *); 274 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 275 276 /* 277 * from rpcsec module (common/rpcsec) 278 */ 279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 280 extern void sec_clnt_freeh(AUTH *); 281 extern void sec_clnt_freeinfo(struct sec_data *); 282 283 /* 284 * used in mount policy 285 */ 286 extern ts_label_t *getflabel_cipso(vfs_t *); 287 288 /* 289 * EIO or EINTR are not recoverable errors. 290 */ 291 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 292 293 /* 294 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 295 */ 296 static int 297 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 298 struct chtab **chp, struct nfs_clnt *nfscl) 299 { 300 struct chhead *ch, *newch; 301 struct chhead **plistp; 302 struct chtab *cp; 303 int error; 304 k_sigset_t smask; 305 306 if (newcl == NULL || chp == NULL || ci == NULL) 307 return (EINVAL); 308 309 *newcl = NULL; 310 *chp = NULL; 311 312 /* 313 * Find an unused handle or create one 314 */ 315 newch = NULL; 316 nfscl->nfscl_stat.clgets.value.ui64++; 317 top: 318 /* 319 * Find the correct entry in the cache to check for free 320 * client handles. The search is based on the RPC program 321 * number, program version number, dev_t for the transport 322 * device, and the protocol family. 323 */ 324 mutex_enter(&nfscl->nfscl_chtable_lock); 325 plistp = &nfscl->nfscl_chtable; 326 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 327 if (ch->ch_prog == ci->cl_prog && 328 ch->ch_vers == ci->cl_vers && 329 ch->ch_dev == svp->sv_knconf->knc_rdev && 330 (strcmp(ch->ch_protofmly, 331 svp->sv_knconf->knc_protofmly) == 0)) 332 break; 333 plistp = &ch->ch_next; 334 } 335 336 /* 337 * If we didn't find a cache entry for this quadruple, then 338 * create one. If we don't have one already preallocated, 339 * then drop the cache lock, create one, and then start over. 340 * If we did have a preallocated entry, then just add it to 341 * the front of the list. 342 */ 343 if (ch == NULL) { 344 if (newch == NULL) { 345 mutex_exit(&nfscl->nfscl_chtable_lock); 346 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 347 newch->ch_timesused = 0; 348 newch->ch_prog = ci->cl_prog; 349 newch->ch_vers = ci->cl_vers; 350 newch->ch_dev = svp->sv_knconf->knc_rdev; 351 newch->ch_protofmly = kmem_alloc( 352 strlen(svp->sv_knconf->knc_protofmly) + 1, 353 KM_SLEEP); 354 (void) strcpy(newch->ch_protofmly, 355 svp->sv_knconf->knc_protofmly); 356 newch->ch_list = NULL; 357 goto top; 358 } 359 ch = newch; 360 newch = NULL; 361 ch->ch_next = nfscl->nfscl_chtable; 362 nfscl->nfscl_chtable = ch; 363 /* 364 * We found a cache entry, but if it isn't on the front of the 365 * list, then move it to the front of the list to try to take 366 * advantage of locality of operations. 367 */ 368 } else if (ch != nfscl->nfscl_chtable) { 369 *plistp = ch->ch_next; 370 ch->ch_next = nfscl->nfscl_chtable; 371 nfscl->nfscl_chtable = ch; 372 } 373 374 /* 375 * If there was a free client handle cached, then remove it 376 * from the list, init it, and use it. 377 */ 378 if (ch->ch_list != NULL) { 379 cp = ch->ch_list; 380 ch->ch_list = cp->ch_list; 381 mutex_exit(&nfscl->nfscl_chtable_lock); 382 if (newch != NULL) { 383 kmem_free(newch->ch_protofmly, 384 strlen(newch->ch_protofmly) + 1); 385 kmem_free(newch, sizeof (*newch)); 386 } 387 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 388 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 389 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 390 &cp->ch_client->cl_auth); 391 if (error || cp->ch_client->cl_auth == NULL) { 392 CLNT_DESTROY(cp->ch_client); 393 kmem_cache_free(chtab_cache, cp); 394 return ((error != 0) ? error : EINTR); 395 } 396 ch->ch_timesused++; 397 *newcl = cp->ch_client; 398 *chp = cp; 399 return (0); 400 } 401 402 /* 403 * There weren't any free client handles which fit, so allocate 404 * a new one and use that. 405 */ 406 #ifdef DEBUG 407 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 408 #endif 409 mutex_exit(&nfscl->nfscl_chtable_lock); 410 411 nfscl->nfscl_stat.cltoomany.value.ui64++; 412 if (newch != NULL) { 413 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 414 kmem_free(newch, sizeof (*newch)); 415 } 416 417 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 418 cp->ch_head = ch; 419 420 sigintr(&smask, (int)ci->cl_flags & MI_INT); 421 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 422 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 423 sigunintr(&smask); 424 425 if (error != 0) { 426 kmem_cache_free(chtab_cache, cp); 427 #ifdef DEBUG 428 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 429 #endif 430 /* 431 * Warning is unnecessary if error is EINTR. 432 */ 433 if (error != EINTR) { 434 nfs_cmn_err(error, CE_WARN, 435 "clget: couldn't create handle: %m\n"); 436 } 437 return (error); 438 } 439 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 440 auth_destroy(cp->ch_client->cl_auth); 441 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 442 &cp->ch_client->cl_auth); 443 if (error || cp->ch_client->cl_auth == NULL) { 444 CLNT_DESTROY(cp->ch_client); 445 kmem_cache_free(chtab_cache, cp); 446 #ifdef DEBUG 447 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 448 #endif 449 return ((error != 0) ? error : EINTR); 450 } 451 ch->ch_timesused++; 452 *newcl = cp->ch_client; 453 ASSERT(cp->ch_client->cl_nosignal == FALSE); 454 *chp = cp; 455 return (0); 456 } 457 458 int 459 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 460 struct chtab **chp) 461 { 462 struct nfs_clnt *nfscl; 463 464 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 465 ASSERT(nfscl != NULL); 466 467 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 468 } 469 470 static int 471 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 472 struct chtab **chp, struct nfs_clnt *nfscl) 473 { 474 clinfo_t ci; 475 int error; 476 477 /* 478 * Set read buffer size to rsize 479 * and add room for RPC headers. 480 */ 481 ci.cl_readsize = mi->mi_tsize; 482 if (ci.cl_readsize != 0) 483 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 484 485 /* 486 * If soft mount and server is down just try once. 487 * meaning: do not retransmit. 488 */ 489 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 490 ci.cl_retrans = 0; 491 else 492 ci.cl_retrans = mi->mi_retrans; 493 494 ci.cl_prog = NFS_ACL_PROGRAM; 495 ci.cl_vers = mi->mi_vers; 496 ci.cl_flags = mi->mi_flags; 497 498 /* 499 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 500 * security flavor, the client tries to establish a security context 501 * by contacting the server. If the connection is timed out or reset, 502 * e.g. server reboot, we will try again. 503 */ 504 do { 505 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 506 507 if (error == 0) 508 break; 509 510 /* 511 * For forced unmount or zone shutdown, bail out, no retry. 512 */ 513 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 514 error = EIO; 515 break; 516 } 517 518 /* do not retry for softmount */ 519 if (!(mi->mi_flags & MI_HARD)) 520 break; 521 522 /* let the caller deal with the failover case */ 523 if (FAILOVER_MOUNT(mi)) 524 break; 525 526 } while (error == ETIMEDOUT || error == ECONNRESET); 527 528 return (error); 529 } 530 531 static int 532 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 533 struct chtab **chp, struct nfs_clnt *nfscl) 534 { 535 clinfo_t ci; 536 int error; 537 538 /* 539 * Set read buffer size to rsize 540 * and add room for RPC headers. 541 */ 542 ci.cl_readsize = mi->mi_tsize; 543 if (ci.cl_readsize != 0) 544 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 545 546 /* 547 * If soft mount and server is down just try once. 548 * meaning: do not retransmit. 549 */ 550 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 551 ci.cl_retrans = 0; 552 else 553 ci.cl_retrans = mi->mi_retrans; 554 555 ci.cl_prog = mi->mi_prog; 556 ci.cl_vers = mi->mi_vers; 557 ci.cl_flags = mi->mi_flags; 558 559 /* 560 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 561 * security flavor, the client tries to establish a security context 562 * by contacting the server. If the connection is timed out or reset, 563 * e.g. server reboot, we will try again. 564 */ 565 do { 566 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 567 568 if (error == 0) 569 break; 570 571 /* 572 * For forced unmount or zone shutdown, bail out, no retry. 573 */ 574 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 575 error = EIO; 576 break; 577 } 578 579 /* do not retry for softmount */ 580 if (!(mi->mi_flags & MI_HARD)) 581 break; 582 583 /* let the caller deal with the failover case */ 584 if (FAILOVER_MOUNT(mi)) 585 break; 586 587 } while (error == ETIMEDOUT || error == ECONNRESET); 588 589 return (error); 590 } 591 592 static void 593 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 594 { 595 if (cl->cl_auth != NULL) { 596 sec_clnt_freeh(cl->cl_auth); 597 cl->cl_auth = NULL; 598 } 599 600 /* 601 * Timestamp this cache entry so that we know when it was last 602 * used. 603 */ 604 cp->ch_freed = gethrestime_sec(); 605 606 /* 607 * Add the free client handle to the front of the list. 608 * This way, the list will be sorted in youngest to oldest 609 * order. 610 */ 611 mutex_enter(&nfscl->nfscl_chtable_lock); 612 cp->ch_list = cp->ch_head->ch_list; 613 cp->ch_head->ch_list = cp; 614 mutex_exit(&nfscl->nfscl_chtable_lock); 615 } 616 617 void 618 clfree(CLIENT *cl, struct chtab *cp) 619 { 620 struct nfs_clnt *nfscl; 621 622 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 623 ASSERT(nfscl != NULL); 624 625 clfree_impl(cl, cp, nfscl); 626 } 627 628 #define CL_HOLDTIME 60 /* time to hold client handles */ 629 630 static void 631 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 632 { 633 struct chhead *ch; 634 struct chtab *cp; /* list of objects that can be reclaimed */ 635 struct chtab *cpe; 636 struct chtab *cpl; 637 struct chtab **cpp; 638 #ifdef DEBUG 639 int n = 0; 640 #endif 641 642 /* 643 * Need to reclaim some memory, so step through the cache 644 * looking through the lists for entries which can be freed. 645 */ 646 cp = NULL; 647 648 mutex_enter(&nfscl->nfscl_chtable_lock); 649 650 /* 651 * Here we step through each non-NULL quadruple and start to 652 * construct the reclaim list pointed to by cp. Note that 653 * cp will contain all eligible chtab entries. When this traversal 654 * completes, chtab entries from the last quadruple will be at the 655 * front of cp and entries from previously inspected quadruples have 656 * been appended to the rear of cp. 657 */ 658 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 659 if (ch->ch_list == NULL) 660 continue; 661 /* 662 * Search each list for entries older then 663 * cl_holdtime seconds. The lists are maintained 664 * in youngest to oldest order so that when the 665 * first entry is found which is old enough, then 666 * all of the rest of the entries on the list will 667 * be old enough as well. 668 */ 669 cpl = ch->ch_list; 670 cpp = &ch->ch_list; 671 while (cpl != NULL && 672 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 673 cpp = &cpl->ch_list; 674 cpl = cpl->ch_list; 675 } 676 if (cpl != NULL) { 677 *cpp = NULL; 678 if (cp != NULL) { 679 cpe = cpl; 680 while (cpe->ch_list != NULL) 681 cpe = cpe->ch_list; 682 cpe->ch_list = cp; 683 } 684 cp = cpl; 685 } 686 } 687 688 mutex_exit(&nfscl->nfscl_chtable_lock); 689 690 /* 691 * If cp is empty, then there is nothing to reclaim here. 692 */ 693 if (cp == NULL) 694 return; 695 696 /* 697 * Step through the list of entries to free, destroying each client 698 * handle and kmem_free'ing the memory for each entry. 699 */ 700 while (cp != NULL) { 701 #ifdef DEBUG 702 n++; 703 #endif 704 CLNT_DESTROY(cp->ch_client); 705 cpl = cp->ch_list; 706 kmem_cache_free(chtab_cache, cp); 707 cp = cpl; 708 } 709 710 #ifdef DEBUG 711 /* 712 * Update clalloc so that nfsstat shows the current number 713 * of allocated client handles. 714 */ 715 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 716 #endif 717 } 718 719 /* ARGSUSED */ 720 static void 721 clreclaim(void *all) 722 { 723 struct nfs_clnt *nfscl; 724 725 #ifdef DEBUG 726 clstat_debug.clreclaim.value.ui64++; 727 #endif 728 /* 729 * The system is low on memory; go through and try to reclaim some from 730 * every zone on the system. 731 */ 732 mutex_enter(&nfs_clnt_list_lock); 733 nfscl = list_head(&nfs_clnt_list); 734 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 735 clreclaim_zone(nfscl, CL_HOLDTIME); 736 mutex_exit(&nfs_clnt_list_lock); 737 } 738 739 /* 740 * Minimum time-out values indexed by call type 741 * These units are in "eights" of a second to avoid multiplies 742 */ 743 static unsigned int minimum_timeo[] = { 744 6, 7, 10 745 }; 746 747 /* 748 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 749 */ 750 #define MAXTIMO (20*hz) 751 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 752 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 753 754 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 755 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 756 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 757 758 /* 759 * Function called when rfscall notices that we have been 760 * re-transmitting, or when we get a response without retransmissions. 761 * Return 1 if the transfer size was adjusted down - 0 if no change. 762 */ 763 static int 764 nfs_feedback(int flag, int which, mntinfo_t *mi) 765 { 766 int kind; 767 int r = 0; 768 769 mutex_enter(&mi->mi_lock); 770 if (flag == FEEDBACK_REXMIT1) { 771 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 772 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 773 goto done; 774 if (mi->mi_curread > MIN_NFS_TSIZE) { 775 mi->mi_curread /= 2; 776 if (mi->mi_curread < MIN_NFS_TSIZE) 777 mi->mi_curread = MIN_NFS_TSIZE; 778 r = 1; 779 } 780 781 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 782 mi->mi_curwrite /= 2; 783 if (mi->mi_curwrite < MIN_NFS_TSIZE) 784 mi->mi_curwrite = MIN_NFS_TSIZE; 785 r = 1; 786 } 787 } else if (flag == FEEDBACK_OK) { 788 kind = mi->mi_timer_type[which]; 789 if (kind == 0 || 790 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 791 goto done; 792 if (kind == 1) { 793 if (mi->mi_curread >= mi->mi_tsize) 794 goto done; 795 mi->mi_curread += MIN_NFS_TSIZE; 796 if (mi->mi_curread > mi->mi_tsize/2) 797 mi->mi_curread = mi->mi_tsize; 798 } else if (kind == 2) { 799 if (mi->mi_curwrite >= mi->mi_stsize) 800 goto done; 801 mi->mi_curwrite += MIN_NFS_TSIZE; 802 if (mi->mi_curwrite > mi->mi_stsize/2) 803 mi->mi_curwrite = mi->mi_stsize; 804 } 805 } 806 done: 807 mutex_exit(&mi->mi_lock); 808 return (r); 809 } 810 811 #ifdef DEBUG 812 static int rfs2call_hits = 0; 813 static int rfs2call_misses = 0; 814 #endif 815 816 int 817 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 818 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 819 enum nfsstat *statusp, int flags, failinfo_t *fi) 820 { 821 int rpcerror; 822 enum clnt_stat rpc_status; 823 824 ASSERT(statusp != NULL); 825 826 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 827 cr, douprintf, &rpc_status, flags, fi); 828 if (!rpcerror) { 829 /* 830 * See crnetadjust() for comments. 831 */ 832 if (*statusp == NFSERR_ACCES && 833 (cr = crnetadjust(cr)) != NULL) { 834 #ifdef DEBUG 835 rfs2call_hits++; 836 #endif 837 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 838 resp, cr, douprintf, NULL, flags, fi); 839 crfree(cr); 840 #ifdef DEBUG 841 if (*statusp == NFSERR_ACCES) 842 rfs2call_misses++; 843 #endif 844 } 845 } else if (rpc_status == RPC_PROCUNAVAIL) { 846 *statusp = NFSERR_OPNOTSUPP; 847 rpcerror = 0; 848 } 849 850 return (rpcerror); 851 } 852 853 #define NFS3_JUKEBOX_DELAY 10 * hz 854 855 static clock_t nfs3_jukebox_delay = 0; 856 857 #ifdef DEBUG 858 static int rfs3call_hits = 0; 859 static int rfs3call_misses = 0; 860 #endif 861 862 int 863 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 864 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 865 nfsstat3 *statusp, int flags, failinfo_t *fi) 866 { 867 int rpcerror; 868 int user_informed; 869 870 user_informed = 0; 871 do { 872 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 873 cr, douprintf, NULL, flags, fi); 874 if (!rpcerror) { 875 cred_t *crr; 876 if (*statusp == NFS3ERR_JUKEBOX) { 877 if (ttoproc(curthread) == &p0) { 878 rpcerror = EAGAIN; 879 break; 880 } 881 if (!user_informed) { 882 user_informed = 1; 883 uprintf( 884 "file temporarily unavailable on the server, retrying...\n"); 885 } 886 delay(nfs3_jukebox_delay); 887 } 888 /* 889 * See crnetadjust() for comments. 890 */ 891 else if (*statusp == NFS3ERR_ACCES && 892 (crr = crnetadjust(cr)) != NULL) { 893 #ifdef DEBUG 894 rfs3call_hits++; 895 #endif 896 rpcerror = rfscall(mi, which, xdrargs, argsp, 897 xdrres, resp, crr, douprintf, 898 NULL, flags, fi); 899 900 crfree(crr); 901 #ifdef DEBUG 902 if (*statusp == NFS3ERR_ACCES) 903 rfs3call_misses++; 904 #endif 905 } 906 } 907 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 908 909 return (rpcerror); 910 } 911 912 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 913 #define INC_READERS(mi) { \ 914 mi->mi_readers++; \ 915 } 916 #define DEC_READERS(mi) { \ 917 mi->mi_readers--; \ 918 if (mi->mi_readers == 0) \ 919 cv_broadcast(&mi->mi_failover_cv); \ 920 } 921 922 static int 923 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 924 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 925 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 926 { 927 CLIENT *client; 928 struct chtab *ch; 929 cred_t *cr = icr; 930 enum clnt_stat status; 931 struct rpc_err rpcerr; 932 struct timeval wait; 933 int timeo; /* in units of hz */ 934 int my_rsize, my_wsize; 935 bool_t tryagain; 936 bool_t cred_cloned = FALSE; 937 k_sigset_t smask; 938 servinfo_t *svp; 939 struct nfs_clnt *nfscl; 940 zoneid_t zoneid = getzoneid(); 941 #ifdef DEBUG 942 char *bufp; 943 #endif 944 945 946 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 947 "rfscall_start:which %d mi %p", which, mi); 948 949 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 950 ASSERT(nfscl != NULL); 951 952 nfscl->nfscl_stat.calls.value.ui64++; 953 mi->mi_reqs[which].value.ui64++; 954 955 rpcerr.re_status = RPC_SUCCESS; 956 957 /* 958 * In case of forced unmount or zone shutdown, return EIO. 959 */ 960 961 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 962 rpcerr.re_status = RPC_FAILED; 963 rpcerr.re_errno = EIO; 964 return (rpcerr.re_errno); 965 } 966 967 /* 968 * Remember the transfer sizes in case 969 * nfs_feedback changes them underneath us. 970 */ 971 my_rsize = mi->mi_curread; 972 my_wsize = mi->mi_curwrite; 973 974 /* 975 * NFS client failover support 976 * 977 * If this rnode is not in sync with the current server (VALID_FH), 978 * we'd like to do a remap to get in sync. We can be interrupted 979 * in failover_remap(), and if so we'll bail. Otherwise, we'll 980 * use the best info we have to try the RPC. Part of that is 981 * unconditionally updating the filehandle copy kept for V3. 982 * 983 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 984 * rw_enter(); we're trying to keep the current server from being 985 * changed on us until we're done with the remapping and have a 986 * matching client handle. We don't want to sending a filehandle 987 * to the wrong host. 988 */ 989 failoverretry: 990 if (FAILOVER_MOUNT(mi)) { 991 mutex_enter(&mi->mi_lock); 992 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 993 if (failover_wait(mi)) { 994 mutex_exit(&mi->mi_lock); 995 return (EINTR); 996 } 997 } 998 INC_READERS(mi); 999 mutex_exit(&mi->mi_lock); 1000 if (fi) { 1001 if (!VALID_FH(fi) && 1002 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1003 int remaperr; 1004 1005 svp = mi->mi_curr_serv; 1006 remaperr = failover_remap(fi); 1007 if (remaperr != 0) { 1008 #ifdef DEBUG 1009 if (remaperr != EINTR) 1010 nfs_cmn_err(remaperr, CE_WARN, 1011 "rfscall couldn't failover: %m"); 1012 #endif 1013 mutex_enter(&mi->mi_lock); 1014 DEC_READERS(mi); 1015 mutex_exit(&mi->mi_lock); 1016 /* 1017 * If failover_remap returns ETIMEDOUT 1018 * and the filesystem is hard mounted 1019 * we have to retry the call with a new 1020 * server. 1021 */ 1022 if ((mi->mi_flags & MI_HARD) && 1023 IS_RECOVERABLE_ERROR(remaperr)) { 1024 if (svp == mi->mi_curr_serv) 1025 failover_newserver(mi); 1026 rpcerr.re_status = RPC_SUCCESS; 1027 goto failoverretry; 1028 } 1029 rpcerr.re_errno = remaperr; 1030 return (remaperr); 1031 } 1032 } 1033 if (fi->fhp && fi->copyproc) 1034 (*fi->copyproc)(fi->fhp, fi->vp); 1035 } 1036 } 1037 1038 /* For TSOL, use a new cred which has net_mac_aware flag */ 1039 if (!cred_cloned && is_system_labeled()) { 1040 cred_cloned = TRUE; 1041 cr = crdup(icr); 1042 (void) setpflags(NET_MAC_AWARE, 1, cr); 1043 } 1044 1045 /* 1046 * clget() calls clnt_tli_kinit() which clears the xid, so we 1047 * are guaranteed to reprocess the retry as a new request. 1048 */ 1049 svp = mi->mi_curr_serv; 1050 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1051 1052 if (FAILOVER_MOUNT(mi)) { 1053 mutex_enter(&mi->mi_lock); 1054 DEC_READERS(mi); 1055 mutex_exit(&mi->mi_lock); 1056 1057 if ((rpcerr.re_errno == ETIMEDOUT || 1058 rpcerr.re_errno == ECONNRESET) && 1059 failover_safe(fi)) { 1060 if (svp == mi->mi_curr_serv) 1061 failover_newserver(mi); 1062 goto failoverretry; 1063 } 1064 } 1065 if (rpcerr.re_errno != 0) 1066 return (rpcerr.re_errno); 1067 1068 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1069 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1070 timeo = (mi->mi_timeo * hz) / 10; 1071 } else { 1072 mutex_enter(&mi->mi_lock); 1073 timeo = CLNT_SETTIMERS(client, 1074 &(mi->mi_timers[mi->mi_timer_type[which]]), 1075 &(mi->mi_timers[NFS_CALLTYPES]), 1076 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1077 (void (*)())NULL, (caddr_t)mi, 0); 1078 mutex_exit(&mi->mi_lock); 1079 } 1080 1081 /* 1082 * If hard mounted fs, retry call forever unless hard error occurs. 1083 */ 1084 do { 1085 tryagain = FALSE; 1086 1087 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1088 status = RPC_FAILED; 1089 rpcerr.re_status = RPC_FAILED; 1090 rpcerr.re_errno = EIO; 1091 break; 1092 } 1093 1094 TICK_TO_TIMEVAL(timeo, &wait); 1095 1096 /* 1097 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1098 * and SIGTERM. (Preserving the existing masks). 1099 * Mask out SIGINT if mount option nointr is specified. 1100 */ 1101 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1102 if (!(mi->mi_flags & MI_INT)) 1103 client->cl_nosignal = TRUE; 1104 1105 /* 1106 * If there is a current signal, then don't bother 1107 * even trying to send out the request because we 1108 * won't be able to block waiting for the response. 1109 * Simply assume RPC_INTR and get on with it. 1110 */ 1111 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1112 status = RPC_INTR; 1113 else { 1114 status = CLNT_CALL(client, which, xdrargs, argsp, 1115 xdrres, resp, wait); 1116 } 1117 1118 if (!(mi->mi_flags & MI_INT)) 1119 client->cl_nosignal = FALSE; 1120 /* 1121 * restore original signal mask 1122 */ 1123 sigunintr(&smask); 1124 1125 switch (status) { 1126 case RPC_SUCCESS: 1127 if ((mi->mi_flags & MI_DYNAMIC) && 1128 mi->mi_timer_type[which] != 0 && 1129 (mi->mi_curread != my_rsize || 1130 mi->mi_curwrite != my_wsize)) 1131 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1132 break; 1133 1134 case RPC_INTR: 1135 /* 1136 * There is no way to recover from this error, 1137 * even if mount option nointr is specified. 1138 * SIGKILL, for example, cannot be blocked. 1139 */ 1140 rpcerr.re_status = RPC_INTR; 1141 rpcerr.re_errno = EINTR; 1142 break; 1143 1144 case RPC_UDERROR: 1145 /* 1146 * If the NFS server is local (vold) and 1147 * it goes away then we get RPC_UDERROR. 1148 * This is a retryable error, so we would 1149 * loop, so check to see if the specific 1150 * error was ECONNRESET, indicating that 1151 * target did not exist at all. If so, 1152 * return with RPC_PROGUNAVAIL and 1153 * ECONNRESET to indicate why. 1154 */ 1155 CLNT_GETERR(client, &rpcerr); 1156 if (rpcerr.re_errno == ECONNRESET) { 1157 rpcerr.re_status = RPC_PROGUNAVAIL; 1158 rpcerr.re_errno = ECONNRESET; 1159 break; 1160 } 1161 /*FALLTHROUGH*/ 1162 1163 default: /* probably RPC_TIMEDOUT */ 1164 if (IS_UNRECOVERABLE_RPC(status)) 1165 break; 1166 1167 /* 1168 * increment server not responding count 1169 */ 1170 mutex_enter(&mi->mi_lock); 1171 mi->mi_noresponse++; 1172 mutex_exit(&mi->mi_lock); 1173 #ifdef DEBUG 1174 nfscl->nfscl_stat.noresponse.value.ui64++; 1175 #endif 1176 1177 if (!(mi->mi_flags & MI_HARD)) { 1178 if (!(mi->mi_flags & MI_SEMISOFT) || 1179 (mi->mi_ss_call_type[which] == 0)) 1180 break; 1181 } 1182 1183 /* 1184 * The call is in progress (over COTS). 1185 * Try the CLNT_CALL again, but don't 1186 * print a noisy error message. 1187 */ 1188 if (status == RPC_INPROGRESS) { 1189 tryagain = TRUE; 1190 break; 1191 } 1192 1193 if (flags & RFSCALL_SOFT) 1194 break; 1195 1196 /* 1197 * On zone shutdown, just move on. 1198 */ 1199 if (zone_status_get(curproc->p_zone) >= 1200 ZONE_IS_SHUTTING_DOWN) { 1201 rpcerr.re_status = RPC_FAILED; 1202 rpcerr.re_errno = EIO; 1203 break; 1204 } 1205 1206 /* 1207 * NFS client failover support 1208 * 1209 * If the current server just failed us, we'll 1210 * start the process of finding a new server. 1211 * After that, we can just retry. 1212 */ 1213 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1214 if (svp == mi->mi_curr_serv) 1215 failover_newserver(mi); 1216 clfree_impl(client, ch, nfscl); 1217 goto failoverretry; 1218 } 1219 1220 tryagain = TRUE; 1221 timeo = backoff(timeo); 1222 mutex_enter(&mi->mi_lock); 1223 if (!(mi->mi_flags & MI_PRINTED)) { 1224 mi->mi_flags |= MI_PRINTED; 1225 mutex_exit(&mi->mi_lock); 1226 #ifdef DEBUG 1227 zprintf(zoneid, 1228 "NFS%d server %s not responding still trying\n", 1229 mi->mi_vers, svp->sv_hostname); 1230 #else 1231 zprintf(zoneid, 1232 "NFS server %s not responding still trying\n", 1233 svp->sv_hostname); 1234 #endif 1235 } else 1236 mutex_exit(&mi->mi_lock); 1237 if (*douprintf && nfs_has_ctty()) { 1238 *douprintf = 0; 1239 if (!(mi->mi_flags & MI_NOPRINT)) 1240 #ifdef DEBUG 1241 uprintf( 1242 "NFS%d server %s not responding still trying\n", 1243 mi->mi_vers, svp->sv_hostname); 1244 #else 1245 uprintf( 1246 "NFS server %s not responding still trying\n", 1247 svp->sv_hostname); 1248 #endif 1249 } 1250 1251 /* 1252 * If doing dynamic adjustment of transfer 1253 * size and if it's a read or write call 1254 * and if the transfer size changed while 1255 * retransmitting or if the feedback routine 1256 * changed the transfer size, 1257 * then exit rfscall so that the transfer 1258 * size can be adjusted at the vnops level. 1259 */ 1260 if ((mi->mi_flags & MI_DYNAMIC) && 1261 mi->mi_timer_type[which] != 0 && 1262 (mi->mi_curread != my_rsize || 1263 mi->mi_curwrite != my_wsize || 1264 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1265 /* 1266 * On read or write calls, return 1267 * back to the vnode ops level if 1268 * the transfer size changed. 1269 */ 1270 clfree_impl(client, ch, nfscl); 1271 if (cred_cloned) 1272 crfree(cr); 1273 return (ENFS_TRYAGAIN); 1274 } 1275 } 1276 } while (tryagain); 1277 1278 if (status != RPC_SUCCESS) { 1279 /* 1280 * Let soft mounts use the timed out message. 1281 */ 1282 if (status == RPC_INPROGRESS) 1283 status = RPC_TIMEDOUT; 1284 nfscl->nfscl_stat.badcalls.value.ui64++; 1285 if (status != RPC_INTR) { 1286 mutex_enter(&mi->mi_lock); 1287 mi->mi_flags |= MI_DOWN; 1288 mutex_exit(&mi->mi_lock); 1289 CLNT_GETERR(client, &rpcerr); 1290 #ifdef DEBUG 1291 bufp = clnt_sperror(client, svp->sv_hostname); 1292 zprintf(zoneid, "NFS%d %s failed for %s\n", 1293 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1294 if (nfs_has_ctty()) { 1295 if (!(mi->mi_flags & MI_NOPRINT)) { 1296 uprintf("NFS%d %s failed for %s\n", 1297 mi->mi_vers, mi->mi_rfsnames[which], 1298 bufp); 1299 } 1300 } 1301 kmem_free(bufp, MAXPATHLEN); 1302 #else 1303 zprintf(zoneid, 1304 "NFS %s failed for server %s: error %d (%s)\n", 1305 mi->mi_rfsnames[which], svp->sv_hostname, 1306 status, clnt_sperrno(status)); 1307 if (nfs_has_ctty()) { 1308 if (!(mi->mi_flags & MI_NOPRINT)) { 1309 uprintf( 1310 "NFS %s failed for server %s: error %d (%s)\n", 1311 mi->mi_rfsnames[which], 1312 svp->sv_hostname, status, 1313 clnt_sperrno(status)); 1314 } 1315 } 1316 #endif 1317 /* 1318 * when CLNT_CALL() fails with RPC_AUTHERROR, 1319 * re_errno is set appropriately depending on 1320 * the authentication error 1321 */ 1322 if (status == RPC_VERSMISMATCH || 1323 status == RPC_PROGVERSMISMATCH) 1324 rpcerr.re_errno = EIO; 1325 } 1326 } else { 1327 /* 1328 * Test the value of mi_down and mi_printed without 1329 * holding the mi_lock mutex. If they are both zero, 1330 * then it is okay to skip the down and printed 1331 * processing. This saves on a mutex_enter and 1332 * mutex_exit pair for a normal, successful RPC. 1333 * This was just complete overhead. 1334 */ 1335 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1336 mutex_enter(&mi->mi_lock); 1337 mi->mi_flags &= ~MI_DOWN; 1338 if (mi->mi_flags & MI_PRINTED) { 1339 mi->mi_flags &= ~MI_PRINTED; 1340 mutex_exit(&mi->mi_lock); 1341 #ifdef DEBUG 1342 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1343 zprintf(zoneid, "NFS%d server %s ok\n", 1344 mi->mi_vers, svp->sv_hostname); 1345 #else 1346 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1347 zprintf(zoneid, "NFS server %s ok\n", 1348 svp->sv_hostname); 1349 #endif 1350 } else 1351 mutex_exit(&mi->mi_lock); 1352 } 1353 1354 if (*douprintf == 0) { 1355 if (!(mi->mi_flags & MI_NOPRINT)) 1356 #ifdef DEBUG 1357 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1358 uprintf("NFS%d server %s ok\n", 1359 mi->mi_vers, svp->sv_hostname); 1360 #else 1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1362 uprintf("NFS server %s ok\n", svp->sv_hostname); 1363 #endif 1364 *douprintf = 1; 1365 } 1366 } 1367 1368 clfree_impl(client, ch, nfscl); 1369 if (cred_cloned) 1370 crfree(cr); 1371 1372 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1373 1374 if (rpc_status != NULL) 1375 *rpc_status = rpcerr.re_status; 1376 1377 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1378 rpcerr.re_errno); 1379 1380 return (rpcerr.re_errno); 1381 } 1382 1383 #ifdef DEBUG 1384 static int acl2call_hits = 0; 1385 static int acl2call_misses = 0; 1386 #endif 1387 1388 int 1389 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1390 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1391 enum nfsstat *statusp, int flags, failinfo_t *fi) 1392 { 1393 int rpcerror; 1394 1395 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1396 cr, douprintf, flags, fi); 1397 if (!rpcerror) { 1398 /* 1399 * See comments with crnetadjust(). 1400 */ 1401 if (*statusp == NFSERR_ACCES && 1402 (cr = crnetadjust(cr)) != NULL) { 1403 #ifdef DEBUG 1404 acl2call_hits++; 1405 #endif 1406 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1407 resp, cr, douprintf, flags, fi); 1408 crfree(cr); 1409 #ifdef DEBUG 1410 if (*statusp == NFSERR_ACCES) 1411 acl2call_misses++; 1412 #endif 1413 } 1414 } 1415 1416 return (rpcerror); 1417 } 1418 1419 #ifdef DEBUG 1420 static int acl3call_hits = 0; 1421 static int acl3call_misses = 0; 1422 #endif 1423 1424 int 1425 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1426 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1427 nfsstat3 *statusp, int flags, failinfo_t *fi) 1428 { 1429 int rpcerror; 1430 int user_informed; 1431 1432 user_informed = 0; 1433 1434 do { 1435 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1436 cr, douprintf, flags, fi); 1437 if (!rpcerror) { 1438 cred_t *crr; 1439 if (*statusp == NFS3ERR_JUKEBOX) { 1440 if (!user_informed) { 1441 user_informed = 1; 1442 uprintf( 1443 "file temporarily unavailable on the server, retrying...\n"); 1444 } 1445 delay(nfs3_jukebox_delay); 1446 } 1447 /* 1448 * See crnetadjust() for comments. 1449 */ 1450 else if (*statusp == NFS3ERR_ACCES && 1451 (crr = crnetadjust(cr)) != NULL) { 1452 #ifdef DEBUG 1453 acl3call_hits++; 1454 #endif 1455 rpcerror = aclcall(mi, which, xdrargs, argsp, 1456 xdrres, resp, crr, douprintf, flags, fi); 1457 1458 crfree(crr); 1459 #ifdef DEBUG 1460 if (*statusp == NFS3ERR_ACCES) 1461 acl3call_misses++; 1462 #endif 1463 } 1464 } 1465 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1466 1467 return (rpcerror); 1468 } 1469 1470 static int 1471 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1472 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1473 int flags, failinfo_t *fi) 1474 { 1475 CLIENT *client; 1476 struct chtab *ch; 1477 cred_t *cr = icr; 1478 bool_t cred_cloned = FALSE; 1479 enum clnt_stat status; 1480 struct rpc_err rpcerr; 1481 struct timeval wait; 1482 int timeo; /* in units of hz */ 1483 #if 0 /* notyet */ 1484 int my_rsize, my_wsize; 1485 #endif 1486 bool_t tryagain; 1487 k_sigset_t smask; 1488 servinfo_t *svp; 1489 struct nfs_clnt *nfscl; 1490 zoneid_t zoneid = getzoneid(); 1491 #ifdef DEBUG 1492 char *bufp; 1493 #endif 1494 1495 #if 0 /* notyet */ 1496 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1497 "rfscall_start:which %d mi %p", which, mi); 1498 #endif 1499 1500 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1501 ASSERT(nfscl != NULL); 1502 1503 nfscl->nfscl_stat.calls.value.ui64++; 1504 mi->mi_aclreqs[which].value.ui64++; 1505 1506 rpcerr.re_status = RPC_SUCCESS; 1507 1508 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1509 rpcerr.re_status = RPC_FAILED; 1510 rpcerr.re_errno = EIO; 1511 return (rpcerr.re_errno); 1512 } 1513 1514 #if 0 /* notyet */ 1515 /* 1516 * Remember the transfer sizes in case 1517 * nfs_feedback changes them underneath us. 1518 */ 1519 my_rsize = mi->mi_curread; 1520 my_wsize = mi->mi_curwrite; 1521 #endif 1522 1523 /* 1524 * NFS client failover support 1525 * 1526 * If this rnode is not in sync with the current server (VALID_FH), 1527 * we'd like to do a remap to get in sync. We can be interrupted 1528 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1529 * use the best info we have to try the RPC. Part of that is 1530 * unconditionally updating the filehandle copy kept for V3. 1531 * 1532 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1533 * rw_enter(); we're trying to keep the current server from being 1534 * changed on us until we're done with the remapping and have a 1535 * matching client handle. We don't want to sending a filehandle 1536 * to the wrong host. 1537 */ 1538 failoverretry: 1539 if (FAILOVER_MOUNT(mi)) { 1540 mutex_enter(&mi->mi_lock); 1541 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1542 if (failover_wait(mi)) { 1543 mutex_exit(&mi->mi_lock); 1544 return (EINTR); 1545 } 1546 } 1547 INC_READERS(mi); 1548 mutex_exit(&mi->mi_lock); 1549 if (fi) { 1550 if (!VALID_FH(fi) && 1551 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1552 int remaperr; 1553 1554 svp = mi->mi_curr_serv; 1555 remaperr = failover_remap(fi); 1556 if (remaperr != 0) { 1557 #ifdef DEBUG 1558 if (remaperr != EINTR) 1559 nfs_cmn_err(remaperr, CE_WARN, 1560 "aclcall couldn't failover: %m"); 1561 #endif 1562 mutex_enter(&mi->mi_lock); 1563 DEC_READERS(mi); 1564 mutex_exit(&mi->mi_lock); 1565 1566 /* 1567 * If failover_remap returns ETIMEDOUT 1568 * and the filesystem is hard mounted 1569 * we have to retry the call with a new 1570 * server. 1571 */ 1572 if ((mi->mi_flags & MI_HARD) && 1573 IS_RECOVERABLE_ERROR(remaperr)) { 1574 if (svp == mi->mi_curr_serv) 1575 failover_newserver(mi); 1576 rpcerr.re_status = RPC_SUCCESS; 1577 goto failoverretry; 1578 } 1579 return (remaperr); 1580 } 1581 } 1582 if (fi->fhp && fi->copyproc) 1583 (*fi->copyproc)(fi->fhp, fi->vp); 1584 } 1585 } 1586 1587 /* For TSOL, use a new cred which has net_mac_aware flag */ 1588 if (!cred_cloned && is_system_labeled()) { 1589 cred_cloned = TRUE; 1590 cr = crdup(icr); 1591 (void) setpflags(NET_MAC_AWARE, 1, cr); 1592 } 1593 1594 /* 1595 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1596 * are guaranteed to reprocess the retry as a new request. 1597 */ 1598 svp = mi->mi_curr_serv; 1599 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1600 if (FAILOVER_MOUNT(mi)) { 1601 mutex_enter(&mi->mi_lock); 1602 DEC_READERS(mi); 1603 mutex_exit(&mi->mi_lock); 1604 1605 if ((rpcerr.re_errno == ETIMEDOUT || 1606 rpcerr.re_errno == ECONNRESET) && 1607 failover_safe(fi)) { 1608 if (svp == mi->mi_curr_serv) 1609 failover_newserver(mi); 1610 goto failoverretry; 1611 } 1612 } 1613 if (rpcerr.re_errno != 0) { 1614 if (cred_cloned) 1615 crfree(cr); 1616 return (rpcerr.re_errno); 1617 } 1618 1619 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1620 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1621 timeo = (mi->mi_timeo * hz) / 10; 1622 } else { 1623 mutex_enter(&mi->mi_lock); 1624 timeo = CLNT_SETTIMERS(client, 1625 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1626 &(mi->mi_timers[NFS_CALLTYPES]), 1627 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1628 (void (*)()) 0, (caddr_t)mi, 0); 1629 mutex_exit(&mi->mi_lock); 1630 } 1631 1632 /* 1633 * If hard mounted fs, retry call forever unless hard error occurs. 1634 */ 1635 do { 1636 tryagain = FALSE; 1637 1638 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1639 status = RPC_FAILED; 1640 rpcerr.re_status = RPC_FAILED; 1641 rpcerr.re_errno = EIO; 1642 break; 1643 } 1644 1645 TICK_TO_TIMEVAL(timeo, &wait); 1646 1647 /* 1648 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1649 * and SIGTERM. (Preserving the existing masks). 1650 * Mask out SIGINT if mount option nointr is specified. 1651 */ 1652 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1653 if (!(mi->mi_flags & MI_INT)) 1654 client->cl_nosignal = TRUE; 1655 1656 /* 1657 * If there is a current signal, then don't bother 1658 * even trying to send out the request because we 1659 * won't be able to block waiting for the response. 1660 * Simply assume RPC_INTR and get on with it. 1661 */ 1662 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1663 status = RPC_INTR; 1664 else { 1665 status = CLNT_CALL(client, which, xdrargs, argsp, 1666 xdrres, resp, wait); 1667 } 1668 1669 if (!(mi->mi_flags & MI_INT)) 1670 client->cl_nosignal = FALSE; 1671 /* 1672 * restore original signal mask 1673 */ 1674 sigunintr(&smask); 1675 1676 switch (status) { 1677 case RPC_SUCCESS: 1678 #if 0 /* notyet */ 1679 if ((mi->mi_flags & MI_DYNAMIC) && 1680 mi->mi_timer_type[which] != 0 && 1681 (mi->mi_curread != my_rsize || 1682 mi->mi_curwrite != my_wsize)) 1683 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1684 #endif 1685 break; 1686 1687 /* 1688 * Unfortunately, there are servers in the world which 1689 * are not coded correctly. They are not prepared to 1690 * handle RPC requests to the NFS port which are not 1691 * NFS requests. Thus, they may try to process the 1692 * NFS_ACL request as if it were an NFS request. This 1693 * does not work. Generally, an error will be generated 1694 * on the client because it will not be able to decode 1695 * the response from the server. However, it seems 1696 * possible that the server may not be able to decode 1697 * the arguments. Thus, the criteria for deciding 1698 * whether the server supports NFS_ACL or not is whether 1699 * the following RPC errors are returned from CLNT_CALL. 1700 */ 1701 case RPC_CANTDECODERES: 1702 case RPC_PROGUNAVAIL: 1703 case RPC_CANTDECODEARGS: 1704 case RPC_PROGVERSMISMATCH: 1705 mutex_enter(&mi->mi_lock); 1706 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1707 mutex_exit(&mi->mi_lock); 1708 break; 1709 1710 /* 1711 * If the server supports NFS_ACL but not the new ops 1712 * for extended attributes, make sure we don't retry. 1713 */ 1714 case RPC_PROCUNAVAIL: 1715 mutex_enter(&mi->mi_lock); 1716 mi->mi_flags &= ~MI_EXTATTR; 1717 mutex_exit(&mi->mi_lock); 1718 break; 1719 1720 case RPC_INTR: 1721 /* 1722 * There is no way to recover from this error, 1723 * even if mount option nointr is specified. 1724 * SIGKILL, for example, cannot be blocked. 1725 */ 1726 rpcerr.re_status = RPC_INTR; 1727 rpcerr.re_errno = EINTR; 1728 break; 1729 1730 case RPC_UDERROR: 1731 /* 1732 * If the NFS server is local (vold) and 1733 * it goes away then we get RPC_UDERROR. 1734 * This is a retryable error, so we would 1735 * loop, so check to see if the specific 1736 * error was ECONNRESET, indicating that 1737 * target did not exist at all. If so, 1738 * return with RPC_PROGUNAVAIL and 1739 * ECONNRESET to indicate why. 1740 */ 1741 CLNT_GETERR(client, &rpcerr); 1742 if (rpcerr.re_errno == ECONNRESET) { 1743 rpcerr.re_status = RPC_PROGUNAVAIL; 1744 rpcerr.re_errno = ECONNRESET; 1745 break; 1746 } 1747 /*FALLTHROUGH*/ 1748 1749 default: /* probably RPC_TIMEDOUT */ 1750 if (IS_UNRECOVERABLE_RPC(status)) 1751 break; 1752 1753 /* 1754 * increment server not responding count 1755 */ 1756 mutex_enter(&mi->mi_lock); 1757 mi->mi_noresponse++; 1758 mutex_exit(&mi->mi_lock); 1759 #ifdef DEBUG 1760 nfscl->nfscl_stat.noresponse.value.ui64++; 1761 #endif 1762 1763 if (!(mi->mi_flags & MI_HARD)) { 1764 if (!(mi->mi_flags & MI_SEMISOFT) || 1765 (mi->mi_acl_ss_call_type[which] == 0)) 1766 break; 1767 } 1768 1769 /* 1770 * The call is in progress (over COTS). 1771 * Try the CLNT_CALL again, but don't 1772 * print a noisy error message. 1773 */ 1774 if (status == RPC_INPROGRESS) { 1775 tryagain = TRUE; 1776 break; 1777 } 1778 1779 if (flags & RFSCALL_SOFT) 1780 break; 1781 1782 /* 1783 * On zone shutdown, just move on. 1784 */ 1785 if (zone_status_get(curproc->p_zone) >= 1786 ZONE_IS_SHUTTING_DOWN) { 1787 rpcerr.re_status = RPC_FAILED; 1788 rpcerr.re_errno = EIO; 1789 break; 1790 } 1791 1792 /* 1793 * NFS client failover support 1794 * 1795 * If the current server just failed us, we'll 1796 * start the process of finding a new server. 1797 * After that, we can just retry. 1798 */ 1799 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1800 if (svp == mi->mi_curr_serv) 1801 failover_newserver(mi); 1802 clfree_impl(client, ch, nfscl); 1803 goto failoverretry; 1804 } 1805 1806 tryagain = TRUE; 1807 timeo = backoff(timeo); 1808 mutex_enter(&mi->mi_lock); 1809 if (!(mi->mi_flags & MI_PRINTED)) { 1810 mi->mi_flags |= MI_PRINTED; 1811 mutex_exit(&mi->mi_lock); 1812 #ifdef DEBUG 1813 zprintf(zoneid, 1814 "NFS_ACL%d server %s not responding still trying\n", 1815 mi->mi_vers, svp->sv_hostname); 1816 #else 1817 zprintf(zoneid, 1818 "NFS server %s not responding still trying\n", 1819 svp->sv_hostname); 1820 #endif 1821 } else 1822 mutex_exit(&mi->mi_lock); 1823 if (*douprintf && nfs_has_ctty()) { 1824 *douprintf = 0; 1825 if (!(mi->mi_flags & MI_NOPRINT)) 1826 #ifdef DEBUG 1827 uprintf( 1828 "NFS_ACL%d server %s not responding still trying\n", 1829 mi->mi_vers, svp->sv_hostname); 1830 #else 1831 uprintf( 1832 "NFS server %s not responding still trying\n", 1833 svp->sv_hostname); 1834 #endif 1835 } 1836 1837 #if 0 /* notyet */ 1838 /* 1839 * If doing dynamic adjustment of transfer 1840 * size and if it's a read or write call 1841 * and if the transfer size changed while 1842 * retransmitting or if the feedback routine 1843 * changed the transfer size, 1844 * then exit rfscall so that the transfer 1845 * size can be adjusted at the vnops level. 1846 */ 1847 if ((mi->mi_flags & MI_DYNAMIC) && 1848 mi->mi_acl_timer_type[which] != 0 && 1849 (mi->mi_curread != my_rsize || 1850 mi->mi_curwrite != my_wsize || 1851 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1852 /* 1853 * On read or write calls, return 1854 * back to the vnode ops level if 1855 * the transfer size changed. 1856 */ 1857 clfree_impl(client, ch, nfscl); 1858 if (cred_cloned) 1859 crfree(cr); 1860 return (ENFS_TRYAGAIN); 1861 } 1862 #endif 1863 } 1864 } while (tryagain); 1865 1866 if (status != RPC_SUCCESS) { 1867 /* 1868 * Let soft mounts use the timed out message. 1869 */ 1870 if (status == RPC_INPROGRESS) 1871 status = RPC_TIMEDOUT; 1872 nfscl->nfscl_stat.badcalls.value.ui64++; 1873 if (status == RPC_CANTDECODERES || 1874 status == RPC_PROGUNAVAIL || 1875 status == RPC_PROCUNAVAIL || 1876 status == RPC_CANTDECODEARGS || 1877 status == RPC_PROGVERSMISMATCH) 1878 CLNT_GETERR(client, &rpcerr); 1879 else if (status != RPC_INTR) { 1880 mutex_enter(&mi->mi_lock); 1881 mi->mi_flags |= MI_DOWN; 1882 mutex_exit(&mi->mi_lock); 1883 CLNT_GETERR(client, &rpcerr); 1884 #ifdef DEBUG 1885 bufp = clnt_sperror(client, svp->sv_hostname); 1886 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1887 mi->mi_vers, mi->mi_aclnames[which], bufp); 1888 if (nfs_has_ctty()) { 1889 if (!(mi->mi_flags & MI_NOPRINT)) { 1890 uprintf("NFS_ACL%d %s failed for %s\n", 1891 mi->mi_vers, mi->mi_aclnames[which], 1892 bufp); 1893 } 1894 } 1895 kmem_free(bufp, MAXPATHLEN); 1896 #else 1897 zprintf(zoneid, 1898 "NFS %s failed for server %s: error %d (%s)\n", 1899 mi->mi_aclnames[which], svp->sv_hostname, 1900 status, clnt_sperrno(status)); 1901 if (nfs_has_ctty()) { 1902 if (!(mi->mi_flags & MI_NOPRINT)) 1903 uprintf( 1904 "NFS %s failed for server %s: error %d (%s)\n", 1905 mi->mi_aclnames[which], 1906 svp->sv_hostname, status, 1907 clnt_sperrno(status)); 1908 } 1909 #endif 1910 /* 1911 * when CLNT_CALL() fails with RPC_AUTHERROR, 1912 * re_errno is set appropriately depending on 1913 * the authentication error 1914 */ 1915 if (status == RPC_VERSMISMATCH || 1916 status == RPC_PROGVERSMISMATCH) 1917 rpcerr.re_errno = EIO; 1918 } 1919 } else { 1920 /* 1921 * Test the value of mi_down and mi_printed without 1922 * holding the mi_lock mutex. If they are both zero, 1923 * then it is okay to skip the down and printed 1924 * processing. This saves on a mutex_enter and 1925 * mutex_exit pair for a normal, successful RPC. 1926 * This was just complete overhead. 1927 */ 1928 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1929 mutex_enter(&mi->mi_lock); 1930 mi->mi_flags &= ~MI_DOWN; 1931 if (mi->mi_flags & MI_PRINTED) { 1932 mi->mi_flags &= ~MI_PRINTED; 1933 mutex_exit(&mi->mi_lock); 1934 #ifdef DEBUG 1935 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1936 mi->mi_vers, svp->sv_hostname); 1937 #else 1938 zprintf(zoneid, "NFS server %s ok\n", 1939 svp->sv_hostname); 1940 #endif 1941 } else 1942 mutex_exit(&mi->mi_lock); 1943 } 1944 1945 if (*douprintf == 0) { 1946 if (!(mi->mi_flags & MI_NOPRINT)) 1947 #ifdef DEBUG 1948 uprintf("NFS_ACL%d server %s ok\n", 1949 mi->mi_vers, svp->sv_hostname); 1950 #else 1951 uprintf("NFS server %s ok\n", svp->sv_hostname); 1952 #endif 1953 *douprintf = 1; 1954 } 1955 } 1956 1957 clfree_impl(client, ch, nfscl); 1958 if (cred_cloned) 1959 crfree(cr); 1960 1961 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1962 1963 #if 0 /* notyet */ 1964 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1965 rpcerr.re_errno); 1966 #endif 1967 1968 return (rpcerr.re_errno); 1969 } 1970 1971 int 1972 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1973 { 1974 uint_t mask = vap->va_mask; 1975 1976 if (!(mask & AT_MODE)) 1977 sa->sa_mode = (uint32_t)-1; 1978 else 1979 sa->sa_mode = vap->va_mode; 1980 if (!(mask & AT_UID)) 1981 sa->sa_uid = (uint32_t)-1; 1982 else 1983 sa->sa_uid = (uint32_t)vap->va_uid; 1984 if (!(mask & AT_GID)) 1985 sa->sa_gid = (uint32_t)-1; 1986 else 1987 sa->sa_gid = (uint32_t)vap->va_gid; 1988 if (!(mask & AT_SIZE)) 1989 sa->sa_size = (uint32_t)-1; 1990 else 1991 sa->sa_size = (uint32_t)vap->va_size; 1992 if (!(mask & AT_ATIME)) 1993 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 1994 else { 1995 /* check time validity */ 1996 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1997 return (EOVERFLOW); 1998 } 1999 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2000 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2001 } 2002 if (!(mask & AT_MTIME)) 2003 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2004 else { 2005 /* check time validity */ 2006 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2007 return (EOVERFLOW); 2008 } 2009 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2010 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2011 } 2012 return (0); 2013 } 2014 2015 int 2016 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2017 { 2018 uint_t mask = vap->va_mask; 2019 2020 if (!(mask & AT_MODE)) 2021 sa->mode.set_it = FALSE; 2022 else { 2023 sa->mode.set_it = TRUE; 2024 sa->mode.mode = (mode3)vap->va_mode; 2025 } 2026 if (!(mask & AT_UID)) 2027 sa->uid.set_it = FALSE; 2028 else { 2029 sa->uid.set_it = TRUE; 2030 sa->uid.uid = (uid3)vap->va_uid; 2031 } 2032 if (!(mask & AT_GID)) 2033 sa->gid.set_it = FALSE; 2034 else { 2035 sa->gid.set_it = TRUE; 2036 sa->gid.gid = (gid3)vap->va_gid; 2037 } 2038 if (!(mask & AT_SIZE)) 2039 sa->size.set_it = FALSE; 2040 else { 2041 sa->size.set_it = TRUE; 2042 sa->size.size = (size3)vap->va_size; 2043 } 2044 if (!(mask & AT_ATIME)) 2045 sa->atime.set_it = DONT_CHANGE; 2046 else { 2047 /* check time validity */ 2048 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2049 return (EOVERFLOW); 2050 } 2051 sa->atime.set_it = SET_TO_CLIENT_TIME; 2052 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2053 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2054 } 2055 if (!(mask & AT_MTIME)) 2056 sa->mtime.set_it = DONT_CHANGE; 2057 else { 2058 /* check time validity */ 2059 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2060 return (EOVERFLOW); 2061 } 2062 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2063 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2064 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2065 } 2066 return (0); 2067 } 2068 2069 void 2070 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2071 { 2072 2073 da->da_fhandle = VTOFH(dvp); 2074 da->da_name = nm; 2075 da->da_flags = 0; 2076 } 2077 2078 void 2079 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2080 { 2081 2082 da->dirp = VTOFH3(dvp); 2083 da->name = nm; 2084 } 2085 2086 int 2087 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2088 { 2089 int error; 2090 rnode_t *rp; 2091 struct vattr va; 2092 2093 va.va_mask = AT_MODE | AT_GID; 2094 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2095 if (error) 2096 return (error); 2097 2098 /* 2099 * To determine the expected group-id of the created file: 2100 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2101 * GRPID option, and the directory's set-gid bit is clear, 2102 * then use the process's gid. 2103 * 2) Otherwise, set the group-id to the gid of the parent directory. 2104 */ 2105 rp = VTOR(dvp); 2106 mutex_enter(&rp->r_statelock); 2107 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2108 *gidp = crgetgid(cr); 2109 else 2110 *gidp = va.va_gid; 2111 mutex_exit(&rp->r_statelock); 2112 return (0); 2113 } 2114 2115 int 2116 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2117 { 2118 int error; 2119 struct vattr va; 2120 2121 va.va_mask = AT_MODE; 2122 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2123 if (error) 2124 return (error); 2125 2126 /* 2127 * Modify the expected mode (om) so that the set-gid bit matches 2128 * that of the parent directory (dvp). 2129 */ 2130 if (va.va_mode & VSGID) 2131 *omp |= VSGID; 2132 else 2133 *omp &= ~VSGID; 2134 return (0); 2135 } 2136 2137 void 2138 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2139 { 2140 2141 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2142 if (!(vp->v_flag & VSWAPLIKE)) { 2143 mutex_enter(&vp->v_lock); 2144 vp->v_flag |= VSWAPLIKE; 2145 mutex_exit(&vp->v_lock); 2146 } 2147 } else { 2148 if (vp->v_flag & VSWAPLIKE) { 2149 mutex_enter(&vp->v_lock); 2150 vp->v_flag &= ~VSWAPLIKE; 2151 mutex_exit(&vp->v_lock); 2152 } 2153 } 2154 } 2155 2156 /* 2157 * Free the resources associated with an rnode. 2158 */ 2159 static void 2160 rinactive(rnode_t *rp, cred_t *cr) 2161 { 2162 vnode_t *vp; 2163 cred_t *cred; 2164 char *contents; 2165 int size; 2166 vsecattr_t *vsp; 2167 int error; 2168 nfs3_pathconf_info *info; 2169 2170 /* 2171 * Before freeing anything, wait until all asynchronous 2172 * activity is done on this rnode. This will allow all 2173 * asynchronous read ahead and write behind i/o's to 2174 * finish. 2175 */ 2176 mutex_enter(&rp->r_statelock); 2177 while (rp->r_count > 0) 2178 cv_wait(&rp->r_cv, &rp->r_statelock); 2179 mutex_exit(&rp->r_statelock); 2180 2181 /* 2182 * Flush and invalidate all pages associated with the vnode. 2183 */ 2184 vp = RTOV(rp); 2185 if (vn_has_cached_data(vp)) { 2186 ASSERT(vp->v_type != VCHR); 2187 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2188 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2189 if (error && (error == ENOSPC || error == EDQUOT)) { 2190 mutex_enter(&rp->r_statelock); 2191 if (!rp->r_error) 2192 rp->r_error = error; 2193 mutex_exit(&rp->r_statelock); 2194 } 2195 } 2196 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2197 } 2198 2199 /* 2200 * Free any held credentials and caches which may be associated 2201 * with this rnode. 2202 */ 2203 mutex_enter(&rp->r_statelock); 2204 cred = rp->r_cred; 2205 rp->r_cred = NULL; 2206 contents = rp->r_symlink.contents; 2207 size = rp->r_symlink.size; 2208 rp->r_symlink.contents = NULL; 2209 vsp = rp->r_secattr; 2210 rp->r_secattr = NULL; 2211 info = rp->r_pathconf; 2212 rp->r_pathconf = NULL; 2213 mutex_exit(&rp->r_statelock); 2214 2215 /* 2216 * Free the held credential. 2217 */ 2218 if (cred != NULL) 2219 crfree(cred); 2220 2221 /* 2222 * Free the access cache entries. 2223 */ 2224 (void) nfs_access_purge_rp(rp); 2225 2226 /* 2227 * Free the readdir cache entries. 2228 */ 2229 if (HAVE_RDDIR_CACHE(rp)) 2230 nfs_purge_rddir_cache(vp); 2231 2232 /* 2233 * Free the symbolic link cache. 2234 */ 2235 if (contents != NULL) { 2236 2237 kmem_free((void *)contents, size); 2238 } 2239 2240 /* 2241 * Free any cached ACL. 2242 */ 2243 if (vsp != NULL) 2244 nfs_acl_free(vsp); 2245 2246 /* 2247 * Free any cached pathconf information. 2248 */ 2249 if (info != NULL) 2250 kmem_free(info, sizeof (*info)); 2251 } 2252 2253 /* 2254 * Return a vnode for the given NFS Version 2 file handle. 2255 * If no rnode exists for this fhandle, create one and put it 2256 * into the hash queues. If the rnode for this fhandle 2257 * already exists, return it. 2258 * 2259 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2260 */ 2261 vnode_t * 2262 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2263 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2264 { 2265 int newnode; 2266 int index; 2267 vnode_t *vp; 2268 nfs_fhandle nfh; 2269 vattr_t va; 2270 2271 nfh.fh_len = NFS_FHSIZE; 2272 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2273 2274 index = rtablehash(&nfh); 2275 rw_enter(&rtable[index].r_lock, RW_READER); 2276 2277 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2278 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2279 2280 if (attr != NULL) { 2281 if (!newnode) { 2282 rw_exit(&rtable[index].r_lock); 2283 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2284 } else { 2285 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2286 vp->v_type = VBAD; 2287 else 2288 vp->v_type = n2v_type(attr); 2289 /* 2290 * A translation here seems to be necessary 2291 * because this function can be called 2292 * with `attr' that has come from the wire, 2293 * and been operated on by vattr_to_nattr(). 2294 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2295 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2296 * ->makenfsnode(). 2297 */ 2298 if ((attr->na_rdev & 0xffff0000) == 0) 2299 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2300 else 2301 vp->v_rdev = expldev(n2v_rdev(attr)); 2302 nfs_attrcache(vp, attr, t); 2303 rw_exit(&rtable[index].r_lock); 2304 } 2305 } else { 2306 if (newnode) { 2307 PURGE_ATTRCACHE(vp); 2308 } 2309 rw_exit(&rtable[index].r_lock); 2310 } 2311 2312 return (vp); 2313 } 2314 2315 /* 2316 * Return a vnode for the given NFS Version 3 file handle. 2317 * If no rnode exists for this fhandle, create one and put it 2318 * into the hash queues. If the rnode for this fhandle 2319 * already exists, return it. 2320 * 2321 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2322 */ 2323 vnode_t * 2324 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2325 cred_t *cr, char *dnm, char *nm) 2326 { 2327 int newnode; 2328 int index; 2329 vnode_t *vp; 2330 2331 index = rtablehash((nfs_fhandle *)fh); 2332 rw_enter(&rtable[index].r_lock, RW_READER); 2333 2334 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2335 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2336 dnm, nm); 2337 2338 if (vap == NULL) { 2339 if (newnode) { 2340 PURGE_ATTRCACHE(vp); 2341 } 2342 rw_exit(&rtable[index].r_lock); 2343 return (vp); 2344 } 2345 2346 if (!newnode) { 2347 rw_exit(&rtable[index].r_lock); 2348 nfs_attr_cache(vp, vap, t, cr); 2349 } else { 2350 rnode_t *rp = VTOR(vp); 2351 2352 vp->v_type = vap->va_type; 2353 vp->v_rdev = vap->va_rdev; 2354 2355 mutex_enter(&rp->r_statelock); 2356 if (rp->r_mtime <= t) 2357 nfs_attrcache_va(vp, vap); 2358 mutex_exit(&rp->r_statelock); 2359 rw_exit(&rtable[index].r_lock); 2360 } 2361 2362 return (vp); 2363 } 2364 2365 vnode_t * 2366 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2367 cred_t *cr, char *dnm, char *nm) 2368 { 2369 int newnode; 2370 int index; 2371 vnode_t *vp; 2372 vattr_t va; 2373 2374 index = rtablehash((nfs_fhandle *)fh); 2375 rw_enter(&rtable[index].r_lock, RW_READER); 2376 2377 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2378 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2379 dnm, nm); 2380 2381 if (attr == NULL) { 2382 if (newnode) { 2383 PURGE_ATTRCACHE(vp); 2384 } 2385 rw_exit(&rtable[index].r_lock); 2386 return (vp); 2387 } 2388 2389 if (!newnode) { 2390 rw_exit(&rtable[index].r_lock); 2391 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2392 } else { 2393 if (attr->type < NF3REG || attr->type > NF3FIFO) 2394 vp->v_type = VBAD; 2395 else 2396 vp->v_type = nf3_to_vt[attr->type]; 2397 vp->v_rdev = makedevice(attr->rdev.specdata1, 2398 attr->rdev.specdata2); 2399 nfs3_attrcache(vp, attr, t); 2400 rw_exit(&rtable[index].r_lock); 2401 } 2402 2403 return (vp); 2404 } 2405 2406 /* 2407 * Read this comment before making changes to rtablehash()! 2408 * This is a hash function in which seemingly obvious and harmless 2409 * changes can cause escalations costing million dollars! 2410 * Know what you are doing. 2411 * 2412 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2413 * algorithm is currently detailed here: 2414 * 2415 * http://burtleburtle.net/bob/hash/doobs.html 2416 * 2417 * Of course, the above link may not be valid by the time you are reading 2418 * this, but suffice it to say that the one-at-a-time algorithm works well in 2419 * almost all cases. If you are changing the algorithm be sure to verify that 2420 * the hash algorithm still provides even distribution in all cases and with 2421 * any server returning filehandles in whatever order (sequential or random). 2422 */ 2423 static int 2424 rtablehash(nfs_fhandle *fh) 2425 { 2426 ulong_t hash, len, i; 2427 char *key; 2428 2429 key = fh->fh_buf; 2430 len = (ulong_t)fh->fh_len; 2431 for (hash = 0, i = 0; i < len; i++) { 2432 hash += key[i]; 2433 hash += (hash << 10); 2434 hash ^= (hash >> 6); 2435 } 2436 hash += (hash << 3); 2437 hash ^= (hash >> 11); 2438 hash += (hash << 15); 2439 return (hash & rtablemask); 2440 } 2441 2442 static vnode_t * 2443 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2444 struct vnodeops *vops, 2445 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2446 int (*compar)(const void *, const void *), 2447 int *newnode, cred_t *cr, char *dnm, char *nm) 2448 { 2449 rnode_t *rp; 2450 rnode_t *trp; 2451 vnode_t *vp; 2452 mntinfo_t *mi; 2453 2454 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2455 2456 mi = VFTOMI(vfsp); 2457 start: 2458 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2459 vp = RTOV(rp); 2460 nfs_set_vroot(vp); 2461 *newnode = 0; 2462 return (vp); 2463 } 2464 rw_exit(&rhtp->r_lock); 2465 2466 mutex_enter(&rpfreelist_lock); 2467 if (rpfreelist != NULL && rnew >= nrnode) { 2468 rp = rpfreelist; 2469 rp_rmfree(rp); 2470 mutex_exit(&rpfreelist_lock); 2471 2472 vp = RTOV(rp); 2473 2474 if (rp->r_flags & RHASHED) { 2475 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2476 mutex_enter(&vp->v_lock); 2477 if (vp->v_count > 1) { 2478 vp->v_count--; 2479 mutex_exit(&vp->v_lock); 2480 rw_exit(&rp->r_hashq->r_lock); 2481 rw_enter(&rhtp->r_lock, RW_READER); 2482 goto start; 2483 } 2484 mutex_exit(&vp->v_lock); 2485 rp_rmhash_locked(rp); 2486 rw_exit(&rp->r_hashq->r_lock); 2487 } 2488 2489 rinactive(rp, cr); 2490 2491 mutex_enter(&vp->v_lock); 2492 if (vp->v_count > 1) { 2493 vp->v_count--; 2494 mutex_exit(&vp->v_lock); 2495 rw_enter(&rhtp->r_lock, RW_READER); 2496 goto start; 2497 } 2498 mutex_exit(&vp->v_lock); 2499 vn_invalid(vp); 2500 /* 2501 * destroy old locks before bzero'ing and 2502 * recreating the locks below. 2503 */ 2504 nfs_rw_destroy(&rp->r_rwlock); 2505 nfs_rw_destroy(&rp->r_lkserlock); 2506 mutex_destroy(&rp->r_statelock); 2507 cv_destroy(&rp->r_cv); 2508 cv_destroy(&rp->r_commit.c_cv); 2509 nfs_free_r_path(rp); 2510 avl_destroy(&rp->r_dir); 2511 /* 2512 * Make sure that if rnode is recycled then 2513 * VFS count is decremented properly before 2514 * reuse. 2515 */ 2516 VFS_RELE(vp->v_vfsp); 2517 vn_reinit(vp); 2518 } else { 2519 vnode_t *new_vp; 2520 2521 mutex_exit(&rpfreelist_lock); 2522 2523 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2524 new_vp = vn_alloc(KM_SLEEP); 2525 2526 atomic_add_long((ulong_t *)&rnew, 1); 2527 #ifdef DEBUG 2528 clstat_debug.nrnode.value.ui64++; 2529 #endif 2530 vp = new_vp; 2531 } 2532 2533 bzero(rp, sizeof (*rp)); 2534 rp->r_vnode = vp; 2535 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2536 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2537 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2538 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2539 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2540 rp->r_fh.fh_len = fh->fh_len; 2541 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2542 rp->r_server = mi->mi_curr_serv; 2543 if (FAILOVER_MOUNT(mi)) { 2544 /* 2545 * If replicated servers, stash pathnames 2546 */ 2547 if (dnm != NULL && nm != NULL) { 2548 char *s, *p; 2549 uint_t len; 2550 2551 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2552 rp->r_path = kmem_alloc(len, KM_SLEEP); 2553 #ifdef DEBUG 2554 clstat_debug.rpath.value.ui64 += len; 2555 #endif 2556 s = rp->r_path; 2557 for (p = dnm; *p; p++) 2558 *s++ = *p; 2559 *s++ = '/'; 2560 for (p = nm; *p; p++) 2561 *s++ = *p; 2562 *s = '\0'; 2563 } else { 2564 /* special case for root */ 2565 rp->r_path = kmem_alloc(2, KM_SLEEP); 2566 #ifdef DEBUG 2567 clstat_debug.rpath.value.ui64 += 2; 2568 #endif 2569 *rp->r_path = '.'; 2570 *(rp->r_path + 1) = '\0'; 2571 } 2572 } 2573 VFS_HOLD(vfsp); 2574 rp->r_putapage = putapage; 2575 rp->r_hashq = rhtp; 2576 rp->r_flags = RREADDIRPLUS; 2577 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2578 offsetof(rddir_cache, tree)); 2579 vn_setops(vp, vops); 2580 vp->v_data = (caddr_t)rp; 2581 vp->v_vfsp = vfsp; 2582 vp->v_type = VNON; 2583 nfs_set_vroot(vp); 2584 2585 /* 2586 * There is a race condition if someone else 2587 * alloc's the rnode while no locks are held, so we 2588 * check again and recover if found. 2589 */ 2590 rw_enter(&rhtp->r_lock, RW_WRITER); 2591 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2592 vp = RTOV(trp); 2593 nfs_set_vroot(vp); 2594 *newnode = 0; 2595 rw_exit(&rhtp->r_lock); 2596 rp_addfree(rp, cr); 2597 rw_enter(&rhtp->r_lock, RW_READER); 2598 return (vp); 2599 } 2600 rp_addhash(rp); 2601 *newnode = 1; 2602 return (vp); 2603 } 2604 2605 static void 2606 nfs_set_vroot(vnode_t *vp) 2607 { 2608 rnode_t *rp; 2609 nfs_fhandle *rootfh; 2610 2611 rp = VTOR(vp); 2612 rootfh = &rp->r_server->sv_fhandle; 2613 if (rootfh->fh_len == rp->r_fh.fh_len && 2614 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2615 if (!(vp->v_flag & VROOT)) { 2616 mutex_enter(&vp->v_lock); 2617 vp->v_flag |= VROOT; 2618 mutex_exit(&vp->v_lock); 2619 } 2620 } 2621 } 2622 2623 static void 2624 nfs_free_r_path(rnode_t *rp) 2625 { 2626 char *path; 2627 size_t len; 2628 2629 path = rp->r_path; 2630 if (path) { 2631 rp->r_path = NULL; 2632 len = strlen(path) + 1; 2633 kmem_free(path, len); 2634 #ifdef DEBUG 2635 clstat_debug.rpath.value.ui64 -= len; 2636 #endif 2637 } 2638 } 2639 2640 /* 2641 * Put an rnode on the free list. 2642 * 2643 * Rnodes which were allocated above and beyond the normal limit 2644 * are immediately freed. 2645 */ 2646 void 2647 rp_addfree(rnode_t *rp, cred_t *cr) 2648 { 2649 vnode_t *vp; 2650 struct vfs *vfsp; 2651 2652 vp = RTOV(rp); 2653 ASSERT(vp->v_count >= 1); 2654 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2655 2656 /* 2657 * If we have too many rnodes allocated and there are no 2658 * references to this rnode, or if the rnode is no longer 2659 * accessible by it does not reside in the hash queues, 2660 * or if an i/o error occurred while writing to the file, 2661 * then just free it instead of putting it on the rnode 2662 * freelist. 2663 */ 2664 vfsp = vp->v_vfsp; 2665 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2666 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2667 if (rp->r_flags & RHASHED) { 2668 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2669 mutex_enter(&vp->v_lock); 2670 if (vp->v_count > 1) { 2671 vp->v_count--; 2672 mutex_exit(&vp->v_lock); 2673 rw_exit(&rp->r_hashq->r_lock); 2674 return; 2675 } 2676 mutex_exit(&vp->v_lock); 2677 rp_rmhash_locked(rp); 2678 rw_exit(&rp->r_hashq->r_lock); 2679 } 2680 2681 rinactive(rp, cr); 2682 2683 /* 2684 * Recheck the vnode reference count. We need to 2685 * make sure that another reference has not been 2686 * acquired while we were not holding v_lock. The 2687 * rnode is not in the rnode hash queues, so the 2688 * only way for a reference to have been acquired 2689 * is for a VOP_PUTPAGE because the rnode was marked 2690 * with RDIRTY or for a modified page. This 2691 * reference may have been acquired before our call 2692 * to rinactive. The i/o may have been completed, 2693 * thus allowing rinactive to complete, but the 2694 * reference to the vnode may not have been released 2695 * yet. In any case, the rnode can not be destroyed 2696 * until the other references to this vnode have been 2697 * released. The other references will take care of 2698 * either destroying the rnode or placing it on the 2699 * rnode freelist. If there are no other references, 2700 * then the rnode may be safely destroyed. 2701 */ 2702 mutex_enter(&vp->v_lock); 2703 if (vp->v_count > 1) { 2704 vp->v_count--; 2705 mutex_exit(&vp->v_lock); 2706 return; 2707 } 2708 mutex_exit(&vp->v_lock); 2709 2710 destroy_rnode(rp); 2711 return; 2712 } 2713 2714 /* 2715 * Lock the hash queue and then recheck the reference count 2716 * to ensure that no other threads have acquired a reference 2717 * to indicate that the rnode should not be placed on the 2718 * freelist. If another reference has been acquired, then 2719 * just release this one and let the other thread complete 2720 * the processing of adding this rnode to the freelist. 2721 */ 2722 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2723 2724 mutex_enter(&vp->v_lock); 2725 if (vp->v_count > 1) { 2726 vp->v_count--; 2727 mutex_exit(&vp->v_lock); 2728 rw_exit(&rp->r_hashq->r_lock); 2729 return; 2730 } 2731 mutex_exit(&vp->v_lock); 2732 2733 /* 2734 * If there is no cached data or metadata for this file, then 2735 * put the rnode on the front of the freelist so that it will 2736 * be reused before other rnodes which may have cached data or 2737 * metadata associated with them. 2738 */ 2739 mutex_enter(&rpfreelist_lock); 2740 if (rpfreelist == NULL) { 2741 rp->r_freef = rp; 2742 rp->r_freeb = rp; 2743 rpfreelist = rp; 2744 } else { 2745 rp->r_freef = rpfreelist; 2746 rp->r_freeb = rpfreelist->r_freeb; 2747 rpfreelist->r_freeb->r_freef = rp; 2748 rpfreelist->r_freeb = rp; 2749 if (!vn_has_cached_data(vp) && 2750 !HAVE_RDDIR_CACHE(rp) && 2751 rp->r_symlink.contents == NULL && 2752 rp->r_secattr == NULL && 2753 rp->r_pathconf == NULL) 2754 rpfreelist = rp; 2755 } 2756 mutex_exit(&rpfreelist_lock); 2757 2758 rw_exit(&rp->r_hashq->r_lock); 2759 } 2760 2761 /* 2762 * Remove an rnode from the free list. 2763 * 2764 * The caller must be holding rpfreelist_lock and the rnode 2765 * must be on the freelist. 2766 */ 2767 static void 2768 rp_rmfree(rnode_t *rp) 2769 { 2770 2771 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2772 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2773 2774 if (rp == rpfreelist) { 2775 rpfreelist = rp->r_freef; 2776 if (rp == rpfreelist) 2777 rpfreelist = NULL; 2778 } 2779 2780 rp->r_freeb->r_freef = rp->r_freef; 2781 rp->r_freef->r_freeb = rp->r_freeb; 2782 2783 rp->r_freef = rp->r_freeb = NULL; 2784 } 2785 2786 /* 2787 * Put a rnode in the hash table. 2788 * 2789 * The caller must be holding the exclusive hash queue lock. 2790 */ 2791 static void 2792 rp_addhash(rnode_t *rp) 2793 { 2794 2795 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2796 ASSERT(!(rp->r_flags & RHASHED)); 2797 2798 rp->r_hashf = rp->r_hashq->r_hashf; 2799 rp->r_hashq->r_hashf = rp; 2800 rp->r_hashb = (rnode_t *)rp->r_hashq; 2801 rp->r_hashf->r_hashb = rp; 2802 2803 mutex_enter(&rp->r_statelock); 2804 rp->r_flags |= RHASHED; 2805 mutex_exit(&rp->r_statelock); 2806 } 2807 2808 /* 2809 * Remove a rnode from the hash table. 2810 * 2811 * The caller must be holding the hash queue lock. 2812 */ 2813 static void 2814 rp_rmhash_locked(rnode_t *rp) 2815 { 2816 2817 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2818 ASSERT(rp->r_flags & RHASHED); 2819 2820 rp->r_hashb->r_hashf = rp->r_hashf; 2821 rp->r_hashf->r_hashb = rp->r_hashb; 2822 2823 mutex_enter(&rp->r_statelock); 2824 rp->r_flags &= ~RHASHED; 2825 mutex_exit(&rp->r_statelock); 2826 } 2827 2828 /* 2829 * Remove a rnode from the hash table. 2830 * 2831 * The caller must not be holding the hash queue lock. 2832 */ 2833 void 2834 rp_rmhash(rnode_t *rp) 2835 { 2836 2837 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2838 rp_rmhash_locked(rp); 2839 rw_exit(&rp->r_hashq->r_lock); 2840 } 2841 2842 /* 2843 * Lookup a rnode by fhandle. 2844 * 2845 * The caller must be holding the hash queue lock, either shared or exclusive. 2846 */ 2847 static rnode_t * 2848 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2849 { 2850 rnode_t *rp; 2851 vnode_t *vp; 2852 2853 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2854 2855 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2856 vp = RTOV(rp); 2857 if (vp->v_vfsp == vfsp && 2858 rp->r_fh.fh_len == fh->fh_len && 2859 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2860 /* 2861 * remove rnode from free list, if necessary. 2862 */ 2863 if (rp->r_freef != NULL) { 2864 mutex_enter(&rpfreelist_lock); 2865 /* 2866 * If the rnode is on the freelist, 2867 * then remove it and use that reference 2868 * as the new reference. Otherwise, 2869 * need to increment the reference count. 2870 */ 2871 if (rp->r_freef != NULL) { 2872 rp_rmfree(rp); 2873 mutex_exit(&rpfreelist_lock); 2874 } else { 2875 mutex_exit(&rpfreelist_lock); 2876 VN_HOLD(vp); 2877 } 2878 } else 2879 VN_HOLD(vp); 2880 return (rp); 2881 } 2882 } 2883 return (NULL); 2884 } 2885 2886 /* 2887 * Return 1 if there is a active vnode belonging to this vfs in the 2888 * rtable cache. 2889 * 2890 * Several of these checks are done without holding the usual 2891 * locks. This is safe because destroy_rtable(), rp_addfree(), 2892 * etc. will redo the necessary checks before actually destroying 2893 * any rnodes. 2894 */ 2895 int 2896 check_rtable(struct vfs *vfsp) 2897 { 2898 int index; 2899 rnode_t *rp; 2900 vnode_t *vp; 2901 2902 for (index = 0; index < rtablesize; index++) { 2903 rw_enter(&rtable[index].r_lock, RW_READER); 2904 for (rp = rtable[index].r_hashf; 2905 rp != (rnode_t *)(&rtable[index]); 2906 rp = rp->r_hashf) { 2907 vp = RTOV(rp); 2908 if (vp->v_vfsp == vfsp) { 2909 if (rp->r_freef == NULL || 2910 (vn_has_cached_data(vp) && 2911 (rp->r_flags & RDIRTY)) || 2912 rp->r_count > 0) { 2913 rw_exit(&rtable[index].r_lock); 2914 return (1); 2915 } 2916 } 2917 } 2918 rw_exit(&rtable[index].r_lock); 2919 } 2920 return (0); 2921 } 2922 2923 /* 2924 * Destroy inactive vnodes from the hash queues which belong to this 2925 * vfs. It is essential that we destroy all inactive vnodes during a 2926 * forced unmount as well as during a normal unmount. 2927 */ 2928 void 2929 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2930 { 2931 int index; 2932 rnode_t *rp; 2933 rnode_t *rlist; 2934 rnode_t *r_hashf; 2935 vnode_t *vp; 2936 2937 rlist = NULL; 2938 2939 for (index = 0; index < rtablesize; index++) { 2940 rw_enter(&rtable[index].r_lock, RW_WRITER); 2941 for (rp = rtable[index].r_hashf; 2942 rp != (rnode_t *)(&rtable[index]); 2943 rp = r_hashf) { 2944 /* save the hash pointer before destroying */ 2945 r_hashf = rp->r_hashf; 2946 vp = RTOV(rp); 2947 if (vp->v_vfsp == vfsp) { 2948 mutex_enter(&rpfreelist_lock); 2949 if (rp->r_freef != NULL) { 2950 rp_rmfree(rp); 2951 mutex_exit(&rpfreelist_lock); 2952 rp_rmhash_locked(rp); 2953 rp->r_hashf = rlist; 2954 rlist = rp; 2955 } else 2956 mutex_exit(&rpfreelist_lock); 2957 } 2958 } 2959 rw_exit(&rtable[index].r_lock); 2960 } 2961 2962 for (rp = rlist; rp != NULL; rp = rlist) { 2963 rlist = rp->r_hashf; 2964 /* 2965 * This call to rp_addfree will end up destroying the 2966 * rnode, but in a safe way with the appropriate set 2967 * of checks done. 2968 */ 2969 rp_addfree(rp, cr); 2970 } 2971 2972 } 2973 2974 /* 2975 * This routine destroys all the resources associated with the rnode 2976 * and then the rnode itself. 2977 */ 2978 static void 2979 destroy_rnode(rnode_t *rp) 2980 { 2981 vnode_t *vp; 2982 vfs_t *vfsp; 2983 2984 vp = RTOV(rp); 2985 vfsp = vp->v_vfsp; 2986 2987 ASSERT(vp->v_count == 1); 2988 ASSERT(rp->r_count == 0); 2989 ASSERT(rp->r_lmpl == NULL); 2990 ASSERT(rp->r_mapcnt == 0); 2991 ASSERT(!(rp->r_flags & RHASHED)); 2992 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2993 atomic_add_long((ulong_t *)&rnew, -1); 2994 #ifdef DEBUG 2995 clstat_debug.nrnode.value.ui64--; 2996 #endif 2997 nfs_rw_destroy(&rp->r_rwlock); 2998 nfs_rw_destroy(&rp->r_lkserlock); 2999 mutex_destroy(&rp->r_statelock); 3000 cv_destroy(&rp->r_cv); 3001 cv_destroy(&rp->r_commit.c_cv); 3002 if (rp->r_flags & RDELMAPLIST) 3003 list_destroy(&rp->r_indelmap); 3004 nfs_free_r_path(rp); 3005 avl_destroy(&rp->r_dir); 3006 vn_invalid(vp); 3007 vn_free(vp); 3008 kmem_cache_free(rnode_cache, rp); 3009 VFS_RELE(vfsp); 3010 } 3011 3012 /* 3013 * Flush all vnodes in this (or every) vfs. 3014 * Used by nfs_sync and by nfs_unmount. 3015 */ 3016 void 3017 rflush(struct vfs *vfsp, cred_t *cr) 3018 { 3019 int index; 3020 rnode_t *rp; 3021 vnode_t *vp, **vplist; 3022 long num, cnt; 3023 3024 /* 3025 * Check to see whether there is anything to do. 3026 */ 3027 num = rnew; 3028 if (num == 0) 3029 return; 3030 3031 /* 3032 * Allocate a slot for all currently active rnodes on the 3033 * supposition that they all may need flushing. 3034 */ 3035 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3036 cnt = 0; 3037 3038 /* 3039 * Walk the hash queues looking for rnodes with page 3040 * lists associated with them. Make a list of these 3041 * files. 3042 */ 3043 for (index = 0; index < rtablesize; index++) { 3044 rw_enter(&rtable[index].r_lock, RW_READER); 3045 for (rp = rtable[index].r_hashf; 3046 rp != (rnode_t *)(&rtable[index]); 3047 rp = rp->r_hashf) { 3048 vp = RTOV(rp); 3049 /* 3050 * Don't bother sync'ing a vp if it 3051 * is part of virtual swap device or 3052 * if VFS is read-only 3053 */ 3054 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3055 continue; 3056 /* 3057 * If flushing all mounted file systems or 3058 * the vnode belongs to this vfs, has pages 3059 * and is marked as either dirty or mmap'd, 3060 * hold and add this vnode to the list of 3061 * vnodes to flush. 3062 */ 3063 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3064 vn_has_cached_data(vp) && 3065 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3066 VN_HOLD(vp); 3067 vplist[cnt++] = vp; 3068 if (cnt == num) { 3069 rw_exit(&rtable[index].r_lock); 3070 goto toomany; 3071 } 3072 } 3073 } 3074 rw_exit(&rtable[index].r_lock); 3075 } 3076 toomany: 3077 3078 /* 3079 * Flush and release all of the files on the list. 3080 */ 3081 while (cnt-- > 0) { 3082 vp = vplist[cnt]; 3083 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3084 VN_RELE(vp); 3085 } 3086 3087 /* 3088 * Free the space allocated to hold the list. 3089 */ 3090 kmem_free(vplist, num * sizeof (*vplist)); 3091 } 3092 3093 /* 3094 * This probably needs to be larger than or equal to 3095 * log2(sizeof (struct rnode)) due to the way that rnodes are 3096 * allocated. 3097 */ 3098 #define ACACHE_SHIFT_BITS 9 3099 3100 static int 3101 acachehash(rnode_t *rp, cred_t *cr) 3102 { 3103 3104 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3105 acachemask); 3106 } 3107 3108 #ifdef DEBUG 3109 static long nfs_access_cache_hits = 0; 3110 static long nfs_access_cache_misses = 0; 3111 #endif 3112 3113 nfs_access_type_t 3114 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3115 { 3116 vnode_t *vp; 3117 acache_t *ap; 3118 acache_hash_t *hp; 3119 nfs_access_type_t all; 3120 3121 vp = RTOV(rp); 3122 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3123 return (NFS_ACCESS_UNKNOWN); 3124 3125 if (rp->r_acache != NULL) { 3126 hp = &acache[acachehash(rp, cr)]; 3127 rw_enter(&hp->lock, RW_READER); 3128 ap = hp->next; 3129 while (ap != (acache_t *)hp) { 3130 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3131 if ((ap->known & acc) == acc) { 3132 #ifdef DEBUG 3133 nfs_access_cache_hits++; 3134 #endif 3135 if ((ap->allowed & acc) == acc) 3136 all = NFS_ACCESS_ALLOWED; 3137 else 3138 all = NFS_ACCESS_DENIED; 3139 } else { 3140 #ifdef DEBUG 3141 nfs_access_cache_misses++; 3142 #endif 3143 all = NFS_ACCESS_UNKNOWN; 3144 } 3145 rw_exit(&hp->lock); 3146 return (all); 3147 } 3148 ap = ap->next; 3149 } 3150 rw_exit(&hp->lock); 3151 } 3152 3153 #ifdef DEBUG 3154 nfs_access_cache_misses++; 3155 #endif 3156 return (NFS_ACCESS_UNKNOWN); 3157 } 3158 3159 void 3160 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3161 { 3162 acache_t *ap; 3163 acache_t *nap; 3164 acache_hash_t *hp; 3165 3166 hp = &acache[acachehash(rp, cr)]; 3167 3168 /* 3169 * Allocate now assuming that mostly an allocation will be 3170 * required. This allows the allocation to happen without 3171 * holding the hash bucket locked. 3172 */ 3173 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3174 if (nap != NULL) { 3175 nap->known = acc; 3176 nap->allowed = resacc; 3177 nap->rnode = rp; 3178 crhold(cr); 3179 nap->cred = cr; 3180 nap->hashq = hp; 3181 } 3182 3183 rw_enter(&hp->lock, RW_WRITER); 3184 3185 if (rp->r_acache != NULL) { 3186 ap = hp->next; 3187 while (ap != (acache_t *)hp) { 3188 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3189 ap->known |= acc; 3190 ap->allowed &= ~acc; 3191 ap->allowed |= resacc; 3192 rw_exit(&hp->lock); 3193 if (nap != NULL) { 3194 crfree(nap->cred); 3195 kmem_cache_free(acache_cache, nap); 3196 } 3197 return; 3198 } 3199 ap = ap->next; 3200 } 3201 } 3202 3203 if (nap != NULL) { 3204 #ifdef DEBUG 3205 clstat_debug.access.value.ui64++; 3206 #endif 3207 nap->next = hp->next; 3208 hp->next = nap; 3209 nap->next->prev = nap; 3210 nap->prev = (acache_t *)hp; 3211 3212 mutex_enter(&rp->r_statelock); 3213 nap->list = rp->r_acache; 3214 rp->r_acache = nap; 3215 mutex_exit(&rp->r_statelock); 3216 } 3217 3218 rw_exit(&hp->lock); 3219 } 3220 3221 int 3222 nfs_access_purge_rp(rnode_t *rp) 3223 { 3224 acache_t *ap; 3225 acache_t *tmpap; 3226 acache_t *rplist; 3227 3228 /* 3229 * If there aren't any cached entries, then there is nothing 3230 * to free. 3231 */ 3232 if (rp->r_acache == NULL) 3233 return (0); 3234 3235 mutex_enter(&rp->r_statelock); 3236 rplist = rp->r_acache; 3237 rp->r_acache = NULL; 3238 mutex_exit(&rp->r_statelock); 3239 3240 /* 3241 * Loop through each entry in the list pointed to in the 3242 * rnode. Remove each of these entries from the hash 3243 * queue that it is on and remove it from the list in 3244 * the rnode. 3245 */ 3246 for (ap = rplist; ap != NULL; ap = tmpap) { 3247 rw_enter(&ap->hashq->lock, RW_WRITER); 3248 ap->prev->next = ap->next; 3249 ap->next->prev = ap->prev; 3250 rw_exit(&ap->hashq->lock); 3251 3252 tmpap = ap->list; 3253 crfree(ap->cred); 3254 kmem_cache_free(acache_cache, ap); 3255 #ifdef DEBUG 3256 clstat_debug.access.value.ui64--; 3257 #endif 3258 } 3259 3260 return (1); 3261 } 3262 3263 static const char prefix[] = ".nfs"; 3264 3265 static kmutex_t newnum_lock; 3266 3267 int 3268 newnum(void) 3269 { 3270 static uint_t newnum = 0; 3271 uint_t id; 3272 3273 mutex_enter(&newnum_lock); 3274 if (newnum == 0) 3275 newnum = gethrestime_sec() & 0xffff; 3276 id = newnum++; 3277 mutex_exit(&newnum_lock); 3278 return (id); 3279 } 3280 3281 char * 3282 newname(void) 3283 { 3284 char *news; 3285 char *s; 3286 const char *p; 3287 uint_t id; 3288 3289 id = newnum(); 3290 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3291 s = news; 3292 p = prefix; 3293 while (*p != '\0') 3294 *s++ = *p++; 3295 while (id != 0) { 3296 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3297 id >>= 4; 3298 } 3299 *s = '\0'; 3300 return (news); 3301 } 3302 3303 /* 3304 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3305 * framework. 3306 */ 3307 static int 3308 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3309 { 3310 ksp->ks_snaptime = gethrtime(); 3311 if (rw == KSTAT_WRITE) { 3312 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3313 #ifdef DEBUG 3314 /* 3315 * Currently only the global zone can write to kstats, but we 3316 * add the check just for paranoia. 3317 */ 3318 if (INGLOBALZONE(curproc)) 3319 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3320 sizeof (clstat_debug)); 3321 #endif 3322 } else { 3323 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3324 #ifdef DEBUG 3325 /* 3326 * If we're displaying the "global" debug kstat values, we 3327 * display them as-is to all zones since in fact they apply to 3328 * the system as a whole. 3329 */ 3330 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3331 sizeof (clstat_debug)); 3332 #endif 3333 } 3334 return (0); 3335 } 3336 3337 static void * 3338 clinit_zone(zoneid_t zoneid) 3339 { 3340 kstat_t *nfs_client_kstat; 3341 struct nfs_clnt *nfscl; 3342 uint_t ndata; 3343 3344 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3345 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3346 nfscl->nfscl_chtable = NULL; 3347 nfscl->nfscl_zoneid = zoneid; 3348 3349 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3350 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3351 #ifdef DEBUG 3352 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3353 #endif 3354 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3355 "misc", KSTAT_TYPE_NAMED, ndata, 3356 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3357 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3358 nfs_client_kstat->ks_snapshot = cl_snapshot; 3359 kstat_install(nfs_client_kstat); 3360 } 3361 mutex_enter(&nfs_clnt_list_lock); 3362 list_insert_head(&nfs_clnt_list, nfscl); 3363 mutex_exit(&nfs_clnt_list_lock); 3364 return (nfscl); 3365 } 3366 3367 /*ARGSUSED*/ 3368 static void 3369 clfini_zone(zoneid_t zoneid, void *arg) 3370 { 3371 struct nfs_clnt *nfscl = arg; 3372 chhead_t *chp, *next; 3373 3374 if (nfscl == NULL) 3375 return; 3376 mutex_enter(&nfs_clnt_list_lock); 3377 list_remove(&nfs_clnt_list, nfscl); 3378 mutex_exit(&nfs_clnt_list_lock); 3379 clreclaim_zone(nfscl, 0); 3380 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3381 ASSERT(chp->ch_list == NULL); 3382 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3383 next = chp->ch_next; 3384 kmem_free(chp, sizeof (*chp)); 3385 } 3386 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3387 mutex_destroy(&nfscl->nfscl_chtable_lock); 3388 kmem_free(nfscl, sizeof (*nfscl)); 3389 } 3390 3391 /* 3392 * Called by endpnt_destructor to make sure the client handles are 3393 * cleaned up before the RPC endpoints. This becomes a no-op if 3394 * clfini_zone (above) is called first. This function is needed 3395 * (rather than relying on clfini_zone to clean up) because the ZSD 3396 * callbacks have no ordering mechanism, so we have no way to ensure 3397 * that clfini_zone is called before endpnt_destructor. 3398 */ 3399 void 3400 clcleanup_zone(zoneid_t zoneid) 3401 { 3402 struct nfs_clnt *nfscl; 3403 3404 mutex_enter(&nfs_clnt_list_lock); 3405 nfscl = list_head(&nfs_clnt_list); 3406 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3407 if (nfscl->nfscl_zoneid == zoneid) { 3408 clreclaim_zone(nfscl, 0); 3409 break; 3410 } 3411 } 3412 mutex_exit(&nfs_clnt_list_lock); 3413 } 3414 3415 int 3416 nfs_subrinit(void) 3417 { 3418 int i; 3419 ulong_t nrnode_max; 3420 3421 /* 3422 * Allocate and initialize the rnode hash queues 3423 */ 3424 if (nrnode <= 0) 3425 nrnode = ncsize; 3426 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3427 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3428 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3429 "setting nrnode to max value of %ld", nrnode_max); 3430 nrnode = nrnode_max; 3431 } 3432 3433 rtablesize = 1 << highbit(nrnode / hashlen); 3434 rtablemask = rtablesize - 1; 3435 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3436 for (i = 0; i < rtablesize; i++) { 3437 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3438 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3439 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3440 } 3441 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3442 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3443 3444 /* 3445 * Allocate and initialize the access cache 3446 */ 3447 3448 /* 3449 * Initial guess is one access cache entry per rnode unless 3450 * nacache is set to a non-zero value and then it is used to 3451 * indicate a guess at the number of access cache entries. 3452 */ 3453 if (nacache > 0) 3454 acachesize = 1 << highbit(nacache / hashlen); 3455 else 3456 acachesize = rtablesize; 3457 acachemask = acachesize - 1; 3458 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3459 for (i = 0; i < acachesize; i++) { 3460 acache[i].next = (acache_t *)&acache[i]; 3461 acache[i].prev = (acache_t *)&acache[i]; 3462 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3463 } 3464 acache_cache = kmem_cache_create("nfs_access_cache", 3465 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3466 /* 3467 * Allocate and initialize the client handle cache 3468 */ 3469 chtab_cache = kmem_cache_create("client_handle_cache", 3470 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3471 /* 3472 * Initialize the list of per-zone client handles (and associated data). 3473 * This needs to be done before we call zone_key_create(). 3474 */ 3475 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3476 offsetof(struct nfs_clnt, nfscl_node)); 3477 /* 3478 * Initialize the zone_key for per-zone client handle lists. 3479 */ 3480 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3481 /* 3482 * Initialize the various mutexes and reader/writer locks 3483 */ 3484 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3485 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3486 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3487 3488 /* 3489 * Assign unique major number for all nfs mounts 3490 */ 3491 if ((nfs_major = getudev()) == -1) { 3492 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3493 "nfs: init: can't get unique device number"); 3494 nfs_major = 0; 3495 } 3496 nfs_minor = 0; 3497 3498 if (nfs3_jukebox_delay == 0) 3499 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3500 3501 return (0); 3502 } 3503 3504 void 3505 nfs_subrfini(void) 3506 { 3507 int i; 3508 3509 /* 3510 * Deallocate the rnode hash queues 3511 */ 3512 kmem_cache_destroy(rnode_cache); 3513 3514 for (i = 0; i < rtablesize; i++) 3515 rw_destroy(&rtable[i].r_lock); 3516 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3517 3518 /* 3519 * Deallocated the access cache 3520 */ 3521 kmem_cache_destroy(acache_cache); 3522 3523 for (i = 0; i < acachesize; i++) 3524 rw_destroy(&acache[i].lock); 3525 kmem_free(acache, acachesize * sizeof (*acache)); 3526 3527 /* 3528 * Deallocate the client handle cache 3529 */ 3530 kmem_cache_destroy(chtab_cache); 3531 3532 /* 3533 * Destroy the various mutexes and reader/writer locks 3534 */ 3535 mutex_destroy(&rpfreelist_lock); 3536 mutex_destroy(&newnum_lock); 3537 mutex_destroy(&nfs_minor_lock); 3538 (void) zone_key_delete(nfsclnt_zone_key); 3539 } 3540 3541 enum nfsstat 3542 puterrno(int error) 3543 { 3544 3545 switch (error) { 3546 case EOPNOTSUPP: 3547 return (NFSERR_OPNOTSUPP); 3548 case ENAMETOOLONG: 3549 return (NFSERR_NAMETOOLONG); 3550 case ENOTEMPTY: 3551 return (NFSERR_NOTEMPTY); 3552 case EDQUOT: 3553 return (NFSERR_DQUOT); 3554 case ESTALE: 3555 return (NFSERR_STALE); 3556 case EREMOTE: 3557 return (NFSERR_REMOTE); 3558 case ENOSYS: 3559 return (NFSERR_OPNOTSUPP); 3560 case EOVERFLOW: 3561 return (NFSERR_INVAL); 3562 default: 3563 return ((enum nfsstat)error); 3564 } 3565 /* NOTREACHED */ 3566 } 3567 3568 int 3569 geterrno(enum nfsstat status) 3570 { 3571 3572 switch (status) { 3573 case NFSERR_OPNOTSUPP: 3574 return (EOPNOTSUPP); 3575 case NFSERR_NAMETOOLONG: 3576 return (ENAMETOOLONG); 3577 case NFSERR_NOTEMPTY: 3578 return (ENOTEMPTY); 3579 case NFSERR_DQUOT: 3580 return (EDQUOT); 3581 case NFSERR_STALE: 3582 return (ESTALE); 3583 case NFSERR_REMOTE: 3584 return (EREMOTE); 3585 case NFSERR_WFLUSH: 3586 return (EIO); 3587 default: 3588 return ((int)status); 3589 } 3590 /* NOTREACHED */ 3591 } 3592 3593 enum nfsstat3 3594 puterrno3(int error) 3595 { 3596 3597 #ifdef DEBUG 3598 switch (error) { 3599 case 0: 3600 return (NFS3_OK); 3601 case EPERM: 3602 return (NFS3ERR_PERM); 3603 case ENOENT: 3604 return (NFS3ERR_NOENT); 3605 case EIO: 3606 return (NFS3ERR_IO); 3607 case ENXIO: 3608 return (NFS3ERR_NXIO); 3609 case EACCES: 3610 return (NFS3ERR_ACCES); 3611 case EEXIST: 3612 return (NFS3ERR_EXIST); 3613 case EXDEV: 3614 return (NFS3ERR_XDEV); 3615 case ENODEV: 3616 return (NFS3ERR_NODEV); 3617 case ENOTDIR: 3618 return (NFS3ERR_NOTDIR); 3619 case EISDIR: 3620 return (NFS3ERR_ISDIR); 3621 case EINVAL: 3622 return (NFS3ERR_INVAL); 3623 case EFBIG: 3624 return (NFS3ERR_FBIG); 3625 case ENOSPC: 3626 return (NFS3ERR_NOSPC); 3627 case EROFS: 3628 return (NFS3ERR_ROFS); 3629 case EMLINK: 3630 return (NFS3ERR_MLINK); 3631 case ENAMETOOLONG: 3632 return (NFS3ERR_NAMETOOLONG); 3633 case ENOTEMPTY: 3634 return (NFS3ERR_NOTEMPTY); 3635 case EDQUOT: 3636 return (NFS3ERR_DQUOT); 3637 case ESTALE: 3638 return (NFS3ERR_STALE); 3639 case EREMOTE: 3640 return (NFS3ERR_REMOTE); 3641 case ENOSYS: 3642 case EOPNOTSUPP: 3643 return (NFS3ERR_NOTSUPP); 3644 case EOVERFLOW: 3645 return (NFS3ERR_INVAL); 3646 default: 3647 zcmn_err(getzoneid(), CE_WARN, 3648 "puterrno3: got error %d", error); 3649 return ((enum nfsstat3)error); 3650 } 3651 #else 3652 switch (error) { 3653 case ENAMETOOLONG: 3654 return (NFS3ERR_NAMETOOLONG); 3655 case ENOTEMPTY: 3656 return (NFS3ERR_NOTEMPTY); 3657 case EDQUOT: 3658 return (NFS3ERR_DQUOT); 3659 case ESTALE: 3660 return (NFS3ERR_STALE); 3661 case ENOSYS: 3662 case EOPNOTSUPP: 3663 return (NFS3ERR_NOTSUPP); 3664 case EREMOTE: 3665 return (NFS3ERR_REMOTE); 3666 case EOVERFLOW: 3667 return (NFS3ERR_INVAL); 3668 default: 3669 return ((enum nfsstat3)error); 3670 } 3671 #endif 3672 } 3673 3674 int 3675 geterrno3(enum nfsstat3 status) 3676 { 3677 3678 #ifdef DEBUG 3679 switch (status) { 3680 case NFS3_OK: 3681 return (0); 3682 case NFS3ERR_PERM: 3683 return (EPERM); 3684 case NFS3ERR_NOENT: 3685 return (ENOENT); 3686 case NFS3ERR_IO: 3687 return (EIO); 3688 case NFS3ERR_NXIO: 3689 return (ENXIO); 3690 case NFS3ERR_ACCES: 3691 return (EACCES); 3692 case NFS3ERR_EXIST: 3693 return (EEXIST); 3694 case NFS3ERR_XDEV: 3695 return (EXDEV); 3696 case NFS3ERR_NODEV: 3697 return (ENODEV); 3698 case NFS3ERR_NOTDIR: 3699 return (ENOTDIR); 3700 case NFS3ERR_ISDIR: 3701 return (EISDIR); 3702 case NFS3ERR_INVAL: 3703 return (EINVAL); 3704 case NFS3ERR_FBIG: 3705 return (EFBIG); 3706 case NFS3ERR_NOSPC: 3707 return (ENOSPC); 3708 case NFS3ERR_ROFS: 3709 return (EROFS); 3710 case NFS3ERR_MLINK: 3711 return (EMLINK); 3712 case NFS3ERR_NAMETOOLONG: 3713 return (ENAMETOOLONG); 3714 case NFS3ERR_NOTEMPTY: 3715 return (ENOTEMPTY); 3716 case NFS3ERR_DQUOT: 3717 return (EDQUOT); 3718 case NFS3ERR_STALE: 3719 return (ESTALE); 3720 case NFS3ERR_REMOTE: 3721 return (EREMOTE); 3722 case NFS3ERR_BADHANDLE: 3723 return (ESTALE); 3724 case NFS3ERR_NOT_SYNC: 3725 return (EINVAL); 3726 case NFS3ERR_BAD_COOKIE: 3727 return (ENOENT); 3728 case NFS3ERR_NOTSUPP: 3729 return (EOPNOTSUPP); 3730 case NFS3ERR_TOOSMALL: 3731 return (EINVAL); 3732 case NFS3ERR_SERVERFAULT: 3733 return (EIO); 3734 case NFS3ERR_BADTYPE: 3735 return (EINVAL); 3736 case NFS3ERR_JUKEBOX: 3737 return (ENXIO); 3738 default: 3739 zcmn_err(getzoneid(), CE_WARN, 3740 "geterrno3: got status %d", status); 3741 return ((int)status); 3742 } 3743 #else 3744 switch (status) { 3745 case NFS3ERR_NAMETOOLONG: 3746 return (ENAMETOOLONG); 3747 case NFS3ERR_NOTEMPTY: 3748 return (ENOTEMPTY); 3749 case NFS3ERR_DQUOT: 3750 return (EDQUOT); 3751 case NFS3ERR_STALE: 3752 case NFS3ERR_BADHANDLE: 3753 return (ESTALE); 3754 case NFS3ERR_NOTSUPP: 3755 return (EOPNOTSUPP); 3756 case NFS3ERR_REMOTE: 3757 return (EREMOTE); 3758 case NFS3ERR_NOT_SYNC: 3759 case NFS3ERR_TOOSMALL: 3760 case NFS3ERR_BADTYPE: 3761 return (EINVAL); 3762 case NFS3ERR_BAD_COOKIE: 3763 return (ENOENT); 3764 case NFS3ERR_SERVERFAULT: 3765 return (EIO); 3766 case NFS3ERR_JUKEBOX: 3767 return (ENXIO); 3768 default: 3769 return ((int)status); 3770 } 3771 #endif 3772 } 3773 3774 rddir_cache * 3775 rddir_cache_alloc(int flags) 3776 { 3777 rddir_cache *rc; 3778 3779 rc = kmem_alloc(sizeof (*rc), flags); 3780 if (rc != NULL) { 3781 rc->entries = NULL; 3782 rc->flags = RDDIR; 3783 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3784 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3785 rc->count = 1; 3786 #ifdef DEBUG 3787 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3788 #endif 3789 } 3790 return (rc); 3791 } 3792 3793 static void 3794 rddir_cache_free(rddir_cache *rc) 3795 { 3796 3797 #ifdef DEBUG 3798 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3799 #endif 3800 if (rc->entries != NULL) { 3801 #ifdef DEBUG 3802 rddir_cache_buf_free(rc->entries, rc->buflen); 3803 #else 3804 kmem_free(rc->entries, rc->buflen); 3805 #endif 3806 } 3807 cv_destroy(&rc->cv); 3808 mutex_destroy(&rc->lock); 3809 kmem_free(rc, sizeof (*rc)); 3810 } 3811 3812 void 3813 rddir_cache_hold(rddir_cache *rc) 3814 { 3815 3816 mutex_enter(&rc->lock); 3817 rc->count++; 3818 mutex_exit(&rc->lock); 3819 } 3820 3821 void 3822 rddir_cache_rele(rddir_cache *rc) 3823 { 3824 3825 mutex_enter(&rc->lock); 3826 ASSERT(rc->count > 0); 3827 if (--rc->count == 0) { 3828 mutex_exit(&rc->lock); 3829 rddir_cache_free(rc); 3830 } else 3831 mutex_exit(&rc->lock); 3832 } 3833 3834 #ifdef DEBUG 3835 char * 3836 rddir_cache_buf_alloc(size_t size, int flags) 3837 { 3838 char *rc; 3839 3840 rc = kmem_alloc(size, flags); 3841 if (rc != NULL) 3842 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3843 return (rc); 3844 } 3845 3846 void 3847 rddir_cache_buf_free(void *addr, size_t size) 3848 { 3849 3850 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3851 kmem_free(addr, size); 3852 } 3853 #endif 3854 3855 static int 3856 nfs_free_data_reclaim(rnode_t *rp) 3857 { 3858 char *contents; 3859 int size; 3860 vsecattr_t *vsp; 3861 nfs3_pathconf_info *info; 3862 int freed; 3863 cred_t *cred; 3864 3865 /* 3866 * Free any held credentials and caches which 3867 * may be associated with this rnode. 3868 */ 3869 mutex_enter(&rp->r_statelock); 3870 cred = rp->r_cred; 3871 rp->r_cred = NULL; 3872 contents = rp->r_symlink.contents; 3873 size = rp->r_symlink.size; 3874 rp->r_symlink.contents = NULL; 3875 vsp = rp->r_secattr; 3876 rp->r_secattr = NULL; 3877 info = rp->r_pathconf; 3878 rp->r_pathconf = NULL; 3879 mutex_exit(&rp->r_statelock); 3880 3881 if (cred != NULL) 3882 crfree(cred); 3883 3884 /* 3885 * Free the access cache entries. 3886 */ 3887 freed = nfs_access_purge_rp(rp); 3888 3889 if (!HAVE_RDDIR_CACHE(rp) && 3890 contents == NULL && 3891 vsp == NULL && 3892 info == NULL) 3893 return (freed); 3894 3895 /* 3896 * Free the readdir cache entries 3897 */ 3898 if (HAVE_RDDIR_CACHE(rp)) 3899 nfs_purge_rddir_cache(RTOV(rp)); 3900 3901 /* 3902 * Free the symbolic link cache. 3903 */ 3904 if (contents != NULL) { 3905 3906 kmem_free((void *)contents, size); 3907 } 3908 3909 /* 3910 * Free any cached ACL. 3911 */ 3912 if (vsp != NULL) 3913 nfs_acl_free(vsp); 3914 3915 /* 3916 * Free any cached pathconf information. 3917 */ 3918 if (info != NULL) 3919 kmem_free(info, sizeof (*info)); 3920 3921 return (1); 3922 } 3923 3924 static int 3925 nfs_active_data_reclaim(rnode_t *rp) 3926 { 3927 char *contents; 3928 int size; 3929 vsecattr_t *vsp; 3930 nfs3_pathconf_info *info; 3931 int freed; 3932 3933 /* 3934 * Free any held credentials and caches which 3935 * may be associated with this rnode. 3936 */ 3937 if (!mutex_tryenter(&rp->r_statelock)) 3938 return (0); 3939 contents = rp->r_symlink.contents; 3940 size = rp->r_symlink.size; 3941 rp->r_symlink.contents = NULL; 3942 vsp = rp->r_secattr; 3943 rp->r_secattr = NULL; 3944 info = rp->r_pathconf; 3945 rp->r_pathconf = NULL; 3946 mutex_exit(&rp->r_statelock); 3947 3948 /* 3949 * Free the access cache entries. 3950 */ 3951 freed = nfs_access_purge_rp(rp); 3952 3953 if (!HAVE_RDDIR_CACHE(rp) && 3954 contents == NULL && 3955 vsp == NULL && 3956 info == NULL) 3957 return (freed); 3958 3959 /* 3960 * Free the readdir cache entries 3961 */ 3962 if (HAVE_RDDIR_CACHE(rp)) 3963 nfs_purge_rddir_cache(RTOV(rp)); 3964 3965 /* 3966 * Free the symbolic link cache. 3967 */ 3968 if (contents != NULL) { 3969 3970 kmem_free((void *)contents, size); 3971 } 3972 3973 /* 3974 * Free any cached ACL. 3975 */ 3976 if (vsp != NULL) 3977 nfs_acl_free(vsp); 3978 3979 /* 3980 * Free any cached pathconf information. 3981 */ 3982 if (info != NULL) 3983 kmem_free(info, sizeof (*info)); 3984 3985 return (1); 3986 } 3987 3988 static int 3989 nfs_free_reclaim(void) 3990 { 3991 int freed; 3992 rnode_t *rp; 3993 3994 #ifdef DEBUG 3995 clstat_debug.f_reclaim.value.ui64++; 3996 #endif 3997 freed = 0; 3998 mutex_enter(&rpfreelist_lock); 3999 rp = rpfreelist; 4000 if (rp != NULL) { 4001 do { 4002 if (nfs_free_data_reclaim(rp)) 4003 freed = 1; 4004 } while ((rp = rp->r_freef) != rpfreelist); 4005 } 4006 mutex_exit(&rpfreelist_lock); 4007 return (freed); 4008 } 4009 4010 static int 4011 nfs_active_reclaim(void) 4012 { 4013 int freed; 4014 int index; 4015 rnode_t *rp; 4016 4017 #ifdef DEBUG 4018 clstat_debug.a_reclaim.value.ui64++; 4019 #endif 4020 freed = 0; 4021 for (index = 0; index < rtablesize; index++) { 4022 rw_enter(&rtable[index].r_lock, RW_READER); 4023 for (rp = rtable[index].r_hashf; 4024 rp != (rnode_t *)(&rtable[index]); 4025 rp = rp->r_hashf) { 4026 if (nfs_active_data_reclaim(rp)) 4027 freed = 1; 4028 } 4029 rw_exit(&rtable[index].r_lock); 4030 } 4031 return (freed); 4032 } 4033 4034 static int 4035 nfs_rnode_reclaim(void) 4036 { 4037 int freed; 4038 rnode_t *rp; 4039 vnode_t *vp; 4040 4041 #ifdef DEBUG 4042 clstat_debug.r_reclaim.value.ui64++; 4043 #endif 4044 freed = 0; 4045 mutex_enter(&rpfreelist_lock); 4046 while ((rp = rpfreelist) != NULL) { 4047 rp_rmfree(rp); 4048 mutex_exit(&rpfreelist_lock); 4049 if (rp->r_flags & RHASHED) { 4050 vp = RTOV(rp); 4051 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4052 mutex_enter(&vp->v_lock); 4053 if (vp->v_count > 1) { 4054 vp->v_count--; 4055 mutex_exit(&vp->v_lock); 4056 rw_exit(&rp->r_hashq->r_lock); 4057 mutex_enter(&rpfreelist_lock); 4058 continue; 4059 } 4060 mutex_exit(&vp->v_lock); 4061 rp_rmhash_locked(rp); 4062 rw_exit(&rp->r_hashq->r_lock); 4063 } 4064 /* 4065 * This call to rp_addfree will end up destroying the 4066 * rnode, but in a safe way with the appropriate set 4067 * of checks done. 4068 */ 4069 rp_addfree(rp, CRED()); 4070 mutex_enter(&rpfreelist_lock); 4071 } 4072 mutex_exit(&rpfreelist_lock); 4073 return (freed); 4074 } 4075 4076 /*ARGSUSED*/ 4077 static void 4078 nfs_reclaim(void *cdrarg) 4079 { 4080 4081 #ifdef DEBUG 4082 clstat_debug.reclaim.value.ui64++; 4083 #endif 4084 if (nfs_free_reclaim()) 4085 return; 4086 4087 if (nfs_active_reclaim()) 4088 return; 4089 4090 (void) nfs_rnode_reclaim(); 4091 } 4092 4093 /* 4094 * NFS client failover support 4095 * 4096 * Routines to copy filehandles 4097 */ 4098 void 4099 nfscopyfh(caddr_t fhp, vnode_t *vp) 4100 { 4101 fhandle_t *dest = (fhandle_t *)fhp; 4102 4103 if (dest != NULL) 4104 *dest = *VTOFH(vp); 4105 } 4106 4107 void 4108 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4109 { 4110 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4111 4112 if (dest != NULL) 4113 *dest = *VTOFH3(vp); 4114 } 4115 4116 /* 4117 * NFS client failover support 4118 * 4119 * failover_safe() will test various conditions to ensure that 4120 * failover is permitted for this vnode. It will be denied 4121 * if: 4122 * 1) the operation in progress does not support failover (NULL fi) 4123 * 2) there are no available replicas (NULL mi_servers->sv_next) 4124 * 3) any locks are outstanding on this file 4125 */ 4126 static int 4127 failover_safe(failinfo_t *fi) 4128 { 4129 4130 /* 4131 * Does this op permit failover? 4132 */ 4133 if (fi == NULL || fi->vp == NULL) 4134 return (0); 4135 4136 /* 4137 * Are there any alternates to failover to? 4138 */ 4139 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4140 return (0); 4141 4142 /* 4143 * Disable check; we've forced local locking 4144 * 4145 * if (flk_has_remote_locks(fi->vp)) 4146 * return (0); 4147 */ 4148 4149 /* 4150 * If we have no partial path, we can't do anything 4151 */ 4152 if (VTOR(fi->vp)->r_path == NULL) 4153 return (0); 4154 4155 return (1); 4156 } 4157 4158 #include <sys/thread.h> 4159 4160 /* 4161 * NFS client failover support 4162 * 4163 * failover_newserver() will start a search for a new server, 4164 * preferably by starting an async thread to do the work. If 4165 * someone is already doing this (recognizable by MI_BINDINPROG 4166 * being set), it will simply return and the calling thread 4167 * will queue on the mi_failover_cv condition variable. 4168 */ 4169 static void 4170 failover_newserver(mntinfo_t *mi) 4171 { 4172 /* 4173 * Check if someone else is doing this already 4174 */ 4175 mutex_enter(&mi->mi_lock); 4176 if (mi->mi_flags & MI_BINDINPROG) { 4177 mutex_exit(&mi->mi_lock); 4178 return; 4179 } 4180 mi->mi_flags |= MI_BINDINPROG; 4181 4182 /* 4183 * Need to hold the vfs struct so that it can't be released 4184 * while the failover thread is selecting a new server. 4185 */ 4186 VFS_HOLD(mi->mi_vfsp); 4187 4188 /* 4189 * Start a thread to do the real searching. 4190 */ 4191 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4192 4193 mutex_exit(&mi->mi_lock); 4194 } 4195 4196 /* 4197 * NFS client failover support 4198 * 4199 * failover_thread() will find a new server to replace the one 4200 * currently in use, wake up other threads waiting on this mount 4201 * point, and die. It will start at the head of the server list 4202 * and poll servers until it finds one with an NFS server which is 4203 * registered and responds to a NULL procedure ping. 4204 * 4205 * XXX failover_thread is unsafe within the scope of the 4206 * present model defined for cpr to suspend the system. 4207 * Specifically, over-the-wire calls made by the thread 4208 * are unsafe. The thread needs to be reevaluated in case of 4209 * future updates to the cpr suspend model. 4210 */ 4211 static void 4212 failover_thread(mntinfo_t *mi) 4213 { 4214 servinfo_t *svp = NULL; 4215 CLIENT *cl; 4216 enum clnt_stat status; 4217 struct timeval tv; 4218 int error; 4219 int oncethru = 0; 4220 callb_cpr_t cprinfo; 4221 rnode_t *rp; 4222 int index; 4223 char *srvnames; 4224 size_t srvnames_len; 4225 struct nfs_clnt *nfscl = NULL; 4226 zoneid_t zoneid = getzoneid(); 4227 4228 #ifdef DEBUG 4229 /* 4230 * This is currently only needed to access counters which exist on 4231 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4232 * on non-DEBUG kernels. 4233 */ 4234 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4235 ASSERT(nfscl != NULL); 4236 #endif 4237 4238 /* 4239 * Its safe to piggyback on the mi_lock since failover_newserver() 4240 * code guarantees that there will be only one failover thread 4241 * per mountinfo at any instance. 4242 */ 4243 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4244 "failover_thread"); 4245 4246 mutex_enter(&mi->mi_lock); 4247 while (mi->mi_readers) { 4248 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4249 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4250 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4251 } 4252 mutex_exit(&mi->mi_lock); 4253 4254 tv.tv_sec = 2; 4255 tv.tv_usec = 0; 4256 4257 /* 4258 * Ping the null NFS procedure of every server in 4259 * the list until one responds. We always start 4260 * at the head of the list and always skip the one 4261 * that is current, since it's caused us a problem. 4262 */ 4263 while (svp == NULL) { 4264 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4265 if (!oncethru && svp == mi->mi_curr_serv) 4266 continue; 4267 4268 /* 4269 * If the file system was forcibly umounted 4270 * while trying to do a failover, then just 4271 * give up on the failover. It won't matter 4272 * what the server is. 4273 */ 4274 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4275 svp = NULL; 4276 goto done; 4277 } 4278 4279 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4280 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4281 if (error) 4282 continue; 4283 4284 if (!(mi->mi_flags & MI_INT)) 4285 cl->cl_nosignal = TRUE; 4286 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4287 xdr_void, NULL, tv); 4288 if (!(mi->mi_flags & MI_INT)) 4289 cl->cl_nosignal = FALSE; 4290 AUTH_DESTROY(cl->cl_auth); 4291 CLNT_DESTROY(cl); 4292 if (status == RPC_SUCCESS) { 4293 if (svp == mi->mi_curr_serv) { 4294 #ifdef DEBUG 4295 zcmn_err(zoneid, CE_NOTE, 4296 "NFS%d: failing over: selecting original server %s", 4297 mi->mi_vers, svp->sv_hostname); 4298 #else 4299 zcmn_err(zoneid, CE_NOTE, 4300 "NFS: failing over: selecting original server %s", 4301 svp->sv_hostname); 4302 #endif 4303 } else { 4304 #ifdef DEBUG 4305 zcmn_err(zoneid, CE_NOTE, 4306 "NFS%d: failing over from %s to %s", 4307 mi->mi_vers, 4308 mi->mi_curr_serv->sv_hostname, 4309 svp->sv_hostname); 4310 #else 4311 zcmn_err(zoneid, CE_NOTE, 4312 "NFS: failing over from %s to %s", 4313 mi->mi_curr_serv->sv_hostname, 4314 svp->sv_hostname); 4315 #endif 4316 } 4317 break; 4318 } 4319 } 4320 4321 if (svp == NULL) { 4322 if (!oncethru) { 4323 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4324 #ifdef DEBUG 4325 zprintf(zoneid, 4326 "NFS%d servers %s not responding " 4327 "still trying\n", mi->mi_vers, srvnames); 4328 #else 4329 zprintf(zoneid, "NFS servers %s not responding " 4330 "still trying\n", srvnames); 4331 #endif 4332 oncethru = 1; 4333 } 4334 mutex_enter(&mi->mi_lock); 4335 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4336 mutex_exit(&mi->mi_lock); 4337 delay(hz); 4338 mutex_enter(&mi->mi_lock); 4339 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4340 mutex_exit(&mi->mi_lock); 4341 } 4342 } 4343 4344 if (oncethru) { 4345 #ifdef DEBUG 4346 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4347 #else 4348 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4349 #endif 4350 } 4351 4352 if (svp != mi->mi_curr_serv) { 4353 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4354 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4355 rw_enter(&rtable[index].r_lock, RW_WRITER); 4356 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4357 mi->mi_vfsp); 4358 if (rp != NULL) { 4359 if (rp->r_flags & RHASHED) 4360 rp_rmhash_locked(rp); 4361 rw_exit(&rtable[index].r_lock); 4362 rp->r_server = svp; 4363 rp->r_fh = svp->sv_fhandle; 4364 (void) nfs_free_data_reclaim(rp); 4365 index = rtablehash(&rp->r_fh); 4366 rp->r_hashq = &rtable[index]; 4367 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4368 vn_exists(RTOV(rp)); 4369 rp_addhash(rp); 4370 rw_exit(&rp->r_hashq->r_lock); 4371 VN_RELE(RTOV(rp)); 4372 } else 4373 rw_exit(&rtable[index].r_lock); 4374 } 4375 4376 done: 4377 if (oncethru) 4378 kmem_free(srvnames, srvnames_len); 4379 mutex_enter(&mi->mi_lock); 4380 mi->mi_flags &= ~MI_BINDINPROG; 4381 if (svp != NULL) { 4382 mi->mi_curr_serv = svp; 4383 mi->mi_failover++; 4384 #ifdef DEBUG 4385 nfscl->nfscl_stat.failover.value.ui64++; 4386 #endif 4387 } 4388 cv_broadcast(&mi->mi_failover_cv); 4389 CALLB_CPR_EXIT(&cprinfo); 4390 VFS_RELE(mi->mi_vfsp); 4391 zthread_exit(); 4392 /* NOTREACHED */ 4393 } 4394 4395 /* 4396 * NFS client failover support 4397 * 4398 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4399 * is cleared, meaning that failover is complete. Called with 4400 * mi_lock mutex held. 4401 */ 4402 static int 4403 failover_wait(mntinfo_t *mi) 4404 { 4405 k_sigset_t smask; 4406 4407 /* 4408 * If someone else is hunting for a living server, 4409 * sleep until it's done. After our sleep, we may 4410 * be bound to the right server and get off cheaply. 4411 */ 4412 while (mi->mi_flags & MI_BINDINPROG) { 4413 /* 4414 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4415 * and SIGTERM. (Preserving the existing masks). 4416 * Mask out SIGINT if mount option nointr is specified. 4417 */ 4418 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4419 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4420 /* 4421 * restore original signal mask 4422 */ 4423 sigunintr(&smask); 4424 return (EINTR); 4425 } 4426 /* 4427 * restore original signal mask 4428 */ 4429 sigunintr(&smask); 4430 } 4431 return (0); 4432 } 4433 4434 /* 4435 * NFS client failover support 4436 * 4437 * failover_remap() will do a partial pathname lookup and find the 4438 * desired vnode on the current server. The interim vnode will be 4439 * discarded after we pilfer the new filehandle. 4440 * 4441 * Side effects: 4442 * - This routine will also update the filehandle in the args structure 4443 * pointed to by the fi->fhp pointer if it is non-NULL. 4444 */ 4445 4446 static int 4447 failover_remap(failinfo_t *fi) 4448 { 4449 vnode_t *vp, *nvp, *rootvp; 4450 rnode_t *rp, *nrp; 4451 mntinfo_t *mi; 4452 int error; 4453 #ifdef DEBUG 4454 struct nfs_clnt *nfscl; 4455 4456 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4457 ASSERT(nfscl != NULL); 4458 #endif 4459 /* 4460 * Sanity check 4461 */ 4462 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4463 return (EINVAL); 4464 vp = fi->vp; 4465 rp = VTOR(vp); 4466 mi = VTOMI(vp); 4467 4468 if (!(vp->v_flag & VROOT)) { 4469 /* 4470 * Given the root fh, use the path stored in 4471 * the rnode to find the fh for the new server. 4472 */ 4473 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4474 if (error) 4475 return (error); 4476 4477 error = failover_lookup(rp->r_path, rootvp, 4478 fi->lookupproc, fi->xattrdirproc, &nvp); 4479 4480 VN_RELE(rootvp); 4481 4482 if (error) 4483 return (error); 4484 4485 /* 4486 * If we found the same rnode, we're done now 4487 */ 4488 if (nvp == vp) { 4489 /* 4490 * Failed and the new server may physically be same 4491 * OR may share a same disk subsystem. In this case 4492 * file handle for a particular file path is not going 4493 * to change, given the same filehandle lookup will 4494 * always locate the same rnode as the existing one. 4495 * All we might need to do is to update the r_server 4496 * with the current servinfo. 4497 */ 4498 if (!VALID_FH(fi)) { 4499 rp->r_server = mi->mi_curr_serv; 4500 } 4501 VN_RELE(nvp); 4502 return (0); 4503 } 4504 4505 /* 4506 * Try to make it so that no one else will find this 4507 * vnode because it is just a temporary to hold the 4508 * new file handle until that file handle can be 4509 * copied to the original vnode/rnode. 4510 */ 4511 nrp = VTOR(nvp); 4512 mutex_enter(&mi->mi_remap_lock); 4513 /* 4514 * Some other thread could have raced in here and could 4515 * have done the remap for this particular rnode before 4516 * this thread here. Check for rp->r_server and 4517 * mi->mi_curr_serv and return if they are same. 4518 */ 4519 if (VALID_FH(fi)) { 4520 mutex_exit(&mi->mi_remap_lock); 4521 VN_RELE(nvp); 4522 return (0); 4523 } 4524 4525 if (nrp->r_flags & RHASHED) 4526 rp_rmhash(nrp); 4527 4528 /* 4529 * As a heuristic check on the validity of the new 4530 * file, check that the size and type match against 4531 * that we remember from the old version. 4532 */ 4533 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4534 mutex_exit(&mi->mi_remap_lock); 4535 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4536 "NFS replicas %s and %s: file %s not same.", 4537 rp->r_server->sv_hostname, 4538 nrp->r_server->sv_hostname, rp->r_path); 4539 VN_RELE(nvp); 4540 return (EINVAL); 4541 } 4542 4543 /* 4544 * snarf the filehandle from the new rnode 4545 * then release it, again while updating the 4546 * hash queues for the rnode. 4547 */ 4548 if (rp->r_flags & RHASHED) 4549 rp_rmhash(rp); 4550 rp->r_server = mi->mi_curr_serv; 4551 rp->r_fh = nrp->r_fh; 4552 rp->r_hashq = nrp->r_hashq; 4553 /* 4554 * Copy the attributes from the new rnode to the old 4555 * rnode. This will help to reduce unnecessary page 4556 * cache flushes. 4557 */ 4558 rp->r_attr = nrp->r_attr; 4559 rp->r_attrtime = nrp->r_attrtime; 4560 rp->r_mtime = nrp->r_mtime; 4561 (void) nfs_free_data_reclaim(rp); 4562 nfs_setswaplike(vp, &rp->r_attr); 4563 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4564 rp_addhash(rp); 4565 rw_exit(&rp->r_hashq->r_lock); 4566 mutex_exit(&mi->mi_remap_lock); 4567 VN_RELE(nvp); 4568 } 4569 4570 /* 4571 * Update successful failover remap count 4572 */ 4573 mutex_enter(&mi->mi_lock); 4574 mi->mi_remap++; 4575 mutex_exit(&mi->mi_lock); 4576 #ifdef DEBUG 4577 nfscl->nfscl_stat.remap.value.ui64++; 4578 #endif 4579 4580 /* 4581 * If we have a copied filehandle to update, do it now. 4582 */ 4583 if (fi->fhp != NULL && fi->copyproc != NULL) 4584 (*fi->copyproc)(fi->fhp, vp); 4585 4586 return (0); 4587 } 4588 4589 /* 4590 * NFS client failover support 4591 * 4592 * We want a simple pathname lookup routine to parse the pieces 4593 * of path in rp->r_path. We know that the path was a created 4594 * as rnodes were made, so we know we have only to deal with 4595 * paths that look like: 4596 * dir1/dir2/dir3/file 4597 * Any evidence of anything like .., symlinks, and ENOTDIR 4598 * are hard errors, because they mean something in this filesystem 4599 * is different from the one we came from, or has changed under 4600 * us in some way. If this is true, we want the failure. 4601 * 4602 * Extended attributes: if the filesystem is mounted with extended 4603 * attributes enabled (-o xattr), the attribute directory will be 4604 * represented in the r_path as the magic name XATTR_RPATH. So if 4605 * we see that name in the pathname, is must be because this node 4606 * is an extended attribute. Therefore, look it up that way. 4607 */ 4608 static int 4609 failover_lookup(char *path, vnode_t *root, 4610 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4611 vnode_t *, cred_t *, int), 4612 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4613 vnode_t **new) 4614 { 4615 vnode_t *dvp, *nvp; 4616 int error = EINVAL; 4617 char *s, *p, *tmppath; 4618 size_t len; 4619 mntinfo_t *mi; 4620 bool_t xattr; 4621 4622 /* Make local copy of path */ 4623 len = strlen(path) + 1; 4624 tmppath = kmem_alloc(len, KM_SLEEP); 4625 (void) strcpy(tmppath, path); 4626 s = tmppath; 4627 4628 dvp = root; 4629 VN_HOLD(dvp); 4630 mi = VTOMI(root); 4631 xattr = mi->mi_flags & MI_EXTATTR; 4632 4633 do { 4634 p = strchr(s, '/'); 4635 if (p != NULL) 4636 *p = '\0'; 4637 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4638 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4639 RFSCALL_SOFT); 4640 } else { 4641 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4642 CRED(), RFSCALL_SOFT); 4643 } 4644 if (p != NULL) 4645 *p++ = '/'; 4646 if (error) { 4647 VN_RELE(dvp); 4648 kmem_free(tmppath, len); 4649 return (error); 4650 } 4651 s = p; 4652 VN_RELE(dvp); 4653 dvp = nvp; 4654 } while (p != NULL); 4655 4656 if (nvp != NULL && new != NULL) 4657 *new = nvp; 4658 kmem_free(tmppath, len); 4659 return (0); 4660 } 4661 4662 /* 4663 * NFS client failover support 4664 * 4665 * sv_free() frees the malloc'd portion of a "servinfo_t". 4666 */ 4667 void 4668 sv_free(servinfo_t *svp) 4669 { 4670 servinfo_t *next; 4671 struct knetconfig *knconf; 4672 4673 while (svp != NULL) { 4674 next = svp->sv_next; 4675 if (svp->sv_secdata) 4676 sec_clnt_freeinfo(svp->sv_secdata); 4677 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4678 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4679 knconf = svp->sv_knconf; 4680 if (knconf != NULL) { 4681 if (knconf->knc_protofmly != NULL) 4682 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4683 if (knconf->knc_proto != NULL) 4684 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4685 kmem_free(knconf, sizeof (*knconf)); 4686 } 4687 knconf = svp->sv_origknconf; 4688 if (knconf != NULL) { 4689 if (knconf->knc_protofmly != NULL) 4690 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4691 if (knconf->knc_proto != NULL) 4692 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4693 kmem_free(knconf, sizeof (*knconf)); 4694 } 4695 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4696 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4697 mutex_destroy(&svp->sv_lock); 4698 kmem_free(svp, sizeof (*svp)); 4699 svp = next; 4700 } 4701 } 4702 4703 /* 4704 * Only can return non-zero if intr != 0. 4705 */ 4706 int 4707 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4708 { 4709 4710 mutex_enter(&l->lock); 4711 4712 /* 4713 * If this is a nested enter, then allow it. There 4714 * must be as many exits as enters through. 4715 */ 4716 if (l->owner == curthread) { 4717 /* lock is held for writing by current thread */ 4718 ASSERT(rw == RW_READER || rw == RW_WRITER); 4719 l->count--; 4720 } else if (rw == RW_READER) { 4721 /* 4722 * While there is a writer active or writers waiting, 4723 * then wait for them to finish up and move on. Then, 4724 * increment the count to indicate that a reader is 4725 * active. 4726 */ 4727 while (l->count < 0 || l->waiters > 0) { 4728 if (intr) { 4729 klwp_t *lwp = ttolwp(curthread); 4730 4731 if (lwp != NULL) 4732 lwp->lwp_nostop++; 4733 if (!cv_wait_sig(&l->cv, &l->lock)) { 4734 if (lwp != NULL) 4735 lwp->lwp_nostop--; 4736 mutex_exit(&l->lock); 4737 return (EINTR); 4738 } 4739 if (lwp != NULL) 4740 lwp->lwp_nostop--; 4741 } else 4742 cv_wait(&l->cv, &l->lock); 4743 } 4744 ASSERT(l->count < INT_MAX); 4745 #ifdef DEBUG 4746 if ((l->count % 10000) == 9999) 4747 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4748 "rwlock @ %p\n", l->count, (void *)&l); 4749 #endif 4750 l->count++; 4751 } else { 4752 ASSERT(rw == RW_WRITER); 4753 /* 4754 * While there are readers active or a writer 4755 * active, then wait for all of the readers 4756 * to finish or for the writer to finish. 4757 * Then, set the owner field to curthread and 4758 * decrement count to indicate that a writer 4759 * is active. 4760 */ 4761 while (l->count > 0 || l->owner != NULL) { 4762 l->waiters++; 4763 if (intr) { 4764 klwp_t *lwp = ttolwp(curthread); 4765 4766 if (lwp != NULL) 4767 lwp->lwp_nostop++; 4768 if (!cv_wait_sig(&l->cv, &l->lock)) { 4769 if (lwp != NULL) 4770 lwp->lwp_nostop--; 4771 l->waiters--; 4772 cv_broadcast(&l->cv); 4773 mutex_exit(&l->lock); 4774 return (EINTR); 4775 } 4776 if (lwp != NULL) 4777 lwp->lwp_nostop--; 4778 } else 4779 cv_wait(&l->cv, &l->lock); 4780 l->waiters--; 4781 } 4782 l->owner = curthread; 4783 l->count--; 4784 } 4785 4786 mutex_exit(&l->lock); 4787 4788 return (0); 4789 } 4790 4791 /* 4792 * If the lock is available, obtain it and return non-zero. If there is 4793 * already a conflicting lock, return 0 immediately. 4794 */ 4795 4796 int 4797 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4798 { 4799 mutex_enter(&l->lock); 4800 4801 /* 4802 * If this is a nested enter, then allow it. There 4803 * must be as many exits as enters through. 4804 */ 4805 if (l->owner == curthread) { 4806 /* lock is held for writing by current thread */ 4807 ASSERT(rw == RW_READER || rw == RW_WRITER); 4808 l->count--; 4809 } else if (rw == RW_READER) { 4810 /* 4811 * If there is a writer active or writers waiting, deny the 4812 * lock. Otherwise, bump the count of readers. 4813 */ 4814 if (l->count < 0 || l->waiters > 0) { 4815 mutex_exit(&l->lock); 4816 return (0); 4817 } 4818 l->count++; 4819 } else { 4820 ASSERT(rw == RW_WRITER); 4821 /* 4822 * If there are readers active or a writer active, deny the 4823 * lock. Otherwise, set the owner field to curthread and 4824 * decrement count to indicate that a writer is active. 4825 */ 4826 if (l->count > 0 || l->owner != NULL) { 4827 mutex_exit(&l->lock); 4828 return (0); 4829 } 4830 l->owner = curthread; 4831 l->count--; 4832 } 4833 4834 mutex_exit(&l->lock); 4835 4836 return (1); 4837 } 4838 4839 void 4840 nfs_rw_exit(nfs_rwlock_t *l) 4841 { 4842 4843 mutex_enter(&l->lock); 4844 /* 4845 * If this is releasing a writer lock, then increment count to 4846 * indicate that there is one less writer active. If this was 4847 * the last of possibly nested writer locks, then clear the owner 4848 * field as well to indicate that there is no writer active 4849 * and wakeup any possible waiting writers or readers. 4850 * 4851 * If releasing a reader lock, then just decrement count to 4852 * indicate that there is one less reader active. If this was 4853 * the last active reader and there are writer(s) waiting, 4854 * then wake up the first. 4855 */ 4856 if (l->owner != NULL) { 4857 ASSERT(l->owner == curthread); 4858 l->count++; 4859 if (l->count == 0) { 4860 l->owner = NULL; 4861 cv_broadcast(&l->cv); 4862 } 4863 } else { 4864 ASSERT(l->count > 0); 4865 l->count--; 4866 if (l->count == 0 && l->waiters > 0) 4867 cv_broadcast(&l->cv); 4868 } 4869 mutex_exit(&l->lock); 4870 } 4871 4872 int 4873 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4874 { 4875 4876 if (rw == RW_READER) 4877 return (l->count > 0); 4878 ASSERT(rw == RW_WRITER); 4879 return (l->count < 0); 4880 } 4881 4882 /* ARGSUSED */ 4883 void 4884 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4885 { 4886 4887 l->count = 0; 4888 l->waiters = 0; 4889 l->owner = NULL; 4890 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4891 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4892 } 4893 4894 void 4895 nfs_rw_destroy(nfs_rwlock_t *l) 4896 { 4897 4898 mutex_destroy(&l->lock); 4899 cv_destroy(&l->cv); 4900 } 4901 4902 int 4903 nfs3_rddir_compar(const void *x, const void *y) 4904 { 4905 rddir_cache *a = (rddir_cache *)x; 4906 rddir_cache *b = (rddir_cache *)y; 4907 4908 if (a->nfs3_cookie == b->nfs3_cookie) { 4909 if (a->buflen == b->buflen) 4910 return (0); 4911 if (a->buflen < b->buflen) 4912 return (-1); 4913 return (1); 4914 } 4915 4916 if (a->nfs3_cookie < b->nfs3_cookie) 4917 return (-1); 4918 4919 return (1); 4920 } 4921 4922 int 4923 nfs_rddir_compar(const void *x, const void *y) 4924 { 4925 rddir_cache *a = (rddir_cache *)x; 4926 rddir_cache *b = (rddir_cache *)y; 4927 4928 if (a->nfs_cookie == b->nfs_cookie) { 4929 if (a->buflen == b->buflen) 4930 return (0); 4931 if (a->buflen < b->buflen) 4932 return (-1); 4933 return (1); 4934 } 4935 4936 if (a->nfs_cookie < b->nfs_cookie) 4937 return (-1); 4938 4939 return (1); 4940 } 4941 4942 static char * 4943 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4944 { 4945 servinfo_t *s; 4946 char *srvnames; 4947 char *namep; 4948 size_t length; 4949 4950 /* 4951 * Calculate the length of the string required to hold all 4952 * of the server names plus either a comma or a null 4953 * character following each individual one. 4954 */ 4955 length = 0; 4956 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4957 length += s->sv_hostnamelen; 4958 4959 srvnames = kmem_alloc(length, KM_SLEEP); 4960 4961 namep = srvnames; 4962 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4963 (void) strcpy(namep, s->sv_hostname); 4964 namep += s->sv_hostnamelen - 1; 4965 *namep++ = ','; 4966 } 4967 *--namep = '\0'; 4968 4969 *len = length; 4970 4971 return (srvnames); 4972 } 4973 4974 /* 4975 * These two functions are temporary and designed for the upgrade-workaround 4976 * only. They cannot be used for general zone-crossing NFS client support, and 4977 * will be removed shortly. 4978 * 4979 * When the workaround is enabled, all NFS traffic is forced into the global 4980 * zone. These functions are called when the code needs to refer to the state 4981 * of the underlying network connection. They're not called when the function 4982 * needs to refer to the state of the process that invoked the system call. 4983 * (E.g., when checking whether the zone is shutting down during the mount() 4984 * call.) 4985 */ 4986 4987 struct zone * 4988 nfs_zone(void) 4989 { 4990 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 4991 } 4992 4993 zoneid_t 4994 nfs_zoneid(void) 4995 { 4996 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 4997 } 4998 4999 /* 5000 * nfs_mount_label_policy: 5001 * Determine whether the mount is allowed according to MAC check, 5002 * by comparing (where appropriate) label of the remote server 5003 * against the label of the zone being mounted into. 5004 * 5005 * Returns: 5006 * 0 : access allowed 5007 * -1 : read-only access allowed (i.e., read-down) 5008 * >0 : error code, such as EACCES 5009 */ 5010 int 5011 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5012 struct knetconfig *knconf, cred_t *cr) 5013 { 5014 int addr_type; 5015 void *ipaddr; 5016 bslabel_t *server_sl, *mntlabel; 5017 zone_t *mntzone = NULL; 5018 ts_label_t *zlabel; 5019 tsol_tpc_t *tp; 5020 ts_label_t *tsl = NULL; 5021 int retv; 5022 5023 /* 5024 * Get the zone's label. Each zone on a labeled system has a label. 5025 */ 5026 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5027 zlabel = mntzone->zone_slabel; 5028 ASSERT(zlabel != NULL); 5029 label_hold(zlabel); 5030 5031 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5032 addr_type = IPV4_VERSION; 5033 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5034 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5035 addr_type = IPV6_VERSION; 5036 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5037 } else { 5038 retv = 0; 5039 goto out; 5040 } 5041 5042 retv = EACCES; /* assume the worst */ 5043 5044 /* 5045 * Next, get the assigned label of the remote server. 5046 */ 5047 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5048 if (tp == NULL) 5049 goto out; /* error getting host entry */ 5050 5051 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5052 goto rel_tpc; /* invalid domain */ 5053 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5054 (tp->tpc_tp.host_type != UNLABELED)) 5055 goto rel_tpc; /* invalid hosttype */ 5056 5057 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5058 tsl = getflabel_cipso(vfsp); 5059 if (tsl == NULL) 5060 goto rel_tpc; /* error getting server lbl */ 5061 5062 server_sl = label2bslabel(tsl); 5063 } else { /* UNLABELED */ 5064 server_sl = &tp->tpc_tp.tp_def_label; 5065 } 5066 5067 mntlabel = label2bslabel(zlabel); 5068 5069 /* 5070 * Now compare labels to complete the MAC check. If the labels 5071 * are equal or if the requestor is in the global zone and has 5072 * NET_MAC_AWARE, then allow read-write access. (Except for 5073 * mounts into the global zone itself; restrict these to 5074 * read-only.) 5075 * 5076 * If the requestor is in some other zone, but his label 5077 * dominates the server, then allow read-down. 5078 * 5079 * Otherwise, access is denied. 5080 */ 5081 if (blequal(mntlabel, server_sl) || 5082 (crgetzoneid(cr) == GLOBAL_ZONEID && 5083 getpflags(NET_MAC_AWARE, cr) != 0)) { 5084 if ((mntzone == global_zone) || 5085 !blequal(mntlabel, server_sl)) 5086 retv = -1; /* read-only */ 5087 else 5088 retv = 0; /* access OK */ 5089 } else if (bldominates(mntlabel, server_sl)) { 5090 retv = -1; /* read-only */ 5091 } else { 5092 retv = EACCES; 5093 } 5094 5095 if (tsl != NULL) 5096 label_rele(tsl); 5097 5098 rel_tpc: 5099 TPC_RELE(tp); 5100 out: 5101 if (mntzone) 5102 zone_rele(mntzone); 5103 label_rele(zlabel); 5104 return (retv); 5105 } 5106 5107 boolean_t 5108 nfs_has_ctty(void) 5109 { 5110 boolean_t rv; 5111 mutex_enter(&curproc->p_splock); 5112 rv = (curproc->p_sessp->s_vp != NULL); 5113 mutex_exit(&curproc->p_splock); 5114 return (rv); 5115 } 5116 5117 /* 5118 * TX NFS routine used by NFSv3 and NFSv4 to do label check 5119 * on client label and server's file object lable. 5120 */ 5121 boolean_t 5122 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag) 5123 { 5124 bslabel_t *slabel; 5125 ts_label_t *tslabel; 5126 boolean_t result; 5127 5128 if ((tslabel = nfs_getflabel(vp)) == NULL) { 5129 return (B_FALSE); 5130 } 5131 slabel = label2bslabel(tslabel); 5132 DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *, 5133 "comparing server's file label(1) with client label(2) (vp(3))", 5134 bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp); 5135 5136 if (flag == EQUALITY_CHECK) 5137 result = blequal(clabel, slabel); 5138 else 5139 result = bldominates(clabel, slabel); 5140 label_rele(tslabel); 5141 return (result); 5142 } 5143 5144 /* 5145 * See if xattr directory to see if it has any generic user attributes 5146 */ 5147 int 5148 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5149 { 5150 struct uio uio; 5151 struct iovec iov; 5152 char *dbuf; 5153 struct dirent64 *dp; 5154 size_t dlen = 8 * 1024; 5155 size_t dbuflen; 5156 int eof = 0; 5157 int error; 5158 5159 *valp = 0; 5160 dbuf = kmem_alloc(dlen, KM_SLEEP); 5161 uio.uio_iov = &iov; 5162 uio.uio_iovcnt = 1; 5163 uio.uio_segflg = UIO_SYSSPACE; 5164 uio.uio_fmode = 0; 5165 uio.uio_extflg = UIO_COPY_CACHED; 5166 uio.uio_loffset = 0; 5167 uio.uio_resid = dlen; 5168 iov.iov_base = dbuf; 5169 iov.iov_len = dlen; 5170 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5171 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5172 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5173 5174 dbuflen = dlen - uio.uio_resid; 5175 5176 if (error || dbuflen == 0) { 5177 kmem_free(dbuf, dlen); 5178 return (error); 5179 } 5180 5181 dp = (dirent64_t *)dbuf; 5182 5183 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5184 if (strcmp(dp->d_name, ".") == 0 || 5185 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5186 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5187 VIEW_READONLY) == 0) { 5188 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5189 continue; 5190 } 5191 5192 *valp = 1; 5193 break; 5194 } 5195 kmem_free(dbuf, dlen); 5196 return (0); 5197 } 5198