1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All rights reserved. 28 */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/proc.h> 37 #include <sys/user.h> 38 #include <sys/time.h> 39 #include <sys/buf.h> 40 #include <sys/vfs.h> 41 #include <sys/vnode.h> 42 #include <sys/socket.h> 43 #include <sys/uio.h> 44 #include <sys/tiuser.h> 45 #include <sys/swap.h> 46 #include <sys/errno.h> 47 #include <sys/debug.h> 48 #include <sys/kmem.h> 49 #include <sys/kstat.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vtrace.h> 52 #include <sys/session.h> 53 #include <sys/dnlc.h> 54 #include <sys/bitmap.h> 55 #include <sys/acl.h> 56 #include <sys/ddi.h> 57 #include <sys/pathname.h> 58 #include <sys/flock.h> 59 #include <sys/dirent.h> 60 #include <sys/flock.h> 61 #include <sys/callb.h> 62 #include <sys/atomic.h> 63 #include <sys/list.h> 64 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs4.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/rnode.h> 74 #include <nfs/nfs_acl.h> 75 76 /* 77 * The hash queues for the access to active and cached rnodes 78 * are organized as doubly linked lists. A reader/writer lock 79 * for each hash bucket is used to control access and to synchronize 80 * lookups, additions, and deletions from the hash queue. 81 * 82 * The rnode freelist is organized as a doubly linked list with 83 * a head pointer. Additions and deletions are synchronized via 84 * a single mutex. 85 * 86 * In order to add an rnode to the free list, it must be hashed into 87 * a hash queue and the exclusive lock to the hash queue be held. 88 * If an rnode is not hashed into a hash queue, then it is destroyed 89 * because it represents no valuable information that can be reused 90 * about the file. The exclusive lock to the hash queue must be 91 * held in order to prevent a lookup in the hash queue from finding 92 * the rnode and using it and assuming that the rnode is not on the 93 * freelist. The lookup in the hash queue will have the hash queue 94 * locked, either exclusive or shared. 95 * 96 * The vnode reference count for each rnode is not allowed to drop 97 * below 1. This prevents external entities, such as the VM 98 * subsystem, from acquiring references to vnodes already on the 99 * freelist and then trying to place them back on the freelist 100 * when their reference is released. This means that the when an 101 * rnode is looked up in the hash queues, then either the rnode 102 * is removed from the freelist and that reference is tranfered to 103 * the new reference or the vnode reference count must be incremented 104 * accordingly. The mutex for the freelist must be held in order to 105 * accurately test to see if the rnode is on the freelist or not. 106 * The hash queue lock might be held shared and it is possible that 107 * two different threads may race to remove the rnode from the 108 * freelist. This race can be resolved by holding the mutex for the 109 * freelist. Please note that the mutex for the freelist does not 110 * need to held if the rnode is not on the freelist. It can not be 111 * placed on the freelist due to the requirement that the thread 112 * putting the rnode on the freelist must hold the exclusive lock 113 * to the hash queue and the thread doing the lookup in the hash 114 * queue is holding either a shared or exclusive lock to the hash 115 * queue. 116 * 117 * The lock ordering is: 118 * 119 * hash bucket lock -> vnode lock 120 * hash bucket lock -> freelist lock 121 */ 122 static rhashq_t *rtable; 123 124 static kmutex_t rpfreelist_lock; 125 static rnode_t *rpfreelist = NULL; 126 static long rnew = 0; 127 long nrnode = 0; 128 129 static int rtablesize; 130 static int rtablemask; 131 132 static int hashlen = 4; 133 134 static struct kmem_cache *rnode_cache; 135 136 /* 137 * Mutex to protect the following variables: 138 * nfs_major 139 * nfs_minor 140 */ 141 kmutex_t nfs_minor_lock; 142 int nfs_major; 143 int nfs_minor; 144 145 /* Do we allow preepoch (negative) time values otw? */ 146 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 147 148 /* 149 * Access cache 150 */ 151 static acache_hash_t *acache; 152 static long nacache; /* used strictly to size the number of hash queues */ 153 154 static int acachesize; 155 static int acachemask; 156 static struct kmem_cache *acache_cache; 157 158 /* 159 * Client side utilities 160 */ 161 162 /* 163 * client side statistics 164 */ 165 static const struct clstat clstat_tmpl = { 166 { "calls", KSTAT_DATA_UINT64 }, 167 { "badcalls", KSTAT_DATA_UINT64 }, 168 { "clgets", KSTAT_DATA_UINT64 }, 169 { "cltoomany", KSTAT_DATA_UINT64 }, 170 #ifdef DEBUG 171 { "clalloc", KSTAT_DATA_UINT64 }, 172 { "noresponse", KSTAT_DATA_UINT64 }, 173 { "failover", KSTAT_DATA_UINT64 }, 174 { "remap", KSTAT_DATA_UINT64 }, 175 #endif 176 }; 177 178 /* 179 * The following are statistics that describe behavior of the system as a whole 180 * and doesn't correspond to any one particular zone. 181 */ 182 #ifdef DEBUG 183 static struct clstat_debug { 184 kstat_named_t nrnode; /* number of allocated rnodes */ 185 kstat_named_t access; /* size of access cache */ 186 kstat_named_t dirent; /* size of readdir cache */ 187 kstat_named_t dirents; /* size of readdir buf cache */ 188 kstat_named_t reclaim; /* number of reclaims */ 189 kstat_named_t clreclaim; /* number of cl reclaims */ 190 kstat_named_t f_reclaim; /* number of free reclaims */ 191 kstat_named_t a_reclaim; /* number of active reclaims */ 192 kstat_named_t r_reclaim; /* number of rnode reclaims */ 193 kstat_named_t rpath; /* bytes used to store rpaths */ 194 } clstat_debug = { 195 { "nrnode", KSTAT_DATA_UINT64 }, 196 { "access", KSTAT_DATA_UINT64 }, 197 { "dirent", KSTAT_DATA_UINT64 }, 198 { "dirents", KSTAT_DATA_UINT64 }, 199 { "reclaim", KSTAT_DATA_UINT64 }, 200 { "clreclaim", KSTAT_DATA_UINT64 }, 201 { "f_reclaim", KSTAT_DATA_UINT64 }, 202 { "a_reclaim", KSTAT_DATA_UINT64 }, 203 { "r_reclaim", KSTAT_DATA_UINT64 }, 204 { "r_path", KSTAT_DATA_UINT64 }, 205 }; 206 #endif /* DEBUG */ 207 208 /* 209 * We keep a global list of per-zone client data, so we can clean up all zones 210 * if we get low on memory. 211 */ 212 static list_t nfs_clnt_list; 213 static kmutex_t nfs_clnt_list_lock; 214 static zone_key_t nfsclnt_zone_key; 215 216 static struct kmem_cache *chtab_cache; 217 218 /* 219 * Some servers do not properly update the attributes of the 220 * directory when changes are made. To allow interoperability 221 * with these broken servers, the nfs_disable_rddir_cache 222 * parameter must be set in /etc/system 223 */ 224 int nfs_disable_rddir_cache = 0; 225 226 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 227 struct chtab **); 228 void clfree(CLIENT *, struct chtab *); 229 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 230 struct chtab **, struct nfs_clnt *); 231 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 232 struct chtab **, struct nfs_clnt *); 233 static void clreclaim(void *); 234 static int nfs_feedback(int, int, mntinfo_t *); 235 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 236 caddr_t, cred_t *, int *, enum clnt_stat *, int, 237 failinfo_t *); 238 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 239 caddr_t, cred_t *, int *, int, failinfo_t *); 240 static void rinactive(rnode_t *, cred_t *); 241 static int rtablehash(nfs_fhandle *); 242 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 243 struct vnodeops *, 244 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 245 cred_t *), 246 int (*)(const void *, const void *), int *, cred_t *, 247 char *, char *); 248 static void rp_rmfree(rnode_t *); 249 static void rp_addhash(rnode_t *); 250 static void rp_rmhash_locked(rnode_t *); 251 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 252 static void destroy_rnode(rnode_t *); 253 static void rddir_cache_free(rddir_cache *); 254 static int nfs_free_data_reclaim(rnode_t *); 255 static int nfs_active_data_reclaim(rnode_t *); 256 static int nfs_free_reclaim(void); 257 static int nfs_active_reclaim(void); 258 static int nfs_rnode_reclaim(void); 259 static void nfs_reclaim(void *); 260 static int failover_safe(failinfo_t *); 261 static void failover_newserver(mntinfo_t *mi); 262 static void failover_thread(mntinfo_t *mi); 263 static int failover_wait(mntinfo_t *); 264 static int failover_remap(failinfo_t *); 265 static int failover_lookup(char *, vnode_t *, 266 int (*)(vnode_t *, char *, vnode_t **, 267 struct pathname *, int, vnode_t *, cred_t *, int), 268 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 269 vnode_t **); 270 static void nfs_free_r_path(rnode_t *); 271 static void nfs_set_vroot(vnode_t *); 272 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 273 274 /* 275 * from rpcsec module (common/rpcsec) 276 */ 277 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 278 extern void sec_clnt_freeh(AUTH *); 279 extern void sec_clnt_freeinfo(struct sec_data *); 280 281 /* 282 * EIO or EINTR are not recoverable errors. 283 */ 284 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 285 286 /* 287 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 288 */ 289 static int 290 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 291 struct chtab **chp, struct nfs_clnt *nfscl) 292 { 293 struct chhead *ch, *newch; 294 struct chhead **plistp; 295 struct chtab *cp; 296 int error; 297 k_sigset_t smask; 298 299 if (newcl == NULL || chp == NULL || ci == NULL) 300 return (EINVAL); 301 302 *newcl = NULL; 303 *chp = NULL; 304 305 /* 306 * Find an unused handle or create one 307 */ 308 newch = NULL; 309 nfscl->nfscl_stat.clgets.value.ui64++; 310 top: 311 /* 312 * Find the correct entry in the cache to check for free 313 * client handles. The search is based on the RPC program 314 * number, program version number, dev_t for the transport 315 * device, and the protocol family. 316 */ 317 mutex_enter(&nfscl->nfscl_chtable_lock); 318 plistp = &nfscl->nfscl_chtable; 319 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 320 if (ch->ch_prog == ci->cl_prog && 321 ch->ch_vers == ci->cl_vers && 322 ch->ch_dev == svp->sv_knconf->knc_rdev && 323 (strcmp(ch->ch_protofmly, 324 svp->sv_knconf->knc_protofmly) == 0)) 325 break; 326 plistp = &ch->ch_next; 327 } 328 329 /* 330 * If we didn't find a cache entry for this quadruple, then 331 * create one. If we don't have one already preallocated, 332 * then drop the cache lock, create one, and then start over. 333 * If we did have a preallocated entry, then just add it to 334 * the front of the list. 335 */ 336 if (ch == NULL) { 337 if (newch == NULL) { 338 mutex_exit(&nfscl->nfscl_chtable_lock); 339 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 340 newch->ch_timesused = 0; 341 newch->ch_prog = ci->cl_prog; 342 newch->ch_vers = ci->cl_vers; 343 newch->ch_dev = svp->sv_knconf->knc_rdev; 344 newch->ch_protofmly = kmem_alloc( 345 strlen(svp->sv_knconf->knc_protofmly) + 1, 346 KM_SLEEP); 347 (void) strcpy(newch->ch_protofmly, 348 svp->sv_knconf->knc_protofmly); 349 newch->ch_list = NULL; 350 goto top; 351 } 352 ch = newch; 353 newch = NULL; 354 ch->ch_next = nfscl->nfscl_chtable; 355 nfscl->nfscl_chtable = ch; 356 /* 357 * We found a cache entry, but if it isn't on the front of the 358 * list, then move it to the front of the list to try to take 359 * advantage of locality of operations. 360 */ 361 } else if (ch != nfscl->nfscl_chtable) { 362 *plistp = ch->ch_next; 363 ch->ch_next = nfscl->nfscl_chtable; 364 nfscl->nfscl_chtable = ch; 365 } 366 367 /* 368 * If there was a free client handle cached, then remove it 369 * from the list, init it, and use it. 370 */ 371 if (ch->ch_list != NULL) { 372 cp = ch->ch_list; 373 ch->ch_list = cp->ch_list; 374 mutex_exit(&nfscl->nfscl_chtable_lock); 375 if (newch != NULL) { 376 kmem_free(newch->ch_protofmly, 377 strlen(newch->ch_protofmly) + 1); 378 kmem_free(newch, sizeof (*newch)); 379 } 380 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 381 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 382 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 383 &cp->ch_client->cl_auth); 384 if (error || cp->ch_client->cl_auth == NULL) { 385 CLNT_DESTROY(cp->ch_client); 386 kmem_cache_free(chtab_cache, cp); 387 return ((error != 0) ? error : EINTR); 388 } 389 ch->ch_timesused++; 390 *newcl = cp->ch_client; 391 *chp = cp; 392 return (0); 393 } 394 395 /* 396 * There weren't any free client handles which fit, so allocate 397 * a new one and use that. 398 */ 399 #ifdef DEBUG 400 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 401 #endif 402 mutex_exit(&nfscl->nfscl_chtable_lock); 403 404 nfscl->nfscl_stat.cltoomany.value.ui64++; 405 if (newch != NULL) { 406 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 407 kmem_free(newch, sizeof (*newch)); 408 } 409 410 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 411 cp->ch_head = ch; 412 413 sigintr(&smask, (int)ci->cl_flags & MI_INT); 414 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 415 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 416 sigunintr(&smask); 417 418 if (error != 0) { 419 kmem_cache_free(chtab_cache, cp); 420 #ifdef DEBUG 421 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 422 #endif 423 /* 424 * Warning is unnecessary if error is EINTR. 425 */ 426 if (error != EINTR) { 427 nfs_cmn_err(error, CE_WARN, 428 "clget: couldn't create handle: %m\n"); 429 } 430 return (error); 431 } 432 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 433 auth_destroy(cp->ch_client->cl_auth); 434 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 435 &cp->ch_client->cl_auth); 436 if (error || cp->ch_client->cl_auth == NULL) { 437 CLNT_DESTROY(cp->ch_client); 438 kmem_cache_free(chtab_cache, cp); 439 #ifdef DEBUG 440 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 441 #endif 442 return ((error != 0) ? error : EINTR); 443 } 444 ch->ch_timesused++; 445 *newcl = cp->ch_client; 446 ASSERT(cp->ch_client->cl_nosignal == FALSE); 447 *chp = cp; 448 return (0); 449 } 450 451 int 452 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 453 struct chtab **chp) 454 { 455 struct nfs_clnt *nfscl; 456 457 nfscl = zone_getspecific(nfsclnt_zone_key, curproc->p_zone); 458 ASSERT(nfscl != NULL); 459 460 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 461 } 462 463 static int 464 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 465 struct chtab **chp, struct nfs_clnt *nfscl) 466 { 467 clinfo_t ci; 468 int error; 469 470 /* 471 * Set read buffer size to rsize 472 * and add room for RPC headers. 473 */ 474 ci.cl_readsize = mi->mi_tsize; 475 if (ci.cl_readsize != 0) 476 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 477 478 /* 479 * If soft mount and server is down just try once. 480 * meaning: do not retransmit. 481 */ 482 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 483 ci.cl_retrans = 0; 484 else 485 ci.cl_retrans = mi->mi_retrans; 486 487 ci.cl_prog = NFS_ACL_PROGRAM; 488 ci.cl_vers = mi->mi_vers; 489 ci.cl_flags = mi->mi_flags; 490 491 /* 492 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 493 * security flavor, the client tries to establish a security context 494 * by contacting the server. If the connection is timed out or reset, 495 * e.g. server reboot, we will try again. 496 */ 497 do { 498 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 499 500 if (error == 0) 501 break; 502 503 /* 504 * For forced unmount or zone shutdown, bail out, no retry. 505 */ 506 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 507 error = EIO; 508 break; 509 } 510 511 /* do not retry for softmount */ 512 if (!(mi->mi_flags & MI_HARD)) 513 break; 514 515 /* let the caller deal with the failover case */ 516 if (FAILOVER_MOUNT(mi)) 517 break; 518 519 } while (error == ETIMEDOUT || error == ECONNRESET); 520 521 return (error); 522 } 523 524 static int 525 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 526 struct chtab **chp, struct nfs_clnt *nfscl) 527 { 528 clinfo_t ci; 529 int error; 530 531 /* 532 * Set read buffer size to rsize 533 * and add room for RPC headers. 534 */ 535 ci.cl_readsize = mi->mi_tsize; 536 if (ci.cl_readsize != 0) 537 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 538 539 /* 540 * If soft mount and server is down just try once. 541 * meaning: do not retransmit. 542 */ 543 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 544 ci.cl_retrans = 0; 545 else 546 ci.cl_retrans = mi->mi_retrans; 547 548 ci.cl_prog = mi->mi_prog; 549 ci.cl_vers = mi->mi_vers; 550 ci.cl_flags = mi->mi_flags; 551 552 /* 553 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 554 * security flavor, the client tries to establish a security context 555 * by contacting the server. If the connection is timed out or reset, 556 * e.g. server reboot, we will try again. 557 */ 558 do { 559 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 560 561 if (error == 0) 562 break; 563 564 /* 565 * For forced unmount or zone shutdown, bail out, no retry. 566 */ 567 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 568 error = EIO; 569 break; 570 } 571 572 /* do not retry for softmount */ 573 if (!(mi->mi_flags & MI_HARD)) 574 break; 575 576 /* let the caller deal with the failover case */ 577 if (FAILOVER_MOUNT(mi)) 578 break; 579 580 } while (error == ETIMEDOUT || error == ECONNRESET); 581 582 return (error); 583 } 584 585 static void 586 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 587 { 588 if (cl->cl_auth != NULL) { 589 sec_clnt_freeh(cl->cl_auth); 590 cl->cl_auth = NULL; 591 } 592 593 /* 594 * Timestamp this cache entry so that we know when it was last 595 * used. 596 */ 597 cp->ch_freed = gethrestime_sec(); 598 599 /* 600 * Add the free client handle to the front of the list. 601 * This way, the list will be sorted in youngest to oldest 602 * order. 603 */ 604 mutex_enter(&nfscl->nfscl_chtable_lock); 605 cp->ch_list = cp->ch_head->ch_list; 606 cp->ch_head->ch_list = cp; 607 mutex_exit(&nfscl->nfscl_chtable_lock); 608 } 609 610 void 611 clfree(CLIENT *cl, struct chtab *cp) 612 { 613 struct nfs_clnt *nfscl; 614 615 nfscl = zone_getspecific(nfsclnt_zone_key, curproc->p_zone); 616 ASSERT(nfscl != NULL); 617 618 clfree_impl(cl, cp, nfscl); 619 } 620 621 #define CL_HOLDTIME 60 /* time to hold client handles */ 622 623 static void 624 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 625 { 626 struct chhead *ch; 627 struct chtab *cp; /* list of objects that can be reclaimed */ 628 struct chtab *cpe; 629 struct chtab *cpl; 630 struct chtab **cpp; 631 #ifdef DEBUG 632 int n = 0; 633 #endif 634 635 /* 636 * Need to reclaim some memory, so step through the cache 637 * looking through the lists for entries which can be freed. 638 */ 639 cp = NULL; 640 641 mutex_enter(&nfscl->nfscl_chtable_lock); 642 643 /* 644 * Here we step through each non-NULL quadruple and start to 645 * construct the reclaim list pointed to by cp. Note that 646 * cp will contain all eligible chtab entries. When this traversal 647 * completes, chtab entries from the last quadruple will be at the 648 * front of cp and entries from previously inspected quadruples have 649 * been appended to the rear of cp. 650 */ 651 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 652 if (ch->ch_list == NULL) 653 continue; 654 /* 655 * Search each list for entries older then 656 * cl_holdtime seconds. The lists are maintained 657 * in youngest to oldest order so that when the 658 * first entry is found which is old enough, then 659 * all of the rest of the entries on the list will 660 * be old enough as well. 661 */ 662 cpl = ch->ch_list; 663 cpp = &ch->ch_list; 664 while (cpl != NULL && 665 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 666 cpp = &cpl->ch_list; 667 cpl = cpl->ch_list; 668 } 669 if (cpl != NULL) { 670 *cpp = NULL; 671 if (cp != NULL) { 672 cpe = cpl; 673 while (cpe->ch_list != NULL) 674 cpe = cpe->ch_list; 675 cpe->ch_list = cp; 676 } 677 cp = cpl; 678 } 679 } 680 681 mutex_exit(&nfscl->nfscl_chtable_lock); 682 683 /* 684 * If cp is empty, then there is nothing to reclaim here. 685 */ 686 if (cp == NULL) 687 return; 688 689 /* 690 * Step through the list of entries to free, destroying each client 691 * handle and kmem_free'ing the memory for each entry. 692 */ 693 while (cp != NULL) { 694 #ifdef DEBUG 695 n++; 696 #endif 697 CLNT_DESTROY(cp->ch_client); 698 cpl = cp->ch_list; 699 kmem_cache_free(chtab_cache, cp); 700 cp = cpl; 701 } 702 703 #ifdef DEBUG 704 /* 705 * Update clalloc so that nfsstat shows the current number 706 * of allocated client handles. 707 */ 708 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 709 #endif 710 } 711 712 /* ARGSUSED */ 713 static void 714 clreclaim(void *all) 715 { 716 struct nfs_clnt *nfscl; 717 718 #ifdef DEBUG 719 clstat_debug.clreclaim.value.ui64++; 720 #endif 721 /* 722 * The system is low on memory; go through and try to reclaim some from 723 * every zone on the system. 724 */ 725 mutex_enter(&nfs_clnt_list_lock); 726 nfscl = list_head(&nfs_clnt_list); 727 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 728 clreclaim_zone(nfscl, CL_HOLDTIME); 729 mutex_exit(&nfs_clnt_list_lock); 730 } 731 732 /* 733 * Minimum time-out values indexed by call type 734 * These units are in "eights" of a second to avoid multiplies 735 */ 736 static unsigned int minimum_timeo[] = { 737 6, 7, 10 738 }; 739 740 /* 741 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 742 */ 743 #define MAXTIMO (20*hz) 744 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 745 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 746 747 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 748 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 749 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 750 751 /* 752 * Function called when rfscall notices that we have been 753 * re-transmitting, or when we get a response without retransmissions. 754 * Return 1 if the transfer size was adjusted down - 0 if no change. 755 */ 756 static int 757 nfs_feedback(int flag, int which, mntinfo_t *mi) 758 { 759 int kind; 760 int r = 0; 761 762 mutex_enter(&mi->mi_lock); 763 if (flag == FEEDBACK_REXMIT1) { 764 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 765 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 766 goto done; 767 if (mi->mi_curread > MIN_NFS_TSIZE) { 768 mi->mi_curread /= 2; 769 if (mi->mi_curread < MIN_NFS_TSIZE) 770 mi->mi_curread = MIN_NFS_TSIZE; 771 r = 1; 772 } 773 774 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 775 mi->mi_curwrite /= 2; 776 if (mi->mi_curwrite < MIN_NFS_TSIZE) 777 mi->mi_curwrite = MIN_NFS_TSIZE; 778 r = 1; 779 } 780 } else if (flag == FEEDBACK_OK) { 781 kind = mi->mi_timer_type[which]; 782 if (kind == 0 || 783 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 784 goto done; 785 if (kind == 1) { 786 if (mi->mi_curread >= mi->mi_tsize) 787 goto done; 788 mi->mi_curread += MIN_NFS_TSIZE; 789 if (mi->mi_curread > mi->mi_tsize/2) 790 mi->mi_curread = mi->mi_tsize; 791 } else if (kind == 2) { 792 if (mi->mi_curwrite >= mi->mi_stsize) 793 goto done; 794 mi->mi_curwrite += MIN_NFS_TSIZE; 795 if (mi->mi_curwrite > mi->mi_stsize/2) 796 mi->mi_curwrite = mi->mi_stsize; 797 } 798 } 799 done: 800 mutex_exit(&mi->mi_lock); 801 return (r); 802 } 803 804 #ifdef DEBUG 805 static int rfs2call_hits = 0; 806 static int rfs2call_misses = 0; 807 #endif 808 809 int 810 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 811 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 812 enum nfsstat *statusp, int flags, failinfo_t *fi) 813 { 814 int rpcerror; 815 enum clnt_stat rpc_status; 816 817 ASSERT(statusp != NULL); 818 819 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 820 cr, douprintf, &rpc_status, flags, fi); 821 if (!rpcerror) { 822 /* 823 * See crnetadjust() for comments. 824 */ 825 if (*statusp == NFSERR_ACCES && 826 (cr = crnetadjust(cr)) != NULL) { 827 #ifdef DEBUG 828 rfs2call_hits++; 829 #endif 830 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 831 resp, cr, douprintf, NULL, flags, fi); 832 crfree(cr); 833 #ifdef DEBUG 834 if (*statusp == NFSERR_ACCES) 835 rfs2call_misses++; 836 #endif 837 } 838 } else if (rpc_status == RPC_PROCUNAVAIL) { 839 *statusp = NFSERR_OPNOTSUPP; 840 rpcerror = 0; 841 } 842 843 return (rpcerror); 844 } 845 846 #define NFS3_JUKEBOX_DELAY 10 * hz 847 848 static clock_t nfs3_jukebox_delay = 0; 849 850 #ifdef DEBUG 851 static int rfs3call_hits = 0; 852 static int rfs3call_misses = 0; 853 #endif 854 855 int 856 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 857 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 858 nfsstat3 *statusp, int flags, failinfo_t *fi) 859 { 860 int rpcerror; 861 int user_informed; 862 863 user_informed = 0; 864 do { 865 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 866 cr, douprintf, NULL, flags, fi); 867 if (!rpcerror) { 868 cred_t *crr; 869 if (*statusp == NFS3ERR_JUKEBOX) { 870 if (ttoproc(curthread) == &p0) { 871 rpcerror = EAGAIN; 872 break; 873 } 874 if (!user_informed) { 875 user_informed = 1; 876 uprintf( 877 "file temporarily unavailable on the server, retrying...\n"); 878 } 879 delay(nfs3_jukebox_delay); 880 } 881 /* 882 * See crnetadjust() for comments. 883 */ 884 else if (*statusp == NFS3ERR_ACCES && 885 (crr = crnetadjust(cr)) != NULL) { 886 #ifdef DEBUG 887 rfs3call_hits++; 888 #endif 889 rpcerror = rfscall(mi, which, xdrargs, argsp, 890 xdrres, resp, crr, douprintf, 891 NULL, flags, fi); 892 893 crfree(crr); 894 #ifdef DEBUG 895 if (*statusp == NFS3ERR_ACCES) 896 rfs3call_misses++; 897 #endif 898 } 899 } 900 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 901 902 return (rpcerror); 903 } 904 905 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 906 #define INC_READERS(mi) { \ 907 mi->mi_readers++; \ 908 } 909 #define DEC_READERS(mi) { \ 910 mi->mi_readers--; \ 911 if (mi->mi_readers == 0) \ 912 cv_broadcast(&mi->mi_failover_cv); \ 913 } 914 915 static int 916 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 917 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 918 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 919 { 920 CLIENT *client; 921 struct chtab *ch; 922 enum clnt_stat status; 923 struct rpc_err rpcerr; 924 struct timeval wait; 925 int timeo; /* in units of hz */ 926 int my_rsize, my_wsize; 927 bool_t tryagain; 928 k_sigset_t smask; 929 servinfo_t *svp; 930 struct nfs_clnt *nfscl; 931 zoneid_t zoneid = getzoneid(); 932 #ifdef DEBUG 933 char *bufp; 934 #endif 935 936 937 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 938 "rfscall_start:which %d mi %p", which, mi); 939 940 nfscl = zone_getspecific(nfsclnt_zone_key, curproc->p_zone); 941 ASSERT(nfscl != NULL); 942 943 nfscl->nfscl_stat.calls.value.ui64++; 944 mi->mi_reqs[which].value.ui64++; 945 946 rpcerr.re_status = RPC_SUCCESS; 947 948 /* 949 * In case of forced unmount or zone shutdown, return EIO. 950 */ 951 952 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 953 rpcerr.re_status = RPC_FAILED; 954 rpcerr.re_errno = EIO; 955 return (rpcerr.re_errno); 956 } 957 958 /* 959 * Remember the transfer sizes in case 960 * nfs_feedback changes them underneath us. 961 */ 962 my_rsize = mi->mi_curread; 963 my_wsize = mi->mi_curwrite; 964 965 /* 966 * NFS client failover support 967 * 968 * If this rnode is not in sync with the current server (VALID_FH), 969 * we'd like to do a remap to get in sync. We can be interrupted 970 * in failover_remap(), and if so we'll bail. Otherwise, we'll 971 * use the best info we have to try the RPC. Part of that is 972 * unconditionally updating the filehandle copy kept for V3. 973 * 974 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 975 * rw_enter(); we're trying to keep the current server from being 976 * changed on us until we're done with the remapping and have a 977 * matching client handle. We don't want to sending a filehandle 978 * to the wrong host. 979 */ 980 failoverretry: 981 if (FAILOVER_MOUNT(mi)) { 982 mutex_enter(&mi->mi_lock); 983 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 984 if (failover_wait(mi)) { 985 mutex_exit(&mi->mi_lock); 986 return (EINTR); 987 } 988 } 989 INC_READERS(mi); 990 mutex_exit(&mi->mi_lock); 991 if (fi) { 992 if (!VALID_FH(fi) && 993 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 994 int remaperr; 995 996 svp = mi->mi_curr_serv; 997 remaperr = failover_remap(fi); 998 if (remaperr != 0) { 999 #ifdef DEBUG 1000 if (remaperr != EINTR) 1001 nfs_cmn_err(remaperr, CE_WARN, 1002 "rfscall couldn't failover: %m"); 1003 #endif 1004 mutex_enter(&mi->mi_lock); 1005 DEC_READERS(mi); 1006 mutex_exit(&mi->mi_lock); 1007 /* 1008 * If failover_remap returns ETIMEDOUT 1009 * and the filesystem is hard mounted 1010 * we have to retry the call with a new 1011 * server. 1012 */ 1013 if ((mi->mi_flags & MI_HARD) && 1014 IS_RECOVERABLE_ERROR(remaperr)) { 1015 if (svp == mi->mi_curr_serv) 1016 failover_newserver(mi); 1017 rpcerr.re_status = RPC_SUCCESS; 1018 goto failoverretry; 1019 } 1020 rpcerr.re_errno = remaperr; 1021 return (remaperr); 1022 } 1023 } 1024 if (fi->fhp && fi->copyproc) 1025 (*fi->copyproc)(fi->fhp, fi->vp); 1026 } 1027 } 1028 1029 /* 1030 * clget() calls clnt_tli_kinit() which clears the xid, so we 1031 * are guaranteed to reprocess the retry as a new request. 1032 */ 1033 svp = mi->mi_curr_serv; 1034 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1035 1036 if (FAILOVER_MOUNT(mi)) { 1037 mutex_enter(&mi->mi_lock); 1038 DEC_READERS(mi); 1039 mutex_exit(&mi->mi_lock); 1040 1041 if ((rpcerr.re_errno == ETIMEDOUT || 1042 rpcerr.re_errno == ECONNRESET) && 1043 failover_safe(fi)) { 1044 if (svp == mi->mi_curr_serv) 1045 failover_newserver(mi); 1046 goto failoverretry; 1047 } 1048 } 1049 if (rpcerr.re_errno != 0) 1050 return (rpcerr.re_errno); 1051 1052 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1053 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1054 timeo = (mi->mi_timeo * hz) / 10; 1055 } else { 1056 mutex_enter(&mi->mi_lock); 1057 timeo = CLNT_SETTIMERS(client, 1058 &(mi->mi_timers[mi->mi_timer_type[which]]), 1059 &(mi->mi_timers[NFS_CALLTYPES]), 1060 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1061 (void (*)())NULL, (caddr_t)mi, 0); 1062 mutex_exit(&mi->mi_lock); 1063 } 1064 1065 /* 1066 * If hard mounted fs, retry call forever unless hard error occurs. 1067 */ 1068 do { 1069 tryagain = FALSE; 1070 1071 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1072 status = RPC_FAILED; 1073 rpcerr.re_status = RPC_FAILED; 1074 rpcerr.re_errno = EIO; 1075 break; 1076 } 1077 1078 TICK_TO_TIMEVAL(timeo, &wait); 1079 1080 /* 1081 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1082 * and SIGTERM. (Preserving the existing masks). 1083 * Mask out SIGINT if mount option nointr is specified. 1084 */ 1085 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1086 if (!(mi->mi_flags & MI_INT)) 1087 client->cl_nosignal = TRUE; 1088 1089 /* 1090 * If there is a current signal, then don't bother 1091 * even trying to send out the request because we 1092 * won't be able to block waiting for the response. 1093 * Simply assume RPC_INTR and get on with it. 1094 */ 1095 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1096 status = RPC_INTR; 1097 else { 1098 status = CLNT_CALL(client, which, xdrargs, argsp, 1099 xdrres, resp, wait); 1100 } 1101 1102 if (!(mi->mi_flags & MI_INT)) 1103 client->cl_nosignal = FALSE; 1104 /* 1105 * restore original signal mask 1106 */ 1107 sigunintr(&smask); 1108 1109 switch (status) { 1110 case RPC_SUCCESS: 1111 if ((mi->mi_flags & MI_DYNAMIC) && 1112 mi->mi_timer_type[which] != 0 && 1113 (mi->mi_curread != my_rsize || 1114 mi->mi_curwrite != my_wsize)) 1115 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1116 break; 1117 1118 case RPC_INTR: 1119 /* 1120 * There is no way to recover from this error, 1121 * even if mount option nointr is specified. 1122 * SIGKILL, for example, cannot be blocked. 1123 */ 1124 rpcerr.re_status = RPC_INTR; 1125 rpcerr.re_errno = EINTR; 1126 break; 1127 1128 case RPC_UDERROR: 1129 /* 1130 * If the NFS server is local (vold) and 1131 * it goes away then we get RPC_UDERROR. 1132 * This is a retryable error, so we would 1133 * loop, so check to see if the specific 1134 * error was ECONNRESET, indicating that 1135 * target did not exist at all. If so, 1136 * return with RPC_PROGUNAVAIL and 1137 * ECONNRESET to indicate why. 1138 */ 1139 CLNT_GETERR(client, &rpcerr); 1140 if (rpcerr.re_errno == ECONNRESET) { 1141 rpcerr.re_status = RPC_PROGUNAVAIL; 1142 rpcerr.re_errno = ECONNRESET; 1143 break; 1144 } 1145 /*FALLTHROUGH*/ 1146 1147 default: /* probably RPC_TIMEDOUT */ 1148 if (IS_UNRECOVERABLE_RPC(status)) 1149 break; 1150 1151 /* 1152 * increment server not responding count 1153 */ 1154 mutex_enter(&mi->mi_lock); 1155 mi->mi_noresponse++; 1156 mutex_exit(&mi->mi_lock); 1157 #ifdef DEBUG 1158 nfscl->nfscl_stat.noresponse.value.ui64++; 1159 #endif 1160 1161 if (!(mi->mi_flags & MI_HARD)) { 1162 if (!(mi->mi_flags & MI_SEMISOFT) || 1163 (mi->mi_ss_call_type[which] == 0)) 1164 break; 1165 } 1166 1167 /* 1168 * The call is in progress (over COTS). 1169 * Try the CLNT_CALL again, but don't 1170 * print a noisy error message. 1171 */ 1172 if (status == RPC_INPROGRESS) { 1173 tryagain = TRUE; 1174 break; 1175 } 1176 1177 if (flags & RFSCALL_SOFT) 1178 break; 1179 1180 /* 1181 * On zone shutdown, just move on. 1182 */ 1183 if (zone_status_get(curproc->p_zone) >= 1184 ZONE_IS_SHUTTING_DOWN) { 1185 rpcerr.re_status = RPC_FAILED; 1186 rpcerr.re_errno = EIO; 1187 break; 1188 } 1189 1190 /* 1191 * NFS client failover support 1192 * 1193 * If the current server just failed us, we'll 1194 * start the process of finding a new server. 1195 * After that, we can just retry. 1196 */ 1197 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1198 if (svp == mi->mi_curr_serv) 1199 failover_newserver(mi); 1200 clfree_impl(client, ch, nfscl); 1201 goto failoverretry; 1202 } 1203 1204 tryagain = TRUE; 1205 timeo = backoff(timeo); 1206 mutex_enter(&mi->mi_lock); 1207 if (!(mi->mi_flags & MI_PRINTED)) { 1208 mi->mi_flags |= MI_PRINTED; 1209 mutex_exit(&mi->mi_lock); 1210 #ifdef DEBUG 1211 zprintf(zoneid, 1212 "NFS%d server %s not responding still trying\n", 1213 mi->mi_vers, svp->sv_hostname); 1214 #else 1215 zprintf(zoneid, 1216 "NFS server %s not responding still trying\n", 1217 svp->sv_hostname); 1218 #endif 1219 } else 1220 mutex_exit(&mi->mi_lock); 1221 if (*douprintf && curproc->p_sessp->s_vp != NULL) { 1222 *douprintf = 0; 1223 if (!(mi->mi_flags & MI_NOPRINT)) 1224 #ifdef DEBUG 1225 uprintf( 1226 "NFS%d server %s not responding still trying\n", 1227 mi->mi_vers, svp->sv_hostname); 1228 #else 1229 uprintf( 1230 "NFS server %s not responding still trying\n", 1231 svp->sv_hostname); 1232 #endif 1233 } 1234 1235 /* 1236 * If doing dynamic adjustment of transfer 1237 * size and if it's a read or write call 1238 * and if the transfer size changed while 1239 * retransmitting or if the feedback routine 1240 * changed the transfer size, 1241 * then exit rfscall so that the transfer 1242 * size can be adjusted at the vnops level. 1243 */ 1244 if ((mi->mi_flags & MI_DYNAMIC) && 1245 mi->mi_timer_type[which] != 0 && 1246 (mi->mi_curread != my_rsize || 1247 mi->mi_curwrite != my_wsize || 1248 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1249 /* 1250 * On read or write calls, return 1251 * back to the vnode ops level if 1252 * the transfer size changed. 1253 */ 1254 clfree_impl(client, ch, nfscl); 1255 return (ENFS_TRYAGAIN); 1256 } 1257 } 1258 } while (tryagain); 1259 1260 if (status != RPC_SUCCESS) { 1261 /* 1262 * Let soft mounts use the timed out message. 1263 */ 1264 if (status == RPC_INPROGRESS) 1265 status = RPC_TIMEDOUT; 1266 nfscl->nfscl_stat.badcalls.value.ui64++; 1267 if (status != RPC_INTR) { 1268 mutex_enter(&mi->mi_lock); 1269 mi->mi_flags |= MI_DOWN; 1270 mutex_exit(&mi->mi_lock); 1271 CLNT_GETERR(client, &rpcerr); 1272 #ifdef DEBUG 1273 bufp = clnt_sperror(client, svp->sv_hostname); 1274 zprintf(zoneid, "NFS%d %s failed for %s\n", 1275 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1276 if (curproc->p_sessp->s_vp != NULL) { 1277 if (!(mi->mi_flags & MI_NOPRINT)) { 1278 uprintf("NFS%d %s failed for %s\n", 1279 mi->mi_vers, mi->mi_rfsnames[which], 1280 bufp); 1281 } 1282 } 1283 kmem_free(bufp, MAXPATHLEN); 1284 #else 1285 zprintf(zoneid, 1286 "NFS %s failed for server %s: error %d (%s)\n", 1287 mi->mi_rfsnames[which], svp->sv_hostname, 1288 status, clnt_sperrno(status)); 1289 if (curproc->p_sessp->s_vp != NULL) { 1290 if (!(mi->mi_flags & MI_NOPRINT)) { 1291 uprintf( 1292 "NFS %s failed for server %s: error %d (%s)\n", 1293 mi->mi_rfsnames[which], 1294 svp->sv_hostname, status, 1295 clnt_sperrno(status)); 1296 } 1297 } 1298 #endif 1299 /* 1300 * when CLNT_CALL() fails with RPC_AUTHERROR, 1301 * re_errno is set appropriately depending on 1302 * the authentication error 1303 */ 1304 if (status == RPC_VERSMISMATCH || 1305 status == RPC_PROGVERSMISMATCH) 1306 rpcerr.re_errno = EIO; 1307 } 1308 } else { 1309 /* 1310 * Test the value of mi_down and mi_printed without 1311 * holding the mi_lock mutex. If they are both zero, 1312 * then it is okay to skip the down and printed 1313 * processing. This saves on a mutex_enter and 1314 * mutex_exit pair for a normal, successful RPC. 1315 * This was just complete overhead. 1316 */ 1317 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1318 mutex_enter(&mi->mi_lock); 1319 mi->mi_flags &= ~MI_DOWN; 1320 if (mi->mi_flags & MI_PRINTED) { 1321 mi->mi_flags &= ~MI_PRINTED; 1322 mutex_exit(&mi->mi_lock); 1323 #ifdef DEBUG 1324 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1325 zprintf(zoneid, "NFS%d server %s ok\n", 1326 mi->mi_vers, svp->sv_hostname); 1327 #else 1328 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1329 zprintf(zoneid, "NFS server %s ok\n", 1330 svp->sv_hostname); 1331 #endif 1332 } else 1333 mutex_exit(&mi->mi_lock); 1334 } 1335 1336 if (*douprintf == 0) { 1337 if (!(mi->mi_flags & MI_NOPRINT)) 1338 #ifdef DEBUG 1339 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1340 uprintf("NFS%d server %s ok\n", 1341 mi->mi_vers, svp->sv_hostname); 1342 #else 1343 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1344 uprintf("NFS server %s ok\n", svp->sv_hostname); 1345 #endif 1346 *douprintf = 1; 1347 } 1348 } 1349 1350 clfree_impl(client, ch, nfscl); 1351 1352 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1353 1354 if (rpc_status != NULL) 1355 *rpc_status = rpcerr.re_status; 1356 1357 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1358 rpcerr.re_errno); 1359 1360 return (rpcerr.re_errno); 1361 } 1362 1363 #ifdef DEBUG 1364 static int acl2call_hits = 0; 1365 static int acl2call_misses = 0; 1366 #endif 1367 1368 int 1369 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1370 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1371 enum nfsstat *statusp, int flags, failinfo_t *fi) 1372 { 1373 int rpcerror; 1374 1375 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1376 cr, douprintf, flags, fi); 1377 if (!rpcerror) { 1378 /* 1379 * See comments with crnetadjust(). 1380 */ 1381 if (*statusp == NFSERR_ACCES && 1382 (cr = crnetadjust(cr)) != NULL) { 1383 #ifdef DEBUG 1384 acl2call_hits++; 1385 #endif 1386 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1387 resp, cr, douprintf, flags, fi); 1388 crfree(cr); 1389 #ifdef DEBUG 1390 if (*statusp == NFSERR_ACCES) 1391 acl2call_misses++; 1392 #endif 1393 } 1394 } 1395 1396 return (rpcerror); 1397 } 1398 1399 #ifdef DEBUG 1400 static int acl3call_hits = 0; 1401 static int acl3call_misses = 0; 1402 #endif 1403 1404 int 1405 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1406 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1407 nfsstat3 *statusp, int flags, failinfo_t *fi) 1408 { 1409 int rpcerror; 1410 int user_informed; 1411 1412 user_informed = 0; 1413 1414 do { 1415 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1416 cr, douprintf, flags, fi); 1417 if (!rpcerror) { 1418 cred_t *crr; 1419 if (*statusp == NFS3ERR_JUKEBOX) { 1420 if (!user_informed) { 1421 user_informed = 1; 1422 uprintf( 1423 "file temporarily unavailable on the server, retrying...\n"); 1424 } 1425 delay(nfs3_jukebox_delay); 1426 } 1427 /* 1428 * See crnetadjust() for comments. 1429 */ 1430 else if (*statusp == NFS3ERR_ACCES && 1431 (crr = crnetadjust(cr)) != NULL) { 1432 #ifdef DEBUG 1433 acl3call_hits++; 1434 #endif 1435 rpcerror = aclcall(mi, which, xdrargs, argsp, 1436 xdrres, resp, crr, douprintf, flags, fi); 1437 1438 crfree(crr); 1439 #ifdef DEBUG 1440 if (*statusp == NFS3ERR_ACCES) 1441 acl3call_misses++; 1442 #endif 1443 } 1444 } 1445 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1446 1447 return (rpcerror); 1448 } 1449 1450 static int 1451 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1452 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1453 int flags, failinfo_t *fi) 1454 { 1455 CLIENT *client; 1456 struct chtab *ch; 1457 enum clnt_stat status; 1458 struct rpc_err rpcerr; 1459 struct timeval wait; 1460 int timeo; /* in units of hz */ 1461 #if 0 /* notyet */ 1462 int my_rsize, my_wsize; 1463 #endif 1464 bool_t tryagain; 1465 k_sigset_t smask; 1466 servinfo_t *svp; 1467 struct nfs_clnt *nfscl; 1468 zoneid_t zoneid = getzoneid(); 1469 #ifdef DEBUG 1470 char *bufp; 1471 #endif 1472 1473 #if 0 /* notyet */ 1474 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1475 "rfscall_start:which %d mi %p", which, mi); 1476 #endif 1477 1478 nfscl = zone_getspecific(nfsclnt_zone_key, curproc->p_zone); 1479 ASSERT(nfscl != NULL); 1480 1481 nfscl->nfscl_stat.calls.value.ui64++; 1482 mi->mi_aclreqs[which].value.ui64++; 1483 1484 rpcerr.re_status = RPC_SUCCESS; 1485 1486 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1487 rpcerr.re_status = RPC_FAILED; 1488 rpcerr.re_errno = EIO; 1489 return (rpcerr.re_errno); 1490 } 1491 1492 #if 0 /* notyet */ 1493 /* 1494 * Remember the transfer sizes in case 1495 * nfs_feedback changes them underneath us. 1496 */ 1497 my_rsize = mi->mi_curread; 1498 my_wsize = mi->mi_curwrite; 1499 #endif 1500 1501 /* 1502 * NFS client failover support 1503 * 1504 * If this rnode is not in sync with the current server (VALID_FH), 1505 * we'd like to do a remap to get in sync. We can be interrupted 1506 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1507 * use the best info we have to try the RPC. Part of that is 1508 * unconditionally updating the filehandle copy kept for V3. 1509 * 1510 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1511 * rw_enter(); we're trying to keep the current server from being 1512 * changed on us until we're done with the remapping and have a 1513 * matching client handle. We don't want to sending a filehandle 1514 * to the wrong host. 1515 */ 1516 failoverretry: 1517 if (FAILOVER_MOUNT(mi)) { 1518 mutex_enter(&mi->mi_lock); 1519 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1520 if (failover_wait(mi)) { 1521 mutex_exit(&mi->mi_lock); 1522 return (EINTR); 1523 } 1524 } 1525 INC_READERS(mi); 1526 mutex_exit(&mi->mi_lock); 1527 if (fi) { 1528 if (!VALID_FH(fi) && 1529 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1530 int remaperr; 1531 1532 svp = mi->mi_curr_serv; 1533 remaperr = failover_remap(fi); 1534 if (remaperr != 0) { 1535 #ifdef DEBUG 1536 if (remaperr != EINTR) 1537 nfs_cmn_err(remaperr, CE_WARN, 1538 "aclcall couldn't failover: %m"); 1539 #endif 1540 mutex_enter(&mi->mi_lock); 1541 DEC_READERS(mi); 1542 mutex_exit(&mi->mi_lock); 1543 1544 /* 1545 * If failover_remap returns ETIMEDOUT 1546 * and the filesystem is hard mounted 1547 * we have to retry the call with a new 1548 * server. 1549 */ 1550 if ((mi->mi_flags & MI_HARD) && 1551 IS_RECOVERABLE_ERROR(remaperr)) { 1552 if (svp == mi->mi_curr_serv) 1553 failover_newserver(mi); 1554 rpcerr.re_status = RPC_SUCCESS; 1555 goto failoverretry; 1556 } 1557 return (remaperr); 1558 } 1559 } 1560 if (fi->fhp && fi->copyproc) 1561 (*fi->copyproc)(fi->fhp, fi->vp); 1562 } 1563 } 1564 1565 /* 1566 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1567 * are guaranteed to reprocess the retry as a new request. 1568 */ 1569 svp = mi->mi_curr_serv; 1570 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1571 if (FAILOVER_MOUNT(mi)) { 1572 mutex_enter(&mi->mi_lock); 1573 DEC_READERS(mi); 1574 mutex_exit(&mi->mi_lock); 1575 1576 if ((rpcerr.re_errno == ETIMEDOUT || 1577 rpcerr.re_errno == ECONNRESET) && 1578 failover_safe(fi)) { 1579 if (svp == mi->mi_curr_serv) 1580 failover_newserver(mi); 1581 goto failoverretry; 1582 } 1583 } 1584 if (rpcerr.re_errno != 0) 1585 return (rpcerr.re_errno); 1586 1587 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1588 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1589 timeo = (mi->mi_timeo * hz) / 10; 1590 } else { 1591 mutex_enter(&mi->mi_lock); 1592 timeo = CLNT_SETTIMERS(client, 1593 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1594 &(mi->mi_timers[NFS_CALLTYPES]), 1595 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1596 (void (*)()) 0, (caddr_t)mi, 0); 1597 mutex_exit(&mi->mi_lock); 1598 } 1599 1600 /* 1601 * If hard mounted fs, retry call forever unless hard error occurs. 1602 */ 1603 do { 1604 tryagain = FALSE; 1605 1606 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1607 status = RPC_FAILED; 1608 rpcerr.re_status = RPC_FAILED; 1609 rpcerr.re_errno = EIO; 1610 break; 1611 } 1612 1613 TICK_TO_TIMEVAL(timeo, &wait); 1614 1615 /* 1616 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1617 * and SIGTERM. (Preserving the existing masks). 1618 * Mask out SIGINT if mount option nointr is specified. 1619 */ 1620 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1621 if (!(mi->mi_flags & MI_INT)) 1622 client->cl_nosignal = TRUE; 1623 1624 /* 1625 * If there is a current signal, then don't bother 1626 * even trying to send out the request because we 1627 * won't be able to block waiting for the response. 1628 * Simply assume RPC_INTR and get on with it. 1629 */ 1630 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1631 status = RPC_INTR; 1632 else { 1633 status = CLNT_CALL(client, which, xdrargs, argsp, 1634 xdrres, resp, wait); 1635 } 1636 1637 if (!(mi->mi_flags & MI_INT)) 1638 client->cl_nosignal = FALSE; 1639 /* 1640 * restore original signal mask 1641 */ 1642 sigunintr(&smask); 1643 1644 switch (status) { 1645 case RPC_SUCCESS: 1646 #if 0 /* notyet */ 1647 if ((mi->mi_flags & MI_DYNAMIC) && 1648 mi->mi_timer_type[which] != 0 && 1649 (mi->mi_curread != my_rsize || 1650 mi->mi_curwrite != my_wsize)) 1651 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1652 #endif 1653 break; 1654 1655 /* 1656 * Unfortunately, there are servers in the world which 1657 * are not coded correctly. They are not prepared to 1658 * handle RPC requests to the NFS port which are not 1659 * NFS requests. Thus, they may try to process the 1660 * NFS_ACL request as if it were an NFS request. This 1661 * does not work. Generally, an error will be generated 1662 * on the client because it will not be able to decode 1663 * the response from the server. However, it seems 1664 * possible that the server may not be able to decode 1665 * the arguments. Thus, the criteria for deciding 1666 * whether the server supports NFS_ACL or not is whether 1667 * the following RPC errors are returned from CLNT_CALL. 1668 */ 1669 case RPC_CANTDECODERES: 1670 case RPC_PROGUNAVAIL: 1671 case RPC_CANTDECODEARGS: 1672 case RPC_PROGVERSMISMATCH: 1673 mutex_enter(&mi->mi_lock); 1674 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1675 mutex_exit(&mi->mi_lock); 1676 break; 1677 1678 /* 1679 * If the server supports NFS_ACL but not the new ops 1680 * for extended attributes, make sure we don't retry. 1681 */ 1682 case RPC_PROCUNAVAIL: 1683 mutex_enter(&mi->mi_lock); 1684 mi->mi_flags &= ~MI_EXTATTR; 1685 mutex_exit(&mi->mi_lock); 1686 break; 1687 1688 case RPC_INTR: 1689 /* 1690 * There is no way to recover from this error, 1691 * even if mount option nointr is specified. 1692 * SIGKILL, for example, cannot be blocked. 1693 */ 1694 rpcerr.re_status = RPC_INTR; 1695 rpcerr.re_errno = EINTR; 1696 break; 1697 1698 case RPC_UDERROR: 1699 /* 1700 * If the NFS server is local (vold) and 1701 * it goes away then we get RPC_UDERROR. 1702 * This is a retryable error, so we would 1703 * loop, so check to see if the specific 1704 * error was ECONNRESET, indicating that 1705 * target did not exist at all. If so, 1706 * return with RPC_PROGUNAVAIL and 1707 * ECONNRESET to indicate why. 1708 */ 1709 CLNT_GETERR(client, &rpcerr); 1710 if (rpcerr.re_errno == ECONNRESET) { 1711 rpcerr.re_status = RPC_PROGUNAVAIL; 1712 rpcerr.re_errno = ECONNRESET; 1713 break; 1714 } 1715 /*FALLTHROUGH*/ 1716 1717 default: /* probably RPC_TIMEDOUT */ 1718 if (IS_UNRECOVERABLE_RPC(status)) 1719 break; 1720 1721 /* 1722 * increment server not responding count 1723 */ 1724 mutex_enter(&mi->mi_lock); 1725 mi->mi_noresponse++; 1726 mutex_exit(&mi->mi_lock); 1727 #ifdef DEBUG 1728 nfscl->nfscl_stat.noresponse.value.ui64++; 1729 #endif 1730 1731 if (!(mi->mi_flags & MI_HARD)) { 1732 if (!(mi->mi_flags & MI_SEMISOFT) || 1733 (mi->mi_acl_ss_call_type[which] == 0)) 1734 break; 1735 } 1736 1737 /* 1738 * The call is in progress (over COTS). 1739 * Try the CLNT_CALL again, but don't 1740 * print a noisy error message. 1741 */ 1742 if (status == RPC_INPROGRESS) { 1743 tryagain = TRUE; 1744 break; 1745 } 1746 1747 if (flags & RFSCALL_SOFT) 1748 break; 1749 1750 /* 1751 * On zone shutdown, just move on. 1752 */ 1753 if (zone_status_get(curproc->p_zone) >= 1754 ZONE_IS_SHUTTING_DOWN) { 1755 rpcerr.re_status = RPC_FAILED; 1756 rpcerr.re_errno = EIO; 1757 break; 1758 } 1759 1760 /* 1761 * NFS client failover support 1762 * 1763 * If the current server just failed us, we'll 1764 * start the process of finding a new server. 1765 * After that, we can just retry. 1766 */ 1767 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1768 if (svp == mi->mi_curr_serv) 1769 failover_newserver(mi); 1770 clfree_impl(client, ch, nfscl); 1771 goto failoverretry; 1772 } 1773 1774 tryagain = TRUE; 1775 timeo = backoff(timeo); 1776 mutex_enter(&mi->mi_lock); 1777 if (!(mi->mi_flags & MI_PRINTED)) { 1778 mi->mi_flags |= MI_PRINTED; 1779 mutex_exit(&mi->mi_lock); 1780 #ifdef DEBUG 1781 zprintf(zoneid, 1782 "NFS_ACL%d server %s not responding still trying\n", 1783 mi->mi_vers, svp->sv_hostname); 1784 #else 1785 zprintf(zoneid, 1786 "NFS server %s not responding still trying\n", 1787 svp->sv_hostname); 1788 #endif 1789 } else 1790 mutex_exit(&mi->mi_lock); 1791 if (*douprintf && curproc->p_sessp->s_vp != NULL) { 1792 *douprintf = 0; 1793 if (!(mi->mi_flags & MI_NOPRINT)) 1794 #ifdef DEBUG 1795 uprintf( 1796 "NFS_ACL%d server %s not responding still trying\n", 1797 mi->mi_vers, svp->sv_hostname); 1798 #else 1799 uprintf( 1800 "NFS server %s not responding still trying\n", 1801 svp->sv_hostname); 1802 #endif 1803 } 1804 1805 #if 0 /* notyet */ 1806 /* 1807 * If doing dynamic adjustment of transfer 1808 * size and if it's a read or write call 1809 * and if the transfer size changed while 1810 * retransmitting or if the feedback routine 1811 * changed the transfer size, 1812 * then exit rfscall so that the transfer 1813 * size can be adjusted at the vnops level. 1814 */ 1815 if ((mi->mi_flags & MI_DYNAMIC) && 1816 mi->mi_acl_timer_type[which] != 0 && 1817 (mi->mi_curread != my_rsize || 1818 mi->mi_curwrite != my_wsize || 1819 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1820 /* 1821 * On read or write calls, return 1822 * back to the vnode ops level if 1823 * the transfer size changed. 1824 */ 1825 clfree_impl(client, ch, nfscl); 1826 return (ENFS_TRYAGAIN); 1827 } 1828 #endif 1829 } 1830 } while (tryagain); 1831 1832 if (status != RPC_SUCCESS) { 1833 /* 1834 * Let soft mounts use the timed out message. 1835 */ 1836 if (status == RPC_INPROGRESS) 1837 status = RPC_TIMEDOUT; 1838 nfscl->nfscl_stat.badcalls.value.ui64++; 1839 if (status == RPC_CANTDECODERES || 1840 status == RPC_PROGUNAVAIL || 1841 status == RPC_PROCUNAVAIL || 1842 status == RPC_CANTDECODEARGS || 1843 status == RPC_PROGVERSMISMATCH) 1844 CLNT_GETERR(client, &rpcerr); 1845 else if (status != RPC_INTR) { 1846 mutex_enter(&mi->mi_lock); 1847 mi->mi_flags |= MI_DOWN; 1848 mutex_exit(&mi->mi_lock); 1849 CLNT_GETERR(client, &rpcerr); 1850 #ifdef DEBUG 1851 bufp = clnt_sperror(client, svp->sv_hostname); 1852 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1853 mi->mi_vers, mi->mi_aclnames[which], bufp); 1854 if (curproc->p_sessp->s_vp != NULL) { 1855 if (!(mi->mi_flags & MI_NOPRINT)) { 1856 uprintf("NFS_ACL%d %s failed for %s\n", 1857 mi->mi_vers, mi->mi_aclnames[which], 1858 bufp); 1859 } 1860 } 1861 kmem_free(bufp, MAXPATHLEN); 1862 #else 1863 zprintf(zoneid, 1864 "NFS %s failed for server %s: error %d (%s)\n", 1865 mi->mi_aclnames[which], svp->sv_hostname, 1866 status, clnt_sperrno(status)); 1867 if (curproc->p_sessp->s_vp != NULL) { 1868 if (!(mi->mi_flags & MI_NOPRINT)) 1869 uprintf( 1870 "NFS %s failed for server %s: error %d (%s)\n", 1871 mi->mi_aclnames[which], 1872 svp->sv_hostname, status, 1873 clnt_sperrno(status)); 1874 } 1875 #endif 1876 /* 1877 * when CLNT_CALL() fails with RPC_AUTHERROR, 1878 * re_errno is set appropriately depending on 1879 * the authentication error 1880 */ 1881 if (status == RPC_VERSMISMATCH || 1882 status == RPC_PROGVERSMISMATCH) 1883 rpcerr.re_errno = EIO; 1884 } 1885 } else { 1886 /* 1887 * Test the value of mi_down and mi_printed without 1888 * holding the mi_lock mutex. If they are both zero, 1889 * then it is okay to skip the down and printed 1890 * processing. This saves on a mutex_enter and 1891 * mutex_exit pair for a normal, successful RPC. 1892 * This was just complete overhead. 1893 */ 1894 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1895 mutex_enter(&mi->mi_lock); 1896 mi->mi_flags &= ~MI_DOWN; 1897 if (mi->mi_flags & MI_PRINTED) { 1898 mi->mi_flags &= ~MI_PRINTED; 1899 mutex_exit(&mi->mi_lock); 1900 #ifdef DEBUG 1901 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1902 mi->mi_vers, svp->sv_hostname); 1903 #else 1904 zprintf(zoneid, "NFS server %s ok\n", 1905 svp->sv_hostname); 1906 #endif 1907 } else 1908 mutex_exit(&mi->mi_lock); 1909 } 1910 1911 if (*douprintf == 0) { 1912 if (!(mi->mi_flags & MI_NOPRINT)) 1913 #ifdef DEBUG 1914 uprintf("NFS_ACL%d server %s ok\n", 1915 mi->mi_vers, svp->sv_hostname); 1916 #else 1917 uprintf("NFS server %s ok\n", svp->sv_hostname); 1918 #endif 1919 *douprintf = 1; 1920 } 1921 } 1922 1923 clfree_impl(client, ch, nfscl); 1924 1925 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1926 1927 #if 0 /* notyet */ 1928 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1929 rpcerr.re_errno); 1930 #endif 1931 1932 return (rpcerr.re_errno); 1933 } 1934 1935 int 1936 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1937 { 1938 uint_t mask = vap->va_mask; 1939 1940 if (!(mask & AT_MODE)) 1941 sa->sa_mode = (uint32_t)-1; 1942 else 1943 sa->sa_mode = vap->va_mode; 1944 if (!(mask & AT_UID)) 1945 sa->sa_uid = (uint32_t)-1; 1946 else 1947 sa->sa_uid = (uint32_t)vap->va_uid; 1948 if (!(mask & AT_GID)) 1949 sa->sa_gid = (uint32_t)-1; 1950 else 1951 sa->sa_gid = (uint32_t)vap->va_gid; 1952 if (!(mask & AT_SIZE)) 1953 sa->sa_size = (uint32_t)-1; 1954 else 1955 sa->sa_size = (uint32_t)vap->va_size; 1956 if (!(mask & AT_ATIME)) 1957 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 1958 else { 1959 /* check time validity */ 1960 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1961 return (EOVERFLOW); 1962 } 1963 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 1964 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 1965 } 1966 if (!(mask & AT_MTIME)) 1967 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 1968 else { 1969 /* check time validity */ 1970 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 1971 return (EOVERFLOW); 1972 } 1973 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 1974 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 1975 } 1976 return (0); 1977 } 1978 1979 int 1980 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 1981 { 1982 uint_t mask = vap->va_mask; 1983 1984 if (!(mask & AT_MODE)) 1985 sa->mode.set_it = FALSE; 1986 else { 1987 sa->mode.set_it = TRUE; 1988 sa->mode.mode = (mode3)vap->va_mode; 1989 } 1990 if (!(mask & AT_UID)) 1991 sa->uid.set_it = FALSE; 1992 else { 1993 sa->uid.set_it = TRUE; 1994 sa->uid.uid = (uid3)vap->va_uid; 1995 } 1996 if (!(mask & AT_GID)) 1997 sa->gid.set_it = FALSE; 1998 else { 1999 sa->gid.set_it = TRUE; 2000 sa->gid.gid = (gid3)vap->va_gid; 2001 } 2002 if (!(mask & AT_SIZE)) 2003 sa->size.set_it = FALSE; 2004 else { 2005 sa->size.set_it = TRUE; 2006 sa->size.size = (size3)vap->va_size; 2007 } 2008 if (!(mask & AT_ATIME)) 2009 sa->atime.set_it = DONT_CHANGE; 2010 else { 2011 /* check time validity */ 2012 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2013 return (EOVERFLOW); 2014 } 2015 sa->atime.set_it = SET_TO_CLIENT_TIME; 2016 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2017 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2018 } 2019 if (!(mask & AT_MTIME)) 2020 sa->mtime.set_it = DONT_CHANGE; 2021 else { 2022 /* check time validity */ 2023 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2024 return (EOVERFLOW); 2025 } 2026 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2027 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2028 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2029 } 2030 return (0); 2031 } 2032 2033 void 2034 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2035 { 2036 2037 da->da_fhandle = VTOFH(dvp); 2038 da->da_name = nm; 2039 da->da_flags = 0; 2040 } 2041 2042 void 2043 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2044 { 2045 2046 da->dirp = VTOFH3(dvp); 2047 da->name = nm; 2048 } 2049 2050 int 2051 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2052 { 2053 int error; 2054 rnode_t *rp; 2055 struct vattr va; 2056 2057 va.va_mask = AT_MODE | AT_GID; 2058 error = VOP_GETATTR(dvp, &va, 0, cr); 2059 if (error) 2060 return (error); 2061 2062 /* 2063 * To determine the expected group-id of the created file: 2064 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2065 * GRPID option, and the directory's set-gid bit is clear, 2066 * then use the process's gid. 2067 * 2) Otherwise, set the group-id to the gid of the parent directory. 2068 */ 2069 rp = VTOR(dvp); 2070 mutex_enter(&rp->r_statelock); 2071 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2072 *gidp = crgetgid(cr); 2073 else 2074 *gidp = va.va_gid; 2075 mutex_exit(&rp->r_statelock); 2076 return (0); 2077 } 2078 2079 int 2080 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2081 { 2082 int error; 2083 struct vattr va; 2084 2085 va.va_mask = AT_MODE; 2086 error = VOP_GETATTR(dvp, &va, 0, cr); 2087 if (error) 2088 return (error); 2089 2090 /* 2091 * Modify the expected mode (om) so that the set-gid bit matches 2092 * that of the parent directory (dvp). 2093 */ 2094 if (va.va_mode & VSGID) 2095 *omp |= VSGID; 2096 else 2097 *omp &= ~VSGID; 2098 return (0); 2099 } 2100 2101 void 2102 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2103 { 2104 2105 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2106 if (!(vp->v_flag & VSWAPLIKE)) { 2107 mutex_enter(&vp->v_lock); 2108 vp->v_flag |= VSWAPLIKE; 2109 mutex_exit(&vp->v_lock); 2110 } 2111 } else { 2112 if (vp->v_flag & VSWAPLIKE) { 2113 mutex_enter(&vp->v_lock); 2114 vp->v_flag &= ~VSWAPLIKE; 2115 mutex_exit(&vp->v_lock); 2116 } 2117 } 2118 } 2119 2120 /* 2121 * Free the resources associated with an rnode. 2122 */ 2123 static void 2124 rinactive(rnode_t *rp, cred_t *cr) 2125 { 2126 vnode_t *vp; 2127 cred_t *cred; 2128 char *contents; 2129 int size; 2130 vsecattr_t *vsp; 2131 int error; 2132 nfs3_pathconf_info *info; 2133 2134 /* 2135 * Before freeing anything, wait until all asynchronous 2136 * activity is done on this rnode. This will allow all 2137 * asynchronous read ahead and write behind i/o's to 2138 * finish. 2139 */ 2140 mutex_enter(&rp->r_statelock); 2141 while (rp->r_count > 0) 2142 cv_wait(&rp->r_cv, &rp->r_statelock); 2143 mutex_exit(&rp->r_statelock); 2144 2145 /* 2146 * Flush and invalidate all pages associated with the vnode. 2147 */ 2148 vp = RTOV(rp); 2149 if (vn_has_cached_data(vp)) { 2150 ASSERT(vp->v_type != VCHR); 2151 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2152 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); 2153 if (error && (error == ENOSPC || error == EDQUOT)) { 2154 mutex_enter(&rp->r_statelock); 2155 if (!rp->r_error) 2156 rp->r_error = error; 2157 mutex_exit(&rp->r_statelock); 2158 } 2159 } 2160 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2161 } 2162 2163 /* 2164 * Free any held credentials and caches which may be associated 2165 * with this rnode. 2166 */ 2167 mutex_enter(&rp->r_statelock); 2168 cred = rp->r_cred; 2169 rp->r_cred = NULL; 2170 contents = rp->r_symlink.contents; 2171 size = rp->r_symlink.size; 2172 rp->r_symlink.contents = NULL; 2173 vsp = rp->r_secattr; 2174 rp->r_secattr = NULL; 2175 info = rp->r_pathconf; 2176 rp->r_pathconf = NULL; 2177 mutex_exit(&rp->r_statelock); 2178 2179 /* 2180 * Free the held credential. 2181 */ 2182 if (cred != NULL) 2183 crfree(cred); 2184 2185 /* 2186 * Free the access cache entries. 2187 */ 2188 (void) nfs_access_purge_rp(rp); 2189 2190 /* 2191 * Free the readdir cache entries. 2192 */ 2193 if (HAVE_RDDIR_CACHE(rp)) 2194 nfs_purge_rddir_cache(vp); 2195 2196 /* 2197 * Free the symbolic link cache. 2198 */ 2199 if (contents != NULL) { 2200 2201 kmem_free((void *)contents, size); 2202 } 2203 2204 /* 2205 * Free any cached ACL. 2206 */ 2207 if (vsp != NULL) 2208 nfs_acl_free(vsp); 2209 2210 /* 2211 * Free any cached pathconf information. 2212 */ 2213 if (info != NULL) 2214 kmem_free(info, sizeof (*info)); 2215 } 2216 2217 /* 2218 * Return a vnode for the given NFS Version 2 file handle. 2219 * If no rnode exists for this fhandle, create one and put it 2220 * into the hash queues. If the rnode for this fhandle 2221 * already exists, return it. 2222 * 2223 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2224 */ 2225 vnode_t * 2226 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2227 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2228 { 2229 int newnode; 2230 int index; 2231 vnode_t *vp; 2232 nfs_fhandle nfh; 2233 vattr_t va; 2234 2235 nfh.fh_len = NFS_FHSIZE; 2236 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2237 2238 index = rtablehash(&nfh); 2239 rw_enter(&rtable[index].r_lock, RW_READER); 2240 2241 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2242 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2243 2244 if (attr != NULL) { 2245 if (!newnode) { 2246 rw_exit(&rtable[index].r_lock); 2247 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2248 } else { 2249 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2250 vp->v_type = VBAD; 2251 else 2252 vp->v_type = n2v_type(attr); 2253 /* 2254 * A translation here seems to be necessary 2255 * because this function can be called 2256 * with `attr' that has come from the wire, 2257 * and been operated on by vattr_to_nattr(). 2258 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2259 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2260 * ->makenfsnode(). 2261 */ 2262 if ((attr->na_rdev & 0xffff0000) == 0) 2263 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2264 else 2265 vp->v_rdev = expldev(n2v_rdev(attr)); 2266 nfs_attrcache(vp, attr, t); 2267 rw_exit(&rtable[index].r_lock); 2268 } 2269 } else { 2270 if (newnode) { 2271 PURGE_ATTRCACHE(vp); 2272 } 2273 rw_exit(&rtable[index].r_lock); 2274 } 2275 2276 return (vp); 2277 } 2278 2279 /* 2280 * Return a vnode for the given NFS Version 3 file handle. 2281 * If no rnode exists for this fhandle, create one and put it 2282 * into the hash queues. If the rnode for this fhandle 2283 * already exists, return it. 2284 * 2285 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2286 */ 2287 vnode_t * 2288 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2289 cred_t *cr, char *dnm, char *nm) 2290 { 2291 int newnode; 2292 int index; 2293 vnode_t *vp; 2294 2295 index = rtablehash((nfs_fhandle *)fh); 2296 rw_enter(&rtable[index].r_lock, RW_READER); 2297 2298 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2299 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2300 dnm, nm); 2301 2302 if (vap == NULL) { 2303 if (newnode) { 2304 PURGE_ATTRCACHE(vp); 2305 } 2306 rw_exit(&rtable[index].r_lock); 2307 return (vp); 2308 } 2309 2310 if (!newnode) { 2311 rw_exit(&rtable[index].r_lock); 2312 nfs_attr_cache(vp, vap, t, cr); 2313 } else { 2314 rnode_t *rp = VTOR(vp); 2315 2316 vp->v_type = vap->va_type; 2317 vp->v_rdev = vap->va_rdev; 2318 2319 mutex_enter(&rp->r_statelock); 2320 if (rp->r_mtime <= t) 2321 nfs_attrcache_va(vp, vap); 2322 mutex_exit(&rp->r_statelock); 2323 rw_exit(&rtable[index].r_lock); 2324 } 2325 2326 return (vp); 2327 } 2328 2329 vnode_t * 2330 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2331 cred_t *cr, char *dnm, char *nm) 2332 { 2333 int newnode; 2334 int index; 2335 vnode_t *vp; 2336 vattr_t va; 2337 2338 index = rtablehash((nfs_fhandle *)fh); 2339 rw_enter(&rtable[index].r_lock, RW_READER); 2340 2341 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2342 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2343 dnm, nm); 2344 2345 if (attr == NULL) { 2346 if (newnode) { 2347 PURGE_ATTRCACHE(vp); 2348 } 2349 rw_exit(&rtable[index].r_lock); 2350 return (vp); 2351 } 2352 2353 if (!newnode) { 2354 rw_exit(&rtable[index].r_lock); 2355 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2356 } else { 2357 if (attr->type < NF3REG || attr->type > NF3FIFO) 2358 vp->v_type = VBAD; 2359 else 2360 vp->v_type = nf3_to_vt[attr->type]; 2361 vp->v_rdev = makedevice(attr->rdev.specdata1, 2362 attr->rdev.specdata2); 2363 nfs3_attrcache(vp, attr, t); 2364 rw_exit(&rtable[index].r_lock); 2365 } 2366 2367 return (vp); 2368 } 2369 2370 /* 2371 * Read this comment before making changes to rtablehash()! 2372 * This is a hash function in which seemingly obvious and harmless 2373 * changes can cause escalations costing million dollars! 2374 * Know what you are doing. 2375 * 2376 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2377 * algorithm is currently detailed here: 2378 * 2379 * http://burtleburtle.net/bob/hash/doobs.html 2380 * 2381 * Of course, the above link may not be valid by the time you are reading 2382 * this, but suffice it to say that the one-at-a-time algorithm works well in 2383 * almost all cases. If you are changing the algorithm be sure to verify that 2384 * the hash algorithm still provides even distribution in all cases and with 2385 * any server returning filehandles in whatever order (sequential or random). 2386 */ 2387 static int 2388 rtablehash(nfs_fhandle *fh) 2389 { 2390 ulong_t hash, len, i; 2391 char *key; 2392 2393 key = fh->fh_buf; 2394 len = (ulong_t)fh->fh_len; 2395 for (hash = 0, i = 0; i < len; i++) { 2396 hash += key[i]; 2397 hash += (hash << 10); 2398 hash ^= (hash >> 6); 2399 } 2400 hash += (hash << 3); 2401 hash ^= (hash >> 11); 2402 hash += (hash << 15); 2403 return (hash & rtablemask); 2404 } 2405 2406 static vnode_t * 2407 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2408 struct vnodeops *vops, 2409 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2410 int (*compar)(const void *, const void *), 2411 int *newnode, cred_t *cr, char *dnm, char *nm) 2412 { 2413 rnode_t *rp; 2414 rnode_t *trp; 2415 vnode_t *vp; 2416 mntinfo_t *mi; 2417 2418 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2419 2420 mi = VFTOMI(vfsp); 2421 start: 2422 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2423 vp = RTOV(rp); 2424 nfs_set_vroot(vp); 2425 *newnode = 0; 2426 return (vp); 2427 } 2428 rw_exit(&rhtp->r_lock); 2429 2430 mutex_enter(&rpfreelist_lock); 2431 if (rpfreelist != NULL && rnew >= nrnode) { 2432 rp = rpfreelist; 2433 rp_rmfree(rp); 2434 mutex_exit(&rpfreelist_lock); 2435 2436 vp = RTOV(rp); 2437 2438 if (rp->r_flags & RHASHED) { 2439 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2440 mutex_enter(&vp->v_lock); 2441 if (vp->v_count > 1) { 2442 vp->v_count--; 2443 mutex_exit(&vp->v_lock); 2444 rw_exit(&rp->r_hashq->r_lock); 2445 rw_enter(&rhtp->r_lock, RW_READER); 2446 goto start; 2447 } 2448 mutex_exit(&vp->v_lock); 2449 rp_rmhash_locked(rp); 2450 rw_exit(&rp->r_hashq->r_lock); 2451 } 2452 2453 rinactive(rp, cr); 2454 2455 mutex_enter(&vp->v_lock); 2456 if (vp->v_count > 1) { 2457 vp->v_count--; 2458 mutex_exit(&vp->v_lock); 2459 rw_enter(&rhtp->r_lock, RW_READER); 2460 goto start; 2461 } 2462 mutex_exit(&vp->v_lock); 2463 vn_invalid(vp); 2464 /* 2465 * destroy old locks before bzero'ing and 2466 * recreating the locks below. 2467 */ 2468 nfs_rw_destroy(&rp->r_rwlock); 2469 nfs_rw_destroy(&rp->r_lkserlock); 2470 mutex_destroy(&rp->r_statelock); 2471 cv_destroy(&rp->r_cv); 2472 cv_destroy(&rp->r_commit.c_cv); 2473 nfs_free_r_path(rp); 2474 avl_destroy(&rp->r_dir); 2475 /* 2476 * Make sure that if rnode is recycled then 2477 * VFS count is decremented properly before 2478 * reuse. 2479 */ 2480 VFS_RELE(vp->v_vfsp); 2481 vn_reinit(vp); 2482 } else { 2483 vnode_t *new_vp; 2484 2485 mutex_exit(&rpfreelist_lock); 2486 2487 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2488 new_vp = vn_alloc(KM_SLEEP); 2489 2490 atomic_add_long((ulong_t *)&rnew, 1); 2491 #ifdef DEBUG 2492 clstat_debug.nrnode.value.ui64++; 2493 #endif 2494 vp = new_vp; 2495 } 2496 2497 bzero(rp, sizeof (*rp)); 2498 rp->r_vnode = vp; 2499 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2500 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2501 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2502 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2503 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2504 rp->r_fh.fh_len = fh->fh_len; 2505 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2506 rp->r_server = mi->mi_curr_serv; 2507 if (FAILOVER_MOUNT(mi)) { 2508 /* 2509 * If replicated servers, stash pathnames 2510 */ 2511 if (dnm != NULL && nm != NULL) { 2512 char *s, *p; 2513 uint_t len; 2514 2515 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2516 rp->r_path = kmem_alloc(len, KM_SLEEP); 2517 #ifdef DEBUG 2518 clstat_debug.rpath.value.ui64 += len; 2519 #endif 2520 s = rp->r_path; 2521 for (p = dnm; *p; p++) 2522 *s++ = *p; 2523 *s++ = '/'; 2524 for (p = nm; *p; p++) 2525 *s++ = *p; 2526 *s = '\0'; 2527 } else { 2528 /* special case for root */ 2529 rp->r_path = kmem_alloc(2, KM_SLEEP); 2530 #ifdef DEBUG 2531 clstat_debug.rpath.value.ui64 += 2; 2532 #endif 2533 *rp->r_path = '.'; 2534 *(rp->r_path + 1) = '\0'; 2535 } 2536 } 2537 VFS_HOLD(vfsp); 2538 rp->r_putapage = putapage; 2539 rp->r_hashq = rhtp; 2540 rp->r_flags = RREADDIRPLUS; 2541 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2542 offsetof(rddir_cache, tree)); 2543 vn_setops(vp, vops); 2544 vp->v_data = (caddr_t)rp; 2545 vp->v_vfsp = vfsp; 2546 vp->v_type = VNON; 2547 nfs_set_vroot(vp); 2548 2549 /* 2550 * There is a race condition if someone else 2551 * alloc's the rnode while no locks are held, so we 2552 * check again and recover if found. 2553 */ 2554 rw_enter(&rhtp->r_lock, RW_WRITER); 2555 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2556 vp = RTOV(trp); 2557 nfs_set_vroot(vp); 2558 *newnode = 0; 2559 rw_exit(&rhtp->r_lock); 2560 rp_addfree(rp, cr); 2561 rw_enter(&rhtp->r_lock, RW_READER); 2562 return (vp); 2563 } 2564 rp_addhash(rp); 2565 *newnode = 1; 2566 return (vp); 2567 } 2568 2569 static void 2570 nfs_set_vroot(vnode_t *vp) 2571 { 2572 rnode_t *rp; 2573 nfs_fhandle *rootfh; 2574 2575 rp = VTOR(vp); 2576 rootfh = &rp->r_server->sv_fhandle; 2577 if (rootfh->fh_len == rp->r_fh.fh_len && 2578 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2579 if (!(vp->v_flag & VROOT)) { 2580 mutex_enter(&vp->v_lock); 2581 vp->v_flag |= VROOT; 2582 mutex_exit(&vp->v_lock); 2583 } 2584 } 2585 } 2586 2587 static void 2588 nfs_free_r_path(rnode_t *rp) 2589 { 2590 char *path; 2591 size_t len; 2592 2593 path = rp->r_path; 2594 if (path) { 2595 rp->r_path = NULL; 2596 len = strlen(path) + 1; 2597 kmem_free(path, len); 2598 #ifdef DEBUG 2599 clstat_debug.rpath.value.ui64 -= len; 2600 #endif 2601 } 2602 } 2603 2604 /* 2605 * Put an rnode on the free list. 2606 * 2607 * Rnodes which were allocated above and beyond the normal limit 2608 * are immediately freed. 2609 */ 2610 void 2611 rp_addfree(rnode_t *rp, cred_t *cr) 2612 { 2613 vnode_t *vp; 2614 struct vfs *vfsp; 2615 2616 vp = RTOV(rp); 2617 ASSERT(vp->v_count >= 1); 2618 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2619 2620 /* 2621 * If we have too many rnodes allocated and there are no 2622 * references to this rnode, or if the rnode is no longer 2623 * accessible by it does not reside in the hash queues, 2624 * or if an i/o error occurred while writing to the file, 2625 * then just free it instead of putting it on the rnode 2626 * freelist. 2627 */ 2628 vfsp = vp->v_vfsp; 2629 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2630 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2631 if (rp->r_flags & RHASHED) { 2632 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2633 mutex_enter(&vp->v_lock); 2634 if (vp->v_count > 1) { 2635 vp->v_count--; 2636 mutex_exit(&vp->v_lock); 2637 rw_exit(&rp->r_hashq->r_lock); 2638 return; 2639 } 2640 mutex_exit(&vp->v_lock); 2641 rp_rmhash_locked(rp); 2642 rw_exit(&rp->r_hashq->r_lock); 2643 } 2644 2645 rinactive(rp, cr); 2646 2647 /* 2648 * Recheck the vnode reference count. We need to 2649 * make sure that another reference has not been 2650 * acquired while we were not holding v_lock. The 2651 * rnode is not in the rnode hash queues, so the 2652 * only way for a reference to have been acquired 2653 * is for a VOP_PUTPAGE because the rnode was marked 2654 * with RDIRTY or for a modified page. This 2655 * reference may have been acquired before our call 2656 * to rinactive. The i/o may have been completed, 2657 * thus allowing rinactive to complete, but the 2658 * reference to the vnode may not have been released 2659 * yet. In any case, the rnode can not be destroyed 2660 * until the other references to this vnode have been 2661 * released. The other references will take care of 2662 * either destroying the rnode or placing it on the 2663 * rnode freelist. If there are no other references, 2664 * then the rnode may be safely destroyed. 2665 */ 2666 mutex_enter(&vp->v_lock); 2667 if (vp->v_count > 1) { 2668 vp->v_count--; 2669 mutex_exit(&vp->v_lock); 2670 return; 2671 } 2672 mutex_exit(&vp->v_lock); 2673 2674 destroy_rnode(rp); 2675 return; 2676 } 2677 2678 /* 2679 * Lock the hash queue and then recheck the reference count 2680 * to ensure that no other threads have acquired a reference 2681 * to indicate that the rnode should not be placed on the 2682 * freelist. If another reference has been acquired, then 2683 * just release this one and let the other thread complete 2684 * the processing of adding this rnode to the freelist. 2685 */ 2686 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2687 2688 mutex_enter(&vp->v_lock); 2689 if (vp->v_count > 1) { 2690 vp->v_count--; 2691 mutex_exit(&vp->v_lock); 2692 rw_exit(&rp->r_hashq->r_lock); 2693 return; 2694 } 2695 mutex_exit(&vp->v_lock); 2696 2697 /* 2698 * If there is no cached data or metadata for this file, then 2699 * put the rnode on the front of the freelist so that it will 2700 * be reused before other rnodes which may have cached data or 2701 * metadata associated with them. 2702 */ 2703 mutex_enter(&rpfreelist_lock); 2704 if (rpfreelist == NULL) { 2705 rp->r_freef = rp; 2706 rp->r_freeb = rp; 2707 rpfreelist = rp; 2708 } else { 2709 rp->r_freef = rpfreelist; 2710 rp->r_freeb = rpfreelist->r_freeb; 2711 rpfreelist->r_freeb->r_freef = rp; 2712 rpfreelist->r_freeb = rp; 2713 if (!vn_has_cached_data(vp) && 2714 !HAVE_RDDIR_CACHE(rp) && 2715 rp->r_symlink.contents == NULL && 2716 rp->r_secattr == NULL && 2717 rp->r_pathconf == NULL) 2718 rpfreelist = rp; 2719 } 2720 mutex_exit(&rpfreelist_lock); 2721 2722 rw_exit(&rp->r_hashq->r_lock); 2723 } 2724 2725 /* 2726 * Remove an rnode from the free list. 2727 * 2728 * The caller must be holding rpfreelist_lock and the rnode 2729 * must be on the freelist. 2730 */ 2731 static void 2732 rp_rmfree(rnode_t *rp) 2733 { 2734 2735 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2736 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2737 2738 if (rp == rpfreelist) { 2739 rpfreelist = rp->r_freef; 2740 if (rp == rpfreelist) 2741 rpfreelist = NULL; 2742 } 2743 2744 rp->r_freeb->r_freef = rp->r_freef; 2745 rp->r_freef->r_freeb = rp->r_freeb; 2746 2747 rp->r_freef = rp->r_freeb = NULL; 2748 } 2749 2750 /* 2751 * Put a rnode in the hash table. 2752 * 2753 * The caller must be holding the exclusive hash queue lock. 2754 */ 2755 static void 2756 rp_addhash(rnode_t *rp) 2757 { 2758 2759 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2760 ASSERT(!(rp->r_flags & RHASHED)); 2761 2762 rp->r_hashf = rp->r_hashq->r_hashf; 2763 rp->r_hashq->r_hashf = rp; 2764 rp->r_hashb = (rnode_t *)rp->r_hashq; 2765 rp->r_hashf->r_hashb = rp; 2766 2767 mutex_enter(&rp->r_statelock); 2768 rp->r_flags |= RHASHED; 2769 mutex_exit(&rp->r_statelock); 2770 } 2771 2772 /* 2773 * Remove a rnode from the hash table. 2774 * 2775 * The caller must be holding the hash queue lock. 2776 */ 2777 static void 2778 rp_rmhash_locked(rnode_t *rp) 2779 { 2780 2781 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2782 ASSERT(rp->r_flags & RHASHED); 2783 2784 rp->r_hashb->r_hashf = rp->r_hashf; 2785 rp->r_hashf->r_hashb = rp->r_hashb; 2786 2787 mutex_enter(&rp->r_statelock); 2788 rp->r_flags &= ~RHASHED; 2789 mutex_exit(&rp->r_statelock); 2790 } 2791 2792 /* 2793 * Remove a rnode from the hash table. 2794 * 2795 * The caller must not be holding the hash queue lock. 2796 */ 2797 void 2798 rp_rmhash(rnode_t *rp) 2799 { 2800 2801 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2802 rp_rmhash_locked(rp); 2803 rw_exit(&rp->r_hashq->r_lock); 2804 } 2805 2806 /* 2807 * Lookup a rnode by fhandle. 2808 * 2809 * The caller must be holding the hash queue lock, either shared or exclusive. 2810 */ 2811 static rnode_t * 2812 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2813 { 2814 rnode_t *rp; 2815 vnode_t *vp; 2816 2817 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2818 2819 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2820 vp = RTOV(rp); 2821 if (vp->v_vfsp == vfsp && 2822 rp->r_fh.fh_len == fh->fh_len && 2823 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2824 /* 2825 * remove rnode from free list, if necessary. 2826 */ 2827 if (rp->r_freef != NULL) { 2828 mutex_enter(&rpfreelist_lock); 2829 /* 2830 * If the rnode is on the freelist, 2831 * then remove it and use that reference 2832 * as the new reference. Otherwise, 2833 * need to increment the reference count. 2834 */ 2835 if (rp->r_freef != NULL) { 2836 rp_rmfree(rp); 2837 mutex_exit(&rpfreelist_lock); 2838 } else { 2839 mutex_exit(&rpfreelist_lock); 2840 VN_HOLD(vp); 2841 } 2842 } else 2843 VN_HOLD(vp); 2844 return (rp); 2845 } 2846 } 2847 return (NULL); 2848 } 2849 2850 /* 2851 * Return 1 if there is a active vnode belonging to this vfs in the 2852 * rtable cache. 2853 * 2854 * Several of these checks are done without holding the usual 2855 * locks. This is safe because destroy_rtable(), rp_addfree(), 2856 * etc. will redo the necessary checks before actually destroying 2857 * any rnodes. 2858 */ 2859 int 2860 check_rtable(struct vfs *vfsp) 2861 { 2862 int index; 2863 rnode_t *rp; 2864 vnode_t *vp; 2865 2866 for (index = 0; index < rtablesize; index++) { 2867 rw_enter(&rtable[index].r_lock, RW_READER); 2868 for (rp = rtable[index].r_hashf; 2869 rp != (rnode_t *)(&rtable[index]); 2870 rp = rp->r_hashf) { 2871 vp = RTOV(rp); 2872 if (vp->v_vfsp == vfsp) { 2873 if (rp->r_freef == NULL || 2874 (vn_has_cached_data(vp) && 2875 (rp->r_flags & RDIRTY)) || 2876 rp->r_count > 0) { 2877 rw_exit(&rtable[index].r_lock); 2878 return (1); 2879 } 2880 } 2881 } 2882 rw_exit(&rtable[index].r_lock); 2883 } 2884 return (0); 2885 } 2886 2887 /* 2888 * Destroy inactive vnodes from the hash queues which belong to this 2889 * vfs. It is essential that we destroy all inactive vnodes during a 2890 * forced unmount as well as during a normal unmount. 2891 */ 2892 void 2893 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2894 { 2895 int index; 2896 rnode_t *rp; 2897 rnode_t *rlist; 2898 rnode_t *r_hashf; 2899 vnode_t *vp; 2900 2901 rlist = NULL; 2902 2903 for (index = 0; index < rtablesize; index++) { 2904 rw_enter(&rtable[index].r_lock, RW_WRITER); 2905 for (rp = rtable[index].r_hashf; 2906 rp != (rnode_t *)(&rtable[index]); 2907 rp = r_hashf) { 2908 /* save the hash pointer before destroying */ 2909 r_hashf = rp->r_hashf; 2910 vp = RTOV(rp); 2911 if (vp->v_vfsp == vfsp) { 2912 mutex_enter(&rpfreelist_lock); 2913 if (rp->r_freef != NULL) { 2914 rp_rmfree(rp); 2915 mutex_exit(&rpfreelist_lock); 2916 rp_rmhash_locked(rp); 2917 rp->r_hashf = rlist; 2918 rlist = rp; 2919 } else 2920 mutex_exit(&rpfreelist_lock); 2921 } 2922 } 2923 rw_exit(&rtable[index].r_lock); 2924 } 2925 2926 for (rp = rlist; rp != NULL; rp = rlist) { 2927 rlist = rp->r_hashf; 2928 /* 2929 * This call to rp_addfree will end up destroying the 2930 * rnode, but in a safe way with the appropriate set 2931 * of checks done. 2932 */ 2933 rp_addfree(rp, cr); 2934 } 2935 2936 } 2937 2938 /* 2939 * This routine destroys all the resources associated with the rnode 2940 * and then the rnode itself. 2941 */ 2942 static void 2943 destroy_rnode(rnode_t *rp) 2944 { 2945 vnode_t *vp; 2946 vfs_t *vfsp; 2947 2948 vp = RTOV(rp); 2949 vfsp = vp->v_vfsp; 2950 2951 ASSERT(vp->v_count == 1); 2952 ASSERT(rp->r_count == 0); 2953 ASSERT(rp->r_lmpl == NULL); 2954 ASSERT(rp->r_mapcnt == 0); 2955 ASSERT(!(rp->r_flags & RHASHED)); 2956 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2957 atomic_add_long((ulong_t *)&rnew, -1); 2958 #ifdef DEBUG 2959 clstat_debug.nrnode.value.ui64--; 2960 #endif 2961 nfs_rw_destroy(&rp->r_rwlock); 2962 nfs_rw_destroy(&rp->r_lkserlock); 2963 mutex_destroy(&rp->r_statelock); 2964 cv_destroy(&rp->r_cv); 2965 cv_destroy(&rp->r_commit.c_cv); 2966 if (rp->r_flags & RDELMAPLIST) 2967 list_destroy(&rp->r_indelmap); 2968 nfs_free_r_path(rp); 2969 avl_destroy(&rp->r_dir); 2970 vn_invalid(vp); 2971 vn_free(vp); 2972 kmem_cache_free(rnode_cache, rp); 2973 VFS_RELE(vfsp); 2974 } 2975 2976 /* 2977 * Flush all vnodes in this (or every) vfs. 2978 * Used by nfs_sync and by nfs_unmount. 2979 */ 2980 void 2981 rflush(struct vfs *vfsp, cred_t *cr) 2982 { 2983 int index; 2984 rnode_t *rp; 2985 vnode_t *vp, **vplist; 2986 long num, cnt; 2987 2988 /* 2989 * Check to see whether there is anything to do. 2990 */ 2991 num = rnew; 2992 if (num == 0) 2993 return; 2994 2995 /* 2996 * Allocate a slot for all currently active rnodes on the 2997 * supposition that they all may need flushing. 2998 */ 2999 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3000 cnt = 0; 3001 3002 /* 3003 * Walk the hash queues looking for rnodes with page 3004 * lists associated with them. Make a list of these 3005 * files. 3006 */ 3007 for (index = 0; index < rtablesize; index++) { 3008 rw_enter(&rtable[index].r_lock, RW_READER); 3009 for (rp = rtable[index].r_hashf; 3010 rp != (rnode_t *)(&rtable[index]); 3011 rp = rp->r_hashf) { 3012 vp = RTOV(rp); 3013 /* 3014 * Don't bother sync'ing a vp if it 3015 * is part of virtual swap device or 3016 * if VFS is read-only 3017 */ 3018 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3019 continue; 3020 /* 3021 * If flushing all mounted file systems or 3022 * the vnode belongs to this vfs, has pages 3023 * and is marked as either dirty or mmap'd, 3024 * hold and add this vnode to the list of 3025 * vnodes to flush. 3026 */ 3027 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3028 vn_has_cached_data(vp) && 3029 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3030 VN_HOLD(vp); 3031 vplist[cnt++] = vp; 3032 if (cnt == num) { 3033 rw_exit(&rtable[index].r_lock); 3034 goto toomany; 3035 } 3036 } 3037 } 3038 rw_exit(&rtable[index].r_lock); 3039 } 3040 toomany: 3041 3042 /* 3043 * Flush and release all of the files on the list. 3044 */ 3045 while (cnt-- > 0) { 3046 vp = vplist[cnt]; 3047 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr); 3048 VN_RELE(vp); 3049 } 3050 3051 /* 3052 * Free the space allocated to hold the list. 3053 */ 3054 kmem_free(vplist, num * sizeof (*vplist)); 3055 } 3056 3057 /* 3058 * This probably needs to be larger than or equal to 3059 * log2(sizeof (struct rnode)) due to the way that rnodes are 3060 * allocated. 3061 */ 3062 #define ACACHE_SHIFT_BITS 9 3063 3064 static int 3065 acachehash(rnode_t *rp, cred_t *cr) 3066 { 3067 3068 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3069 acachemask); 3070 } 3071 3072 #ifdef DEBUG 3073 static long nfs_access_cache_hits = 0; 3074 static long nfs_access_cache_misses = 0; 3075 #endif 3076 3077 nfs_access_type_t 3078 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3079 { 3080 vnode_t *vp; 3081 acache_t *ap; 3082 acache_hash_t *hp; 3083 nfs_access_type_t all; 3084 3085 vp = RTOV(rp); 3086 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3087 return (NFS_ACCESS_UNKNOWN); 3088 3089 if (rp->r_acache != NULL) { 3090 hp = &acache[acachehash(rp, cr)]; 3091 rw_enter(&hp->lock, RW_READER); 3092 ap = hp->next; 3093 while (ap != (acache_t *)hp) { 3094 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3095 if ((ap->known & acc) == acc) { 3096 #ifdef DEBUG 3097 nfs_access_cache_hits++; 3098 #endif 3099 if ((ap->allowed & acc) == acc) 3100 all = NFS_ACCESS_ALLOWED; 3101 else 3102 all = NFS_ACCESS_DENIED; 3103 } else { 3104 #ifdef DEBUG 3105 nfs_access_cache_misses++; 3106 #endif 3107 all = NFS_ACCESS_UNKNOWN; 3108 } 3109 rw_exit(&hp->lock); 3110 return (all); 3111 } 3112 ap = ap->next; 3113 } 3114 rw_exit(&hp->lock); 3115 } 3116 3117 #ifdef DEBUG 3118 nfs_access_cache_misses++; 3119 #endif 3120 return (NFS_ACCESS_UNKNOWN); 3121 } 3122 3123 void 3124 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3125 { 3126 acache_t *ap; 3127 acache_t *nap; 3128 acache_hash_t *hp; 3129 3130 hp = &acache[acachehash(rp, cr)]; 3131 3132 /* 3133 * Allocate now assuming that mostly an allocation will be 3134 * required. This allows the allocation to happen without 3135 * holding the hash bucket locked. 3136 */ 3137 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3138 if (nap != NULL) { 3139 nap->known = acc; 3140 nap->allowed = resacc; 3141 nap->rnode = rp; 3142 crhold(cr); 3143 nap->cred = cr; 3144 nap->hashq = hp; 3145 } 3146 3147 rw_enter(&hp->lock, RW_WRITER); 3148 3149 if (rp->r_acache != NULL) { 3150 ap = hp->next; 3151 while (ap != (acache_t *)hp) { 3152 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3153 ap->known |= acc; 3154 ap->allowed &= ~acc; 3155 ap->allowed |= resacc; 3156 rw_exit(&hp->lock); 3157 if (nap != NULL) { 3158 crfree(nap->cred); 3159 kmem_cache_free(acache_cache, nap); 3160 } 3161 return; 3162 } 3163 ap = ap->next; 3164 } 3165 } 3166 3167 if (nap != NULL) { 3168 #ifdef DEBUG 3169 clstat_debug.access.value.ui64++; 3170 #endif 3171 nap->next = hp->next; 3172 hp->next = nap; 3173 nap->next->prev = nap; 3174 nap->prev = (acache_t *)hp; 3175 3176 mutex_enter(&rp->r_statelock); 3177 nap->list = rp->r_acache; 3178 rp->r_acache = nap; 3179 mutex_exit(&rp->r_statelock); 3180 } 3181 3182 rw_exit(&hp->lock); 3183 } 3184 3185 int 3186 nfs_access_purge_rp(rnode_t *rp) 3187 { 3188 acache_t *ap; 3189 acache_t *tmpap; 3190 acache_t *rplist; 3191 3192 /* 3193 * If there aren't any cached entries, then there is nothing 3194 * to free. 3195 */ 3196 if (rp->r_acache == NULL) 3197 return (0); 3198 3199 mutex_enter(&rp->r_statelock); 3200 rplist = rp->r_acache; 3201 rp->r_acache = NULL; 3202 mutex_exit(&rp->r_statelock); 3203 3204 /* 3205 * Loop through each entry in the list pointed to in the 3206 * rnode. Remove each of these entries from the hash 3207 * queue that it is on and remove it from the list in 3208 * the rnode. 3209 */ 3210 for (ap = rplist; ap != NULL; ap = tmpap) { 3211 rw_enter(&ap->hashq->lock, RW_WRITER); 3212 ap->prev->next = ap->next; 3213 ap->next->prev = ap->prev; 3214 rw_exit(&ap->hashq->lock); 3215 3216 tmpap = ap->list; 3217 crfree(ap->cred); 3218 kmem_cache_free(acache_cache, ap); 3219 #ifdef DEBUG 3220 clstat_debug.access.value.ui64--; 3221 #endif 3222 } 3223 3224 return (1); 3225 } 3226 3227 static const char prefix[] = ".nfs"; 3228 3229 static kmutex_t newnum_lock; 3230 3231 int 3232 newnum(void) 3233 { 3234 static uint_t newnum = 0; 3235 uint_t id; 3236 3237 mutex_enter(&newnum_lock); 3238 if (newnum == 0) 3239 newnum = gethrestime_sec() & 0xffff; 3240 id = newnum++; 3241 mutex_exit(&newnum_lock); 3242 return (id); 3243 } 3244 3245 char * 3246 newname(void) 3247 { 3248 char *news; 3249 char *s; 3250 const char *p; 3251 uint_t id; 3252 3253 id = newnum(); 3254 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3255 s = news; 3256 p = prefix; 3257 while (*p != '\0') 3258 *s++ = *p++; 3259 while (id != 0) { 3260 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3261 id >>= 4; 3262 } 3263 *s = '\0'; 3264 return (news); 3265 } 3266 3267 int 3268 nfs_atoi(char *cp) 3269 { 3270 int n; 3271 3272 n = 0; 3273 while (*cp != '\0') { 3274 n = n * 10 + (*cp - '0'); 3275 cp++; 3276 } 3277 3278 return (n); 3279 } 3280 3281 /* 3282 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3283 * framework. 3284 */ 3285 static int 3286 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3287 { 3288 ksp->ks_snaptime = gethrtime(); 3289 if (rw == KSTAT_WRITE) { 3290 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3291 #ifdef DEBUG 3292 /* 3293 * Currently only the global zone can write to kstats, but we 3294 * add the check just for paranoia. 3295 */ 3296 if (INGLOBALZONE(curproc)) 3297 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3298 sizeof (clstat_debug)); 3299 #endif 3300 } else { 3301 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3302 #ifdef DEBUG 3303 /* 3304 * If we're displaying the "global" debug kstat values, we 3305 * display them as-is to all zones since in fact they apply to 3306 * the system as a whole. 3307 */ 3308 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3309 sizeof (clstat_debug)); 3310 #endif 3311 } 3312 return (0); 3313 } 3314 3315 static void * 3316 clinit_zone(zoneid_t zoneid) 3317 { 3318 kstat_t *nfs_client_kstat; 3319 struct nfs_clnt *nfscl; 3320 uint_t ndata; 3321 3322 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3323 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3324 nfscl->nfscl_chtable = NULL; 3325 nfscl->nfscl_zoneid = zoneid; 3326 3327 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3328 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3329 #ifdef DEBUG 3330 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3331 #endif 3332 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3333 "misc", KSTAT_TYPE_NAMED, ndata, 3334 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3335 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3336 nfs_client_kstat->ks_snapshot = cl_snapshot; 3337 kstat_install(nfs_client_kstat); 3338 } 3339 mutex_enter(&nfs_clnt_list_lock); 3340 list_insert_head(&nfs_clnt_list, nfscl); 3341 mutex_exit(&nfs_clnt_list_lock); 3342 return (nfscl); 3343 } 3344 3345 /*ARGSUSED*/ 3346 static void 3347 clfini_zone(zoneid_t zoneid, void *arg) 3348 { 3349 struct nfs_clnt *nfscl = arg; 3350 chhead_t *chp, *next; 3351 3352 if (nfscl == NULL) 3353 return; 3354 mutex_enter(&nfs_clnt_list_lock); 3355 list_remove(&nfs_clnt_list, nfscl); 3356 mutex_exit(&nfs_clnt_list_lock); 3357 clreclaim_zone(nfscl, 0); 3358 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3359 ASSERT(chp->ch_list == NULL); 3360 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3361 next = chp->ch_next; 3362 kmem_free(chp, sizeof (*chp)); 3363 } 3364 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3365 mutex_destroy(&nfscl->nfscl_chtable_lock); 3366 kmem_free(nfscl, sizeof (*nfscl)); 3367 } 3368 3369 /* 3370 * Called by endpnt_destructor to make sure the client handles are 3371 * cleaned up before the RPC endpoints. This becomes a no-op if 3372 * clfini_zone (above) is called first. This function is needed 3373 * (rather than relying on clfini_zone to clean up) because the ZSD 3374 * callbacks have no ordering mechanism, so we have no way to ensure 3375 * that clfini_zone is called before endpnt_destructor. 3376 */ 3377 void 3378 clcleanup_zone(zoneid_t zoneid) 3379 { 3380 struct nfs_clnt *nfscl; 3381 3382 mutex_enter(&nfs_clnt_list_lock); 3383 nfscl = list_head(&nfs_clnt_list); 3384 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3385 if (nfscl->nfscl_zoneid == zoneid) { 3386 clreclaim_zone(nfscl, 0); 3387 break; 3388 } 3389 } 3390 mutex_exit(&nfs_clnt_list_lock); 3391 } 3392 3393 int 3394 nfs_subrinit(void) 3395 { 3396 int i; 3397 ulong_t nrnode_max; 3398 3399 /* 3400 * Allocate and initialize the rnode hash queues 3401 */ 3402 if (nrnode <= 0) 3403 nrnode = ncsize; 3404 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3405 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3406 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3407 "setting nrnode to max value of %ld", nrnode_max); 3408 nrnode = nrnode_max; 3409 } 3410 3411 rtablesize = 1 << highbit(nrnode / hashlen); 3412 rtablemask = rtablesize - 1; 3413 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3414 for (i = 0; i < rtablesize; i++) { 3415 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3416 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3417 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3418 } 3419 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3420 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3421 3422 /* 3423 * Allocate and initialize the access cache 3424 */ 3425 3426 /* 3427 * Initial guess is one access cache entry per rnode unless 3428 * nacache is set to a non-zero value and then it is used to 3429 * indicate a guess at the number of access cache entries. 3430 */ 3431 if (nacache > 0) 3432 acachesize = 1 << highbit(nacache / hashlen); 3433 else 3434 acachesize = rtablesize; 3435 acachemask = acachesize - 1; 3436 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3437 for (i = 0; i < acachesize; i++) { 3438 acache[i].next = (acache_t *)&acache[i]; 3439 acache[i].prev = (acache_t *)&acache[i]; 3440 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3441 } 3442 acache_cache = kmem_cache_create("nfs_access_cache", 3443 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3444 /* 3445 * Allocate and initialize the client handle cache 3446 */ 3447 chtab_cache = kmem_cache_create("client_handle_cache", 3448 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, 3449 NULL, 0); 3450 /* 3451 * Initialize the list of per-zone client handles (and associated data). 3452 * This needs to be done before we call zone_key_create(). 3453 */ 3454 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3455 offsetof(struct nfs_clnt, nfscl_node)); 3456 /* 3457 * Initialize the zone_key for per-zone client handle lists. 3458 */ 3459 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3460 /* 3461 * Initialize the various mutexes and reader/writer locks 3462 */ 3463 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3464 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3465 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3466 3467 /* 3468 * Assign unique major number for all nfs mounts 3469 */ 3470 if ((nfs_major = getudev()) == -1) { 3471 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3472 "nfs: init: can't get unique device number"); 3473 nfs_major = 0; 3474 } 3475 nfs_minor = 0; 3476 3477 if (nfs3_jukebox_delay == 0) 3478 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3479 3480 return (0); 3481 } 3482 3483 void 3484 nfs_subrfini(void) 3485 { 3486 int i; 3487 3488 /* 3489 * Deallocate the rnode hash queues 3490 */ 3491 kmem_cache_destroy(rnode_cache); 3492 3493 for (i = 0; i < rtablesize; i++) 3494 rw_destroy(&rtable[i].r_lock); 3495 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3496 3497 /* 3498 * Deallocated the access cache 3499 */ 3500 kmem_cache_destroy(acache_cache); 3501 3502 for (i = 0; i < acachesize; i++) 3503 rw_destroy(&acache[i].lock); 3504 kmem_free(acache, acachesize * sizeof (*acache)); 3505 3506 /* 3507 * Deallocate the client handle cache 3508 */ 3509 kmem_cache_destroy(chtab_cache); 3510 3511 /* 3512 * Destroy the various mutexes and reader/writer locks 3513 */ 3514 mutex_destroy(&rpfreelist_lock); 3515 mutex_destroy(&newnum_lock); 3516 mutex_destroy(&nfs_minor_lock); 3517 (void) zone_key_delete(nfsclnt_zone_key); 3518 } 3519 3520 enum nfsstat 3521 puterrno(int error) 3522 { 3523 3524 switch (error) { 3525 case EOPNOTSUPP: 3526 return (NFSERR_OPNOTSUPP); 3527 case ENAMETOOLONG: 3528 return (NFSERR_NAMETOOLONG); 3529 case ENOTEMPTY: 3530 return (NFSERR_NOTEMPTY); 3531 case EDQUOT: 3532 return (NFSERR_DQUOT); 3533 case ESTALE: 3534 return (NFSERR_STALE); 3535 case EREMOTE: 3536 return (NFSERR_REMOTE); 3537 case ENOSYS: 3538 return (NFSERR_OPNOTSUPP); 3539 case EOVERFLOW: 3540 return (NFSERR_INVAL); 3541 default: 3542 return ((enum nfsstat)error); 3543 } 3544 /* NOTREACHED */ 3545 } 3546 3547 int 3548 geterrno(enum nfsstat status) 3549 { 3550 3551 switch (status) { 3552 case NFSERR_OPNOTSUPP: 3553 return (EOPNOTSUPP); 3554 case NFSERR_NAMETOOLONG: 3555 return (ENAMETOOLONG); 3556 case NFSERR_NOTEMPTY: 3557 return (ENOTEMPTY); 3558 case NFSERR_DQUOT: 3559 return (EDQUOT); 3560 case NFSERR_STALE: 3561 return (ESTALE); 3562 case NFSERR_REMOTE: 3563 return (EREMOTE); 3564 case NFSERR_WFLUSH: 3565 return (EIO); 3566 default: 3567 return ((int)status); 3568 } 3569 /* NOTREACHED */ 3570 } 3571 3572 enum nfsstat3 3573 puterrno3(int error) 3574 { 3575 3576 #ifdef DEBUG 3577 switch (error) { 3578 case 0: 3579 return (NFS3_OK); 3580 case EPERM: 3581 return (NFS3ERR_PERM); 3582 case ENOENT: 3583 return (NFS3ERR_NOENT); 3584 case EIO: 3585 return (NFS3ERR_IO); 3586 case ENXIO: 3587 return (NFS3ERR_NXIO); 3588 case EACCES: 3589 return (NFS3ERR_ACCES); 3590 case EEXIST: 3591 return (NFS3ERR_EXIST); 3592 case EXDEV: 3593 return (NFS3ERR_XDEV); 3594 case ENODEV: 3595 return (NFS3ERR_NODEV); 3596 case ENOTDIR: 3597 return (NFS3ERR_NOTDIR); 3598 case EISDIR: 3599 return (NFS3ERR_ISDIR); 3600 case EINVAL: 3601 return (NFS3ERR_INVAL); 3602 case EFBIG: 3603 return (NFS3ERR_FBIG); 3604 case ENOSPC: 3605 return (NFS3ERR_NOSPC); 3606 case EROFS: 3607 return (NFS3ERR_ROFS); 3608 case EMLINK: 3609 return (NFS3ERR_MLINK); 3610 case ENAMETOOLONG: 3611 return (NFS3ERR_NAMETOOLONG); 3612 case ENOTEMPTY: 3613 return (NFS3ERR_NOTEMPTY); 3614 case EDQUOT: 3615 return (NFS3ERR_DQUOT); 3616 case ESTALE: 3617 return (NFS3ERR_STALE); 3618 case EREMOTE: 3619 return (NFS3ERR_REMOTE); 3620 case EOPNOTSUPP: 3621 return (NFS3ERR_NOTSUPP); 3622 case EOVERFLOW: 3623 return (NFS3ERR_INVAL); 3624 default: 3625 zcmn_err(getzoneid(), CE_WARN, 3626 "puterrno3: got error %d", error); 3627 return ((enum nfsstat3)error); 3628 } 3629 #else 3630 switch (error) { 3631 case ENAMETOOLONG: 3632 return (NFS3ERR_NAMETOOLONG); 3633 case ENOTEMPTY: 3634 return (NFS3ERR_NOTEMPTY); 3635 case EDQUOT: 3636 return (NFS3ERR_DQUOT); 3637 case ESTALE: 3638 return (NFS3ERR_STALE); 3639 case EOPNOTSUPP: 3640 return (NFS3ERR_NOTSUPP); 3641 case EREMOTE: 3642 return (NFS3ERR_REMOTE); 3643 case EOVERFLOW: 3644 return (NFS3ERR_INVAL); 3645 default: 3646 return ((enum nfsstat3)error); 3647 } 3648 #endif 3649 } 3650 3651 int 3652 geterrno3(enum nfsstat3 status) 3653 { 3654 3655 #ifdef DEBUG 3656 switch (status) { 3657 case NFS3_OK: 3658 return (0); 3659 case NFS3ERR_PERM: 3660 return (EPERM); 3661 case NFS3ERR_NOENT: 3662 return (ENOENT); 3663 case NFS3ERR_IO: 3664 return (EIO); 3665 case NFS3ERR_NXIO: 3666 return (ENXIO); 3667 case NFS3ERR_ACCES: 3668 return (EACCES); 3669 case NFS3ERR_EXIST: 3670 return (EEXIST); 3671 case NFS3ERR_XDEV: 3672 return (EXDEV); 3673 case NFS3ERR_NODEV: 3674 return (ENODEV); 3675 case NFS3ERR_NOTDIR: 3676 return (ENOTDIR); 3677 case NFS3ERR_ISDIR: 3678 return (EISDIR); 3679 case NFS3ERR_INVAL: 3680 return (EINVAL); 3681 case NFS3ERR_FBIG: 3682 return (EFBIG); 3683 case NFS3ERR_NOSPC: 3684 return (ENOSPC); 3685 case NFS3ERR_ROFS: 3686 return (EROFS); 3687 case NFS3ERR_MLINK: 3688 return (EMLINK); 3689 case NFS3ERR_NAMETOOLONG: 3690 return (ENAMETOOLONG); 3691 case NFS3ERR_NOTEMPTY: 3692 return (ENOTEMPTY); 3693 case NFS3ERR_DQUOT: 3694 return (EDQUOT); 3695 case NFS3ERR_STALE: 3696 return (ESTALE); 3697 case NFS3ERR_REMOTE: 3698 return (EREMOTE); 3699 case NFS3ERR_BADHANDLE: 3700 return (ESTALE); 3701 case NFS3ERR_NOT_SYNC: 3702 return (EINVAL); 3703 case NFS3ERR_BAD_COOKIE: 3704 return (ENOENT); 3705 case NFS3ERR_NOTSUPP: 3706 return (EOPNOTSUPP); 3707 case NFS3ERR_TOOSMALL: 3708 return (EINVAL); 3709 case NFS3ERR_SERVERFAULT: 3710 return (EIO); 3711 case NFS3ERR_BADTYPE: 3712 return (EINVAL); 3713 case NFS3ERR_JUKEBOX: 3714 return (ENXIO); 3715 default: 3716 zcmn_err(getzoneid(), CE_WARN, 3717 "geterrno3: got status %d", status); 3718 return ((int)status); 3719 } 3720 #else 3721 switch (status) { 3722 case NFS3ERR_NAMETOOLONG: 3723 return (ENAMETOOLONG); 3724 case NFS3ERR_NOTEMPTY: 3725 return (ENOTEMPTY); 3726 case NFS3ERR_DQUOT: 3727 return (EDQUOT); 3728 case NFS3ERR_STALE: 3729 case NFS3ERR_BADHANDLE: 3730 return (ESTALE); 3731 case NFS3ERR_NOTSUPP: 3732 return (EOPNOTSUPP); 3733 case NFS3ERR_REMOTE: 3734 return (EREMOTE); 3735 case NFS3ERR_NOT_SYNC: 3736 case NFS3ERR_TOOSMALL: 3737 case NFS3ERR_BADTYPE: 3738 return (EINVAL); 3739 case NFS3ERR_BAD_COOKIE: 3740 return (ENOENT); 3741 case NFS3ERR_SERVERFAULT: 3742 return (EIO); 3743 case NFS3ERR_JUKEBOX: 3744 return (ENXIO); 3745 default: 3746 return ((int)status); 3747 } 3748 #endif 3749 } 3750 3751 rddir_cache * 3752 rddir_cache_alloc(int flags) 3753 { 3754 rddir_cache *rc; 3755 3756 rc = kmem_alloc(sizeof (*rc), flags); 3757 if (rc != NULL) { 3758 rc->entries = NULL; 3759 rc->flags = RDDIR; 3760 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3761 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3762 rc->count = 1; 3763 #ifdef DEBUG 3764 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3765 #endif 3766 } 3767 return (rc); 3768 } 3769 3770 static void 3771 rddir_cache_free(rddir_cache *rc) 3772 { 3773 3774 #ifdef DEBUG 3775 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3776 #endif 3777 if (rc->entries != NULL) { 3778 #ifdef DEBUG 3779 rddir_cache_buf_free(rc->entries, rc->buflen); 3780 #else 3781 kmem_free(rc->entries, rc->buflen); 3782 #endif 3783 } 3784 cv_destroy(&rc->cv); 3785 mutex_destroy(&rc->lock); 3786 kmem_free(rc, sizeof (*rc)); 3787 } 3788 3789 void 3790 rddir_cache_hold(rddir_cache *rc) 3791 { 3792 3793 mutex_enter(&rc->lock); 3794 rc->count++; 3795 mutex_exit(&rc->lock); 3796 } 3797 3798 void 3799 rddir_cache_rele(rddir_cache *rc) 3800 { 3801 3802 mutex_enter(&rc->lock); 3803 ASSERT(rc->count > 0); 3804 if (--rc->count == 0) { 3805 mutex_exit(&rc->lock); 3806 rddir_cache_free(rc); 3807 } else 3808 mutex_exit(&rc->lock); 3809 } 3810 3811 #ifdef DEBUG 3812 char * 3813 rddir_cache_buf_alloc(size_t size, int flags) 3814 { 3815 char *rc; 3816 3817 rc = kmem_alloc(size, flags); 3818 if (rc != NULL) 3819 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3820 return (rc); 3821 } 3822 3823 void 3824 rddir_cache_buf_free(void *addr, size_t size) 3825 { 3826 3827 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3828 kmem_free(addr, size); 3829 } 3830 #endif 3831 3832 static int 3833 nfs_free_data_reclaim(rnode_t *rp) 3834 { 3835 char *contents; 3836 int size; 3837 vsecattr_t *vsp; 3838 nfs3_pathconf_info *info; 3839 int freed; 3840 cred_t *cred; 3841 3842 /* 3843 * Free any held credentials and caches which 3844 * may be associated with this rnode. 3845 */ 3846 mutex_enter(&rp->r_statelock); 3847 cred = rp->r_cred; 3848 rp->r_cred = NULL; 3849 contents = rp->r_symlink.contents; 3850 size = rp->r_symlink.size; 3851 rp->r_symlink.contents = NULL; 3852 vsp = rp->r_secattr; 3853 rp->r_secattr = NULL; 3854 info = rp->r_pathconf; 3855 rp->r_pathconf = NULL; 3856 mutex_exit(&rp->r_statelock); 3857 3858 if (cred != NULL) 3859 crfree(cred); 3860 3861 /* 3862 * Free the access cache entries. 3863 */ 3864 freed = nfs_access_purge_rp(rp); 3865 3866 if (!HAVE_RDDIR_CACHE(rp) && 3867 contents == NULL && 3868 vsp == NULL && 3869 info == NULL) 3870 return (freed); 3871 3872 /* 3873 * Free the readdir cache entries 3874 */ 3875 if (HAVE_RDDIR_CACHE(rp)) 3876 nfs_purge_rddir_cache(RTOV(rp)); 3877 3878 /* 3879 * Free the symbolic link cache. 3880 */ 3881 if (contents != NULL) { 3882 3883 kmem_free((void *)contents, size); 3884 } 3885 3886 /* 3887 * Free any cached ACL. 3888 */ 3889 if (vsp != NULL) 3890 nfs_acl_free(vsp); 3891 3892 /* 3893 * Free any cached pathconf information. 3894 */ 3895 if (info != NULL) 3896 kmem_free(info, sizeof (*info)); 3897 3898 return (1); 3899 } 3900 3901 static int 3902 nfs_active_data_reclaim(rnode_t *rp) 3903 { 3904 char *contents; 3905 int size; 3906 vsecattr_t *vsp; 3907 nfs3_pathconf_info *info; 3908 int freed; 3909 3910 /* 3911 * Free any held credentials and caches which 3912 * may be associated with this rnode. 3913 */ 3914 if (!mutex_tryenter(&rp->r_statelock)) 3915 return (0); 3916 contents = rp->r_symlink.contents; 3917 size = rp->r_symlink.size; 3918 rp->r_symlink.contents = NULL; 3919 vsp = rp->r_secattr; 3920 rp->r_secattr = NULL; 3921 info = rp->r_pathconf; 3922 rp->r_pathconf = NULL; 3923 mutex_exit(&rp->r_statelock); 3924 3925 /* 3926 * Free the access cache entries. 3927 */ 3928 freed = nfs_access_purge_rp(rp); 3929 3930 if (!HAVE_RDDIR_CACHE(rp) && 3931 contents == NULL && 3932 vsp == NULL && 3933 info == NULL) 3934 return (freed); 3935 3936 /* 3937 * Free the readdir cache entries 3938 */ 3939 if (HAVE_RDDIR_CACHE(rp)) 3940 nfs_purge_rddir_cache(RTOV(rp)); 3941 3942 /* 3943 * Free the symbolic link cache. 3944 */ 3945 if (contents != NULL) { 3946 3947 kmem_free((void *)contents, size); 3948 } 3949 3950 /* 3951 * Free any cached ACL. 3952 */ 3953 if (vsp != NULL) 3954 nfs_acl_free(vsp); 3955 3956 /* 3957 * Free any cached pathconf information. 3958 */ 3959 if (info != NULL) 3960 kmem_free(info, sizeof (*info)); 3961 3962 return (1); 3963 } 3964 3965 static int 3966 nfs_free_reclaim(void) 3967 { 3968 int freed; 3969 rnode_t *rp; 3970 3971 #ifdef DEBUG 3972 clstat_debug.f_reclaim.value.ui64++; 3973 #endif 3974 freed = 0; 3975 mutex_enter(&rpfreelist_lock); 3976 rp = rpfreelist; 3977 if (rp != NULL) { 3978 do { 3979 if (nfs_free_data_reclaim(rp)) 3980 freed = 1; 3981 } while ((rp = rp->r_freef) != rpfreelist); 3982 } 3983 mutex_exit(&rpfreelist_lock); 3984 return (freed); 3985 } 3986 3987 static int 3988 nfs_active_reclaim(void) 3989 { 3990 int freed; 3991 int index; 3992 rnode_t *rp; 3993 3994 #ifdef DEBUG 3995 clstat_debug.a_reclaim.value.ui64++; 3996 #endif 3997 freed = 0; 3998 for (index = 0; index < rtablesize; index++) { 3999 rw_enter(&rtable[index].r_lock, RW_READER); 4000 for (rp = rtable[index].r_hashf; 4001 rp != (rnode_t *)(&rtable[index]); 4002 rp = rp->r_hashf) { 4003 if (nfs_active_data_reclaim(rp)) 4004 freed = 1; 4005 } 4006 rw_exit(&rtable[index].r_lock); 4007 } 4008 return (freed); 4009 } 4010 4011 static int 4012 nfs_rnode_reclaim(void) 4013 { 4014 int freed; 4015 rnode_t *rp; 4016 vnode_t *vp; 4017 4018 #ifdef DEBUG 4019 clstat_debug.r_reclaim.value.ui64++; 4020 #endif 4021 freed = 0; 4022 mutex_enter(&rpfreelist_lock); 4023 while ((rp = rpfreelist) != NULL) { 4024 rp_rmfree(rp); 4025 mutex_exit(&rpfreelist_lock); 4026 if (rp->r_flags & RHASHED) { 4027 vp = RTOV(rp); 4028 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4029 mutex_enter(&vp->v_lock); 4030 if (vp->v_count > 1) { 4031 vp->v_count--; 4032 mutex_exit(&vp->v_lock); 4033 rw_exit(&rp->r_hashq->r_lock); 4034 mutex_enter(&rpfreelist_lock); 4035 continue; 4036 } 4037 mutex_exit(&vp->v_lock); 4038 rp_rmhash_locked(rp); 4039 rw_exit(&rp->r_hashq->r_lock); 4040 } 4041 /* 4042 * This call to rp_addfree will end up destroying the 4043 * rnode, but in a safe way with the appropriate set 4044 * of checks done. 4045 */ 4046 rp_addfree(rp, CRED()); 4047 mutex_enter(&rpfreelist_lock); 4048 } 4049 mutex_exit(&rpfreelist_lock); 4050 return (freed); 4051 } 4052 4053 /*ARGSUSED*/ 4054 static void 4055 nfs_reclaim(void *cdrarg) 4056 { 4057 4058 #ifdef DEBUG 4059 clstat_debug.reclaim.value.ui64++; 4060 #endif 4061 if (nfs_free_reclaim()) 4062 return; 4063 4064 if (nfs_active_reclaim()) 4065 return; 4066 4067 (void) nfs_rnode_reclaim(); 4068 } 4069 4070 /* 4071 * NFS client failover support 4072 * 4073 * Routines to copy filehandles 4074 */ 4075 void 4076 nfscopyfh(caddr_t fhp, vnode_t *vp) 4077 { 4078 fhandle_t *dest = (fhandle_t *)fhp; 4079 4080 if (dest != NULL) 4081 *dest = *VTOFH(vp); 4082 } 4083 4084 void 4085 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4086 { 4087 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4088 4089 if (dest != NULL) 4090 *dest = *VTOFH3(vp); 4091 } 4092 4093 /* 4094 * NFS client failover support 4095 * 4096 * failover_safe() will test various conditions to ensure that 4097 * failover is permitted for this vnode. It will be denied 4098 * if: 4099 * 1) the operation in progress does not support failover (NULL fi) 4100 * 2) there are no available replicas (NULL mi_servers->sv_next) 4101 * 3) any locks are outstanding on this file 4102 */ 4103 static int 4104 failover_safe(failinfo_t *fi) 4105 { 4106 4107 /* 4108 * Does this op permit failover? 4109 */ 4110 if (fi == NULL || fi->vp == NULL) 4111 return (0); 4112 4113 /* 4114 * Are there any alternates to failover to? 4115 */ 4116 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4117 return (0); 4118 4119 /* 4120 * Disable check; we've forced local locking 4121 * 4122 * if (flk_has_remote_locks(fi->vp)) 4123 * return (0); 4124 */ 4125 4126 /* 4127 * If we have no partial path, we can't do anything 4128 */ 4129 if (VTOR(fi->vp)->r_path == NULL) 4130 return (0); 4131 4132 return (1); 4133 } 4134 4135 #include <sys/thread.h> 4136 4137 /* 4138 * NFS client failover support 4139 * 4140 * failover_newserver() will start a search for a new server, 4141 * preferably by starting an async thread to do the work. If 4142 * someone is already doing this (recognizable by MI_BINDINPROG 4143 * being set), it will simply return and the calling thread 4144 * will queue on the mi_failover_cv condition variable. 4145 */ 4146 static void 4147 failover_newserver(mntinfo_t *mi) 4148 { 4149 /* 4150 * Check if someone else is doing this already 4151 */ 4152 mutex_enter(&mi->mi_lock); 4153 if (mi->mi_flags & MI_BINDINPROG) { 4154 mutex_exit(&mi->mi_lock); 4155 return; 4156 } 4157 mi->mi_flags |= MI_BINDINPROG; 4158 4159 /* 4160 * Need to hold the vfs struct so that it can't be released 4161 * while the failover thread is selecting a new server. 4162 */ 4163 VFS_HOLD(mi->mi_vfsp); 4164 4165 /* 4166 * Start a thread to do the real searching. 4167 */ 4168 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4169 4170 mutex_exit(&mi->mi_lock); 4171 } 4172 4173 /* 4174 * NFS client failover support 4175 * 4176 * failover_thread() will find a new server to replace the one 4177 * currently in use, wake up other threads waiting on this mount 4178 * point, and die. It will start at the head of the server list 4179 * and poll servers until it finds one with an NFS server which is 4180 * registered and responds to a NULL procedure ping. 4181 * 4182 * XXX failover_thread is unsafe within the scope of the 4183 * present model defined for cpr to suspend the system. 4184 * Specifically, over-the-wire calls made by the thread 4185 * are unsafe. The thread needs to be reevaluated in case of 4186 * future updates to the cpr suspend model. 4187 */ 4188 static void 4189 failover_thread(mntinfo_t *mi) 4190 { 4191 servinfo_t *svp = NULL; 4192 CLIENT *cl; 4193 enum clnt_stat status; 4194 struct timeval tv; 4195 int error; 4196 int oncethru = 0; 4197 callb_cpr_t cprinfo; 4198 rnode_t *rp; 4199 int index; 4200 char *srvnames; 4201 size_t srvnames_len; 4202 struct nfs_clnt *nfscl = NULL; 4203 zoneid_t zoneid = getzoneid(); 4204 4205 #ifdef DEBUG 4206 /* 4207 * This is currently only needed to access counters which exist on 4208 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4209 * on non-DEBUG kernels. 4210 */ 4211 nfscl = zone_getspecific(nfsclnt_zone_key, curproc->p_zone); 4212 ASSERT(nfscl != NULL); 4213 #endif 4214 4215 /* 4216 * Its safe to piggyback on the mi_lock since failover_newserver() 4217 * code guarantees that there will be only one failover thread 4218 * per mountinfo at any instance. 4219 */ 4220 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4221 "failover_thread"); 4222 4223 mutex_enter(&mi->mi_lock); 4224 while (mi->mi_readers) { 4225 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4226 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4227 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4228 } 4229 mutex_exit(&mi->mi_lock); 4230 4231 tv.tv_sec = 2; 4232 tv.tv_usec = 0; 4233 4234 /* 4235 * Ping the null NFS procedure of every server in 4236 * the list until one responds. We always start 4237 * at the head of the list and always skip the one 4238 * that is current, since it's caused us a problem. 4239 */ 4240 while (svp == NULL) { 4241 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4242 if (!oncethru && svp == mi->mi_curr_serv) 4243 continue; 4244 4245 /* 4246 * If the file system was forcibly umounted 4247 * while trying to do a failover, then just 4248 * give up on the failover. It won't matter 4249 * what the server is. 4250 */ 4251 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4252 svp = NULL; 4253 goto done; 4254 } 4255 4256 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4257 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4258 if (error) 4259 continue; 4260 4261 if (!(mi->mi_flags & MI_INT)) 4262 cl->cl_nosignal = TRUE; 4263 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4264 xdr_void, NULL, tv); 4265 if (!(mi->mi_flags & MI_INT)) 4266 cl->cl_nosignal = FALSE; 4267 AUTH_DESTROY(cl->cl_auth); 4268 CLNT_DESTROY(cl); 4269 if (status == RPC_SUCCESS) { 4270 if (svp == mi->mi_curr_serv) { 4271 #ifdef DEBUG 4272 zcmn_err(zoneid, CE_NOTE, 4273 "NFS%d: failing over: selecting original server %s", 4274 mi->mi_vers, svp->sv_hostname); 4275 #else 4276 zcmn_err(zoneid, CE_NOTE, 4277 "NFS: failing over: selecting original server %s", 4278 svp->sv_hostname); 4279 #endif 4280 } else { 4281 #ifdef DEBUG 4282 zcmn_err(zoneid, CE_NOTE, 4283 "NFS%d: failing over from %s to %s", 4284 mi->mi_vers, 4285 mi->mi_curr_serv->sv_hostname, 4286 svp->sv_hostname); 4287 #else 4288 zcmn_err(zoneid, CE_NOTE, 4289 "NFS: failing over from %s to %s", 4290 mi->mi_curr_serv->sv_hostname, 4291 svp->sv_hostname); 4292 #endif 4293 } 4294 break; 4295 } 4296 } 4297 4298 if (svp == NULL) { 4299 if (!oncethru) { 4300 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4301 #ifdef DEBUG 4302 zprintf(zoneid, 4303 "NFS%d servers %s not responding " 4304 "still trying\n", mi->mi_vers, srvnames); 4305 #else 4306 zprintf(zoneid, "NFS servers %s not responding " 4307 "still trying\n", srvnames); 4308 #endif 4309 oncethru = 1; 4310 } 4311 mutex_enter(&mi->mi_lock); 4312 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4313 mutex_exit(&mi->mi_lock); 4314 delay(hz); 4315 mutex_enter(&mi->mi_lock); 4316 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4317 mutex_exit(&mi->mi_lock); 4318 } 4319 } 4320 4321 if (oncethru) { 4322 #ifdef DEBUG 4323 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4324 #else 4325 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4326 #endif 4327 } 4328 4329 if (svp != mi->mi_curr_serv) { 4330 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4331 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4332 rw_enter(&rtable[index].r_lock, RW_WRITER); 4333 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4334 mi->mi_vfsp); 4335 if (rp != NULL) { 4336 if (rp->r_flags & RHASHED) 4337 rp_rmhash_locked(rp); 4338 rw_exit(&rtable[index].r_lock); 4339 rp->r_server = svp; 4340 rp->r_fh = svp->sv_fhandle; 4341 (void) nfs_free_data_reclaim(rp); 4342 index = rtablehash(&rp->r_fh); 4343 rp->r_hashq = &rtable[index]; 4344 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4345 vn_exists(RTOV(rp)); 4346 rp_addhash(rp); 4347 rw_exit(&rp->r_hashq->r_lock); 4348 VN_RELE(RTOV(rp)); 4349 } else 4350 rw_exit(&rtable[index].r_lock); 4351 } 4352 4353 done: 4354 if (oncethru) 4355 kmem_free(srvnames, srvnames_len); 4356 mutex_enter(&mi->mi_lock); 4357 mi->mi_flags &= ~MI_BINDINPROG; 4358 if (svp != NULL) { 4359 mi->mi_curr_serv = svp; 4360 mi->mi_failover++; 4361 #ifdef DEBUG 4362 nfscl->nfscl_stat.failover.value.ui64++; 4363 #endif 4364 } 4365 cv_broadcast(&mi->mi_failover_cv); 4366 CALLB_CPR_EXIT(&cprinfo); 4367 VFS_RELE(mi->mi_vfsp); 4368 zthread_exit(); 4369 /* NOTREACHED */ 4370 } 4371 4372 /* 4373 * NFS client failover support 4374 * 4375 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4376 * is cleared, meaning that failover is complete. Called with 4377 * mi_lock mutex held. 4378 */ 4379 static int 4380 failover_wait(mntinfo_t *mi) 4381 { 4382 k_sigset_t smask; 4383 4384 /* 4385 * If someone else is hunting for a living server, 4386 * sleep until it's done. After our sleep, we may 4387 * be bound to the right server and get off cheaply. 4388 */ 4389 while (mi->mi_flags & MI_BINDINPROG) { 4390 /* 4391 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4392 * and SIGTERM. (Preserving the existing masks). 4393 * Mask out SIGINT if mount option nointr is specified. 4394 */ 4395 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4396 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4397 /* 4398 * restore original signal mask 4399 */ 4400 sigunintr(&smask); 4401 return (EINTR); 4402 } 4403 /* 4404 * restore original signal mask 4405 */ 4406 sigunintr(&smask); 4407 } 4408 return (0); 4409 } 4410 4411 /* 4412 * NFS client failover support 4413 * 4414 * failover_remap() will do a partial pathname lookup and find the 4415 * desired vnode on the current server. The interim vnode will be 4416 * discarded after we pilfer the new filehandle. 4417 * 4418 * Side effects: 4419 * - This routine will also update the filehandle in the args structure 4420 * pointed to by the fi->fhp pointer if it is non-NULL. 4421 */ 4422 4423 static int 4424 failover_remap(failinfo_t *fi) 4425 { 4426 vnode_t *vp, *nvp, *rootvp; 4427 rnode_t *rp, *nrp; 4428 mntinfo_t *mi; 4429 int error; 4430 int index; 4431 #ifdef DEBUG 4432 struct nfs_clnt *nfscl; 4433 4434 nfscl = zone_getspecific(nfsclnt_zone_key, curproc->p_zone); 4435 ASSERT(nfscl != NULL); 4436 #endif 4437 /* 4438 * Sanity check 4439 */ 4440 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4441 return (EINVAL); 4442 vp = fi->vp; 4443 rp = VTOR(vp); 4444 mi = VTOMI(vp); 4445 4446 if (!(vp->v_flag & VROOT)) { 4447 /* 4448 * Given the root fh, use the path stored in 4449 * the rnode to find the fh for the new server. 4450 */ 4451 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4452 if (error) 4453 return (error); 4454 4455 error = failover_lookup(rp->r_path, rootvp, 4456 fi->lookupproc, fi->xattrdirproc, &nvp); 4457 4458 VN_RELE(rootvp); 4459 4460 if (error) 4461 return (error); 4462 4463 /* 4464 * If we found the same rnode, we're done now 4465 */ 4466 if (nvp == vp) { 4467 /* 4468 * Failed and the new server may physically be same 4469 * OR may share a same disk subsystem. In this case 4470 * file handle for a particular file path is not going 4471 * to change, given the same filehandle lookup will 4472 * always locate the same rnode as the existing one. 4473 * All we might need to do is to update the r_server 4474 * with the current servinfo. 4475 */ 4476 if (!VALID_FH(fi)) { 4477 rp->r_server = mi->mi_curr_serv; 4478 } 4479 VN_RELE(nvp); 4480 return (0); 4481 } 4482 4483 /* 4484 * Try to make it so that no one else will find this 4485 * vnode because it is just a temporary to hold the 4486 * new file handle until that file handle can be 4487 * copied to the original vnode/rnode. 4488 */ 4489 nrp = VTOR(nvp); 4490 if (nrp->r_flags & RHASHED) 4491 rp_rmhash(nrp); 4492 4493 /* 4494 * As a heuristic check on the validity of the new 4495 * file, check that the size and type match against 4496 * that we remember from the old version. 4497 */ 4498 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4499 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4500 "NFS replicas %s and %s: file %s not same.", 4501 rp->r_server->sv_hostname, 4502 nrp->r_server->sv_hostname, rp->r_path); 4503 VN_RELE(nvp); 4504 return (EINVAL); 4505 } 4506 4507 /* 4508 * snarf the filehandle from the new rnode 4509 * then release it, again while updating the 4510 * hash queues for the rnode. 4511 */ 4512 if (rp->r_flags & RHASHED) 4513 rp_rmhash(rp); 4514 rp->r_server = mi->mi_curr_serv; 4515 rp->r_fh = nrp->r_fh; 4516 index = rtablehash(&rp->r_fh); 4517 rp->r_hashq = &rtable[index]; 4518 /* 4519 * Copy the attributes from the new rnode to the old 4520 * rnode. This will help to reduce unnecessary page 4521 * cache flushes. 4522 */ 4523 rp->r_attr = nrp->r_attr; 4524 rp->r_attrtime = nrp->r_attrtime; 4525 rp->r_mtime = nrp->r_mtime; 4526 (void) nfs_free_data_reclaim(rp); 4527 nfs_setswaplike(vp, &rp->r_attr); 4528 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4529 rp_addhash(rp); 4530 rw_exit(&rp->r_hashq->r_lock); 4531 VN_RELE(nvp); 4532 } 4533 4534 /* 4535 * Update successful failover remap count 4536 */ 4537 mutex_enter(&mi->mi_lock); 4538 mi->mi_remap++; 4539 mutex_exit(&mi->mi_lock); 4540 #ifdef DEBUG 4541 nfscl->nfscl_stat.remap.value.ui64++; 4542 #endif 4543 4544 /* 4545 * If we have a copied filehandle to update, do it now. 4546 */ 4547 if (fi->fhp != NULL && fi->copyproc != NULL) 4548 (*fi->copyproc)(fi->fhp, vp); 4549 4550 return (0); 4551 } 4552 4553 /* 4554 * NFS client failover support 4555 * 4556 * We want a simple pathname lookup routine to parse the pieces 4557 * of path in rp->r_path. We know that the path was a created 4558 * as rnodes were made, so we know we have only to deal with 4559 * paths that look like: 4560 * dir1/dir2/dir3/file 4561 * Any evidence of anything like .., symlinks, and ENOTDIR 4562 * are hard errors, because they mean something in this filesystem 4563 * is different from the one we came from, or has changed under 4564 * us in some way. If this is true, we want the failure. 4565 * 4566 * Extended attributes: if the filesystem is mounted with extended 4567 * attributes enabled (-o xattr), the attribute directory will be 4568 * represented in the r_path as the magic name XATTR_RPATH. So if 4569 * we see that name in the pathname, is must be because this node 4570 * is an extended attribute. Therefore, look it up that way. 4571 */ 4572 static int 4573 failover_lookup(char *path, vnode_t *root, 4574 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4575 vnode_t *, cred_t *, int), 4576 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4577 vnode_t **new) 4578 { 4579 vnode_t *dvp, *nvp; 4580 int error = EINVAL; 4581 char *s, *p, *tmppath; 4582 size_t len; 4583 mntinfo_t *mi; 4584 bool_t xattr; 4585 4586 /* Make local copy of path */ 4587 len = strlen(path) + 1; 4588 tmppath = kmem_alloc(len, KM_SLEEP); 4589 (void) strcpy(tmppath, path); 4590 s = tmppath; 4591 4592 dvp = root; 4593 VN_HOLD(dvp); 4594 mi = VTOMI(root); 4595 xattr = mi->mi_flags & MI_EXTATTR; 4596 4597 do { 4598 p = strchr(s, '/'); 4599 if (p != NULL) 4600 *p = '\0'; 4601 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4602 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4603 RFSCALL_SOFT); 4604 } else { 4605 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4606 CRED(), RFSCALL_SOFT); 4607 } 4608 if (p != NULL) 4609 *p++ = '/'; 4610 if (error) { 4611 VN_RELE(dvp); 4612 kmem_free(tmppath, len); 4613 return (error); 4614 } 4615 s = p; 4616 VN_RELE(dvp); 4617 dvp = nvp; 4618 } while (p != NULL); 4619 4620 if (nvp != NULL && new != NULL) 4621 *new = nvp; 4622 kmem_free(tmppath, len); 4623 return (0); 4624 } 4625 4626 /* 4627 * NFS client failover support 4628 * 4629 * sv_free() frees the malloc'd portion of a "servinfo_t". 4630 */ 4631 void 4632 sv_free(servinfo_t *svp) 4633 { 4634 servinfo_t *next; 4635 struct knetconfig *knconf; 4636 4637 while (svp != NULL) { 4638 next = svp->sv_next; 4639 if (svp->sv_secdata) 4640 sec_clnt_freeinfo(svp->sv_secdata); 4641 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4642 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4643 knconf = svp->sv_knconf; 4644 if (knconf != NULL) { 4645 if (knconf->knc_protofmly != NULL) 4646 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4647 if (knconf->knc_proto != NULL) 4648 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4649 kmem_free(knconf, sizeof (*knconf)); 4650 } 4651 knconf = svp->sv_origknconf; 4652 if (knconf != NULL) { 4653 if (knconf->knc_protofmly != NULL) 4654 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4655 if (knconf->knc_proto != NULL) 4656 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4657 kmem_free(knconf, sizeof (*knconf)); 4658 } 4659 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4660 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4661 mutex_destroy(&svp->sv_lock); 4662 kmem_free(svp, sizeof (*svp)); 4663 svp = next; 4664 } 4665 } 4666 4667 /* 4668 * Only can return non-zero if intr != 0. 4669 */ 4670 int 4671 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4672 { 4673 4674 mutex_enter(&l->lock); 4675 4676 /* 4677 * If this is a nested enter, then allow it. There 4678 * must be as many exits as enters through. 4679 */ 4680 if (l->owner == curthread) { 4681 /* lock is held for writing by current thread */ 4682 ASSERT(rw == RW_READER || rw == RW_WRITER); 4683 l->count--; 4684 } else if (rw == RW_READER) { 4685 /* 4686 * While there is a writer active or writers waiting, 4687 * then wait for them to finish up and move on. Then, 4688 * increment the count to indicate that a reader is 4689 * active. 4690 */ 4691 while (l->count < 0 || l->waiters > 0) { 4692 if (intr) { 4693 klwp_t *lwp = ttolwp(curthread); 4694 4695 if (lwp != NULL) 4696 lwp->lwp_nostop++; 4697 if (!cv_wait_sig(&l->cv, &l->lock)) { 4698 if (lwp != NULL) 4699 lwp->lwp_nostop--; 4700 mutex_exit(&l->lock); 4701 return (EINTR); 4702 } 4703 if (lwp != NULL) 4704 lwp->lwp_nostop--; 4705 } else 4706 cv_wait(&l->cv, &l->lock); 4707 } 4708 ASSERT(l->count < INT_MAX); 4709 #ifdef DEBUG 4710 if ((l->count % 10000) == 9999) 4711 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4712 "rwlock @ %p\n", l->count, (void *)&l); 4713 #endif 4714 l->count++; 4715 } else { 4716 ASSERT(rw == RW_WRITER); 4717 /* 4718 * While there are readers active or a writer 4719 * active, then wait for all of the readers 4720 * to finish or for the writer to finish. 4721 * Then, set the owner field to curthread and 4722 * decrement count to indicate that a writer 4723 * is active. 4724 */ 4725 while (l->count > 0 || l->owner != NULL) { 4726 l->waiters++; 4727 if (intr) { 4728 klwp_t *lwp = ttolwp(curthread); 4729 4730 if (lwp != NULL) 4731 lwp->lwp_nostop++; 4732 if (!cv_wait_sig(&l->cv, &l->lock)) { 4733 if (lwp != NULL) 4734 lwp->lwp_nostop--; 4735 l->waiters--; 4736 cv_broadcast(&l->cv); 4737 mutex_exit(&l->lock); 4738 return (EINTR); 4739 } 4740 if (lwp != NULL) 4741 lwp->lwp_nostop--; 4742 } else 4743 cv_wait(&l->cv, &l->lock); 4744 l->waiters--; 4745 } 4746 l->owner = curthread; 4747 l->count--; 4748 } 4749 4750 mutex_exit(&l->lock); 4751 4752 return (0); 4753 } 4754 4755 /* 4756 * If the lock is available, obtain it and return non-zero. If there is 4757 * already a conflicting lock, return 0 immediately. 4758 */ 4759 4760 int 4761 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4762 { 4763 mutex_enter(&l->lock); 4764 4765 /* 4766 * If this is a nested enter, then allow it. There 4767 * must be as many exits as enters through. 4768 */ 4769 if (l->owner == curthread) { 4770 /* lock is held for writing by current thread */ 4771 ASSERT(rw == RW_READER || rw == RW_WRITER); 4772 l->count--; 4773 } else if (rw == RW_READER) { 4774 /* 4775 * If there is a writer active or writers waiting, deny the 4776 * lock. Otherwise, bump the count of readers. 4777 */ 4778 if (l->count < 0 || l->waiters > 0) { 4779 mutex_exit(&l->lock); 4780 return (0); 4781 } 4782 l->count++; 4783 } else { 4784 ASSERT(rw == RW_WRITER); 4785 /* 4786 * If there are readers active or a writer active, deny the 4787 * lock. Otherwise, set the owner field to curthread and 4788 * decrement count to indicate that a writer is active. 4789 */ 4790 if (l->count > 0 || l->owner != NULL) { 4791 mutex_exit(&l->lock); 4792 return (0); 4793 } 4794 l->owner = curthread; 4795 l->count--; 4796 } 4797 4798 mutex_exit(&l->lock); 4799 4800 return (1); 4801 } 4802 4803 void 4804 nfs_rw_exit(nfs_rwlock_t *l) 4805 { 4806 4807 mutex_enter(&l->lock); 4808 /* 4809 * If this is releasing a writer lock, then increment count to 4810 * indicate that there is one less writer active. If this was 4811 * the last of possibly nested writer locks, then clear the owner 4812 * field as well to indicate that there is no writer active 4813 * and wakeup any possible waiting writers or readers. 4814 * 4815 * If releasing a reader lock, then just decrement count to 4816 * indicate that there is one less reader active. If this was 4817 * the last active reader and there are writer(s) waiting, 4818 * then wake up the first. 4819 */ 4820 if (l->owner != NULL) { 4821 ASSERT(l->owner == curthread); 4822 l->count++; 4823 if (l->count == 0) { 4824 l->owner = NULL; 4825 cv_broadcast(&l->cv); 4826 } 4827 } else { 4828 ASSERT(l->count > 0); 4829 l->count--; 4830 if (l->count == 0 && l->waiters > 0) 4831 cv_broadcast(&l->cv); 4832 } 4833 mutex_exit(&l->lock); 4834 } 4835 4836 int 4837 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4838 { 4839 4840 if (rw == RW_READER) 4841 return (l->count > 0); 4842 ASSERT(rw == RW_WRITER); 4843 return (l->count < 0); 4844 } 4845 4846 /* ARGSUSED */ 4847 void 4848 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4849 { 4850 4851 l->count = 0; 4852 l->waiters = 0; 4853 l->owner = NULL; 4854 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4855 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4856 } 4857 4858 void 4859 nfs_rw_destroy(nfs_rwlock_t *l) 4860 { 4861 4862 mutex_destroy(&l->lock); 4863 cv_destroy(&l->cv); 4864 } 4865 4866 int 4867 nfs3_rddir_compar(const void *x, const void *y) 4868 { 4869 rddir_cache *a = (rddir_cache *)x; 4870 rddir_cache *b = (rddir_cache *)y; 4871 4872 if (a->nfs3_cookie == b->nfs3_cookie) { 4873 if (a->buflen == b->buflen) 4874 return (0); 4875 if (a->buflen < b->buflen) 4876 return (-1); 4877 return (1); 4878 } 4879 4880 if (a->nfs3_cookie < b->nfs3_cookie) 4881 return (-1); 4882 4883 return (1); 4884 } 4885 4886 int 4887 nfs_rddir_compar(const void *x, const void *y) 4888 { 4889 rddir_cache *a = (rddir_cache *)x; 4890 rddir_cache *b = (rddir_cache *)y; 4891 4892 if (a->nfs_cookie == b->nfs_cookie) { 4893 if (a->buflen == b->buflen) 4894 return (0); 4895 if (a->buflen < b->buflen) 4896 return (-1); 4897 return (1); 4898 } 4899 4900 if (a->nfs_cookie < b->nfs_cookie) 4901 return (-1); 4902 4903 return (1); 4904 } 4905 4906 static char * 4907 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4908 { 4909 servinfo_t *s; 4910 char *srvnames; 4911 char *namep; 4912 size_t length; 4913 4914 /* 4915 * Calculate the length of the string required to hold all 4916 * of the server names plus either a comma or a null 4917 * character following each individual one. 4918 */ 4919 length = 0; 4920 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4921 length += s->sv_hostnamelen; 4922 4923 srvnames = kmem_alloc(length, KM_SLEEP); 4924 4925 namep = srvnames; 4926 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4927 (void) strcpy(namep, s->sv_hostname); 4928 namep += s->sv_hostnamelen - 1; 4929 *namep++ = ','; 4930 } 4931 *--namep = '\0'; 4932 4933 *len = length; 4934 4935 return (srvnames); 4936 } 4937