1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred_impl.h> 35 #include <sys/proc.h> 36 #include <sys/user.h> 37 #include <sys/time.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/socket.h> 42 #include <sys/uio.h> 43 #include <sys/tiuser.h> 44 #include <sys/swap.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/kmem.h> 48 #include <sys/kstat.h> 49 #include <sys/cmn_err.h> 50 #include <sys/vtrace.h> 51 #include <sys/session.h> 52 #include <sys/dnlc.h> 53 #include <sys/bitmap.h> 54 #include <sys/acl.h> 55 #include <sys/ddi.h> 56 #include <sys/pathname.h> 57 #include <sys/flock.h> 58 #include <sys/dirent.h> 59 #include <sys/flock.h> 60 #include <sys/callb.h> 61 #include <sys/atomic.h> 62 #include <sys/list.h> 63 #include <sys/tsol/tnet.h> 64 #include <sys/priv.h> 65 66 #include <inet/ip6.h> 67 68 #include <rpc/types.h> 69 #include <rpc/xdr.h> 70 #include <rpc/auth.h> 71 #include <rpc/clnt.h> 72 73 #include <nfs/nfs.h> 74 #include <nfs/nfs4.h> 75 #include <nfs/nfs_clnt.h> 76 #include <nfs/rnode.h> 77 #include <nfs/nfs_acl.h> 78 79 /* 80 * The hash queues for the access to active and cached rnodes 81 * are organized as doubly linked lists. A reader/writer lock 82 * for each hash bucket is used to control access and to synchronize 83 * lookups, additions, and deletions from the hash queue. 84 * 85 * The rnode freelist is organized as a doubly linked list with 86 * a head pointer. Additions and deletions are synchronized via 87 * a single mutex. 88 * 89 * In order to add an rnode to the free list, it must be hashed into 90 * a hash queue and the exclusive lock to the hash queue be held. 91 * If an rnode is not hashed into a hash queue, then it is destroyed 92 * because it represents no valuable information that can be reused 93 * about the file. The exclusive lock to the hash queue must be 94 * held in order to prevent a lookup in the hash queue from finding 95 * the rnode and using it and assuming that the rnode is not on the 96 * freelist. The lookup in the hash queue will have the hash queue 97 * locked, either exclusive or shared. 98 * 99 * The vnode reference count for each rnode is not allowed to drop 100 * below 1. This prevents external entities, such as the VM 101 * subsystem, from acquiring references to vnodes already on the 102 * freelist and then trying to place them back on the freelist 103 * when their reference is released. This means that the when an 104 * rnode is looked up in the hash queues, then either the rnode 105 * is removed from the freelist and that reference is tranfered to 106 * the new reference or the vnode reference count must be incremented 107 * accordingly. The mutex for the freelist must be held in order to 108 * accurately test to see if the rnode is on the freelist or not. 109 * The hash queue lock might be held shared and it is possible that 110 * two different threads may race to remove the rnode from the 111 * freelist. This race can be resolved by holding the mutex for the 112 * freelist. Please note that the mutex for the freelist does not 113 * need to held if the rnode is not on the freelist. It can not be 114 * placed on the freelist due to the requirement that the thread 115 * putting the rnode on the freelist must hold the exclusive lock 116 * to the hash queue and the thread doing the lookup in the hash 117 * queue is holding either a shared or exclusive lock to the hash 118 * queue. 119 * 120 * The lock ordering is: 121 * 122 * hash bucket lock -> vnode lock 123 * hash bucket lock -> freelist lock 124 */ 125 static rhashq_t *rtable; 126 127 static kmutex_t rpfreelist_lock; 128 static rnode_t *rpfreelist = NULL; 129 static long rnew = 0; 130 long nrnode = 0; 131 132 static int rtablesize; 133 static int rtablemask; 134 135 static int hashlen = 4; 136 137 static struct kmem_cache *rnode_cache; 138 139 /* 140 * Mutex to protect the following variables: 141 * nfs_major 142 * nfs_minor 143 */ 144 kmutex_t nfs_minor_lock; 145 int nfs_major; 146 int nfs_minor; 147 148 /* Do we allow preepoch (negative) time values otw? */ 149 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 150 151 /* 152 * Access cache 153 */ 154 static acache_hash_t *acache; 155 static long nacache; /* used strictly to size the number of hash queues */ 156 157 static int acachesize; 158 static int acachemask; 159 static struct kmem_cache *acache_cache; 160 161 /* 162 * Client side utilities 163 */ 164 165 /* 166 * client side statistics 167 */ 168 static const struct clstat clstat_tmpl = { 169 { "calls", KSTAT_DATA_UINT64 }, 170 { "badcalls", KSTAT_DATA_UINT64 }, 171 { "clgets", KSTAT_DATA_UINT64 }, 172 { "cltoomany", KSTAT_DATA_UINT64 }, 173 #ifdef DEBUG 174 { "clalloc", KSTAT_DATA_UINT64 }, 175 { "noresponse", KSTAT_DATA_UINT64 }, 176 { "failover", KSTAT_DATA_UINT64 }, 177 { "remap", KSTAT_DATA_UINT64 }, 178 #endif 179 }; 180 181 /* 182 * The following are statistics that describe behavior of the system as a whole 183 * and doesn't correspond to any one particular zone. 184 */ 185 #ifdef DEBUG 186 static struct clstat_debug { 187 kstat_named_t nrnode; /* number of allocated rnodes */ 188 kstat_named_t access; /* size of access cache */ 189 kstat_named_t dirent; /* size of readdir cache */ 190 kstat_named_t dirents; /* size of readdir buf cache */ 191 kstat_named_t reclaim; /* number of reclaims */ 192 kstat_named_t clreclaim; /* number of cl reclaims */ 193 kstat_named_t f_reclaim; /* number of free reclaims */ 194 kstat_named_t a_reclaim; /* number of active reclaims */ 195 kstat_named_t r_reclaim; /* number of rnode reclaims */ 196 kstat_named_t rpath; /* bytes used to store rpaths */ 197 } clstat_debug = { 198 { "nrnode", KSTAT_DATA_UINT64 }, 199 { "access", KSTAT_DATA_UINT64 }, 200 { "dirent", KSTAT_DATA_UINT64 }, 201 { "dirents", KSTAT_DATA_UINT64 }, 202 { "reclaim", KSTAT_DATA_UINT64 }, 203 { "clreclaim", KSTAT_DATA_UINT64 }, 204 { "f_reclaim", KSTAT_DATA_UINT64 }, 205 { "a_reclaim", KSTAT_DATA_UINT64 }, 206 { "r_reclaim", KSTAT_DATA_UINT64 }, 207 { "r_path", KSTAT_DATA_UINT64 }, 208 }; 209 #endif /* DEBUG */ 210 211 /* 212 * We keep a global list of per-zone client data, so we can clean up all zones 213 * if we get low on memory. 214 */ 215 static list_t nfs_clnt_list; 216 static kmutex_t nfs_clnt_list_lock; 217 static zone_key_t nfsclnt_zone_key; 218 219 static struct kmem_cache *chtab_cache; 220 221 /* 222 * Some servers do not properly update the attributes of the 223 * directory when changes are made. To allow interoperability 224 * with these broken servers, the nfs_disable_rddir_cache 225 * parameter must be set in /etc/system 226 */ 227 int nfs_disable_rddir_cache = 0; 228 229 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 230 struct chtab **); 231 void clfree(CLIENT *, struct chtab *); 232 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 233 struct chtab **, struct nfs_clnt *); 234 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 235 struct chtab **, struct nfs_clnt *); 236 static void clreclaim(void *); 237 static int nfs_feedback(int, int, mntinfo_t *); 238 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 239 caddr_t, cred_t *, int *, enum clnt_stat *, int, 240 failinfo_t *); 241 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 242 caddr_t, cred_t *, int *, int, failinfo_t *); 243 static void rinactive(rnode_t *, cred_t *); 244 static int rtablehash(nfs_fhandle *); 245 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 246 struct vnodeops *, 247 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 248 cred_t *), 249 int (*)(const void *, const void *), int *, cred_t *, 250 char *, char *); 251 static void rp_rmfree(rnode_t *); 252 static void rp_addhash(rnode_t *); 253 static void rp_rmhash_locked(rnode_t *); 254 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 255 static void destroy_rnode(rnode_t *); 256 static void rddir_cache_free(rddir_cache *); 257 static int nfs_free_data_reclaim(rnode_t *); 258 static int nfs_active_data_reclaim(rnode_t *); 259 static int nfs_free_reclaim(void); 260 static int nfs_active_reclaim(void); 261 static int nfs_rnode_reclaim(void); 262 static void nfs_reclaim(void *); 263 static int failover_safe(failinfo_t *); 264 static void failover_newserver(mntinfo_t *mi); 265 static void failover_thread(mntinfo_t *mi); 266 static int failover_wait(mntinfo_t *); 267 static int failover_remap(failinfo_t *); 268 static int failover_lookup(char *, vnode_t *, 269 int (*)(vnode_t *, char *, vnode_t **, 270 struct pathname *, int, vnode_t *, cred_t *, int), 271 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 272 vnode_t **); 273 static void nfs_free_r_path(rnode_t *); 274 static void nfs_set_vroot(vnode_t *); 275 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 276 277 /* 278 * from rpcsec module (common/rpcsec) 279 */ 280 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 281 extern void sec_clnt_freeh(AUTH *); 282 extern void sec_clnt_freeinfo(struct sec_data *); 283 284 /* 285 * used in mount policy 286 */ 287 extern ts_label_t *getflabel_cipso(vfs_t *); 288 289 /* 290 * EIO or EINTR are not recoverable errors. 291 */ 292 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 293 294 /* 295 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 296 */ 297 static int 298 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 299 struct chtab **chp, struct nfs_clnt *nfscl) 300 { 301 struct chhead *ch, *newch; 302 struct chhead **plistp; 303 struct chtab *cp; 304 int error; 305 k_sigset_t smask; 306 307 if (newcl == NULL || chp == NULL || ci == NULL) 308 return (EINVAL); 309 310 *newcl = NULL; 311 *chp = NULL; 312 313 /* 314 * Find an unused handle or create one 315 */ 316 newch = NULL; 317 nfscl->nfscl_stat.clgets.value.ui64++; 318 top: 319 /* 320 * Find the correct entry in the cache to check for free 321 * client handles. The search is based on the RPC program 322 * number, program version number, dev_t for the transport 323 * device, and the protocol family. 324 */ 325 mutex_enter(&nfscl->nfscl_chtable_lock); 326 plistp = &nfscl->nfscl_chtable; 327 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 328 if (ch->ch_prog == ci->cl_prog && 329 ch->ch_vers == ci->cl_vers && 330 ch->ch_dev == svp->sv_knconf->knc_rdev && 331 (strcmp(ch->ch_protofmly, 332 svp->sv_knconf->knc_protofmly) == 0)) 333 break; 334 plistp = &ch->ch_next; 335 } 336 337 /* 338 * If we didn't find a cache entry for this quadruple, then 339 * create one. If we don't have one already preallocated, 340 * then drop the cache lock, create one, and then start over. 341 * If we did have a preallocated entry, then just add it to 342 * the front of the list. 343 */ 344 if (ch == NULL) { 345 if (newch == NULL) { 346 mutex_exit(&nfscl->nfscl_chtable_lock); 347 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 348 newch->ch_timesused = 0; 349 newch->ch_prog = ci->cl_prog; 350 newch->ch_vers = ci->cl_vers; 351 newch->ch_dev = svp->sv_knconf->knc_rdev; 352 newch->ch_protofmly = kmem_alloc( 353 strlen(svp->sv_knconf->knc_protofmly) + 1, 354 KM_SLEEP); 355 (void) strcpy(newch->ch_protofmly, 356 svp->sv_knconf->knc_protofmly); 357 newch->ch_list = NULL; 358 goto top; 359 } 360 ch = newch; 361 newch = NULL; 362 ch->ch_next = nfscl->nfscl_chtable; 363 nfscl->nfscl_chtable = ch; 364 /* 365 * We found a cache entry, but if it isn't on the front of the 366 * list, then move it to the front of the list to try to take 367 * advantage of locality of operations. 368 */ 369 } else if (ch != nfscl->nfscl_chtable) { 370 *plistp = ch->ch_next; 371 ch->ch_next = nfscl->nfscl_chtable; 372 nfscl->nfscl_chtable = ch; 373 } 374 375 /* 376 * If there was a free client handle cached, then remove it 377 * from the list, init it, and use it. 378 */ 379 if (ch->ch_list != NULL) { 380 cp = ch->ch_list; 381 ch->ch_list = cp->ch_list; 382 mutex_exit(&nfscl->nfscl_chtable_lock); 383 if (newch != NULL) { 384 kmem_free(newch->ch_protofmly, 385 strlen(newch->ch_protofmly) + 1); 386 kmem_free(newch, sizeof (*newch)); 387 } 388 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 389 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 390 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 391 &cp->ch_client->cl_auth); 392 if (error || cp->ch_client->cl_auth == NULL) { 393 CLNT_DESTROY(cp->ch_client); 394 kmem_cache_free(chtab_cache, cp); 395 return ((error != 0) ? error : EINTR); 396 } 397 ch->ch_timesused++; 398 *newcl = cp->ch_client; 399 *chp = cp; 400 return (0); 401 } 402 403 /* 404 * There weren't any free client handles which fit, so allocate 405 * a new one and use that. 406 */ 407 #ifdef DEBUG 408 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 409 #endif 410 mutex_exit(&nfscl->nfscl_chtable_lock); 411 412 nfscl->nfscl_stat.cltoomany.value.ui64++; 413 if (newch != NULL) { 414 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 415 kmem_free(newch, sizeof (*newch)); 416 } 417 418 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 419 cp->ch_head = ch; 420 421 sigintr(&smask, (int)ci->cl_flags & MI_INT); 422 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 423 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 424 sigunintr(&smask); 425 426 if (error != 0) { 427 kmem_cache_free(chtab_cache, cp); 428 #ifdef DEBUG 429 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 430 #endif 431 /* 432 * Warning is unnecessary if error is EINTR. 433 */ 434 if (error != EINTR) { 435 nfs_cmn_err(error, CE_WARN, 436 "clget: couldn't create handle: %m\n"); 437 } 438 return (error); 439 } 440 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 441 auth_destroy(cp->ch_client->cl_auth); 442 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 443 &cp->ch_client->cl_auth); 444 if (error || cp->ch_client->cl_auth == NULL) { 445 CLNT_DESTROY(cp->ch_client); 446 kmem_cache_free(chtab_cache, cp); 447 #ifdef DEBUG 448 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 449 #endif 450 return ((error != 0) ? error : EINTR); 451 } 452 ch->ch_timesused++; 453 *newcl = cp->ch_client; 454 ASSERT(cp->ch_client->cl_nosignal == FALSE); 455 *chp = cp; 456 return (0); 457 } 458 459 int 460 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 461 struct chtab **chp) 462 { 463 struct nfs_clnt *nfscl; 464 465 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 466 ASSERT(nfscl != NULL); 467 468 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 469 } 470 471 static int 472 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 473 struct chtab **chp, struct nfs_clnt *nfscl) 474 { 475 clinfo_t ci; 476 int error; 477 478 /* 479 * Set read buffer size to rsize 480 * and add room for RPC headers. 481 */ 482 ci.cl_readsize = mi->mi_tsize; 483 if (ci.cl_readsize != 0) 484 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 485 486 /* 487 * If soft mount and server is down just try once. 488 * meaning: do not retransmit. 489 */ 490 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 491 ci.cl_retrans = 0; 492 else 493 ci.cl_retrans = mi->mi_retrans; 494 495 ci.cl_prog = NFS_ACL_PROGRAM; 496 ci.cl_vers = mi->mi_vers; 497 ci.cl_flags = mi->mi_flags; 498 499 /* 500 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 501 * security flavor, the client tries to establish a security context 502 * by contacting the server. If the connection is timed out or reset, 503 * e.g. server reboot, we will try again. 504 */ 505 do { 506 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 507 508 if (error == 0) 509 break; 510 511 /* 512 * For forced unmount or zone shutdown, bail out, no retry. 513 */ 514 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 515 error = EIO; 516 break; 517 } 518 519 /* do not retry for softmount */ 520 if (!(mi->mi_flags & MI_HARD)) 521 break; 522 523 /* let the caller deal with the failover case */ 524 if (FAILOVER_MOUNT(mi)) 525 break; 526 527 } while (error == ETIMEDOUT || error == ECONNRESET); 528 529 return (error); 530 } 531 532 static int 533 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 534 struct chtab **chp, struct nfs_clnt *nfscl) 535 { 536 clinfo_t ci; 537 int error; 538 539 /* 540 * Set read buffer size to rsize 541 * and add room for RPC headers. 542 */ 543 ci.cl_readsize = mi->mi_tsize; 544 if (ci.cl_readsize != 0) 545 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 546 547 /* 548 * If soft mount and server is down just try once. 549 * meaning: do not retransmit. 550 */ 551 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 552 ci.cl_retrans = 0; 553 else 554 ci.cl_retrans = mi->mi_retrans; 555 556 ci.cl_prog = mi->mi_prog; 557 ci.cl_vers = mi->mi_vers; 558 ci.cl_flags = mi->mi_flags; 559 560 /* 561 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 562 * security flavor, the client tries to establish a security context 563 * by contacting the server. If the connection is timed out or reset, 564 * e.g. server reboot, we will try again. 565 */ 566 do { 567 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 568 569 if (error == 0) 570 break; 571 572 /* 573 * For forced unmount or zone shutdown, bail out, no retry. 574 */ 575 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 576 error = EIO; 577 break; 578 } 579 580 /* do not retry for softmount */ 581 if (!(mi->mi_flags & MI_HARD)) 582 break; 583 584 /* let the caller deal with the failover case */ 585 if (FAILOVER_MOUNT(mi)) 586 break; 587 588 } while (error == ETIMEDOUT || error == ECONNRESET); 589 590 return (error); 591 } 592 593 static void 594 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 595 { 596 if (cl->cl_auth != NULL) { 597 sec_clnt_freeh(cl->cl_auth); 598 cl->cl_auth = NULL; 599 } 600 601 /* 602 * Timestamp this cache entry so that we know when it was last 603 * used. 604 */ 605 cp->ch_freed = gethrestime_sec(); 606 607 /* 608 * Add the free client handle to the front of the list. 609 * This way, the list will be sorted in youngest to oldest 610 * order. 611 */ 612 mutex_enter(&nfscl->nfscl_chtable_lock); 613 cp->ch_list = cp->ch_head->ch_list; 614 cp->ch_head->ch_list = cp; 615 mutex_exit(&nfscl->nfscl_chtable_lock); 616 } 617 618 void 619 clfree(CLIENT *cl, struct chtab *cp) 620 { 621 struct nfs_clnt *nfscl; 622 623 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 624 ASSERT(nfscl != NULL); 625 626 clfree_impl(cl, cp, nfscl); 627 } 628 629 #define CL_HOLDTIME 60 /* time to hold client handles */ 630 631 static void 632 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 633 { 634 struct chhead *ch; 635 struct chtab *cp; /* list of objects that can be reclaimed */ 636 struct chtab *cpe; 637 struct chtab *cpl; 638 struct chtab **cpp; 639 #ifdef DEBUG 640 int n = 0; 641 #endif 642 643 /* 644 * Need to reclaim some memory, so step through the cache 645 * looking through the lists for entries which can be freed. 646 */ 647 cp = NULL; 648 649 mutex_enter(&nfscl->nfscl_chtable_lock); 650 651 /* 652 * Here we step through each non-NULL quadruple and start to 653 * construct the reclaim list pointed to by cp. Note that 654 * cp will contain all eligible chtab entries. When this traversal 655 * completes, chtab entries from the last quadruple will be at the 656 * front of cp and entries from previously inspected quadruples have 657 * been appended to the rear of cp. 658 */ 659 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 660 if (ch->ch_list == NULL) 661 continue; 662 /* 663 * Search each list for entries older then 664 * cl_holdtime seconds. The lists are maintained 665 * in youngest to oldest order so that when the 666 * first entry is found which is old enough, then 667 * all of the rest of the entries on the list will 668 * be old enough as well. 669 */ 670 cpl = ch->ch_list; 671 cpp = &ch->ch_list; 672 while (cpl != NULL && 673 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 674 cpp = &cpl->ch_list; 675 cpl = cpl->ch_list; 676 } 677 if (cpl != NULL) { 678 *cpp = NULL; 679 if (cp != NULL) { 680 cpe = cpl; 681 while (cpe->ch_list != NULL) 682 cpe = cpe->ch_list; 683 cpe->ch_list = cp; 684 } 685 cp = cpl; 686 } 687 } 688 689 mutex_exit(&nfscl->nfscl_chtable_lock); 690 691 /* 692 * If cp is empty, then there is nothing to reclaim here. 693 */ 694 if (cp == NULL) 695 return; 696 697 /* 698 * Step through the list of entries to free, destroying each client 699 * handle and kmem_free'ing the memory for each entry. 700 */ 701 while (cp != NULL) { 702 #ifdef DEBUG 703 n++; 704 #endif 705 CLNT_DESTROY(cp->ch_client); 706 cpl = cp->ch_list; 707 kmem_cache_free(chtab_cache, cp); 708 cp = cpl; 709 } 710 711 #ifdef DEBUG 712 /* 713 * Update clalloc so that nfsstat shows the current number 714 * of allocated client handles. 715 */ 716 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 717 #endif 718 } 719 720 /* ARGSUSED */ 721 static void 722 clreclaim(void *all) 723 { 724 struct nfs_clnt *nfscl; 725 726 #ifdef DEBUG 727 clstat_debug.clreclaim.value.ui64++; 728 #endif 729 /* 730 * The system is low on memory; go through and try to reclaim some from 731 * every zone on the system. 732 */ 733 mutex_enter(&nfs_clnt_list_lock); 734 nfscl = list_head(&nfs_clnt_list); 735 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 736 clreclaim_zone(nfscl, CL_HOLDTIME); 737 mutex_exit(&nfs_clnt_list_lock); 738 } 739 740 /* 741 * Minimum time-out values indexed by call type 742 * These units are in "eights" of a second to avoid multiplies 743 */ 744 static unsigned int minimum_timeo[] = { 745 6, 7, 10 746 }; 747 748 /* 749 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 750 */ 751 #define MAXTIMO (20*hz) 752 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 753 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 754 755 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 756 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 757 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 758 759 /* 760 * Function called when rfscall notices that we have been 761 * re-transmitting, or when we get a response without retransmissions. 762 * Return 1 if the transfer size was adjusted down - 0 if no change. 763 */ 764 static int 765 nfs_feedback(int flag, int which, mntinfo_t *mi) 766 { 767 int kind; 768 int r = 0; 769 770 mutex_enter(&mi->mi_lock); 771 if (flag == FEEDBACK_REXMIT1) { 772 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 773 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 774 goto done; 775 if (mi->mi_curread > MIN_NFS_TSIZE) { 776 mi->mi_curread /= 2; 777 if (mi->mi_curread < MIN_NFS_TSIZE) 778 mi->mi_curread = MIN_NFS_TSIZE; 779 r = 1; 780 } 781 782 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 783 mi->mi_curwrite /= 2; 784 if (mi->mi_curwrite < MIN_NFS_TSIZE) 785 mi->mi_curwrite = MIN_NFS_TSIZE; 786 r = 1; 787 } 788 } else if (flag == FEEDBACK_OK) { 789 kind = mi->mi_timer_type[which]; 790 if (kind == 0 || 791 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 792 goto done; 793 if (kind == 1) { 794 if (mi->mi_curread >= mi->mi_tsize) 795 goto done; 796 mi->mi_curread += MIN_NFS_TSIZE; 797 if (mi->mi_curread > mi->mi_tsize/2) 798 mi->mi_curread = mi->mi_tsize; 799 } else if (kind == 2) { 800 if (mi->mi_curwrite >= mi->mi_stsize) 801 goto done; 802 mi->mi_curwrite += MIN_NFS_TSIZE; 803 if (mi->mi_curwrite > mi->mi_stsize/2) 804 mi->mi_curwrite = mi->mi_stsize; 805 } 806 } 807 done: 808 mutex_exit(&mi->mi_lock); 809 return (r); 810 } 811 812 #ifdef DEBUG 813 static int rfs2call_hits = 0; 814 static int rfs2call_misses = 0; 815 #endif 816 817 int 818 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 819 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 820 enum nfsstat *statusp, int flags, failinfo_t *fi) 821 { 822 int rpcerror; 823 enum clnt_stat rpc_status; 824 825 ASSERT(statusp != NULL); 826 827 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 828 cr, douprintf, &rpc_status, flags, fi); 829 if (!rpcerror) { 830 /* 831 * See crnetadjust() for comments. 832 */ 833 if (*statusp == NFSERR_ACCES && 834 (cr = crnetadjust(cr)) != NULL) { 835 #ifdef DEBUG 836 rfs2call_hits++; 837 #endif 838 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 839 resp, cr, douprintf, NULL, flags, fi); 840 crfree(cr); 841 #ifdef DEBUG 842 if (*statusp == NFSERR_ACCES) 843 rfs2call_misses++; 844 #endif 845 } 846 } else if (rpc_status == RPC_PROCUNAVAIL) { 847 *statusp = NFSERR_OPNOTSUPP; 848 rpcerror = 0; 849 } 850 851 return (rpcerror); 852 } 853 854 #define NFS3_JUKEBOX_DELAY 10 * hz 855 856 static clock_t nfs3_jukebox_delay = 0; 857 858 #ifdef DEBUG 859 static int rfs3call_hits = 0; 860 static int rfs3call_misses = 0; 861 #endif 862 863 int 864 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 865 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 866 nfsstat3 *statusp, int flags, failinfo_t *fi) 867 { 868 int rpcerror; 869 int user_informed; 870 871 user_informed = 0; 872 do { 873 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 874 cr, douprintf, NULL, flags, fi); 875 if (!rpcerror) { 876 cred_t *crr; 877 if (*statusp == NFS3ERR_JUKEBOX) { 878 if (ttoproc(curthread) == &p0) { 879 rpcerror = EAGAIN; 880 break; 881 } 882 if (!user_informed) { 883 user_informed = 1; 884 uprintf( 885 "file temporarily unavailable on the server, retrying...\n"); 886 } 887 delay(nfs3_jukebox_delay); 888 } 889 /* 890 * See crnetadjust() for comments. 891 */ 892 else if (*statusp == NFS3ERR_ACCES && 893 (crr = crnetadjust(cr)) != NULL) { 894 #ifdef DEBUG 895 rfs3call_hits++; 896 #endif 897 rpcerror = rfscall(mi, which, xdrargs, argsp, 898 xdrres, resp, crr, douprintf, 899 NULL, flags, fi); 900 901 crfree(crr); 902 #ifdef DEBUG 903 if (*statusp == NFS3ERR_ACCES) 904 rfs3call_misses++; 905 #endif 906 } 907 } 908 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 909 910 return (rpcerror); 911 } 912 913 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 914 #define INC_READERS(mi) { \ 915 mi->mi_readers++; \ 916 } 917 #define DEC_READERS(mi) { \ 918 mi->mi_readers--; \ 919 if (mi->mi_readers == 0) \ 920 cv_broadcast(&mi->mi_failover_cv); \ 921 } 922 923 static int 924 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 925 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 926 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 927 { 928 CLIENT *client; 929 struct chtab *ch; 930 cred_t *cr = icr; 931 enum clnt_stat status; 932 struct rpc_err rpcerr; 933 struct timeval wait; 934 int timeo; /* in units of hz */ 935 int my_rsize, my_wsize; 936 bool_t tryagain; 937 bool_t cred_cloned = FALSE; 938 k_sigset_t smask; 939 servinfo_t *svp; 940 struct nfs_clnt *nfscl; 941 zoneid_t zoneid = getzoneid(); 942 #ifdef DEBUG 943 char *bufp; 944 #endif 945 946 947 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 948 "rfscall_start:which %d mi %p", which, mi); 949 950 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 951 ASSERT(nfscl != NULL); 952 953 nfscl->nfscl_stat.calls.value.ui64++; 954 mi->mi_reqs[which].value.ui64++; 955 956 rpcerr.re_status = RPC_SUCCESS; 957 958 /* 959 * In case of forced unmount or zone shutdown, return EIO. 960 */ 961 962 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 963 rpcerr.re_status = RPC_FAILED; 964 rpcerr.re_errno = EIO; 965 return (rpcerr.re_errno); 966 } 967 968 /* 969 * Remember the transfer sizes in case 970 * nfs_feedback changes them underneath us. 971 */ 972 my_rsize = mi->mi_curread; 973 my_wsize = mi->mi_curwrite; 974 975 /* 976 * NFS client failover support 977 * 978 * If this rnode is not in sync with the current server (VALID_FH), 979 * we'd like to do a remap to get in sync. We can be interrupted 980 * in failover_remap(), and if so we'll bail. Otherwise, we'll 981 * use the best info we have to try the RPC. Part of that is 982 * unconditionally updating the filehandle copy kept for V3. 983 * 984 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 985 * rw_enter(); we're trying to keep the current server from being 986 * changed on us until we're done with the remapping and have a 987 * matching client handle. We don't want to sending a filehandle 988 * to the wrong host. 989 */ 990 failoverretry: 991 if (FAILOVER_MOUNT(mi)) { 992 mutex_enter(&mi->mi_lock); 993 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 994 if (failover_wait(mi)) { 995 mutex_exit(&mi->mi_lock); 996 return (EINTR); 997 } 998 } 999 INC_READERS(mi); 1000 mutex_exit(&mi->mi_lock); 1001 if (fi) { 1002 if (!VALID_FH(fi) && 1003 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1004 int remaperr; 1005 1006 svp = mi->mi_curr_serv; 1007 remaperr = failover_remap(fi); 1008 if (remaperr != 0) { 1009 #ifdef DEBUG 1010 if (remaperr != EINTR) 1011 nfs_cmn_err(remaperr, CE_WARN, 1012 "rfscall couldn't failover: %m"); 1013 #endif 1014 mutex_enter(&mi->mi_lock); 1015 DEC_READERS(mi); 1016 mutex_exit(&mi->mi_lock); 1017 /* 1018 * If failover_remap returns ETIMEDOUT 1019 * and the filesystem is hard mounted 1020 * we have to retry the call with a new 1021 * server. 1022 */ 1023 if ((mi->mi_flags & MI_HARD) && 1024 IS_RECOVERABLE_ERROR(remaperr)) { 1025 if (svp == mi->mi_curr_serv) 1026 failover_newserver(mi); 1027 rpcerr.re_status = RPC_SUCCESS; 1028 goto failoverretry; 1029 } 1030 rpcerr.re_errno = remaperr; 1031 return (remaperr); 1032 } 1033 } 1034 if (fi->fhp && fi->copyproc) 1035 (*fi->copyproc)(fi->fhp, fi->vp); 1036 } 1037 } 1038 1039 /* For TSOL, use a new cred which has net_mac_aware flag */ 1040 if (!cred_cloned && is_system_labeled()) { 1041 cred_cloned = TRUE; 1042 cr = crdup(icr); 1043 (void) setpflags(NET_MAC_AWARE, 1, cr); 1044 } 1045 1046 /* 1047 * clget() calls clnt_tli_kinit() which clears the xid, so we 1048 * are guaranteed to reprocess the retry as a new request. 1049 */ 1050 svp = mi->mi_curr_serv; 1051 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1052 1053 if (FAILOVER_MOUNT(mi)) { 1054 mutex_enter(&mi->mi_lock); 1055 DEC_READERS(mi); 1056 mutex_exit(&mi->mi_lock); 1057 1058 if ((rpcerr.re_errno == ETIMEDOUT || 1059 rpcerr.re_errno == ECONNRESET) && 1060 failover_safe(fi)) { 1061 if (svp == mi->mi_curr_serv) 1062 failover_newserver(mi); 1063 goto failoverretry; 1064 } 1065 } 1066 if (rpcerr.re_errno != 0) 1067 return (rpcerr.re_errno); 1068 1069 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1070 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1071 timeo = (mi->mi_timeo * hz) / 10; 1072 } else { 1073 mutex_enter(&mi->mi_lock); 1074 timeo = CLNT_SETTIMERS(client, 1075 &(mi->mi_timers[mi->mi_timer_type[which]]), 1076 &(mi->mi_timers[NFS_CALLTYPES]), 1077 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1078 (void (*)())NULL, (caddr_t)mi, 0); 1079 mutex_exit(&mi->mi_lock); 1080 } 1081 1082 /* 1083 * If hard mounted fs, retry call forever unless hard error occurs. 1084 */ 1085 do { 1086 tryagain = FALSE; 1087 1088 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1089 status = RPC_FAILED; 1090 rpcerr.re_status = RPC_FAILED; 1091 rpcerr.re_errno = EIO; 1092 break; 1093 } 1094 1095 TICK_TO_TIMEVAL(timeo, &wait); 1096 1097 /* 1098 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1099 * and SIGTERM. (Preserving the existing masks). 1100 * Mask out SIGINT if mount option nointr is specified. 1101 */ 1102 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1103 if (!(mi->mi_flags & MI_INT)) 1104 client->cl_nosignal = TRUE; 1105 1106 /* 1107 * If there is a current signal, then don't bother 1108 * even trying to send out the request because we 1109 * won't be able to block waiting for the response. 1110 * Simply assume RPC_INTR and get on with it. 1111 */ 1112 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1113 status = RPC_INTR; 1114 else { 1115 status = CLNT_CALL(client, which, xdrargs, argsp, 1116 xdrres, resp, wait); 1117 } 1118 1119 if (!(mi->mi_flags & MI_INT)) 1120 client->cl_nosignal = FALSE; 1121 /* 1122 * restore original signal mask 1123 */ 1124 sigunintr(&smask); 1125 1126 switch (status) { 1127 case RPC_SUCCESS: 1128 if ((mi->mi_flags & MI_DYNAMIC) && 1129 mi->mi_timer_type[which] != 0 && 1130 (mi->mi_curread != my_rsize || 1131 mi->mi_curwrite != my_wsize)) 1132 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1133 break; 1134 1135 case RPC_INTR: 1136 /* 1137 * There is no way to recover from this error, 1138 * even if mount option nointr is specified. 1139 * SIGKILL, for example, cannot be blocked. 1140 */ 1141 rpcerr.re_status = RPC_INTR; 1142 rpcerr.re_errno = EINTR; 1143 break; 1144 1145 case RPC_UDERROR: 1146 /* 1147 * If the NFS server is local (vold) and 1148 * it goes away then we get RPC_UDERROR. 1149 * This is a retryable error, so we would 1150 * loop, so check to see if the specific 1151 * error was ECONNRESET, indicating that 1152 * target did not exist at all. If so, 1153 * return with RPC_PROGUNAVAIL and 1154 * ECONNRESET to indicate why. 1155 */ 1156 CLNT_GETERR(client, &rpcerr); 1157 if (rpcerr.re_errno == ECONNRESET) { 1158 rpcerr.re_status = RPC_PROGUNAVAIL; 1159 rpcerr.re_errno = ECONNRESET; 1160 break; 1161 } 1162 /*FALLTHROUGH*/ 1163 1164 default: /* probably RPC_TIMEDOUT */ 1165 if (IS_UNRECOVERABLE_RPC(status)) 1166 break; 1167 1168 /* 1169 * increment server not responding count 1170 */ 1171 mutex_enter(&mi->mi_lock); 1172 mi->mi_noresponse++; 1173 mutex_exit(&mi->mi_lock); 1174 #ifdef DEBUG 1175 nfscl->nfscl_stat.noresponse.value.ui64++; 1176 #endif 1177 1178 if (!(mi->mi_flags & MI_HARD)) { 1179 if (!(mi->mi_flags & MI_SEMISOFT) || 1180 (mi->mi_ss_call_type[which] == 0)) 1181 break; 1182 } 1183 1184 /* 1185 * The call is in progress (over COTS). 1186 * Try the CLNT_CALL again, but don't 1187 * print a noisy error message. 1188 */ 1189 if (status == RPC_INPROGRESS) { 1190 tryagain = TRUE; 1191 break; 1192 } 1193 1194 if (flags & RFSCALL_SOFT) 1195 break; 1196 1197 /* 1198 * On zone shutdown, just move on. 1199 */ 1200 if (zone_status_get(curproc->p_zone) >= 1201 ZONE_IS_SHUTTING_DOWN) { 1202 rpcerr.re_status = RPC_FAILED; 1203 rpcerr.re_errno = EIO; 1204 break; 1205 } 1206 1207 /* 1208 * NFS client failover support 1209 * 1210 * If the current server just failed us, we'll 1211 * start the process of finding a new server. 1212 * After that, we can just retry. 1213 */ 1214 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1215 if (svp == mi->mi_curr_serv) 1216 failover_newserver(mi); 1217 clfree_impl(client, ch, nfscl); 1218 goto failoverretry; 1219 } 1220 1221 tryagain = TRUE; 1222 timeo = backoff(timeo); 1223 mutex_enter(&mi->mi_lock); 1224 if (!(mi->mi_flags & MI_PRINTED)) { 1225 mi->mi_flags |= MI_PRINTED; 1226 mutex_exit(&mi->mi_lock); 1227 #ifdef DEBUG 1228 zprintf(zoneid, 1229 "NFS%d server %s not responding still trying\n", 1230 mi->mi_vers, svp->sv_hostname); 1231 #else 1232 zprintf(zoneid, 1233 "NFS server %s not responding still trying\n", 1234 svp->sv_hostname); 1235 #endif 1236 } else 1237 mutex_exit(&mi->mi_lock); 1238 if (*douprintf && nfs_has_ctty()) { 1239 *douprintf = 0; 1240 if (!(mi->mi_flags & MI_NOPRINT)) 1241 #ifdef DEBUG 1242 uprintf( 1243 "NFS%d server %s not responding still trying\n", 1244 mi->mi_vers, svp->sv_hostname); 1245 #else 1246 uprintf( 1247 "NFS server %s not responding still trying\n", 1248 svp->sv_hostname); 1249 #endif 1250 } 1251 1252 /* 1253 * If doing dynamic adjustment of transfer 1254 * size and if it's a read or write call 1255 * and if the transfer size changed while 1256 * retransmitting or if the feedback routine 1257 * changed the transfer size, 1258 * then exit rfscall so that the transfer 1259 * size can be adjusted at the vnops level. 1260 */ 1261 if ((mi->mi_flags & MI_DYNAMIC) && 1262 mi->mi_timer_type[which] != 0 && 1263 (mi->mi_curread != my_rsize || 1264 mi->mi_curwrite != my_wsize || 1265 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1266 /* 1267 * On read or write calls, return 1268 * back to the vnode ops level if 1269 * the transfer size changed. 1270 */ 1271 clfree_impl(client, ch, nfscl); 1272 if (cred_cloned) 1273 crfree(cr); 1274 return (ENFS_TRYAGAIN); 1275 } 1276 } 1277 } while (tryagain); 1278 1279 if (status != RPC_SUCCESS) { 1280 /* 1281 * Let soft mounts use the timed out message. 1282 */ 1283 if (status == RPC_INPROGRESS) 1284 status = RPC_TIMEDOUT; 1285 nfscl->nfscl_stat.badcalls.value.ui64++; 1286 if (status != RPC_INTR) { 1287 mutex_enter(&mi->mi_lock); 1288 mi->mi_flags |= MI_DOWN; 1289 mutex_exit(&mi->mi_lock); 1290 CLNT_GETERR(client, &rpcerr); 1291 #ifdef DEBUG 1292 bufp = clnt_sperror(client, svp->sv_hostname); 1293 zprintf(zoneid, "NFS%d %s failed for %s\n", 1294 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1295 if (nfs_has_ctty()) { 1296 if (!(mi->mi_flags & MI_NOPRINT)) { 1297 uprintf("NFS%d %s failed for %s\n", 1298 mi->mi_vers, mi->mi_rfsnames[which], 1299 bufp); 1300 } 1301 } 1302 kmem_free(bufp, MAXPATHLEN); 1303 #else 1304 zprintf(zoneid, 1305 "NFS %s failed for server %s: error %d (%s)\n", 1306 mi->mi_rfsnames[which], svp->sv_hostname, 1307 status, clnt_sperrno(status)); 1308 if (nfs_has_ctty()) { 1309 if (!(mi->mi_flags & MI_NOPRINT)) { 1310 uprintf( 1311 "NFS %s failed for server %s: error %d (%s)\n", 1312 mi->mi_rfsnames[which], 1313 svp->sv_hostname, status, 1314 clnt_sperrno(status)); 1315 } 1316 } 1317 #endif 1318 /* 1319 * when CLNT_CALL() fails with RPC_AUTHERROR, 1320 * re_errno is set appropriately depending on 1321 * the authentication error 1322 */ 1323 if (status == RPC_VERSMISMATCH || 1324 status == RPC_PROGVERSMISMATCH) 1325 rpcerr.re_errno = EIO; 1326 } 1327 } else { 1328 /* 1329 * Test the value of mi_down and mi_printed without 1330 * holding the mi_lock mutex. If they are both zero, 1331 * then it is okay to skip the down and printed 1332 * processing. This saves on a mutex_enter and 1333 * mutex_exit pair for a normal, successful RPC. 1334 * This was just complete overhead. 1335 */ 1336 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1337 mutex_enter(&mi->mi_lock); 1338 mi->mi_flags &= ~MI_DOWN; 1339 if (mi->mi_flags & MI_PRINTED) { 1340 mi->mi_flags &= ~MI_PRINTED; 1341 mutex_exit(&mi->mi_lock); 1342 #ifdef DEBUG 1343 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1344 zprintf(zoneid, "NFS%d server %s ok\n", 1345 mi->mi_vers, svp->sv_hostname); 1346 #else 1347 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1348 zprintf(zoneid, "NFS server %s ok\n", 1349 svp->sv_hostname); 1350 #endif 1351 } else 1352 mutex_exit(&mi->mi_lock); 1353 } 1354 1355 if (*douprintf == 0) { 1356 if (!(mi->mi_flags & MI_NOPRINT)) 1357 #ifdef DEBUG 1358 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1359 uprintf("NFS%d server %s ok\n", 1360 mi->mi_vers, svp->sv_hostname); 1361 #else 1362 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1363 uprintf("NFS server %s ok\n", svp->sv_hostname); 1364 #endif 1365 *douprintf = 1; 1366 } 1367 } 1368 1369 clfree_impl(client, ch, nfscl); 1370 if (cred_cloned) 1371 crfree(cr); 1372 1373 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1374 1375 if (rpc_status != NULL) 1376 *rpc_status = rpcerr.re_status; 1377 1378 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1379 rpcerr.re_errno); 1380 1381 return (rpcerr.re_errno); 1382 } 1383 1384 #ifdef DEBUG 1385 static int acl2call_hits = 0; 1386 static int acl2call_misses = 0; 1387 #endif 1388 1389 int 1390 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1391 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1392 enum nfsstat *statusp, int flags, failinfo_t *fi) 1393 { 1394 int rpcerror; 1395 1396 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1397 cr, douprintf, flags, fi); 1398 if (!rpcerror) { 1399 /* 1400 * See comments with crnetadjust(). 1401 */ 1402 if (*statusp == NFSERR_ACCES && 1403 (cr = crnetadjust(cr)) != NULL) { 1404 #ifdef DEBUG 1405 acl2call_hits++; 1406 #endif 1407 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1408 resp, cr, douprintf, flags, fi); 1409 crfree(cr); 1410 #ifdef DEBUG 1411 if (*statusp == NFSERR_ACCES) 1412 acl2call_misses++; 1413 #endif 1414 } 1415 } 1416 1417 return (rpcerror); 1418 } 1419 1420 #ifdef DEBUG 1421 static int acl3call_hits = 0; 1422 static int acl3call_misses = 0; 1423 #endif 1424 1425 int 1426 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1427 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1428 nfsstat3 *statusp, int flags, failinfo_t *fi) 1429 { 1430 int rpcerror; 1431 int user_informed; 1432 1433 user_informed = 0; 1434 1435 do { 1436 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1437 cr, douprintf, flags, fi); 1438 if (!rpcerror) { 1439 cred_t *crr; 1440 if (*statusp == NFS3ERR_JUKEBOX) { 1441 if (!user_informed) { 1442 user_informed = 1; 1443 uprintf( 1444 "file temporarily unavailable on the server, retrying...\n"); 1445 } 1446 delay(nfs3_jukebox_delay); 1447 } 1448 /* 1449 * See crnetadjust() for comments. 1450 */ 1451 else if (*statusp == NFS3ERR_ACCES && 1452 (crr = crnetadjust(cr)) != NULL) { 1453 #ifdef DEBUG 1454 acl3call_hits++; 1455 #endif 1456 rpcerror = aclcall(mi, which, xdrargs, argsp, 1457 xdrres, resp, crr, douprintf, flags, fi); 1458 1459 crfree(crr); 1460 #ifdef DEBUG 1461 if (*statusp == NFS3ERR_ACCES) 1462 acl3call_misses++; 1463 #endif 1464 } 1465 } 1466 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1467 1468 return (rpcerror); 1469 } 1470 1471 static int 1472 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1473 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1474 int flags, failinfo_t *fi) 1475 { 1476 CLIENT *client; 1477 struct chtab *ch; 1478 cred_t *cr = icr; 1479 bool_t cred_cloned = FALSE; 1480 enum clnt_stat status; 1481 struct rpc_err rpcerr; 1482 struct timeval wait; 1483 int timeo; /* in units of hz */ 1484 #if 0 /* notyet */ 1485 int my_rsize, my_wsize; 1486 #endif 1487 bool_t tryagain; 1488 k_sigset_t smask; 1489 servinfo_t *svp; 1490 struct nfs_clnt *nfscl; 1491 zoneid_t zoneid = getzoneid(); 1492 #ifdef DEBUG 1493 char *bufp; 1494 #endif 1495 1496 #if 0 /* notyet */ 1497 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1498 "rfscall_start:which %d mi %p", which, mi); 1499 #endif 1500 1501 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1502 ASSERT(nfscl != NULL); 1503 1504 nfscl->nfscl_stat.calls.value.ui64++; 1505 mi->mi_aclreqs[which].value.ui64++; 1506 1507 rpcerr.re_status = RPC_SUCCESS; 1508 1509 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1510 rpcerr.re_status = RPC_FAILED; 1511 rpcerr.re_errno = EIO; 1512 return (rpcerr.re_errno); 1513 } 1514 1515 #if 0 /* notyet */ 1516 /* 1517 * Remember the transfer sizes in case 1518 * nfs_feedback changes them underneath us. 1519 */ 1520 my_rsize = mi->mi_curread; 1521 my_wsize = mi->mi_curwrite; 1522 #endif 1523 1524 /* 1525 * NFS client failover support 1526 * 1527 * If this rnode is not in sync with the current server (VALID_FH), 1528 * we'd like to do a remap to get in sync. We can be interrupted 1529 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1530 * use the best info we have to try the RPC. Part of that is 1531 * unconditionally updating the filehandle copy kept for V3. 1532 * 1533 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1534 * rw_enter(); we're trying to keep the current server from being 1535 * changed on us until we're done with the remapping and have a 1536 * matching client handle. We don't want to sending a filehandle 1537 * to the wrong host. 1538 */ 1539 failoverretry: 1540 if (FAILOVER_MOUNT(mi)) { 1541 mutex_enter(&mi->mi_lock); 1542 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1543 if (failover_wait(mi)) { 1544 mutex_exit(&mi->mi_lock); 1545 return (EINTR); 1546 } 1547 } 1548 INC_READERS(mi); 1549 mutex_exit(&mi->mi_lock); 1550 if (fi) { 1551 if (!VALID_FH(fi) && 1552 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1553 int remaperr; 1554 1555 svp = mi->mi_curr_serv; 1556 remaperr = failover_remap(fi); 1557 if (remaperr != 0) { 1558 #ifdef DEBUG 1559 if (remaperr != EINTR) 1560 nfs_cmn_err(remaperr, CE_WARN, 1561 "aclcall couldn't failover: %m"); 1562 #endif 1563 mutex_enter(&mi->mi_lock); 1564 DEC_READERS(mi); 1565 mutex_exit(&mi->mi_lock); 1566 1567 /* 1568 * If failover_remap returns ETIMEDOUT 1569 * and the filesystem is hard mounted 1570 * we have to retry the call with a new 1571 * server. 1572 */ 1573 if ((mi->mi_flags & MI_HARD) && 1574 IS_RECOVERABLE_ERROR(remaperr)) { 1575 if (svp == mi->mi_curr_serv) 1576 failover_newserver(mi); 1577 rpcerr.re_status = RPC_SUCCESS; 1578 goto failoverretry; 1579 } 1580 return (remaperr); 1581 } 1582 } 1583 if (fi->fhp && fi->copyproc) 1584 (*fi->copyproc)(fi->fhp, fi->vp); 1585 } 1586 } 1587 1588 /* For TSOL, use a new cred which has net_mac_aware flag */ 1589 if (!cred_cloned && is_system_labeled()) { 1590 cred_cloned = TRUE; 1591 cr = crdup(icr); 1592 (void) setpflags(NET_MAC_AWARE, 1, cr); 1593 } 1594 1595 /* 1596 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1597 * are guaranteed to reprocess the retry as a new request. 1598 */ 1599 svp = mi->mi_curr_serv; 1600 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1601 if (FAILOVER_MOUNT(mi)) { 1602 mutex_enter(&mi->mi_lock); 1603 DEC_READERS(mi); 1604 mutex_exit(&mi->mi_lock); 1605 1606 if ((rpcerr.re_errno == ETIMEDOUT || 1607 rpcerr.re_errno == ECONNRESET) && 1608 failover_safe(fi)) { 1609 if (svp == mi->mi_curr_serv) 1610 failover_newserver(mi); 1611 goto failoverretry; 1612 } 1613 } 1614 if (rpcerr.re_errno != 0) { 1615 if (cred_cloned) 1616 crfree(cr); 1617 return (rpcerr.re_errno); 1618 } 1619 1620 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1621 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1622 timeo = (mi->mi_timeo * hz) / 10; 1623 } else { 1624 mutex_enter(&mi->mi_lock); 1625 timeo = CLNT_SETTIMERS(client, 1626 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1627 &(mi->mi_timers[NFS_CALLTYPES]), 1628 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1629 (void (*)()) 0, (caddr_t)mi, 0); 1630 mutex_exit(&mi->mi_lock); 1631 } 1632 1633 /* 1634 * If hard mounted fs, retry call forever unless hard error occurs. 1635 */ 1636 do { 1637 tryagain = FALSE; 1638 1639 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1640 status = RPC_FAILED; 1641 rpcerr.re_status = RPC_FAILED; 1642 rpcerr.re_errno = EIO; 1643 break; 1644 } 1645 1646 TICK_TO_TIMEVAL(timeo, &wait); 1647 1648 /* 1649 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1650 * and SIGTERM. (Preserving the existing masks). 1651 * Mask out SIGINT if mount option nointr is specified. 1652 */ 1653 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1654 if (!(mi->mi_flags & MI_INT)) 1655 client->cl_nosignal = TRUE; 1656 1657 /* 1658 * If there is a current signal, then don't bother 1659 * even trying to send out the request because we 1660 * won't be able to block waiting for the response. 1661 * Simply assume RPC_INTR and get on with it. 1662 */ 1663 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1664 status = RPC_INTR; 1665 else { 1666 status = CLNT_CALL(client, which, xdrargs, argsp, 1667 xdrres, resp, wait); 1668 } 1669 1670 if (!(mi->mi_flags & MI_INT)) 1671 client->cl_nosignal = FALSE; 1672 /* 1673 * restore original signal mask 1674 */ 1675 sigunintr(&smask); 1676 1677 switch (status) { 1678 case RPC_SUCCESS: 1679 #if 0 /* notyet */ 1680 if ((mi->mi_flags & MI_DYNAMIC) && 1681 mi->mi_timer_type[which] != 0 && 1682 (mi->mi_curread != my_rsize || 1683 mi->mi_curwrite != my_wsize)) 1684 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1685 #endif 1686 break; 1687 1688 /* 1689 * Unfortunately, there are servers in the world which 1690 * are not coded correctly. They are not prepared to 1691 * handle RPC requests to the NFS port which are not 1692 * NFS requests. Thus, they may try to process the 1693 * NFS_ACL request as if it were an NFS request. This 1694 * does not work. Generally, an error will be generated 1695 * on the client because it will not be able to decode 1696 * the response from the server. However, it seems 1697 * possible that the server may not be able to decode 1698 * the arguments. Thus, the criteria for deciding 1699 * whether the server supports NFS_ACL or not is whether 1700 * the following RPC errors are returned from CLNT_CALL. 1701 */ 1702 case RPC_CANTDECODERES: 1703 case RPC_PROGUNAVAIL: 1704 case RPC_CANTDECODEARGS: 1705 case RPC_PROGVERSMISMATCH: 1706 mutex_enter(&mi->mi_lock); 1707 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1708 mutex_exit(&mi->mi_lock); 1709 break; 1710 1711 /* 1712 * If the server supports NFS_ACL but not the new ops 1713 * for extended attributes, make sure we don't retry. 1714 */ 1715 case RPC_PROCUNAVAIL: 1716 mutex_enter(&mi->mi_lock); 1717 mi->mi_flags &= ~MI_EXTATTR; 1718 mutex_exit(&mi->mi_lock); 1719 break; 1720 1721 case RPC_INTR: 1722 /* 1723 * There is no way to recover from this error, 1724 * even if mount option nointr is specified. 1725 * SIGKILL, for example, cannot be blocked. 1726 */ 1727 rpcerr.re_status = RPC_INTR; 1728 rpcerr.re_errno = EINTR; 1729 break; 1730 1731 case RPC_UDERROR: 1732 /* 1733 * If the NFS server is local (vold) and 1734 * it goes away then we get RPC_UDERROR. 1735 * This is a retryable error, so we would 1736 * loop, so check to see if the specific 1737 * error was ECONNRESET, indicating that 1738 * target did not exist at all. If so, 1739 * return with RPC_PROGUNAVAIL and 1740 * ECONNRESET to indicate why. 1741 */ 1742 CLNT_GETERR(client, &rpcerr); 1743 if (rpcerr.re_errno == ECONNRESET) { 1744 rpcerr.re_status = RPC_PROGUNAVAIL; 1745 rpcerr.re_errno = ECONNRESET; 1746 break; 1747 } 1748 /*FALLTHROUGH*/ 1749 1750 default: /* probably RPC_TIMEDOUT */ 1751 if (IS_UNRECOVERABLE_RPC(status)) 1752 break; 1753 1754 /* 1755 * increment server not responding count 1756 */ 1757 mutex_enter(&mi->mi_lock); 1758 mi->mi_noresponse++; 1759 mutex_exit(&mi->mi_lock); 1760 #ifdef DEBUG 1761 nfscl->nfscl_stat.noresponse.value.ui64++; 1762 #endif 1763 1764 if (!(mi->mi_flags & MI_HARD)) { 1765 if (!(mi->mi_flags & MI_SEMISOFT) || 1766 (mi->mi_acl_ss_call_type[which] == 0)) 1767 break; 1768 } 1769 1770 /* 1771 * The call is in progress (over COTS). 1772 * Try the CLNT_CALL again, but don't 1773 * print a noisy error message. 1774 */ 1775 if (status == RPC_INPROGRESS) { 1776 tryagain = TRUE; 1777 break; 1778 } 1779 1780 if (flags & RFSCALL_SOFT) 1781 break; 1782 1783 /* 1784 * On zone shutdown, just move on. 1785 */ 1786 if (zone_status_get(curproc->p_zone) >= 1787 ZONE_IS_SHUTTING_DOWN) { 1788 rpcerr.re_status = RPC_FAILED; 1789 rpcerr.re_errno = EIO; 1790 break; 1791 } 1792 1793 /* 1794 * NFS client failover support 1795 * 1796 * If the current server just failed us, we'll 1797 * start the process of finding a new server. 1798 * After that, we can just retry. 1799 */ 1800 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1801 if (svp == mi->mi_curr_serv) 1802 failover_newserver(mi); 1803 clfree_impl(client, ch, nfscl); 1804 goto failoverretry; 1805 } 1806 1807 tryagain = TRUE; 1808 timeo = backoff(timeo); 1809 mutex_enter(&mi->mi_lock); 1810 if (!(mi->mi_flags & MI_PRINTED)) { 1811 mi->mi_flags |= MI_PRINTED; 1812 mutex_exit(&mi->mi_lock); 1813 #ifdef DEBUG 1814 zprintf(zoneid, 1815 "NFS_ACL%d server %s not responding still trying\n", 1816 mi->mi_vers, svp->sv_hostname); 1817 #else 1818 zprintf(zoneid, 1819 "NFS server %s not responding still trying\n", 1820 svp->sv_hostname); 1821 #endif 1822 } else 1823 mutex_exit(&mi->mi_lock); 1824 if (*douprintf && nfs_has_ctty()) { 1825 *douprintf = 0; 1826 if (!(mi->mi_flags & MI_NOPRINT)) 1827 #ifdef DEBUG 1828 uprintf( 1829 "NFS_ACL%d server %s not responding still trying\n", 1830 mi->mi_vers, svp->sv_hostname); 1831 #else 1832 uprintf( 1833 "NFS server %s not responding still trying\n", 1834 svp->sv_hostname); 1835 #endif 1836 } 1837 1838 #if 0 /* notyet */ 1839 /* 1840 * If doing dynamic adjustment of transfer 1841 * size and if it's a read or write call 1842 * and if the transfer size changed while 1843 * retransmitting or if the feedback routine 1844 * changed the transfer size, 1845 * then exit rfscall so that the transfer 1846 * size can be adjusted at the vnops level. 1847 */ 1848 if ((mi->mi_flags & MI_DYNAMIC) && 1849 mi->mi_acl_timer_type[which] != 0 && 1850 (mi->mi_curread != my_rsize || 1851 mi->mi_curwrite != my_wsize || 1852 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1853 /* 1854 * On read or write calls, return 1855 * back to the vnode ops level if 1856 * the transfer size changed. 1857 */ 1858 clfree_impl(client, ch, nfscl); 1859 if (cred_cloned) 1860 crfree(cr); 1861 return (ENFS_TRYAGAIN); 1862 } 1863 #endif 1864 } 1865 } while (tryagain); 1866 1867 if (status != RPC_SUCCESS) { 1868 /* 1869 * Let soft mounts use the timed out message. 1870 */ 1871 if (status == RPC_INPROGRESS) 1872 status = RPC_TIMEDOUT; 1873 nfscl->nfscl_stat.badcalls.value.ui64++; 1874 if (status == RPC_CANTDECODERES || 1875 status == RPC_PROGUNAVAIL || 1876 status == RPC_PROCUNAVAIL || 1877 status == RPC_CANTDECODEARGS || 1878 status == RPC_PROGVERSMISMATCH) 1879 CLNT_GETERR(client, &rpcerr); 1880 else if (status != RPC_INTR) { 1881 mutex_enter(&mi->mi_lock); 1882 mi->mi_flags |= MI_DOWN; 1883 mutex_exit(&mi->mi_lock); 1884 CLNT_GETERR(client, &rpcerr); 1885 #ifdef DEBUG 1886 bufp = clnt_sperror(client, svp->sv_hostname); 1887 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1888 mi->mi_vers, mi->mi_aclnames[which], bufp); 1889 if (nfs_has_ctty()) { 1890 if (!(mi->mi_flags & MI_NOPRINT)) { 1891 uprintf("NFS_ACL%d %s failed for %s\n", 1892 mi->mi_vers, mi->mi_aclnames[which], 1893 bufp); 1894 } 1895 } 1896 kmem_free(bufp, MAXPATHLEN); 1897 #else 1898 zprintf(zoneid, 1899 "NFS %s failed for server %s: error %d (%s)\n", 1900 mi->mi_aclnames[which], svp->sv_hostname, 1901 status, clnt_sperrno(status)); 1902 if (nfs_has_ctty()) { 1903 if (!(mi->mi_flags & MI_NOPRINT)) 1904 uprintf( 1905 "NFS %s failed for server %s: error %d (%s)\n", 1906 mi->mi_aclnames[which], 1907 svp->sv_hostname, status, 1908 clnt_sperrno(status)); 1909 } 1910 #endif 1911 /* 1912 * when CLNT_CALL() fails with RPC_AUTHERROR, 1913 * re_errno is set appropriately depending on 1914 * the authentication error 1915 */ 1916 if (status == RPC_VERSMISMATCH || 1917 status == RPC_PROGVERSMISMATCH) 1918 rpcerr.re_errno = EIO; 1919 } 1920 } else { 1921 /* 1922 * Test the value of mi_down and mi_printed without 1923 * holding the mi_lock mutex. If they are both zero, 1924 * then it is okay to skip the down and printed 1925 * processing. This saves on a mutex_enter and 1926 * mutex_exit pair for a normal, successful RPC. 1927 * This was just complete overhead. 1928 */ 1929 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1930 mutex_enter(&mi->mi_lock); 1931 mi->mi_flags &= ~MI_DOWN; 1932 if (mi->mi_flags & MI_PRINTED) { 1933 mi->mi_flags &= ~MI_PRINTED; 1934 mutex_exit(&mi->mi_lock); 1935 #ifdef DEBUG 1936 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1937 mi->mi_vers, svp->sv_hostname); 1938 #else 1939 zprintf(zoneid, "NFS server %s ok\n", 1940 svp->sv_hostname); 1941 #endif 1942 } else 1943 mutex_exit(&mi->mi_lock); 1944 } 1945 1946 if (*douprintf == 0) { 1947 if (!(mi->mi_flags & MI_NOPRINT)) 1948 #ifdef DEBUG 1949 uprintf("NFS_ACL%d server %s ok\n", 1950 mi->mi_vers, svp->sv_hostname); 1951 #else 1952 uprintf("NFS server %s ok\n", svp->sv_hostname); 1953 #endif 1954 *douprintf = 1; 1955 } 1956 } 1957 1958 clfree_impl(client, ch, nfscl); 1959 if (cred_cloned) 1960 crfree(cr); 1961 1962 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1963 1964 #if 0 /* notyet */ 1965 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1966 rpcerr.re_errno); 1967 #endif 1968 1969 return (rpcerr.re_errno); 1970 } 1971 1972 int 1973 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1974 { 1975 uint_t mask = vap->va_mask; 1976 1977 if (!(mask & AT_MODE)) 1978 sa->sa_mode = (uint32_t)-1; 1979 else 1980 sa->sa_mode = vap->va_mode; 1981 if (!(mask & AT_UID)) 1982 sa->sa_uid = (uint32_t)-1; 1983 else 1984 sa->sa_uid = (uint32_t)vap->va_uid; 1985 if (!(mask & AT_GID)) 1986 sa->sa_gid = (uint32_t)-1; 1987 else 1988 sa->sa_gid = (uint32_t)vap->va_gid; 1989 if (!(mask & AT_SIZE)) 1990 sa->sa_size = (uint32_t)-1; 1991 else 1992 sa->sa_size = (uint32_t)vap->va_size; 1993 if (!(mask & AT_ATIME)) 1994 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 1995 else { 1996 /* check time validity */ 1997 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1998 return (EOVERFLOW); 1999 } 2000 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2001 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2002 } 2003 if (!(mask & AT_MTIME)) 2004 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2005 else { 2006 /* check time validity */ 2007 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2008 return (EOVERFLOW); 2009 } 2010 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2011 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2012 } 2013 return (0); 2014 } 2015 2016 int 2017 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2018 { 2019 uint_t mask = vap->va_mask; 2020 2021 if (!(mask & AT_MODE)) 2022 sa->mode.set_it = FALSE; 2023 else { 2024 sa->mode.set_it = TRUE; 2025 sa->mode.mode = (mode3)vap->va_mode; 2026 } 2027 if (!(mask & AT_UID)) 2028 sa->uid.set_it = FALSE; 2029 else { 2030 sa->uid.set_it = TRUE; 2031 sa->uid.uid = (uid3)vap->va_uid; 2032 } 2033 if (!(mask & AT_GID)) 2034 sa->gid.set_it = FALSE; 2035 else { 2036 sa->gid.set_it = TRUE; 2037 sa->gid.gid = (gid3)vap->va_gid; 2038 } 2039 if (!(mask & AT_SIZE)) 2040 sa->size.set_it = FALSE; 2041 else { 2042 sa->size.set_it = TRUE; 2043 sa->size.size = (size3)vap->va_size; 2044 } 2045 if (!(mask & AT_ATIME)) 2046 sa->atime.set_it = DONT_CHANGE; 2047 else { 2048 /* check time validity */ 2049 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2050 return (EOVERFLOW); 2051 } 2052 sa->atime.set_it = SET_TO_CLIENT_TIME; 2053 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2054 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2055 } 2056 if (!(mask & AT_MTIME)) 2057 sa->mtime.set_it = DONT_CHANGE; 2058 else { 2059 /* check time validity */ 2060 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2061 return (EOVERFLOW); 2062 } 2063 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2064 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2065 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2066 } 2067 return (0); 2068 } 2069 2070 void 2071 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2072 { 2073 2074 da->da_fhandle = VTOFH(dvp); 2075 da->da_name = nm; 2076 da->da_flags = 0; 2077 } 2078 2079 void 2080 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2081 { 2082 2083 da->dirp = VTOFH3(dvp); 2084 da->name = nm; 2085 } 2086 2087 int 2088 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2089 { 2090 int error; 2091 rnode_t *rp; 2092 struct vattr va; 2093 2094 va.va_mask = AT_MODE | AT_GID; 2095 error = VOP_GETATTR(dvp, &va, 0, cr); 2096 if (error) 2097 return (error); 2098 2099 /* 2100 * To determine the expected group-id of the created file: 2101 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2102 * GRPID option, and the directory's set-gid bit is clear, 2103 * then use the process's gid. 2104 * 2) Otherwise, set the group-id to the gid of the parent directory. 2105 */ 2106 rp = VTOR(dvp); 2107 mutex_enter(&rp->r_statelock); 2108 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2109 *gidp = crgetgid(cr); 2110 else 2111 *gidp = va.va_gid; 2112 mutex_exit(&rp->r_statelock); 2113 return (0); 2114 } 2115 2116 int 2117 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2118 { 2119 int error; 2120 struct vattr va; 2121 2122 va.va_mask = AT_MODE; 2123 error = VOP_GETATTR(dvp, &va, 0, cr); 2124 if (error) 2125 return (error); 2126 2127 /* 2128 * Modify the expected mode (om) so that the set-gid bit matches 2129 * that of the parent directory (dvp). 2130 */ 2131 if (va.va_mode & VSGID) 2132 *omp |= VSGID; 2133 else 2134 *omp &= ~VSGID; 2135 return (0); 2136 } 2137 2138 void 2139 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2140 { 2141 2142 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2143 if (!(vp->v_flag & VSWAPLIKE)) { 2144 mutex_enter(&vp->v_lock); 2145 vp->v_flag |= VSWAPLIKE; 2146 mutex_exit(&vp->v_lock); 2147 } 2148 } else { 2149 if (vp->v_flag & VSWAPLIKE) { 2150 mutex_enter(&vp->v_lock); 2151 vp->v_flag &= ~VSWAPLIKE; 2152 mutex_exit(&vp->v_lock); 2153 } 2154 } 2155 } 2156 2157 /* 2158 * Free the resources associated with an rnode. 2159 */ 2160 static void 2161 rinactive(rnode_t *rp, cred_t *cr) 2162 { 2163 vnode_t *vp; 2164 cred_t *cred; 2165 char *contents; 2166 int size; 2167 vsecattr_t *vsp; 2168 int error; 2169 nfs3_pathconf_info *info; 2170 2171 /* 2172 * Before freeing anything, wait until all asynchronous 2173 * activity is done on this rnode. This will allow all 2174 * asynchronous read ahead and write behind i/o's to 2175 * finish. 2176 */ 2177 mutex_enter(&rp->r_statelock); 2178 while (rp->r_count > 0) 2179 cv_wait(&rp->r_cv, &rp->r_statelock); 2180 mutex_exit(&rp->r_statelock); 2181 2182 /* 2183 * Flush and invalidate all pages associated with the vnode. 2184 */ 2185 vp = RTOV(rp); 2186 if (vn_has_cached_data(vp)) { 2187 ASSERT(vp->v_type != VCHR); 2188 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2189 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); 2190 if (error && (error == ENOSPC || error == EDQUOT)) { 2191 mutex_enter(&rp->r_statelock); 2192 if (!rp->r_error) 2193 rp->r_error = error; 2194 mutex_exit(&rp->r_statelock); 2195 } 2196 } 2197 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2198 } 2199 2200 /* 2201 * Free any held credentials and caches which may be associated 2202 * with this rnode. 2203 */ 2204 mutex_enter(&rp->r_statelock); 2205 cred = rp->r_cred; 2206 rp->r_cred = NULL; 2207 contents = rp->r_symlink.contents; 2208 size = rp->r_symlink.size; 2209 rp->r_symlink.contents = NULL; 2210 vsp = rp->r_secattr; 2211 rp->r_secattr = NULL; 2212 info = rp->r_pathconf; 2213 rp->r_pathconf = NULL; 2214 mutex_exit(&rp->r_statelock); 2215 2216 /* 2217 * Free the held credential. 2218 */ 2219 if (cred != NULL) 2220 crfree(cred); 2221 2222 /* 2223 * Free the access cache entries. 2224 */ 2225 (void) nfs_access_purge_rp(rp); 2226 2227 /* 2228 * Free the readdir cache entries. 2229 */ 2230 if (HAVE_RDDIR_CACHE(rp)) 2231 nfs_purge_rddir_cache(vp); 2232 2233 /* 2234 * Free the symbolic link cache. 2235 */ 2236 if (contents != NULL) { 2237 2238 kmem_free((void *)contents, size); 2239 } 2240 2241 /* 2242 * Free any cached ACL. 2243 */ 2244 if (vsp != NULL) 2245 nfs_acl_free(vsp); 2246 2247 /* 2248 * Free any cached pathconf information. 2249 */ 2250 if (info != NULL) 2251 kmem_free(info, sizeof (*info)); 2252 } 2253 2254 /* 2255 * Return a vnode for the given NFS Version 2 file handle. 2256 * If no rnode exists for this fhandle, create one and put it 2257 * into the hash queues. If the rnode for this fhandle 2258 * already exists, return it. 2259 * 2260 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2261 */ 2262 vnode_t * 2263 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2264 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2265 { 2266 int newnode; 2267 int index; 2268 vnode_t *vp; 2269 nfs_fhandle nfh; 2270 vattr_t va; 2271 2272 nfh.fh_len = NFS_FHSIZE; 2273 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2274 2275 index = rtablehash(&nfh); 2276 rw_enter(&rtable[index].r_lock, RW_READER); 2277 2278 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2279 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2280 2281 if (attr != NULL) { 2282 if (!newnode) { 2283 rw_exit(&rtable[index].r_lock); 2284 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2285 } else { 2286 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2287 vp->v_type = VBAD; 2288 else 2289 vp->v_type = n2v_type(attr); 2290 /* 2291 * A translation here seems to be necessary 2292 * because this function can be called 2293 * with `attr' that has come from the wire, 2294 * and been operated on by vattr_to_nattr(). 2295 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2296 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2297 * ->makenfsnode(). 2298 */ 2299 if ((attr->na_rdev & 0xffff0000) == 0) 2300 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2301 else 2302 vp->v_rdev = expldev(n2v_rdev(attr)); 2303 nfs_attrcache(vp, attr, t); 2304 rw_exit(&rtable[index].r_lock); 2305 } 2306 } else { 2307 if (newnode) { 2308 PURGE_ATTRCACHE(vp); 2309 } 2310 rw_exit(&rtable[index].r_lock); 2311 } 2312 2313 return (vp); 2314 } 2315 2316 /* 2317 * Return a vnode for the given NFS Version 3 file handle. 2318 * If no rnode exists for this fhandle, create one and put it 2319 * into the hash queues. If the rnode for this fhandle 2320 * already exists, return it. 2321 * 2322 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2323 */ 2324 vnode_t * 2325 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2326 cred_t *cr, char *dnm, char *nm) 2327 { 2328 int newnode; 2329 int index; 2330 vnode_t *vp; 2331 2332 index = rtablehash((nfs_fhandle *)fh); 2333 rw_enter(&rtable[index].r_lock, RW_READER); 2334 2335 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2336 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2337 dnm, nm); 2338 2339 if (vap == NULL) { 2340 if (newnode) { 2341 PURGE_ATTRCACHE(vp); 2342 } 2343 rw_exit(&rtable[index].r_lock); 2344 return (vp); 2345 } 2346 2347 if (!newnode) { 2348 rw_exit(&rtable[index].r_lock); 2349 nfs_attr_cache(vp, vap, t, cr); 2350 } else { 2351 rnode_t *rp = VTOR(vp); 2352 2353 vp->v_type = vap->va_type; 2354 vp->v_rdev = vap->va_rdev; 2355 2356 mutex_enter(&rp->r_statelock); 2357 if (rp->r_mtime <= t) 2358 nfs_attrcache_va(vp, vap); 2359 mutex_exit(&rp->r_statelock); 2360 rw_exit(&rtable[index].r_lock); 2361 } 2362 2363 return (vp); 2364 } 2365 2366 vnode_t * 2367 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2368 cred_t *cr, char *dnm, char *nm) 2369 { 2370 int newnode; 2371 int index; 2372 vnode_t *vp; 2373 vattr_t va; 2374 2375 index = rtablehash((nfs_fhandle *)fh); 2376 rw_enter(&rtable[index].r_lock, RW_READER); 2377 2378 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2379 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2380 dnm, nm); 2381 2382 if (attr == NULL) { 2383 if (newnode) { 2384 PURGE_ATTRCACHE(vp); 2385 } 2386 rw_exit(&rtable[index].r_lock); 2387 return (vp); 2388 } 2389 2390 if (!newnode) { 2391 rw_exit(&rtable[index].r_lock); 2392 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2393 } else { 2394 if (attr->type < NF3REG || attr->type > NF3FIFO) 2395 vp->v_type = VBAD; 2396 else 2397 vp->v_type = nf3_to_vt[attr->type]; 2398 vp->v_rdev = makedevice(attr->rdev.specdata1, 2399 attr->rdev.specdata2); 2400 nfs3_attrcache(vp, attr, t); 2401 rw_exit(&rtable[index].r_lock); 2402 } 2403 2404 return (vp); 2405 } 2406 2407 /* 2408 * Read this comment before making changes to rtablehash()! 2409 * This is a hash function in which seemingly obvious and harmless 2410 * changes can cause escalations costing million dollars! 2411 * Know what you are doing. 2412 * 2413 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2414 * algorithm is currently detailed here: 2415 * 2416 * http://burtleburtle.net/bob/hash/doobs.html 2417 * 2418 * Of course, the above link may not be valid by the time you are reading 2419 * this, but suffice it to say that the one-at-a-time algorithm works well in 2420 * almost all cases. If you are changing the algorithm be sure to verify that 2421 * the hash algorithm still provides even distribution in all cases and with 2422 * any server returning filehandles in whatever order (sequential or random). 2423 */ 2424 static int 2425 rtablehash(nfs_fhandle *fh) 2426 { 2427 ulong_t hash, len, i; 2428 char *key; 2429 2430 key = fh->fh_buf; 2431 len = (ulong_t)fh->fh_len; 2432 for (hash = 0, i = 0; i < len; i++) { 2433 hash += key[i]; 2434 hash += (hash << 10); 2435 hash ^= (hash >> 6); 2436 } 2437 hash += (hash << 3); 2438 hash ^= (hash >> 11); 2439 hash += (hash << 15); 2440 return (hash & rtablemask); 2441 } 2442 2443 static vnode_t * 2444 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2445 struct vnodeops *vops, 2446 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2447 int (*compar)(const void *, const void *), 2448 int *newnode, cred_t *cr, char *dnm, char *nm) 2449 { 2450 rnode_t *rp; 2451 rnode_t *trp; 2452 vnode_t *vp; 2453 mntinfo_t *mi; 2454 2455 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2456 2457 mi = VFTOMI(vfsp); 2458 start: 2459 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2460 vp = RTOV(rp); 2461 nfs_set_vroot(vp); 2462 *newnode = 0; 2463 return (vp); 2464 } 2465 rw_exit(&rhtp->r_lock); 2466 2467 mutex_enter(&rpfreelist_lock); 2468 if (rpfreelist != NULL && rnew >= nrnode) { 2469 rp = rpfreelist; 2470 rp_rmfree(rp); 2471 mutex_exit(&rpfreelist_lock); 2472 2473 vp = RTOV(rp); 2474 2475 if (rp->r_flags & RHASHED) { 2476 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2477 mutex_enter(&vp->v_lock); 2478 if (vp->v_count > 1) { 2479 vp->v_count--; 2480 mutex_exit(&vp->v_lock); 2481 rw_exit(&rp->r_hashq->r_lock); 2482 rw_enter(&rhtp->r_lock, RW_READER); 2483 goto start; 2484 } 2485 mutex_exit(&vp->v_lock); 2486 rp_rmhash_locked(rp); 2487 rw_exit(&rp->r_hashq->r_lock); 2488 } 2489 2490 rinactive(rp, cr); 2491 2492 mutex_enter(&vp->v_lock); 2493 if (vp->v_count > 1) { 2494 vp->v_count--; 2495 mutex_exit(&vp->v_lock); 2496 rw_enter(&rhtp->r_lock, RW_READER); 2497 goto start; 2498 } 2499 mutex_exit(&vp->v_lock); 2500 vn_invalid(vp); 2501 /* 2502 * destroy old locks before bzero'ing and 2503 * recreating the locks below. 2504 */ 2505 nfs_rw_destroy(&rp->r_rwlock); 2506 nfs_rw_destroy(&rp->r_lkserlock); 2507 mutex_destroy(&rp->r_statelock); 2508 cv_destroy(&rp->r_cv); 2509 cv_destroy(&rp->r_commit.c_cv); 2510 nfs_free_r_path(rp); 2511 avl_destroy(&rp->r_dir); 2512 /* 2513 * Make sure that if rnode is recycled then 2514 * VFS count is decremented properly before 2515 * reuse. 2516 */ 2517 VFS_RELE(vp->v_vfsp); 2518 vn_reinit(vp); 2519 } else { 2520 vnode_t *new_vp; 2521 2522 mutex_exit(&rpfreelist_lock); 2523 2524 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2525 new_vp = vn_alloc(KM_SLEEP); 2526 2527 atomic_add_long((ulong_t *)&rnew, 1); 2528 #ifdef DEBUG 2529 clstat_debug.nrnode.value.ui64++; 2530 #endif 2531 vp = new_vp; 2532 } 2533 2534 bzero(rp, sizeof (*rp)); 2535 rp->r_vnode = vp; 2536 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2537 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2538 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2539 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2540 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2541 rp->r_fh.fh_len = fh->fh_len; 2542 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2543 rp->r_server = mi->mi_curr_serv; 2544 if (FAILOVER_MOUNT(mi)) { 2545 /* 2546 * If replicated servers, stash pathnames 2547 */ 2548 if (dnm != NULL && nm != NULL) { 2549 char *s, *p; 2550 uint_t len; 2551 2552 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2553 rp->r_path = kmem_alloc(len, KM_SLEEP); 2554 #ifdef DEBUG 2555 clstat_debug.rpath.value.ui64 += len; 2556 #endif 2557 s = rp->r_path; 2558 for (p = dnm; *p; p++) 2559 *s++ = *p; 2560 *s++ = '/'; 2561 for (p = nm; *p; p++) 2562 *s++ = *p; 2563 *s = '\0'; 2564 } else { 2565 /* special case for root */ 2566 rp->r_path = kmem_alloc(2, KM_SLEEP); 2567 #ifdef DEBUG 2568 clstat_debug.rpath.value.ui64 += 2; 2569 #endif 2570 *rp->r_path = '.'; 2571 *(rp->r_path + 1) = '\0'; 2572 } 2573 } 2574 VFS_HOLD(vfsp); 2575 rp->r_putapage = putapage; 2576 rp->r_hashq = rhtp; 2577 rp->r_flags = RREADDIRPLUS; 2578 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2579 offsetof(rddir_cache, tree)); 2580 vn_setops(vp, vops); 2581 vp->v_data = (caddr_t)rp; 2582 vp->v_vfsp = vfsp; 2583 vp->v_type = VNON; 2584 nfs_set_vroot(vp); 2585 2586 /* 2587 * There is a race condition if someone else 2588 * alloc's the rnode while no locks are held, so we 2589 * check again and recover if found. 2590 */ 2591 rw_enter(&rhtp->r_lock, RW_WRITER); 2592 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2593 vp = RTOV(trp); 2594 nfs_set_vroot(vp); 2595 *newnode = 0; 2596 rw_exit(&rhtp->r_lock); 2597 rp_addfree(rp, cr); 2598 rw_enter(&rhtp->r_lock, RW_READER); 2599 return (vp); 2600 } 2601 rp_addhash(rp); 2602 *newnode = 1; 2603 return (vp); 2604 } 2605 2606 static void 2607 nfs_set_vroot(vnode_t *vp) 2608 { 2609 rnode_t *rp; 2610 nfs_fhandle *rootfh; 2611 2612 rp = VTOR(vp); 2613 rootfh = &rp->r_server->sv_fhandle; 2614 if (rootfh->fh_len == rp->r_fh.fh_len && 2615 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2616 if (!(vp->v_flag & VROOT)) { 2617 mutex_enter(&vp->v_lock); 2618 vp->v_flag |= VROOT; 2619 mutex_exit(&vp->v_lock); 2620 } 2621 } 2622 } 2623 2624 static void 2625 nfs_free_r_path(rnode_t *rp) 2626 { 2627 char *path; 2628 size_t len; 2629 2630 path = rp->r_path; 2631 if (path) { 2632 rp->r_path = NULL; 2633 len = strlen(path) + 1; 2634 kmem_free(path, len); 2635 #ifdef DEBUG 2636 clstat_debug.rpath.value.ui64 -= len; 2637 #endif 2638 } 2639 } 2640 2641 /* 2642 * Put an rnode on the free list. 2643 * 2644 * Rnodes which were allocated above and beyond the normal limit 2645 * are immediately freed. 2646 */ 2647 void 2648 rp_addfree(rnode_t *rp, cred_t *cr) 2649 { 2650 vnode_t *vp; 2651 struct vfs *vfsp; 2652 2653 vp = RTOV(rp); 2654 ASSERT(vp->v_count >= 1); 2655 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2656 2657 /* 2658 * If we have too many rnodes allocated and there are no 2659 * references to this rnode, or if the rnode is no longer 2660 * accessible by it does not reside in the hash queues, 2661 * or if an i/o error occurred while writing to the file, 2662 * then just free it instead of putting it on the rnode 2663 * freelist. 2664 */ 2665 vfsp = vp->v_vfsp; 2666 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2667 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2668 if (rp->r_flags & RHASHED) { 2669 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2670 mutex_enter(&vp->v_lock); 2671 if (vp->v_count > 1) { 2672 vp->v_count--; 2673 mutex_exit(&vp->v_lock); 2674 rw_exit(&rp->r_hashq->r_lock); 2675 return; 2676 } 2677 mutex_exit(&vp->v_lock); 2678 rp_rmhash_locked(rp); 2679 rw_exit(&rp->r_hashq->r_lock); 2680 } 2681 2682 rinactive(rp, cr); 2683 2684 /* 2685 * Recheck the vnode reference count. We need to 2686 * make sure that another reference has not been 2687 * acquired while we were not holding v_lock. The 2688 * rnode is not in the rnode hash queues, so the 2689 * only way for a reference to have been acquired 2690 * is for a VOP_PUTPAGE because the rnode was marked 2691 * with RDIRTY or for a modified page. This 2692 * reference may have been acquired before our call 2693 * to rinactive. The i/o may have been completed, 2694 * thus allowing rinactive to complete, but the 2695 * reference to the vnode may not have been released 2696 * yet. In any case, the rnode can not be destroyed 2697 * until the other references to this vnode have been 2698 * released. The other references will take care of 2699 * either destroying the rnode or placing it on the 2700 * rnode freelist. If there are no other references, 2701 * then the rnode may be safely destroyed. 2702 */ 2703 mutex_enter(&vp->v_lock); 2704 if (vp->v_count > 1) { 2705 vp->v_count--; 2706 mutex_exit(&vp->v_lock); 2707 return; 2708 } 2709 mutex_exit(&vp->v_lock); 2710 2711 destroy_rnode(rp); 2712 return; 2713 } 2714 2715 /* 2716 * Lock the hash queue and then recheck the reference count 2717 * to ensure that no other threads have acquired a reference 2718 * to indicate that the rnode should not be placed on the 2719 * freelist. If another reference has been acquired, then 2720 * just release this one and let the other thread complete 2721 * the processing of adding this rnode to the freelist. 2722 */ 2723 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2724 2725 mutex_enter(&vp->v_lock); 2726 if (vp->v_count > 1) { 2727 vp->v_count--; 2728 mutex_exit(&vp->v_lock); 2729 rw_exit(&rp->r_hashq->r_lock); 2730 return; 2731 } 2732 mutex_exit(&vp->v_lock); 2733 2734 /* 2735 * If there is no cached data or metadata for this file, then 2736 * put the rnode on the front of the freelist so that it will 2737 * be reused before other rnodes which may have cached data or 2738 * metadata associated with them. 2739 */ 2740 mutex_enter(&rpfreelist_lock); 2741 if (rpfreelist == NULL) { 2742 rp->r_freef = rp; 2743 rp->r_freeb = rp; 2744 rpfreelist = rp; 2745 } else { 2746 rp->r_freef = rpfreelist; 2747 rp->r_freeb = rpfreelist->r_freeb; 2748 rpfreelist->r_freeb->r_freef = rp; 2749 rpfreelist->r_freeb = rp; 2750 if (!vn_has_cached_data(vp) && 2751 !HAVE_RDDIR_CACHE(rp) && 2752 rp->r_symlink.contents == NULL && 2753 rp->r_secattr == NULL && 2754 rp->r_pathconf == NULL) 2755 rpfreelist = rp; 2756 } 2757 mutex_exit(&rpfreelist_lock); 2758 2759 rw_exit(&rp->r_hashq->r_lock); 2760 } 2761 2762 /* 2763 * Remove an rnode from the free list. 2764 * 2765 * The caller must be holding rpfreelist_lock and the rnode 2766 * must be on the freelist. 2767 */ 2768 static void 2769 rp_rmfree(rnode_t *rp) 2770 { 2771 2772 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2773 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2774 2775 if (rp == rpfreelist) { 2776 rpfreelist = rp->r_freef; 2777 if (rp == rpfreelist) 2778 rpfreelist = NULL; 2779 } 2780 2781 rp->r_freeb->r_freef = rp->r_freef; 2782 rp->r_freef->r_freeb = rp->r_freeb; 2783 2784 rp->r_freef = rp->r_freeb = NULL; 2785 } 2786 2787 /* 2788 * Put a rnode in the hash table. 2789 * 2790 * The caller must be holding the exclusive hash queue lock. 2791 */ 2792 static void 2793 rp_addhash(rnode_t *rp) 2794 { 2795 2796 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2797 ASSERT(!(rp->r_flags & RHASHED)); 2798 2799 rp->r_hashf = rp->r_hashq->r_hashf; 2800 rp->r_hashq->r_hashf = rp; 2801 rp->r_hashb = (rnode_t *)rp->r_hashq; 2802 rp->r_hashf->r_hashb = rp; 2803 2804 mutex_enter(&rp->r_statelock); 2805 rp->r_flags |= RHASHED; 2806 mutex_exit(&rp->r_statelock); 2807 } 2808 2809 /* 2810 * Remove a rnode from the hash table. 2811 * 2812 * The caller must be holding the hash queue lock. 2813 */ 2814 static void 2815 rp_rmhash_locked(rnode_t *rp) 2816 { 2817 2818 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2819 ASSERT(rp->r_flags & RHASHED); 2820 2821 rp->r_hashb->r_hashf = rp->r_hashf; 2822 rp->r_hashf->r_hashb = rp->r_hashb; 2823 2824 mutex_enter(&rp->r_statelock); 2825 rp->r_flags &= ~RHASHED; 2826 mutex_exit(&rp->r_statelock); 2827 } 2828 2829 /* 2830 * Remove a rnode from the hash table. 2831 * 2832 * The caller must not be holding the hash queue lock. 2833 */ 2834 void 2835 rp_rmhash(rnode_t *rp) 2836 { 2837 2838 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2839 rp_rmhash_locked(rp); 2840 rw_exit(&rp->r_hashq->r_lock); 2841 } 2842 2843 /* 2844 * Lookup a rnode by fhandle. 2845 * 2846 * The caller must be holding the hash queue lock, either shared or exclusive. 2847 */ 2848 static rnode_t * 2849 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2850 { 2851 rnode_t *rp; 2852 vnode_t *vp; 2853 2854 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2855 2856 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2857 vp = RTOV(rp); 2858 if (vp->v_vfsp == vfsp && 2859 rp->r_fh.fh_len == fh->fh_len && 2860 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2861 /* 2862 * remove rnode from free list, if necessary. 2863 */ 2864 if (rp->r_freef != NULL) { 2865 mutex_enter(&rpfreelist_lock); 2866 /* 2867 * If the rnode is on the freelist, 2868 * then remove it and use that reference 2869 * as the new reference. Otherwise, 2870 * need to increment the reference count. 2871 */ 2872 if (rp->r_freef != NULL) { 2873 rp_rmfree(rp); 2874 mutex_exit(&rpfreelist_lock); 2875 } else { 2876 mutex_exit(&rpfreelist_lock); 2877 VN_HOLD(vp); 2878 } 2879 } else 2880 VN_HOLD(vp); 2881 return (rp); 2882 } 2883 } 2884 return (NULL); 2885 } 2886 2887 /* 2888 * Return 1 if there is a active vnode belonging to this vfs in the 2889 * rtable cache. 2890 * 2891 * Several of these checks are done without holding the usual 2892 * locks. This is safe because destroy_rtable(), rp_addfree(), 2893 * etc. will redo the necessary checks before actually destroying 2894 * any rnodes. 2895 */ 2896 int 2897 check_rtable(struct vfs *vfsp) 2898 { 2899 int index; 2900 rnode_t *rp; 2901 vnode_t *vp; 2902 2903 for (index = 0; index < rtablesize; index++) { 2904 rw_enter(&rtable[index].r_lock, RW_READER); 2905 for (rp = rtable[index].r_hashf; 2906 rp != (rnode_t *)(&rtable[index]); 2907 rp = rp->r_hashf) { 2908 vp = RTOV(rp); 2909 if (vp->v_vfsp == vfsp) { 2910 if (rp->r_freef == NULL || 2911 (vn_has_cached_data(vp) && 2912 (rp->r_flags & RDIRTY)) || 2913 rp->r_count > 0) { 2914 rw_exit(&rtable[index].r_lock); 2915 return (1); 2916 } 2917 } 2918 } 2919 rw_exit(&rtable[index].r_lock); 2920 } 2921 return (0); 2922 } 2923 2924 /* 2925 * Destroy inactive vnodes from the hash queues which belong to this 2926 * vfs. It is essential that we destroy all inactive vnodes during a 2927 * forced unmount as well as during a normal unmount. 2928 */ 2929 void 2930 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2931 { 2932 int index; 2933 rnode_t *rp; 2934 rnode_t *rlist; 2935 rnode_t *r_hashf; 2936 vnode_t *vp; 2937 2938 rlist = NULL; 2939 2940 for (index = 0; index < rtablesize; index++) { 2941 rw_enter(&rtable[index].r_lock, RW_WRITER); 2942 for (rp = rtable[index].r_hashf; 2943 rp != (rnode_t *)(&rtable[index]); 2944 rp = r_hashf) { 2945 /* save the hash pointer before destroying */ 2946 r_hashf = rp->r_hashf; 2947 vp = RTOV(rp); 2948 if (vp->v_vfsp == vfsp) { 2949 mutex_enter(&rpfreelist_lock); 2950 if (rp->r_freef != NULL) { 2951 rp_rmfree(rp); 2952 mutex_exit(&rpfreelist_lock); 2953 rp_rmhash_locked(rp); 2954 rp->r_hashf = rlist; 2955 rlist = rp; 2956 } else 2957 mutex_exit(&rpfreelist_lock); 2958 } 2959 } 2960 rw_exit(&rtable[index].r_lock); 2961 } 2962 2963 for (rp = rlist; rp != NULL; rp = rlist) { 2964 rlist = rp->r_hashf; 2965 /* 2966 * This call to rp_addfree will end up destroying the 2967 * rnode, but in a safe way with the appropriate set 2968 * of checks done. 2969 */ 2970 rp_addfree(rp, cr); 2971 } 2972 2973 } 2974 2975 /* 2976 * This routine destroys all the resources associated with the rnode 2977 * and then the rnode itself. 2978 */ 2979 static void 2980 destroy_rnode(rnode_t *rp) 2981 { 2982 vnode_t *vp; 2983 vfs_t *vfsp; 2984 2985 vp = RTOV(rp); 2986 vfsp = vp->v_vfsp; 2987 2988 ASSERT(vp->v_count == 1); 2989 ASSERT(rp->r_count == 0); 2990 ASSERT(rp->r_lmpl == NULL); 2991 ASSERT(rp->r_mapcnt == 0); 2992 ASSERT(!(rp->r_flags & RHASHED)); 2993 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2994 atomic_add_long((ulong_t *)&rnew, -1); 2995 #ifdef DEBUG 2996 clstat_debug.nrnode.value.ui64--; 2997 #endif 2998 nfs_rw_destroy(&rp->r_rwlock); 2999 nfs_rw_destroy(&rp->r_lkserlock); 3000 mutex_destroy(&rp->r_statelock); 3001 cv_destroy(&rp->r_cv); 3002 cv_destroy(&rp->r_commit.c_cv); 3003 if (rp->r_flags & RDELMAPLIST) 3004 list_destroy(&rp->r_indelmap); 3005 nfs_free_r_path(rp); 3006 avl_destroy(&rp->r_dir); 3007 vn_invalid(vp); 3008 vn_free(vp); 3009 kmem_cache_free(rnode_cache, rp); 3010 VFS_RELE(vfsp); 3011 } 3012 3013 /* 3014 * Flush all vnodes in this (or every) vfs. 3015 * Used by nfs_sync and by nfs_unmount. 3016 */ 3017 void 3018 rflush(struct vfs *vfsp, cred_t *cr) 3019 { 3020 int index; 3021 rnode_t *rp; 3022 vnode_t *vp, **vplist; 3023 long num, cnt; 3024 3025 /* 3026 * Check to see whether there is anything to do. 3027 */ 3028 num = rnew; 3029 if (num == 0) 3030 return; 3031 3032 /* 3033 * Allocate a slot for all currently active rnodes on the 3034 * supposition that they all may need flushing. 3035 */ 3036 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3037 cnt = 0; 3038 3039 /* 3040 * Walk the hash queues looking for rnodes with page 3041 * lists associated with them. Make a list of these 3042 * files. 3043 */ 3044 for (index = 0; index < rtablesize; index++) { 3045 rw_enter(&rtable[index].r_lock, RW_READER); 3046 for (rp = rtable[index].r_hashf; 3047 rp != (rnode_t *)(&rtable[index]); 3048 rp = rp->r_hashf) { 3049 vp = RTOV(rp); 3050 /* 3051 * Don't bother sync'ing a vp if it 3052 * is part of virtual swap device or 3053 * if VFS is read-only 3054 */ 3055 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3056 continue; 3057 /* 3058 * If flushing all mounted file systems or 3059 * the vnode belongs to this vfs, has pages 3060 * and is marked as either dirty or mmap'd, 3061 * hold and add this vnode to the list of 3062 * vnodes to flush. 3063 */ 3064 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3065 vn_has_cached_data(vp) && 3066 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3067 VN_HOLD(vp); 3068 vplist[cnt++] = vp; 3069 if (cnt == num) { 3070 rw_exit(&rtable[index].r_lock); 3071 goto toomany; 3072 } 3073 } 3074 } 3075 rw_exit(&rtable[index].r_lock); 3076 } 3077 toomany: 3078 3079 /* 3080 * Flush and release all of the files on the list. 3081 */ 3082 while (cnt-- > 0) { 3083 vp = vplist[cnt]; 3084 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr); 3085 VN_RELE(vp); 3086 } 3087 3088 /* 3089 * Free the space allocated to hold the list. 3090 */ 3091 kmem_free(vplist, num * sizeof (*vplist)); 3092 } 3093 3094 /* 3095 * This probably needs to be larger than or equal to 3096 * log2(sizeof (struct rnode)) due to the way that rnodes are 3097 * allocated. 3098 */ 3099 #define ACACHE_SHIFT_BITS 9 3100 3101 static int 3102 acachehash(rnode_t *rp, cred_t *cr) 3103 { 3104 3105 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3106 acachemask); 3107 } 3108 3109 #ifdef DEBUG 3110 static long nfs_access_cache_hits = 0; 3111 static long nfs_access_cache_misses = 0; 3112 #endif 3113 3114 nfs_access_type_t 3115 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3116 { 3117 vnode_t *vp; 3118 acache_t *ap; 3119 acache_hash_t *hp; 3120 nfs_access_type_t all; 3121 3122 vp = RTOV(rp); 3123 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3124 return (NFS_ACCESS_UNKNOWN); 3125 3126 if (rp->r_acache != NULL) { 3127 hp = &acache[acachehash(rp, cr)]; 3128 rw_enter(&hp->lock, RW_READER); 3129 ap = hp->next; 3130 while (ap != (acache_t *)hp) { 3131 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3132 if ((ap->known & acc) == acc) { 3133 #ifdef DEBUG 3134 nfs_access_cache_hits++; 3135 #endif 3136 if ((ap->allowed & acc) == acc) 3137 all = NFS_ACCESS_ALLOWED; 3138 else 3139 all = NFS_ACCESS_DENIED; 3140 } else { 3141 #ifdef DEBUG 3142 nfs_access_cache_misses++; 3143 #endif 3144 all = NFS_ACCESS_UNKNOWN; 3145 } 3146 rw_exit(&hp->lock); 3147 return (all); 3148 } 3149 ap = ap->next; 3150 } 3151 rw_exit(&hp->lock); 3152 } 3153 3154 #ifdef DEBUG 3155 nfs_access_cache_misses++; 3156 #endif 3157 return (NFS_ACCESS_UNKNOWN); 3158 } 3159 3160 void 3161 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3162 { 3163 acache_t *ap; 3164 acache_t *nap; 3165 acache_hash_t *hp; 3166 3167 hp = &acache[acachehash(rp, cr)]; 3168 3169 /* 3170 * Allocate now assuming that mostly an allocation will be 3171 * required. This allows the allocation to happen without 3172 * holding the hash bucket locked. 3173 */ 3174 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3175 if (nap != NULL) { 3176 nap->known = acc; 3177 nap->allowed = resacc; 3178 nap->rnode = rp; 3179 crhold(cr); 3180 nap->cred = cr; 3181 nap->hashq = hp; 3182 } 3183 3184 rw_enter(&hp->lock, RW_WRITER); 3185 3186 if (rp->r_acache != NULL) { 3187 ap = hp->next; 3188 while (ap != (acache_t *)hp) { 3189 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3190 ap->known |= acc; 3191 ap->allowed &= ~acc; 3192 ap->allowed |= resacc; 3193 rw_exit(&hp->lock); 3194 if (nap != NULL) { 3195 crfree(nap->cred); 3196 kmem_cache_free(acache_cache, nap); 3197 } 3198 return; 3199 } 3200 ap = ap->next; 3201 } 3202 } 3203 3204 if (nap != NULL) { 3205 #ifdef DEBUG 3206 clstat_debug.access.value.ui64++; 3207 #endif 3208 nap->next = hp->next; 3209 hp->next = nap; 3210 nap->next->prev = nap; 3211 nap->prev = (acache_t *)hp; 3212 3213 mutex_enter(&rp->r_statelock); 3214 nap->list = rp->r_acache; 3215 rp->r_acache = nap; 3216 mutex_exit(&rp->r_statelock); 3217 } 3218 3219 rw_exit(&hp->lock); 3220 } 3221 3222 int 3223 nfs_access_purge_rp(rnode_t *rp) 3224 { 3225 acache_t *ap; 3226 acache_t *tmpap; 3227 acache_t *rplist; 3228 3229 /* 3230 * If there aren't any cached entries, then there is nothing 3231 * to free. 3232 */ 3233 if (rp->r_acache == NULL) 3234 return (0); 3235 3236 mutex_enter(&rp->r_statelock); 3237 rplist = rp->r_acache; 3238 rp->r_acache = NULL; 3239 mutex_exit(&rp->r_statelock); 3240 3241 /* 3242 * Loop through each entry in the list pointed to in the 3243 * rnode. Remove each of these entries from the hash 3244 * queue that it is on and remove it from the list in 3245 * the rnode. 3246 */ 3247 for (ap = rplist; ap != NULL; ap = tmpap) { 3248 rw_enter(&ap->hashq->lock, RW_WRITER); 3249 ap->prev->next = ap->next; 3250 ap->next->prev = ap->prev; 3251 rw_exit(&ap->hashq->lock); 3252 3253 tmpap = ap->list; 3254 crfree(ap->cred); 3255 kmem_cache_free(acache_cache, ap); 3256 #ifdef DEBUG 3257 clstat_debug.access.value.ui64--; 3258 #endif 3259 } 3260 3261 return (1); 3262 } 3263 3264 static const char prefix[] = ".nfs"; 3265 3266 static kmutex_t newnum_lock; 3267 3268 int 3269 newnum(void) 3270 { 3271 static uint_t newnum = 0; 3272 uint_t id; 3273 3274 mutex_enter(&newnum_lock); 3275 if (newnum == 0) 3276 newnum = gethrestime_sec() & 0xffff; 3277 id = newnum++; 3278 mutex_exit(&newnum_lock); 3279 return (id); 3280 } 3281 3282 char * 3283 newname(void) 3284 { 3285 char *news; 3286 char *s; 3287 const char *p; 3288 uint_t id; 3289 3290 id = newnum(); 3291 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3292 s = news; 3293 p = prefix; 3294 while (*p != '\0') 3295 *s++ = *p++; 3296 while (id != 0) { 3297 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3298 id >>= 4; 3299 } 3300 *s = '\0'; 3301 return (news); 3302 } 3303 3304 int 3305 nfs_atoi(char *cp) 3306 { 3307 int n; 3308 3309 n = 0; 3310 while (*cp != '\0') { 3311 n = n * 10 + (*cp - '0'); 3312 cp++; 3313 } 3314 3315 return (n); 3316 } 3317 3318 /* 3319 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3320 * framework. 3321 */ 3322 static int 3323 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3324 { 3325 ksp->ks_snaptime = gethrtime(); 3326 if (rw == KSTAT_WRITE) { 3327 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3328 #ifdef DEBUG 3329 /* 3330 * Currently only the global zone can write to kstats, but we 3331 * add the check just for paranoia. 3332 */ 3333 if (INGLOBALZONE(curproc)) 3334 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3335 sizeof (clstat_debug)); 3336 #endif 3337 } else { 3338 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3339 #ifdef DEBUG 3340 /* 3341 * If we're displaying the "global" debug kstat values, we 3342 * display them as-is to all zones since in fact they apply to 3343 * the system as a whole. 3344 */ 3345 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3346 sizeof (clstat_debug)); 3347 #endif 3348 } 3349 return (0); 3350 } 3351 3352 static void * 3353 clinit_zone(zoneid_t zoneid) 3354 { 3355 kstat_t *nfs_client_kstat; 3356 struct nfs_clnt *nfscl; 3357 uint_t ndata; 3358 3359 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3360 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3361 nfscl->nfscl_chtable = NULL; 3362 nfscl->nfscl_zoneid = zoneid; 3363 3364 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3365 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3366 #ifdef DEBUG 3367 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3368 #endif 3369 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3370 "misc", KSTAT_TYPE_NAMED, ndata, 3371 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3372 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3373 nfs_client_kstat->ks_snapshot = cl_snapshot; 3374 kstat_install(nfs_client_kstat); 3375 } 3376 mutex_enter(&nfs_clnt_list_lock); 3377 list_insert_head(&nfs_clnt_list, nfscl); 3378 mutex_exit(&nfs_clnt_list_lock); 3379 return (nfscl); 3380 } 3381 3382 /*ARGSUSED*/ 3383 static void 3384 clfini_zone(zoneid_t zoneid, void *arg) 3385 { 3386 struct nfs_clnt *nfscl = arg; 3387 chhead_t *chp, *next; 3388 3389 if (nfscl == NULL) 3390 return; 3391 mutex_enter(&nfs_clnt_list_lock); 3392 list_remove(&nfs_clnt_list, nfscl); 3393 mutex_exit(&nfs_clnt_list_lock); 3394 clreclaim_zone(nfscl, 0); 3395 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3396 ASSERT(chp->ch_list == NULL); 3397 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3398 next = chp->ch_next; 3399 kmem_free(chp, sizeof (*chp)); 3400 } 3401 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3402 mutex_destroy(&nfscl->nfscl_chtable_lock); 3403 kmem_free(nfscl, sizeof (*nfscl)); 3404 } 3405 3406 /* 3407 * Called by endpnt_destructor to make sure the client handles are 3408 * cleaned up before the RPC endpoints. This becomes a no-op if 3409 * clfini_zone (above) is called first. This function is needed 3410 * (rather than relying on clfini_zone to clean up) because the ZSD 3411 * callbacks have no ordering mechanism, so we have no way to ensure 3412 * that clfini_zone is called before endpnt_destructor. 3413 */ 3414 void 3415 clcleanup_zone(zoneid_t zoneid) 3416 { 3417 struct nfs_clnt *nfscl; 3418 3419 mutex_enter(&nfs_clnt_list_lock); 3420 nfscl = list_head(&nfs_clnt_list); 3421 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3422 if (nfscl->nfscl_zoneid == zoneid) { 3423 clreclaim_zone(nfscl, 0); 3424 break; 3425 } 3426 } 3427 mutex_exit(&nfs_clnt_list_lock); 3428 } 3429 3430 int 3431 nfs_subrinit(void) 3432 { 3433 int i; 3434 ulong_t nrnode_max; 3435 3436 /* 3437 * Allocate and initialize the rnode hash queues 3438 */ 3439 if (nrnode <= 0) 3440 nrnode = ncsize; 3441 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3442 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3443 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3444 "setting nrnode to max value of %ld", nrnode_max); 3445 nrnode = nrnode_max; 3446 } 3447 3448 rtablesize = 1 << highbit(nrnode / hashlen); 3449 rtablemask = rtablesize - 1; 3450 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3451 for (i = 0; i < rtablesize; i++) { 3452 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3453 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3454 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3455 } 3456 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3457 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3458 3459 /* 3460 * Allocate and initialize the access cache 3461 */ 3462 3463 /* 3464 * Initial guess is one access cache entry per rnode unless 3465 * nacache is set to a non-zero value and then it is used to 3466 * indicate a guess at the number of access cache entries. 3467 */ 3468 if (nacache > 0) 3469 acachesize = 1 << highbit(nacache / hashlen); 3470 else 3471 acachesize = rtablesize; 3472 acachemask = acachesize - 1; 3473 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3474 for (i = 0; i < acachesize; i++) { 3475 acache[i].next = (acache_t *)&acache[i]; 3476 acache[i].prev = (acache_t *)&acache[i]; 3477 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3478 } 3479 acache_cache = kmem_cache_create("nfs_access_cache", 3480 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3481 /* 3482 * Allocate and initialize the client handle cache 3483 */ 3484 chtab_cache = kmem_cache_create("client_handle_cache", 3485 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3486 /* 3487 * Initialize the list of per-zone client handles (and associated data). 3488 * This needs to be done before we call zone_key_create(). 3489 */ 3490 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3491 offsetof(struct nfs_clnt, nfscl_node)); 3492 /* 3493 * Initialize the zone_key for per-zone client handle lists. 3494 */ 3495 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3496 /* 3497 * Initialize the various mutexes and reader/writer locks 3498 */ 3499 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3500 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3501 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3502 3503 /* 3504 * Assign unique major number for all nfs mounts 3505 */ 3506 if ((nfs_major = getudev()) == -1) { 3507 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3508 "nfs: init: can't get unique device number"); 3509 nfs_major = 0; 3510 } 3511 nfs_minor = 0; 3512 3513 if (nfs3_jukebox_delay == 0) 3514 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3515 3516 return (0); 3517 } 3518 3519 void 3520 nfs_subrfini(void) 3521 { 3522 int i; 3523 3524 /* 3525 * Deallocate the rnode hash queues 3526 */ 3527 kmem_cache_destroy(rnode_cache); 3528 3529 for (i = 0; i < rtablesize; i++) 3530 rw_destroy(&rtable[i].r_lock); 3531 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3532 3533 /* 3534 * Deallocated the access cache 3535 */ 3536 kmem_cache_destroy(acache_cache); 3537 3538 for (i = 0; i < acachesize; i++) 3539 rw_destroy(&acache[i].lock); 3540 kmem_free(acache, acachesize * sizeof (*acache)); 3541 3542 /* 3543 * Deallocate the client handle cache 3544 */ 3545 kmem_cache_destroy(chtab_cache); 3546 3547 /* 3548 * Destroy the various mutexes and reader/writer locks 3549 */ 3550 mutex_destroy(&rpfreelist_lock); 3551 mutex_destroy(&newnum_lock); 3552 mutex_destroy(&nfs_minor_lock); 3553 (void) zone_key_delete(nfsclnt_zone_key); 3554 } 3555 3556 enum nfsstat 3557 puterrno(int error) 3558 { 3559 3560 switch (error) { 3561 case EOPNOTSUPP: 3562 return (NFSERR_OPNOTSUPP); 3563 case ENAMETOOLONG: 3564 return (NFSERR_NAMETOOLONG); 3565 case ENOTEMPTY: 3566 return (NFSERR_NOTEMPTY); 3567 case EDQUOT: 3568 return (NFSERR_DQUOT); 3569 case ESTALE: 3570 return (NFSERR_STALE); 3571 case EREMOTE: 3572 return (NFSERR_REMOTE); 3573 case ENOSYS: 3574 return (NFSERR_OPNOTSUPP); 3575 case EOVERFLOW: 3576 return (NFSERR_INVAL); 3577 default: 3578 return ((enum nfsstat)error); 3579 } 3580 /* NOTREACHED */ 3581 } 3582 3583 int 3584 geterrno(enum nfsstat status) 3585 { 3586 3587 switch (status) { 3588 case NFSERR_OPNOTSUPP: 3589 return (EOPNOTSUPP); 3590 case NFSERR_NAMETOOLONG: 3591 return (ENAMETOOLONG); 3592 case NFSERR_NOTEMPTY: 3593 return (ENOTEMPTY); 3594 case NFSERR_DQUOT: 3595 return (EDQUOT); 3596 case NFSERR_STALE: 3597 return (ESTALE); 3598 case NFSERR_REMOTE: 3599 return (EREMOTE); 3600 case NFSERR_WFLUSH: 3601 return (EIO); 3602 default: 3603 return ((int)status); 3604 } 3605 /* NOTREACHED */ 3606 } 3607 3608 enum nfsstat3 3609 puterrno3(int error) 3610 { 3611 3612 #ifdef DEBUG 3613 switch (error) { 3614 case 0: 3615 return (NFS3_OK); 3616 case EPERM: 3617 return (NFS3ERR_PERM); 3618 case ENOENT: 3619 return (NFS3ERR_NOENT); 3620 case EIO: 3621 return (NFS3ERR_IO); 3622 case ENXIO: 3623 return (NFS3ERR_NXIO); 3624 case EACCES: 3625 return (NFS3ERR_ACCES); 3626 case EEXIST: 3627 return (NFS3ERR_EXIST); 3628 case EXDEV: 3629 return (NFS3ERR_XDEV); 3630 case ENODEV: 3631 return (NFS3ERR_NODEV); 3632 case ENOTDIR: 3633 return (NFS3ERR_NOTDIR); 3634 case EISDIR: 3635 return (NFS3ERR_ISDIR); 3636 case EINVAL: 3637 return (NFS3ERR_INVAL); 3638 case EFBIG: 3639 return (NFS3ERR_FBIG); 3640 case ENOSPC: 3641 return (NFS3ERR_NOSPC); 3642 case EROFS: 3643 return (NFS3ERR_ROFS); 3644 case EMLINK: 3645 return (NFS3ERR_MLINK); 3646 case ENAMETOOLONG: 3647 return (NFS3ERR_NAMETOOLONG); 3648 case ENOTEMPTY: 3649 return (NFS3ERR_NOTEMPTY); 3650 case EDQUOT: 3651 return (NFS3ERR_DQUOT); 3652 case ESTALE: 3653 return (NFS3ERR_STALE); 3654 case EREMOTE: 3655 return (NFS3ERR_REMOTE); 3656 case ENOSYS: 3657 case EOPNOTSUPP: 3658 return (NFS3ERR_NOTSUPP); 3659 case EOVERFLOW: 3660 return (NFS3ERR_INVAL); 3661 default: 3662 zcmn_err(getzoneid(), CE_WARN, 3663 "puterrno3: got error %d", error); 3664 return ((enum nfsstat3)error); 3665 } 3666 #else 3667 switch (error) { 3668 case ENAMETOOLONG: 3669 return (NFS3ERR_NAMETOOLONG); 3670 case ENOTEMPTY: 3671 return (NFS3ERR_NOTEMPTY); 3672 case EDQUOT: 3673 return (NFS3ERR_DQUOT); 3674 case ESTALE: 3675 return (NFS3ERR_STALE); 3676 case ENOSYS: 3677 case EOPNOTSUPP: 3678 return (NFS3ERR_NOTSUPP); 3679 case EREMOTE: 3680 return (NFS3ERR_REMOTE); 3681 case EOVERFLOW: 3682 return (NFS3ERR_INVAL); 3683 default: 3684 return ((enum nfsstat3)error); 3685 } 3686 #endif 3687 } 3688 3689 int 3690 geterrno3(enum nfsstat3 status) 3691 { 3692 3693 #ifdef DEBUG 3694 switch (status) { 3695 case NFS3_OK: 3696 return (0); 3697 case NFS3ERR_PERM: 3698 return (EPERM); 3699 case NFS3ERR_NOENT: 3700 return (ENOENT); 3701 case NFS3ERR_IO: 3702 return (EIO); 3703 case NFS3ERR_NXIO: 3704 return (ENXIO); 3705 case NFS3ERR_ACCES: 3706 return (EACCES); 3707 case NFS3ERR_EXIST: 3708 return (EEXIST); 3709 case NFS3ERR_XDEV: 3710 return (EXDEV); 3711 case NFS3ERR_NODEV: 3712 return (ENODEV); 3713 case NFS3ERR_NOTDIR: 3714 return (ENOTDIR); 3715 case NFS3ERR_ISDIR: 3716 return (EISDIR); 3717 case NFS3ERR_INVAL: 3718 return (EINVAL); 3719 case NFS3ERR_FBIG: 3720 return (EFBIG); 3721 case NFS3ERR_NOSPC: 3722 return (ENOSPC); 3723 case NFS3ERR_ROFS: 3724 return (EROFS); 3725 case NFS3ERR_MLINK: 3726 return (EMLINK); 3727 case NFS3ERR_NAMETOOLONG: 3728 return (ENAMETOOLONG); 3729 case NFS3ERR_NOTEMPTY: 3730 return (ENOTEMPTY); 3731 case NFS3ERR_DQUOT: 3732 return (EDQUOT); 3733 case NFS3ERR_STALE: 3734 return (ESTALE); 3735 case NFS3ERR_REMOTE: 3736 return (EREMOTE); 3737 case NFS3ERR_BADHANDLE: 3738 return (ESTALE); 3739 case NFS3ERR_NOT_SYNC: 3740 return (EINVAL); 3741 case NFS3ERR_BAD_COOKIE: 3742 return (ENOENT); 3743 case NFS3ERR_NOTSUPP: 3744 return (EOPNOTSUPP); 3745 case NFS3ERR_TOOSMALL: 3746 return (EINVAL); 3747 case NFS3ERR_SERVERFAULT: 3748 return (EIO); 3749 case NFS3ERR_BADTYPE: 3750 return (EINVAL); 3751 case NFS3ERR_JUKEBOX: 3752 return (ENXIO); 3753 default: 3754 zcmn_err(getzoneid(), CE_WARN, 3755 "geterrno3: got status %d", status); 3756 return ((int)status); 3757 } 3758 #else 3759 switch (status) { 3760 case NFS3ERR_NAMETOOLONG: 3761 return (ENAMETOOLONG); 3762 case NFS3ERR_NOTEMPTY: 3763 return (ENOTEMPTY); 3764 case NFS3ERR_DQUOT: 3765 return (EDQUOT); 3766 case NFS3ERR_STALE: 3767 case NFS3ERR_BADHANDLE: 3768 return (ESTALE); 3769 case NFS3ERR_NOTSUPP: 3770 return (EOPNOTSUPP); 3771 case NFS3ERR_REMOTE: 3772 return (EREMOTE); 3773 case NFS3ERR_NOT_SYNC: 3774 case NFS3ERR_TOOSMALL: 3775 case NFS3ERR_BADTYPE: 3776 return (EINVAL); 3777 case NFS3ERR_BAD_COOKIE: 3778 return (ENOENT); 3779 case NFS3ERR_SERVERFAULT: 3780 return (EIO); 3781 case NFS3ERR_JUKEBOX: 3782 return (ENXIO); 3783 default: 3784 return ((int)status); 3785 } 3786 #endif 3787 } 3788 3789 rddir_cache * 3790 rddir_cache_alloc(int flags) 3791 { 3792 rddir_cache *rc; 3793 3794 rc = kmem_alloc(sizeof (*rc), flags); 3795 if (rc != NULL) { 3796 rc->entries = NULL; 3797 rc->flags = RDDIR; 3798 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3799 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3800 rc->count = 1; 3801 #ifdef DEBUG 3802 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3803 #endif 3804 } 3805 return (rc); 3806 } 3807 3808 static void 3809 rddir_cache_free(rddir_cache *rc) 3810 { 3811 3812 #ifdef DEBUG 3813 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3814 #endif 3815 if (rc->entries != NULL) { 3816 #ifdef DEBUG 3817 rddir_cache_buf_free(rc->entries, rc->buflen); 3818 #else 3819 kmem_free(rc->entries, rc->buflen); 3820 #endif 3821 } 3822 cv_destroy(&rc->cv); 3823 mutex_destroy(&rc->lock); 3824 kmem_free(rc, sizeof (*rc)); 3825 } 3826 3827 void 3828 rddir_cache_hold(rddir_cache *rc) 3829 { 3830 3831 mutex_enter(&rc->lock); 3832 rc->count++; 3833 mutex_exit(&rc->lock); 3834 } 3835 3836 void 3837 rddir_cache_rele(rddir_cache *rc) 3838 { 3839 3840 mutex_enter(&rc->lock); 3841 ASSERT(rc->count > 0); 3842 if (--rc->count == 0) { 3843 mutex_exit(&rc->lock); 3844 rddir_cache_free(rc); 3845 } else 3846 mutex_exit(&rc->lock); 3847 } 3848 3849 #ifdef DEBUG 3850 char * 3851 rddir_cache_buf_alloc(size_t size, int flags) 3852 { 3853 char *rc; 3854 3855 rc = kmem_alloc(size, flags); 3856 if (rc != NULL) 3857 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3858 return (rc); 3859 } 3860 3861 void 3862 rddir_cache_buf_free(void *addr, size_t size) 3863 { 3864 3865 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3866 kmem_free(addr, size); 3867 } 3868 #endif 3869 3870 static int 3871 nfs_free_data_reclaim(rnode_t *rp) 3872 { 3873 char *contents; 3874 int size; 3875 vsecattr_t *vsp; 3876 nfs3_pathconf_info *info; 3877 int freed; 3878 cred_t *cred; 3879 3880 /* 3881 * Free any held credentials and caches which 3882 * may be associated with this rnode. 3883 */ 3884 mutex_enter(&rp->r_statelock); 3885 cred = rp->r_cred; 3886 rp->r_cred = NULL; 3887 contents = rp->r_symlink.contents; 3888 size = rp->r_symlink.size; 3889 rp->r_symlink.contents = NULL; 3890 vsp = rp->r_secattr; 3891 rp->r_secattr = NULL; 3892 info = rp->r_pathconf; 3893 rp->r_pathconf = NULL; 3894 mutex_exit(&rp->r_statelock); 3895 3896 if (cred != NULL) 3897 crfree(cred); 3898 3899 /* 3900 * Free the access cache entries. 3901 */ 3902 freed = nfs_access_purge_rp(rp); 3903 3904 if (!HAVE_RDDIR_CACHE(rp) && 3905 contents == NULL && 3906 vsp == NULL && 3907 info == NULL) 3908 return (freed); 3909 3910 /* 3911 * Free the readdir cache entries 3912 */ 3913 if (HAVE_RDDIR_CACHE(rp)) 3914 nfs_purge_rddir_cache(RTOV(rp)); 3915 3916 /* 3917 * Free the symbolic link cache. 3918 */ 3919 if (contents != NULL) { 3920 3921 kmem_free((void *)contents, size); 3922 } 3923 3924 /* 3925 * Free any cached ACL. 3926 */ 3927 if (vsp != NULL) 3928 nfs_acl_free(vsp); 3929 3930 /* 3931 * Free any cached pathconf information. 3932 */ 3933 if (info != NULL) 3934 kmem_free(info, sizeof (*info)); 3935 3936 return (1); 3937 } 3938 3939 static int 3940 nfs_active_data_reclaim(rnode_t *rp) 3941 { 3942 char *contents; 3943 int size; 3944 vsecattr_t *vsp; 3945 nfs3_pathconf_info *info; 3946 int freed; 3947 3948 /* 3949 * Free any held credentials and caches which 3950 * may be associated with this rnode. 3951 */ 3952 if (!mutex_tryenter(&rp->r_statelock)) 3953 return (0); 3954 contents = rp->r_symlink.contents; 3955 size = rp->r_symlink.size; 3956 rp->r_symlink.contents = NULL; 3957 vsp = rp->r_secattr; 3958 rp->r_secattr = NULL; 3959 info = rp->r_pathconf; 3960 rp->r_pathconf = NULL; 3961 mutex_exit(&rp->r_statelock); 3962 3963 /* 3964 * Free the access cache entries. 3965 */ 3966 freed = nfs_access_purge_rp(rp); 3967 3968 if (!HAVE_RDDIR_CACHE(rp) && 3969 contents == NULL && 3970 vsp == NULL && 3971 info == NULL) 3972 return (freed); 3973 3974 /* 3975 * Free the readdir cache entries 3976 */ 3977 if (HAVE_RDDIR_CACHE(rp)) 3978 nfs_purge_rddir_cache(RTOV(rp)); 3979 3980 /* 3981 * Free the symbolic link cache. 3982 */ 3983 if (contents != NULL) { 3984 3985 kmem_free((void *)contents, size); 3986 } 3987 3988 /* 3989 * Free any cached ACL. 3990 */ 3991 if (vsp != NULL) 3992 nfs_acl_free(vsp); 3993 3994 /* 3995 * Free any cached pathconf information. 3996 */ 3997 if (info != NULL) 3998 kmem_free(info, sizeof (*info)); 3999 4000 return (1); 4001 } 4002 4003 static int 4004 nfs_free_reclaim(void) 4005 { 4006 int freed; 4007 rnode_t *rp; 4008 4009 #ifdef DEBUG 4010 clstat_debug.f_reclaim.value.ui64++; 4011 #endif 4012 freed = 0; 4013 mutex_enter(&rpfreelist_lock); 4014 rp = rpfreelist; 4015 if (rp != NULL) { 4016 do { 4017 if (nfs_free_data_reclaim(rp)) 4018 freed = 1; 4019 } while ((rp = rp->r_freef) != rpfreelist); 4020 } 4021 mutex_exit(&rpfreelist_lock); 4022 return (freed); 4023 } 4024 4025 static int 4026 nfs_active_reclaim(void) 4027 { 4028 int freed; 4029 int index; 4030 rnode_t *rp; 4031 4032 #ifdef DEBUG 4033 clstat_debug.a_reclaim.value.ui64++; 4034 #endif 4035 freed = 0; 4036 for (index = 0; index < rtablesize; index++) { 4037 rw_enter(&rtable[index].r_lock, RW_READER); 4038 for (rp = rtable[index].r_hashf; 4039 rp != (rnode_t *)(&rtable[index]); 4040 rp = rp->r_hashf) { 4041 if (nfs_active_data_reclaim(rp)) 4042 freed = 1; 4043 } 4044 rw_exit(&rtable[index].r_lock); 4045 } 4046 return (freed); 4047 } 4048 4049 static int 4050 nfs_rnode_reclaim(void) 4051 { 4052 int freed; 4053 rnode_t *rp; 4054 vnode_t *vp; 4055 4056 #ifdef DEBUG 4057 clstat_debug.r_reclaim.value.ui64++; 4058 #endif 4059 freed = 0; 4060 mutex_enter(&rpfreelist_lock); 4061 while ((rp = rpfreelist) != NULL) { 4062 rp_rmfree(rp); 4063 mutex_exit(&rpfreelist_lock); 4064 if (rp->r_flags & RHASHED) { 4065 vp = RTOV(rp); 4066 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4067 mutex_enter(&vp->v_lock); 4068 if (vp->v_count > 1) { 4069 vp->v_count--; 4070 mutex_exit(&vp->v_lock); 4071 rw_exit(&rp->r_hashq->r_lock); 4072 mutex_enter(&rpfreelist_lock); 4073 continue; 4074 } 4075 mutex_exit(&vp->v_lock); 4076 rp_rmhash_locked(rp); 4077 rw_exit(&rp->r_hashq->r_lock); 4078 } 4079 /* 4080 * This call to rp_addfree will end up destroying the 4081 * rnode, but in a safe way with the appropriate set 4082 * of checks done. 4083 */ 4084 rp_addfree(rp, CRED()); 4085 mutex_enter(&rpfreelist_lock); 4086 } 4087 mutex_exit(&rpfreelist_lock); 4088 return (freed); 4089 } 4090 4091 /*ARGSUSED*/ 4092 static void 4093 nfs_reclaim(void *cdrarg) 4094 { 4095 4096 #ifdef DEBUG 4097 clstat_debug.reclaim.value.ui64++; 4098 #endif 4099 if (nfs_free_reclaim()) 4100 return; 4101 4102 if (nfs_active_reclaim()) 4103 return; 4104 4105 (void) nfs_rnode_reclaim(); 4106 } 4107 4108 /* 4109 * NFS client failover support 4110 * 4111 * Routines to copy filehandles 4112 */ 4113 void 4114 nfscopyfh(caddr_t fhp, vnode_t *vp) 4115 { 4116 fhandle_t *dest = (fhandle_t *)fhp; 4117 4118 if (dest != NULL) 4119 *dest = *VTOFH(vp); 4120 } 4121 4122 void 4123 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4124 { 4125 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4126 4127 if (dest != NULL) 4128 *dest = *VTOFH3(vp); 4129 } 4130 4131 /* 4132 * NFS client failover support 4133 * 4134 * failover_safe() will test various conditions to ensure that 4135 * failover is permitted for this vnode. It will be denied 4136 * if: 4137 * 1) the operation in progress does not support failover (NULL fi) 4138 * 2) there are no available replicas (NULL mi_servers->sv_next) 4139 * 3) any locks are outstanding on this file 4140 */ 4141 static int 4142 failover_safe(failinfo_t *fi) 4143 { 4144 4145 /* 4146 * Does this op permit failover? 4147 */ 4148 if (fi == NULL || fi->vp == NULL) 4149 return (0); 4150 4151 /* 4152 * Are there any alternates to failover to? 4153 */ 4154 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4155 return (0); 4156 4157 /* 4158 * Disable check; we've forced local locking 4159 * 4160 * if (flk_has_remote_locks(fi->vp)) 4161 * return (0); 4162 */ 4163 4164 /* 4165 * If we have no partial path, we can't do anything 4166 */ 4167 if (VTOR(fi->vp)->r_path == NULL) 4168 return (0); 4169 4170 return (1); 4171 } 4172 4173 #include <sys/thread.h> 4174 4175 /* 4176 * NFS client failover support 4177 * 4178 * failover_newserver() will start a search for a new server, 4179 * preferably by starting an async thread to do the work. If 4180 * someone is already doing this (recognizable by MI_BINDINPROG 4181 * being set), it will simply return and the calling thread 4182 * will queue on the mi_failover_cv condition variable. 4183 */ 4184 static void 4185 failover_newserver(mntinfo_t *mi) 4186 { 4187 /* 4188 * Check if someone else is doing this already 4189 */ 4190 mutex_enter(&mi->mi_lock); 4191 if (mi->mi_flags & MI_BINDINPROG) { 4192 mutex_exit(&mi->mi_lock); 4193 return; 4194 } 4195 mi->mi_flags |= MI_BINDINPROG; 4196 4197 /* 4198 * Need to hold the vfs struct so that it can't be released 4199 * while the failover thread is selecting a new server. 4200 */ 4201 VFS_HOLD(mi->mi_vfsp); 4202 4203 /* 4204 * Start a thread to do the real searching. 4205 */ 4206 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4207 4208 mutex_exit(&mi->mi_lock); 4209 } 4210 4211 /* 4212 * NFS client failover support 4213 * 4214 * failover_thread() will find a new server to replace the one 4215 * currently in use, wake up other threads waiting on this mount 4216 * point, and die. It will start at the head of the server list 4217 * and poll servers until it finds one with an NFS server which is 4218 * registered and responds to a NULL procedure ping. 4219 * 4220 * XXX failover_thread is unsafe within the scope of the 4221 * present model defined for cpr to suspend the system. 4222 * Specifically, over-the-wire calls made by the thread 4223 * are unsafe. The thread needs to be reevaluated in case of 4224 * future updates to the cpr suspend model. 4225 */ 4226 static void 4227 failover_thread(mntinfo_t *mi) 4228 { 4229 servinfo_t *svp = NULL; 4230 CLIENT *cl; 4231 enum clnt_stat status; 4232 struct timeval tv; 4233 int error; 4234 int oncethru = 0; 4235 callb_cpr_t cprinfo; 4236 rnode_t *rp; 4237 int index; 4238 char *srvnames; 4239 size_t srvnames_len; 4240 struct nfs_clnt *nfscl = NULL; 4241 zoneid_t zoneid = getzoneid(); 4242 4243 #ifdef DEBUG 4244 /* 4245 * This is currently only needed to access counters which exist on 4246 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4247 * on non-DEBUG kernels. 4248 */ 4249 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4250 ASSERT(nfscl != NULL); 4251 #endif 4252 4253 /* 4254 * Its safe to piggyback on the mi_lock since failover_newserver() 4255 * code guarantees that there will be only one failover thread 4256 * per mountinfo at any instance. 4257 */ 4258 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4259 "failover_thread"); 4260 4261 mutex_enter(&mi->mi_lock); 4262 while (mi->mi_readers) { 4263 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4264 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4265 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4266 } 4267 mutex_exit(&mi->mi_lock); 4268 4269 tv.tv_sec = 2; 4270 tv.tv_usec = 0; 4271 4272 /* 4273 * Ping the null NFS procedure of every server in 4274 * the list until one responds. We always start 4275 * at the head of the list and always skip the one 4276 * that is current, since it's caused us a problem. 4277 */ 4278 while (svp == NULL) { 4279 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4280 if (!oncethru && svp == mi->mi_curr_serv) 4281 continue; 4282 4283 /* 4284 * If the file system was forcibly umounted 4285 * while trying to do a failover, then just 4286 * give up on the failover. It won't matter 4287 * what the server is. 4288 */ 4289 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4290 svp = NULL; 4291 goto done; 4292 } 4293 4294 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4295 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4296 if (error) 4297 continue; 4298 4299 if (!(mi->mi_flags & MI_INT)) 4300 cl->cl_nosignal = TRUE; 4301 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4302 xdr_void, NULL, tv); 4303 if (!(mi->mi_flags & MI_INT)) 4304 cl->cl_nosignal = FALSE; 4305 AUTH_DESTROY(cl->cl_auth); 4306 CLNT_DESTROY(cl); 4307 if (status == RPC_SUCCESS) { 4308 if (svp == mi->mi_curr_serv) { 4309 #ifdef DEBUG 4310 zcmn_err(zoneid, CE_NOTE, 4311 "NFS%d: failing over: selecting original server %s", 4312 mi->mi_vers, svp->sv_hostname); 4313 #else 4314 zcmn_err(zoneid, CE_NOTE, 4315 "NFS: failing over: selecting original server %s", 4316 svp->sv_hostname); 4317 #endif 4318 } else { 4319 #ifdef DEBUG 4320 zcmn_err(zoneid, CE_NOTE, 4321 "NFS%d: failing over from %s to %s", 4322 mi->mi_vers, 4323 mi->mi_curr_serv->sv_hostname, 4324 svp->sv_hostname); 4325 #else 4326 zcmn_err(zoneid, CE_NOTE, 4327 "NFS: failing over from %s to %s", 4328 mi->mi_curr_serv->sv_hostname, 4329 svp->sv_hostname); 4330 #endif 4331 } 4332 break; 4333 } 4334 } 4335 4336 if (svp == NULL) { 4337 if (!oncethru) { 4338 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4339 #ifdef DEBUG 4340 zprintf(zoneid, 4341 "NFS%d servers %s not responding " 4342 "still trying\n", mi->mi_vers, srvnames); 4343 #else 4344 zprintf(zoneid, "NFS servers %s not responding " 4345 "still trying\n", srvnames); 4346 #endif 4347 oncethru = 1; 4348 } 4349 mutex_enter(&mi->mi_lock); 4350 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4351 mutex_exit(&mi->mi_lock); 4352 delay(hz); 4353 mutex_enter(&mi->mi_lock); 4354 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4355 mutex_exit(&mi->mi_lock); 4356 } 4357 } 4358 4359 if (oncethru) { 4360 #ifdef DEBUG 4361 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4362 #else 4363 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4364 #endif 4365 } 4366 4367 if (svp != mi->mi_curr_serv) { 4368 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4369 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4370 rw_enter(&rtable[index].r_lock, RW_WRITER); 4371 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4372 mi->mi_vfsp); 4373 if (rp != NULL) { 4374 if (rp->r_flags & RHASHED) 4375 rp_rmhash_locked(rp); 4376 rw_exit(&rtable[index].r_lock); 4377 rp->r_server = svp; 4378 rp->r_fh = svp->sv_fhandle; 4379 (void) nfs_free_data_reclaim(rp); 4380 index = rtablehash(&rp->r_fh); 4381 rp->r_hashq = &rtable[index]; 4382 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4383 vn_exists(RTOV(rp)); 4384 rp_addhash(rp); 4385 rw_exit(&rp->r_hashq->r_lock); 4386 VN_RELE(RTOV(rp)); 4387 } else 4388 rw_exit(&rtable[index].r_lock); 4389 } 4390 4391 done: 4392 if (oncethru) 4393 kmem_free(srvnames, srvnames_len); 4394 mutex_enter(&mi->mi_lock); 4395 mi->mi_flags &= ~MI_BINDINPROG; 4396 if (svp != NULL) { 4397 mi->mi_curr_serv = svp; 4398 mi->mi_failover++; 4399 #ifdef DEBUG 4400 nfscl->nfscl_stat.failover.value.ui64++; 4401 #endif 4402 } 4403 cv_broadcast(&mi->mi_failover_cv); 4404 CALLB_CPR_EXIT(&cprinfo); 4405 VFS_RELE(mi->mi_vfsp); 4406 zthread_exit(); 4407 /* NOTREACHED */ 4408 } 4409 4410 /* 4411 * NFS client failover support 4412 * 4413 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4414 * is cleared, meaning that failover is complete. Called with 4415 * mi_lock mutex held. 4416 */ 4417 static int 4418 failover_wait(mntinfo_t *mi) 4419 { 4420 k_sigset_t smask; 4421 4422 /* 4423 * If someone else is hunting for a living server, 4424 * sleep until it's done. After our sleep, we may 4425 * be bound to the right server and get off cheaply. 4426 */ 4427 while (mi->mi_flags & MI_BINDINPROG) { 4428 /* 4429 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4430 * and SIGTERM. (Preserving the existing masks). 4431 * Mask out SIGINT if mount option nointr is specified. 4432 */ 4433 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4434 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4435 /* 4436 * restore original signal mask 4437 */ 4438 sigunintr(&smask); 4439 return (EINTR); 4440 } 4441 /* 4442 * restore original signal mask 4443 */ 4444 sigunintr(&smask); 4445 } 4446 return (0); 4447 } 4448 4449 /* 4450 * NFS client failover support 4451 * 4452 * failover_remap() will do a partial pathname lookup and find the 4453 * desired vnode on the current server. The interim vnode will be 4454 * discarded after we pilfer the new filehandle. 4455 * 4456 * Side effects: 4457 * - This routine will also update the filehandle in the args structure 4458 * pointed to by the fi->fhp pointer if it is non-NULL. 4459 */ 4460 4461 static int 4462 failover_remap(failinfo_t *fi) 4463 { 4464 vnode_t *vp, *nvp, *rootvp; 4465 rnode_t *rp, *nrp; 4466 mntinfo_t *mi; 4467 int error; 4468 #ifdef DEBUG 4469 struct nfs_clnt *nfscl; 4470 4471 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4472 ASSERT(nfscl != NULL); 4473 #endif 4474 /* 4475 * Sanity check 4476 */ 4477 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4478 return (EINVAL); 4479 vp = fi->vp; 4480 rp = VTOR(vp); 4481 mi = VTOMI(vp); 4482 4483 if (!(vp->v_flag & VROOT)) { 4484 /* 4485 * Given the root fh, use the path stored in 4486 * the rnode to find the fh for the new server. 4487 */ 4488 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4489 if (error) 4490 return (error); 4491 4492 error = failover_lookup(rp->r_path, rootvp, 4493 fi->lookupproc, fi->xattrdirproc, &nvp); 4494 4495 VN_RELE(rootvp); 4496 4497 if (error) 4498 return (error); 4499 4500 /* 4501 * If we found the same rnode, we're done now 4502 */ 4503 if (nvp == vp) { 4504 /* 4505 * Failed and the new server may physically be same 4506 * OR may share a same disk subsystem. In this case 4507 * file handle for a particular file path is not going 4508 * to change, given the same filehandle lookup will 4509 * always locate the same rnode as the existing one. 4510 * All we might need to do is to update the r_server 4511 * with the current servinfo. 4512 */ 4513 if (!VALID_FH(fi)) { 4514 rp->r_server = mi->mi_curr_serv; 4515 } 4516 VN_RELE(nvp); 4517 return (0); 4518 } 4519 4520 /* 4521 * Try to make it so that no one else will find this 4522 * vnode because it is just a temporary to hold the 4523 * new file handle until that file handle can be 4524 * copied to the original vnode/rnode. 4525 */ 4526 nrp = VTOR(nvp); 4527 mutex_enter(&mi->mi_remap_lock); 4528 /* 4529 * Some other thread could have raced in here and could 4530 * have done the remap for this particular rnode before 4531 * this thread here. Check for rp->r_server and 4532 * mi->mi_curr_serv and return if they are same. 4533 */ 4534 if (VALID_FH(fi)) { 4535 mutex_exit(&mi->mi_remap_lock); 4536 VN_RELE(nvp); 4537 return (0); 4538 } 4539 4540 if (nrp->r_flags & RHASHED) 4541 rp_rmhash(nrp); 4542 4543 /* 4544 * As a heuristic check on the validity of the new 4545 * file, check that the size and type match against 4546 * that we remember from the old version. 4547 */ 4548 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4549 mutex_exit(&mi->mi_remap_lock); 4550 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4551 "NFS replicas %s and %s: file %s not same.", 4552 rp->r_server->sv_hostname, 4553 nrp->r_server->sv_hostname, rp->r_path); 4554 VN_RELE(nvp); 4555 return (EINVAL); 4556 } 4557 4558 /* 4559 * snarf the filehandle from the new rnode 4560 * then release it, again while updating the 4561 * hash queues for the rnode. 4562 */ 4563 if (rp->r_flags & RHASHED) 4564 rp_rmhash(rp); 4565 rp->r_server = mi->mi_curr_serv; 4566 rp->r_fh = nrp->r_fh; 4567 rp->r_hashq = nrp->r_hashq; 4568 /* 4569 * Copy the attributes from the new rnode to the old 4570 * rnode. This will help to reduce unnecessary page 4571 * cache flushes. 4572 */ 4573 rp->r_attr = nrp->r_attr; 4574 rp->r_attrtime = nrp->r_attrtime; 4575 rp->r_mtime = nrp->r_mtime; 4576 (void) nfs_free_data_reclaim(rp); 4577 nfs_setswaplike(vp, &rp->r_attr); 4578 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4579 rp_addhash(rp); 4580 rw_exit(&rp->r_hashq->r_lock); 4581 mutex_exit(&mi->mi_remap_lock); 4582 VN_RELE(nvp); 4583 } 4584 4585 /* 4586 * Update successful failover remap count 4587 */ 4588 mutex_enter(&mi->mi_lock); 4589 mi->mi_remap++; 4590 mutex_exit(&mi->mi_lock); 4591 #ifdef DEBUG 4592 nfscl->nfscl_stat.remap.value.ui64++; 4593 #endif 4594 4595 /* 4596 * If we have a copied filehandle to update, do it now. 4597 */ 4598 if (fi->fhp != NULL && fi->copyproc != NULL) 4599 (*fi->copyproc)(fi->fhp, vp); 4600 4601 return (0); 4602 } 4603 4604 /* 4605 * NFS client failover support 4606 * 4607 * We want a simple pathname lookup routine to parse the pieces 4608 * of path in rp->r_path. We know that the path was a created 4609 * as rnodes were made, so we know we have only to deal with 4610 * paths that look like: 4611 * dir1/dir2/dir3/file 4612 * Any evidence of anything like .., symlinks, and ENOTDIR 4613 * are hard errors, because they mean something in this filesystem 4614 * is different from the one we came from, or has changed under 4615 * us in some way. If this is true, we want the failure. 4616 * 4617 * Extended attributes: if the filesystem is mounted with extended 4618 * attributes enabled (-o xattr), the attribute directory will be 4619 * represented in the r_path as the magic name XATTR_RPATH. So if 4620 * we see that name in the pathname, is must be because this node 4621 * is an extended attribute. Therefore, look it up that way. 4622 */ 4623 static int 4624 failover_lookup(char *path, vnode_t *root, 4625 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4626 vnode_t *, cred_t *, int), 4627 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4628 vnode_t **new) 4629 { 4630 vnode_t *dvp, *nvp; 4631 int error = EINVAL; 4632 char *s, *p, *tmppath; 4633 size_t len; 4634 mntinfo_t *mi; 4635 bool_t xattr; 4636 4637 /* Make local copy of path */ 4638 len = strlen(path) + 1; 4639 tmppath = kmem_alloc(len, KM_SLEEP); 4640 (void) strcpy(tmppath, path); 4641 s = tmppath; 4642 4643 dvp = root; 4644 VN_HOLD(dvp); 4645 mi = VTOMI(root); 4646 xattr = mi->mi_flags & MI_EXTATTR; 4647 4648 do { 4649 p = strchr(s, '/'); 4650 if (p != NULL) 4651 *p = '\0'; 4652 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4653 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4654 RFSCALL_SOFT); 4655 } else { 4656 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4657 CRED(), RFSCALL_SOFT); 4658 } 4659 if (p != NULL) 4660 *p++ = '/'; 4661 if (error) { 4662 VN_RELE(dvp); 4663 kmem_free(tmppath, len); 4664 return (error); 4665 } 4666 s = p; 4667 VN_RELE(dvp); 4668 dvp = nvp; 4669 } while (p != NULL); 4670 4671 if (nvp != NULL && new != NULL) 4672 *new = nvp; 4673 kmem_free(tmppath, len); 4674 return (0); 4675 } 4676 4677 /* 4678 * NFS client failover support 4679 * 4680 * sv_free() frees the malloc'd portion of a "servinfo_t". 4681 */ 4682 void 4683 sv_free(servinfo_t *svp) 4684 { 4685 servinfo_t *next; 4686 struct knetconfig *knconf; 4687 4688 while (svp != NULL) { 4689 next = svp->sv_next; 4690 if (svp->sv_secdata) 4691 sec_clnt_freeinfo(svp->sv_secdata); 4692 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4693 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4694 knconf = svp->sv_knconf; 4695 if (knconf != NULL) { 4696 if (knconf->knc_protofmly != NULL) 4697 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4698 if (knconf->knc_proto != NULL) 4699 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4700 kmem_free(knconf, sizeof (*knconf)); 4701 } 4702 knconf = svp->sv_origknconf; 4703 if (knconf != NULL) { 4704 if (knconf->knc_protofmly != NULL) 4705 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4706 if (knconf->knc_proto != NULL) 4707 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4708 kmem_free(knconf, sizeof (*knconf)); 4709 } 4710 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4711 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4712 mutex_destroy(&svp->sv_lock); 4713 kmem_free(svp, sizeof (*svp)); 4714 svp = next; 4715 } 4716 } 4717 4718 /* 4719 * Only can return non-zero if intr != 0. 4720 */ 4721 int 4722 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4723 { 4724 4725 mutex_enter(&l->lock); 4726 4727 /* 4728 * If this is a nested enter, then allow it. There 4729 * must be as many exits as enters through. 4730 */ 4731 if (l->owner == curthread) { 4732 /* lock is held for writing by current thread */ 4733 ASSERT(rw == RW_READER || rw == RW_WRITER); 4734 l->count--; 4735 } else if (rw == RW_READER) { 4736 /* 4737 * While there is a writer active or writers waiting, 4738 * then wait for them to finish up and move on. Then, 4739 * increment the count to indicate that a reader is 4740 * active. 4741 */ 4742 while (l->count < 0 || l->waiters > 0) { 4743 if (intr) { 4744 klwp_t *lwp = ttolwp(curthread); 4745 4746 if (lwp != NULL) 4747 lwp->lwp_nostop++; 4748 if (!cv_wait_sig(&l->cv, &l->lock)) { 4749 if (lwp != NULL) 4750 lwp->lwp_nostop--; 4751 mutex_exit(&l->lock); 4752 return (EINTR); 4753 } 4754 if (lwp != NULL) 4755 lwp->lwp_nostop--; 4756 } else 4757 cv_wait(&l->cv, &l->lock); 4758 } 4759 ASSERT(l->count < INT_MAX); 4760 #ifdef DEBUG 4761 if ((l->count % 10000) == 9999) 4762 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4763 "rwlock @ %p\n", l->count, (void *)&l); 4764 #endif 4765 l->count++; 4766 } else { 4767 ASSERT(rw == RW_WRITER); 4768 /* 4769 * While there are readers active or a writer 4770 * active, then wait for all of the readers 4771 * to finish or for the writer to finish. 4772 * Then, set the owner field to curthread and 4773 * decrement count to indicate that a writer 4774 * is active. 4775 */ 4776 while (l->count > 0 || l->owner != NULL) { 4777 l->waiters++; 4778 if (intr) { 4779 klwp_t *lwp = ttolwp(curthread); 4780 4781 if (lwp != NULL) 4782 lwp->lwp_nostop++; 4783 if (!cv_wait_sig(&l->cv, &l->lock)) { 4784 if (lwp != NULL) 4785 lwp->lwp_nostop--; 4786 l->waiters--; 4787 cv_broadcast(&l->cv); 4788 mutex_exit(&l->lock); 4789 return (EINTR); 4790 } 4791 if (lwp != NULL) 4792 lwp->lwp_nostop--; 4793 } else 4794 cv_wait(&l->cv, &l->lock); 4795 l->waiters--; 4796 } 4797 l->owner = curthread; 4798 l->count--; 4799 } 4800 4801 mutex_exit(&l->lock); 4802 4803 return (0); 4804 } 4805 4806 /* 4807 * If the lock is available, obtain it and return non-zero. If there is 4808 * already a conflicting lock, return 0 immediately. 4809 */ 4810 4811 int 4812 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4813 { 4814 mutex_enter(&l->lock); 4815 4816 /* 4817 * If this is a nested enter, then allow it. There 4818 * must be as many exits as enters through. 4819 */ 4820 if (l->owner == curthread) { 4821 /* lock is held for writing by current thread */ 4822 ASSERT(rw == RW_READER || rw == RW_WRITER); 4823 l->count--; 4824 } else if (rw == RW_READER) { 4825 /* 4826 * If there is a writer active or writers waiting, deny the 4827 * lock. Otherwise, bump the count of readers. 4828 */ 4829 if (l->count < 0 || l->waiters > 0) { 4830 mutex_exit(&l->lock); 4831 return (0); 4832 } 4833 l->count++; 4834 } else { 4835 ASSERT(rw == RW_WRITER); 4836 /* 4837 * If there are readers active or a writer active, deny the 4838 * lock. Otherwise, set the owner field to curthread and 4839 * decrement count to indicate that a writer is active. 4840 */ 4841 if (l->count > 0 || l->owner != NULL) { 4842 mutex_exit(&l->lock); 4843 return (0); 4844 } 4845 l->owner = curthread; 4846 l->count--; 4847 } 4848 4849 mutex_exit(&l->lock); 4850 4851 return (1); 4852 } 4853 4854 void 4855 nfs_rw_exit(nfs_rwlock_t *l) 4856 { 4857 4858 mutex_enter(&l->lock); 4859 /* 4860 * If this is releasing a writer lock, then increment count to 4861 * indicate that there is one less writer active. If this was 4862 * the last of possibly nested writer locks, then clear the owner 4863 * field as well to indicate that there is no writer active 4864 * and wakeup any possible waiting writers or readers. 4865 * 4866 * If releasing a reader lock, then just decrement count to 4867 * indicate that there is one less reader active. If this was 4868 * the last active reader and there are writer(s) waiting, 4869 * then wake up the first. 4870 */ 4871 if (l->owner != NULL) { 4872 ASSERT(l->owner == curthread); 4873 l->count++; 4874 if (l->count == 0) { 4875 l->owner = NULL; 4876 cv_broadcast(&l->cv); 4877 } 4878 } else { 4879 ASSERT(l->count > 0); 4880 l->count--; 4881 if (l->count == 0 && l->waiters > 0) 4882 cv_broadcast(&l->cv); 4883 } 4884 mutex_exit(&l->lock); 4885 } 4886 4887 int 4888 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4889 { 4890 4891 if (rw == RW_READER) 4892 return (l->count > 0); 4893 ASSERT(rw == RW_WRITER); 4894 return (l->count < 0); 4895 } 4896 4897 /* ARGSUSED */ 4898 void 4899 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4900 { 4901 4902 l->count = 0; 4903 l->waiters = 0; 4904 l->owner = NULL; 4905 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4906 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4907 } 4908 4909 void 4910 nfs_rw_destroy(nfs_rwlock_t *l) 4911 { 4912 4913 mutex_destroy(&l->lock); 4914 cv_destroy(&l->cv); 4915 } 4916 4917 int 4918 nfs3_rddir_compar(const void *x, const void *y) 4919 { 4920 rddir_cache *a = (rddir_cache *)x; 4921 rddir_cache *b = (rddir_cache *)y; 4922 4923 if (a->nfs3_cookie == b->nfs3_cookie) { 4924 if (a->buflen == b->buflen) 4925 return (0); 4926 if (a->buflen < b->buflen) 4927 return (-1); 4928 return (1); 4929 } 4930 4931 if (a->nfs3_cookie < b->nfs3_cookie) 4932 return (-1); 4933 4934 return (1); 4935 } 4936 4937 int 4938 nfs_rddir_compar(const void *x, const void *y) 4939 { 4940 rddir_cache *a = (rddir_cache *)x; 4941 rddir_cache *b = (rddir_cache *)y; 4942 4943 if (a->nfs_cookie == b->nfs_cookie) { 4944 if (a->buflen == b->buflen) 4945 return (0); 4946 if (a->buflen < b->buflen) 4947 return (-1); 4948 return (1); 4949 } 4950 4951 if (a->nfs_cookie < b->nfs_cookie) 4952 return (-1); 4953 4954 return (1); 4955 } 4956 4957 static char * 4958 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4959 { 4960 servinfo_t *s; 4961 char *srvnames; 4962 char *namep; 4963 size_t length; 4964 4965 /* 4966 * Calculate the length of the string required to hold all 4967 * of the server names plus either a comma or a null 4968 * character following each individual one. 4969 */ 4970 length = 0; 4971 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4972 length += s->sv_hostnamelen; 4973 4974 srvnames = kmem_alloc(length, KM_SLEEP); 4975 4976 namep = srvnames; 4977 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4978 (void) strcpy(namep, s->sv_hostname); 4979 namep += s->sv_hostnamelen - 1; 4980 *namep++ = ','; 4981 } 4982 *--namep = '\0'; 4983 4984 *len = length; 4985 4986 return (srvnames); 4987 } 4988 4989 /* 4990 * These two functions are temporary and designed for the upgrade-workaround 4991 * only. They cannot be used for general zone-crossing NFS client support, and 4992 * will be removed shortly. 4993 * 4994 * When the workaround is enabled, all NFS traffic is forced into the global 4995 * zone. These functions are called when the code needs to refer to the state 4996 * of the underlying network connection. They're not called when the function 4997 * needs to refer to the state of the process that invoked the system call. 4998 * (E.g., when checking whether the zone is shutting down during the mount() 4999 * call.) 5000 */ 5001 5002 struct zone * 5003 nfs_zone(void) 5004 { 5005 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5006 } 5007 5008 zoneid_t 5009 nfs_zoneid(void) 5010 { 5011 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5012 } 5013 5014 /* 5015 * nfs_mount_label_policy: 5016 * Determine whether the mount is allowed according to MAC check, 5017 * by comparing (where appropriate) label of the remote server 5018 * against the label of the zone being mounted into. 5019 * 5020 * Returns: 5021 * 0 : access allowed 5022 * -1 : read-only access allowed (i.e., read-down) 5023 * >0 : error code, such as EACCES 5024 */ 5025 int 5026 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5027 struct knetconfig *knconf, cred_t *cr) 5028 { 5029 int addr_type; 5030 void *ipaddr; 5031 bslabel_t *server_sl, *mntlabel; 5032 zone_t *mntzone = NULL; 5033 ts_label_t *zlabel; 5034 tsol_tpc_t *tp; 5035 ts_label_t *tsl = NULL; 5036 int retv; 5037 5038 /* 5039 * Get the zone's label. Each zone on a labeled system has a label. 5040 */ 5041 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5042 zlabel = mntzone->zone_slabel; 5043 ASSERT(zlabel != NULL); 5044 label_hold(zlabel); 5045 5046 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5047 addr_type = IPV4_VERSION; 5048 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5049 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5050 addr_type = IPV6_VERSION; 5051 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5052 } else { 5053 retv = 0; 5054 goto out; 5055 } 5056 5057 retv = EACCES; /* assume the worst */ 5058 5059 /* 5060 * Next, get the assigned label of the remote server. 5061 */ 5062 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5063 if (tp == NULL) 5064 goto out; /* error getting host entry */ 5065 5066 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5067 goto rel_tpc; /* invalid domain */ 5068 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5069 (tp->tpc_tp.host_type != UNLABELED)) 5070 goto rel_tpc; /* invalid hosttype */ 5071 5072 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5073 tsl = getflabel_cipso(vfsp); 5074 if (tsl == NULL) 5075 goto rel_tpc; /* error getting server lbl */ 5076 5077 server_sl = label2bslabel(tsl); 5078 } else { /* UNLABELED */ 5079 server_sl = &tp->tpc_tp.tp_def_label; 5080 } 5081 5082 mntlabel = label2bslabel(zlabel); 5083 5084 /* 5085 * Now compare labels to complete the MAC check. If the labels 5086 * are equal or if the requestor is in the global zone and has 5087 * NET_MAC_AWARE, then allow read-write access. (Except for 5088 * mounts into the global zone itself; restrict these to 5089 * read-only.) 5090 * 5091 * If the requestor is in some other zone, but his label 5092 * dominates the server, then allow read-down. 5093 * 5094 * Otherwise, access is denied. 5095 */ 5096 if (blequal(mntlabel, server_sl) || 5097 (crgetzoneid(cr) == GLOBAL_ZONEID && 5098 getpflags(NET_MAC_AWARE, cr) != 0)) { 5099 if ((mntzone == global_zone) || 5100 !blequal(mntlabel, server_sl)) 5101 retv = -1; /* read-only */ 5102 else 5103 retv = 0; /* access OK */ 5104 } else if (bldominates(mntlabel, server_sl)) { 5105 retv = -1; /* read-only */ 5106 } else { 5107 retv = EACCES; 5108 } 5109 5110 if (tsl != NULL) 5111 label_rele(tsl); 5112 5113 rel_tpc: 5114 TPC_RELE(tp); 5115 out: 5116 if (mntzone) 5117 zone_rele(mntzone); 5118 label_rele(zlabel); 5119 return (retv); 5120 } 5121 5122 boolean_t 5123 nfs_has_ctty(void) 5124 { 5125 boolean_t rv; 5126 mutex_enter(&curproc->p_splock); 5127 rv = (curproc->p_sessp->s_vp != NULL); 5128 mutex_exit(&curproc->p_splock); 5129 return (rv); 5130 } 5131