1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred_impl.h> 35 #include <sys/proc.h> 36 #include <sys/user.h> 37 #include <sys/time.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/socket.h> 42 #include <sys/uio.h> 43 #include <sys/tiuser.h> 44 #include <sys/swap.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/kmem.h> 48 #include <sys/kstat.h> 49 #include <sys/cmn_err.h> 50 #include <sys/vtrace.h> 51 #include <sys/session.h> 52 #include <sys/dnlc.h> 53 #include <sys/bitmap.h> 54 #include <sys/acl.h> 55 #include <sys/ddi.h> 56 #include <sys/pathname.h> 57 #include <sys/flock.h> 58 #include <sys/dirent.h> 59 #include <sys/flock.h> 60 #include <sys/callb.h> 61 #include <sys/atomic.h> 62 #include <sys/list.h> 63 #include <sys/tsol/tnet.h> 64 #include <sys/priv.h> 65 66 #include <inet/ip6.h> 67 68 #include <rpc/types.h> 69 #include <rpc/xdr.h> 70 #include <rpc/auth.h> 71 #include <rpc/clnt.h> 72 73 #include <nfs/nfs.h> 74 #include <nfs/nfs4.h> 75 #include <nfs/nfs_clnt.h> 76 #include <nfs/rnode.h> 77 #include <nfs/nfs_acl.h> 78 79 /* 80 * The hash queues for the access to active and cached rnodes 81 * are organized as doubly linked lists. A reader/writer lock 82 * for each hash bucket is used to control access and to synchronize 83 * lookups, additions, and deletions from the hash queue. 84 * 85 * The rnode freelist is organized as a doubly linked list with 86 * a head pointer. Additions and deletions are synchronized via 87 * a single mutex. 88 * 89 * In order to add an rnode to the free list, it must be hashed into 90 * a hash queue and the exclusive lock to the hash queue be held. 91 * If an rnode is not hashed into a hash queue, then it is destroyed 92 * because it represents no valuable information that can be reused 93 * about the file. The exclusive lock to the hash queue must be 94 * held in order to prevent a lookup in the hash queue from finding 95 * the rnode and using it and assuming that the rnode is not on the 96 * freelist. The lookup in the hash queue will have the hash queue 97 * locked, either exclusive or shared. 98 * 99 * The vnode reference count for each rnode is not allowed to drop 100 * below 1. This prevents external entities, such as the VM 101 * subsystem, from acquiring references to vnodes already on the 102 * freelist and then trying to place them back on the freelist 103 * when their reference is released. This means that the when an 104 * rnode is looked up in the hash queues, then either the rnode 105 * is removed from the freelist and that reference is tranfered to 106 * the new reference or the vnode reference count must be incremented 107 * accordingly. The mutex for the freelist must be held in order to 108 * accurately test to see if the rnode is on the freelist or not. 109 * The hash queue lock might be held shared and it is possible that 110 * two different threads may race to remove the rnode from the 111 * freelist. This race can be resolved by holding the mutex for the 112 * freelist. Please note that the mutex for the freelist does not 113 * need to held if the rnode is not on the freelist. It can not be 114 * placed on the freelist due to the requirement that the thread 115 * putting the rnode on the freelist must hold the exclusive lock 116 * to the hash queue and the thread doing the lookup in the hash 117 * queue is holding either a shared or exclusive lock to the hash 118 * queue. 119 * 120 * The lock ordering is: 121 * 122 * hash bucket lock -> vnode lock 123 * hash bucket lock -> freelist lock 124 */ 125 static rhashq_t *rtable; 126 127 static kmutex_t rpfreelist_lock; 128 static rnode_t *rpfreelist = NULL; 129 static long rnew = 0; 130 long nrnode = 0; 131 132 static int rtablesize; 133 static int rtablemask; 134 135 static int hashlen = 4; 136 137 static struct kmem_cache *rnode_cache; 138 139 /* 140 * Mutex to protect the following variables: 141 * nfs_major 142 * nfs_minor 143 */ 144 kmutex_t nfs_minor_lock; 145 int nfs_major; 146 int nfs_minor; 147 148 /* Do we allow preepoch (negative) time values otw? */ 149 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 150 151 /* 152 * Access cache 153 */ 154 static acache_hash_t *acache; 155 static long nacache; /* used strictly to size the number of hash queues */ 156 157 static int acachesize; 158 static int acachemask; 159 static struct kmem_cache *acache_cache; 160 161 /* 162 * Client side utilities 163 */ 164 165 /* 166 * client side statistics 167 */ 168 static const struct clstat clstat_tmpl = { 169 { "calls", KSTAT_DATA_UINT64 }, 170 { "badcalls", KSTAT_DATA_UINT64 }, 171 { "clgets", KSTAT_DATA_UINT64 }, 172 { "cltoomany", KSTAT_DATA_UINT64 }, 173 #ifdef DEBUG 174 { "clalloc", KSTAT_DATA_UINT64 }, 175 { "noresponse", KSTAT_DATA_UINT64 }, 176 { "failover", KSTAT_DATA_UINT64 }, 177 { "remap", KSTAT_DATA_UINT64 }, 178 #endif 179 }; 180 181 /* 182 * The following are statistics that describe behavior of the system as a whole 183 * and doesn't correspond to any one particular zone. 184 */ 185 #ifdef DEBUG 186 static struct clstat_debug { 187 kstat_named_t nrnode; /* number of allocated rnodes */ 188 kstat_named_t access; /* size of access cache */ 189 kstat_named_t dirent; /* size of readdir cache */ 190 kstat_named_t dirents; /* size of readdir buf cache */ 191 kstat_named_t reclaim; /* number of reclaims */ 192 kstat_named_t clreclaim; /* number of cl reclaims */ 193 kstat_named_t f_reclaim; /* number of free reclaims */ 194 kstat_named_t a_reclaim; /* number of active reclaims */ 195 kstat_named_t r_reclaim; /* number of rnode reclaims */ 196 kstat_named_t rpath; /* bytes used to store rpaths */ 197 } clstat_debug = { 198 { "nrnode", KSTAT_DATA_UINT64 }, 199 { "access", KSTAT_DATA_UINT64 }, 200 { "dirent", KSTAT_DATA_UINT64 }, 201 { "dirents", KSTAT_DATA_UINT64 }, 202 { "reclaim", KSTAT_DATA_UINT64 }, 203 { "clreclaim", KSTAT_DATA_UINT64 }, 204 { "f_reclaim", KSTAT_DATA_UINT64 }, 205 { "a_reclaim", KSTAT_DATA_UINT64 }, 206 { "r_reclaim", KSTAT_DATA_UINT64 }, 207 { "r_path", KSTAT_DATA_UINT64 }, 208 }; 209 #endif /* DEBUG */ 210 211 /* 212 * We keep a global list of per-zone client data, so we can clean up all zones 213 * if we get low on memory. 214 */ 215 static list_t nfs_clnt_list; 216 static kmutex_t nfs_clnt_list_lock; 217 static zone_key_t nfsclnt_zone_key; 218 219 static struct kmem_cache *chtab_cache; 220 221 /* 222 * Some servers do not properly update the attributes of the 223 * directory when changes are made. To allow interoperability 224 * with these broken servers, the nfs_disable_rddir_cache 225 * parameter must be set in /etc/system 226 */ 227 int nfs_disable_rddir_cache = 0; 228 229 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 230 struct chtab **); 231 void clfree(CLIENT *, struct chtab *); 232 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 233 struct chtab **, struct nfs_clnt *); 234 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 235 struct chtab **, struct nfs_clnt *); 236 static void clreclaim(void *); 237 static int nfs_feedback(int, int, mntinfo_t *); 238 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 239 caddr_t, cred_t *, int *, enum clnt_stat *, int, 240 failinfo_t *); 241 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 242 caddr_t, cred_t *, int *, int, failinfo_t *); 243 static void rinactive(rnode_t *, cred_t *); 244 static int rtablehash(nfs_fhandle *); 245 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 246 struct vnodeops *, 247 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 248 cred_t *), 249 int (*)(const void *, const void *), int *, cred_t *, 250 char *, char *); 251 static void rp_rmfree(rnode_t *); 252 static void rp_addhash(rnode_t *); 253 static void rp_rmhash_locked(rnode_t *); 254 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 255 static void destroy_rnode(rnode_t *); 256 static void rddir_cache_free(rddir_cache *); 257 static int nfs_free_data_reclaim(rnode_t *); 258 static int nfs_active_data_reclaim(rnode_t *); 259 static int nfs_free_reclaim(void); 260 static int nfs_active_reclaim(void); 261 static int nfs_rnode_reclaim(void); 262 static void nfs_reclaim(void *); 263 static int failover_safe(failinfo_t *); 264 static void failover_newserver(mntinfo_t *mi); 265 static void failover_thread(mntinfo_t *mi); 266 static int failover_wait(mntinfo_t *); 267 static int failover_remap(failinfo_t *); 268 static int failover_lookup(char *, vnode_t *, 269 int (*)(vnode_t *, char *, vnode_t **, 270 struct pathname *, int, vnode_t *, cred_t *, int), 271 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 272 vnode_t **); 273 static void nfs_free_r_path(rnode_t *); 274 static void nfs_set_vroot(vnode_t *); 275 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 276 277 /* 278 * from rpcsec module (common/rpcsec) 279 */ 280 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 281 extern void sec_clnt_freeh(AUTH *); 282 extern void sec_clnt_freeinfo(struct sec_data *); 283 284 /* 285 * used in mount policy 286 */ 287 extern ts_label_t *getflabel_cipso(vfs_t *); 288 289 /* 290 * EIO or EINTR are not recoverable errors. 291 */ 292 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 293 294 /* 295 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 296 */ 297 static int 298 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 299 struct chtab **chp, struct nfs_clnt *nfscl) 300 { 301 struct chhead *ch, *newch; 302 struct chhead **plistp; 303 struct chtab *cp; 304 int error; 305 k_sigset_t smask; 306 307 if (newcl == NULL || chp == NULL || ci == NULL) 308 return (EINVAL); 309 310 *newcl = NULL; 311 *chp = NULL; 312 313 /* 314 * Find an unused handle or create one 315 */ 316 newch = NULL; 317 nfscl->nfscl_stat.clgets.value.ui64++; 318 top: 319 /* 320 * Find the correct entry in the cache to check for free 321 * client handles. The search is based on the RPC program 322 * number, program version number, dev_t for the transport 323 * device, and the protocol family. 324 */ 325 mutex_enter(&nfscl->nfscl_chtable_lock); 326 plistp = &nfscl->nfscl_chtable; 327 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 328 if (ch->ch_prog == ci->cl_prog && 329 ch->ch_vers == ci->cl_vers && 330 ch->ch_dev == svp->sv_knconf->knc_rdev && 331 (strcmp(ch->ch_protofmly, 332 svp->sv_knconf->knc_protofmly) == 0)) 333 break; 334 plistp = &ch->ch_next; 335 } 336 337 /* 338 * If we didn't find a cache entry for this quadruple, then 339 * create one. If we don't have one already preallocated, 340 * then drop the cache lock, create one, and then start over. 341 * If we did have a preallocated entry, then just add it to 342 * the front of the list. 343 */ 344 if (ch == NULL) { 345 if (newch == NULL) { 346 mutex_exit(&nfscl->nfscl_chtable_lock); 347 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 348 newch->ch_timesused = 0; 349 newch->ch_prog = ci->cl_prog; 350 newch->ch_vers = ci->cl_vers; 351 newch->ch_dev = svp->sv_knconf->knc_rdev; 352 newch->ch_protofmly = kmem_alloc( 353 strlen(svp->sv_knconf->knc_protofmly) + 1, 354 KM_SLEEP); 355 (void) strcpy(newch->ch_protofmly, 356 svp->sv_knconf->knc_protofmly); 357 newch->ch_list = NULL; 358 goto top; 359 } 360 ch = newch; 361 newch = NULL; 362 ch->ch_next = nfscl->nfscl_chtable; 363 nfscl->nfscl_chtable = ch; 364 /* 365 * We found a cache entry, but if it isn't on the front of the 366 * list, then move it to the front of the list to try to take 367 * advantage of locality of operations. 368 */ 369 } else if (ch != nfscl->nfscl_chtable) { 370 *plistp = ch->ch_next; 371 ch->ch_next = nfscl->nfscl_chtable; 372 nfscl->nfscl_chtable = ch; 373 } 374 375 /* 376 * If there was a free client handle cached, then remove it 377 * from the list, init it, and use it. 378 */ 379 if (ch->ch_list != NULL) { 380 cp = ch->ch_list; 381 ch->ch_list = cp->ch_list; 382 mutex_exit(&nfscl->nfscl_chtable_lock); 383 if (newch != NULL) { 384 kmem_free(newch->ch_protofmly, 385 strlen(newch->ch_protofmly) + 1); 386 kmem_free(newch, sizeof (*newch)); 387 } 388 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 389 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 390 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 391 &cp->ch_client->cl_auth); 392 if (error || cp->ch_client->cl_auth == NULL) { 393 CLNT_DESTROY(cp->ch_client); 394 kmem_cache_free(chtab_cache, cp); 395 return ((error != 0) ? error : EINTR); 396 } 397 ch->ch_timesused++; 398 *newcl = cp->ch_client; 399 *chp = cp; 400 return (0); 401 } 402 403 /* 404 * There weren't any free client handles which fit, so allocate 405 * a new one and use that. 406 */ 407 #ifdef DEBUG 408 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 409 #endif 410 mutex_exit(&nfscl->nfscl_chtable_lock); 411 412 nfscl->nfscl_stat.cltoomany.value.ui64++; 413 if (newch != NULL) { 414 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 415 kmem_free(newch, sizeof (*newch)); 416 } 417 418 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 419 cp->ch_head = ch; 420 421 sigintr(&smask, (int)ci->cl_flags & MI_INT); 422 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 423 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 424 sigunintr(&smask); 425 426 if (error != 0) { 427 kmem_cache_free(chtab_cache, cp); 428 #ifdef DEBUG 429 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 430 #endif 431 /* 432 * Warning is unnecessary if error is EINTR. 433 */ 434 if (error != EINTR) { 435 nfs_cmn_err(error, CE_WARN, 436 "clget: couldn't create handle: %m\n"); 437 } 438 return (error); 439 } 440 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 441 auth_destroy(cp->ch_client->cl_auth); 442 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 443 &cp->ch_client->cl_auth); 444 if (error || cp->ch_client->cl_auth == NULL) { 445 CLNT_DESTROY(cp->ch_client); 446 kmem_cache_free(chtab_cache, cp); 447 #ifdef DEBUG 448 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 449 #endif 450 return ((error != 0) ? error : EINTR); 451 } 452 ch->ch_timesused++; 453 *newcl = cp->ch_client; 454 ASSERT(cp->ch_client->cl_nosignal == FALSE); 455 *chp = cp; 456 return (0); 457 } 458 459 int 460 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 461 struct chtab **chp) 462 { 463 struct nfs_clnt *nfscl; 464 465 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 466 ASSERT(nfscl != NULL); 467 468 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 469 } 470 471 static int 472 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 473 struct chtab **chp, struct nfs_clnt *nfscl) 474 { 475 clinfo_t ci; 476 int error; 477 478 /* 479 * Set read buffer size to rsize 480 * and add room for RPC headers. 481 */ 482 ci.cl_readsize = mi->mi_tsize; 483 if (ci.cl_readsize != 0) 484 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 485 486 /* 487 * If soft mount and server is down just try once. 488 * meaning: do not retransmit. 489 */ 490 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 491 ci.cl_retrans = 0; 492 else 493 ci.cl_retrans = mi->mi_retrans; 494 495 ci.cl_prog = NFS_ACL_PROGRAM; 496 ci.cl_vers = mi->mi_vers; 497 ci.cl_flags = mi->mi_flags; 498 499 /* 500 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 501 * security flavor, the client tries to establish a security context 502 * by contacting the server. If the connection is timed out or reset, 503 * e.g. server reboot, we will try again. 504 */ 505 do { 506 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 507 508 if (error == 0) 509 break; 510 511 /* 512 * For forced unmount or zone shutdown, bail out, no retry. 513 */ 514 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 515 error = EIO; 516 break; 517 } 518 519 /* do not retry for softmount */ 520 if (!(mi->mi_flags & MI_HARD)) 521 break; 522 523 /* let the caller deal with the failover case */ 524 if (FAILOVER_MOUNT(mi)) 525 break; 526 527 } while (error == ETIMEDOUT || error == ECONNRESET); 528 529 return (error); 530 } 531 532 static int 533 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 534 struct chtab **chp, struct nfs_clnt *nfscl) 535 { 536 clinfo_t ci; 537 int error; 538 539 /* 540 * Set read buffer size to rsize 541 * and add room for RPC headers. 542 */ 543 ci.cl_readsize = mi->mi_tsize; 544 if (ci.cl_readsize != 0) 545 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 546 547 /* 548 * If soft mount and server is down just try once. 549 * meaning: do not retransmit. 550 */ 551 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 552 ci.cl_retrans = 0; 553 else 554 ci.cl_retrans = mi->mi_retrans; 555 556 ci.cl_prog = mi->mi_prog; 557 ci.cl_vers = mi->mi_vers; 558 ci.cl_flags = mi->mi_flags; 559 560 /* 561 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 562 * security flavor, the client tries to establish a security context 563 * by contacting the server. If the connection is timed out or reset, 564 * e.g. server reboot, we will try again. 565 */ 566 do { 567 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 568 569 if (error == 0) 570 break; 571 572 /* 573 * For forced unmount or zone shutdown, bail out, no retry. 574 */ 575 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 576 error = EIO; 577 break; 578 } 579 580 /* do not retry for softmount */ 581 if (!(mi->mi_flags & MI_HARD)) 582 break; 583 584 /* let the caller deal with the failover case */ 585 if (FAILOVER_MOUNT(mi)) 586 break; 587 588 } while (error == ETIMEDOUT || error == ECONNRESET); 589 590 return (error); 591 } 592 593 static void 594 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 595 { 596 if (cl->cl_auth != NULL) { 597 sec_clnt_freeh(cl->cl_auth); 598 cl->cl_auth = NULL; 599 } 600 601 /* 602 * Timestamp this cache entry so that we know when it was last 603 * used. 604 */ 605 cp->ch_freed = gethrestime_sec(); 606 607 /* 608 * Add the free client handle to the front of the list. 609 * This way, the list will be sorted in youngest to oldest 610 * order. 611 */ 612 mutex_enter(&nfscl->nfscl_chtable_lock); 613 cp->ch_list = cp->ch_head->ch_list; 614 cp->ch_head->ch_list = cp; 615 mutex_exit(&nfscl->nfscl_chtable_lock); 616 } 617 618 void 619 clfree(CLIENT *cl, struct chtab *cp) 620 { 621 struct nfs_clnt *nfscl; 622 623 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 624 ASSERT(nfscl != NULL); 625 626 clfree_impl(cl, cp, nfscl); 627 } 628 629 #define CL_HOLDTIME 60 /* time to hold client handles */ 630 631 static void 632 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 633 { 634 struct chhead *ch; 635 struct chtab *cp; /* list of objects that can be reclaimed */ 636 struct chtab *cpe; 637 struct chtab *cpl; 638 struct chtab **cpp; 639 #ifdef DEBUG 640 int n = 0; 641 #endif 642 643 /* 644 * Need to reclaim some memory, so step through the cache 645 * looking through the lists for entries which can be freed. 646 */ 647 cp = NULL; 648 649 mutex_enter(&nfscl->nfscl_chtable_lock); 650 651 /* 652 * Here we step through each non-NULL quadruple and start to 653 * construct the reclaim list pointed to by cp. Note that 654 * cp will contain all eligible chtab entries. When this traversal 655 * completes, chtab entries from the last quadruple will be at the 656 * front of cp and entries from previously inspected quadruples have 657 * been appended to the rear of cp. 658 */ 659 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 660 if (ch->ch_list == NULL) 661 continue; 662 /* 663 * Search each list for entries older then 664 * cl_holdtime seconds. The lists are maintained 665 * in youngest to oldest order so that when the 666 * first entry is found which is old enough, then 667 * all of the rest of the entries on the list will 668 * be old enough as well. 669 */ 670 cpl = ch->ch_list; 671 cpp = &ch->ch_list; 672 while (cpl != NULL && 673 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 674 cpp = &cpl->ch_list; 675 cpl = cpl->ch_list; 676 } 677 if (cpl != NULL) { 678 *cpp = NULL; 679 if (cp != NULL) { 680 cpe = cpl; 681 while (cpe->ch_list != NULL) 682 cpe = cpe->ch_list; 683 cpe->ch_list = cp; 684 } 685 cp = cpl; 686 } 687 } 688 689 mutex_exit(&nfscl->nfscl_chtable_lock); 690 691 /* 692 * If cp is empty, then there is nothing to reclaim here. 693 */ 694 if (cp == NULL) 695 return; 696 697 /* 698 * Step through the list of entries to free, destroying each client 699 * handle and kmem_free'ing the memory for each entry. 700 */ 701 while (cp != NULL) { 702 #ifdef DEBUG 703 n++; 704 #endif 705 CLNT_DESTROY(cp->ch_client); 706 cpl = cp->ch_list; 707 kmem_cache_free(chtab_cache, cp); 708 cp = cpl; 709 } 710 711 #ifdef DEBUG 712 /* 713 * Update clalloc so that nfsstat shows the current number 714 * of allocated client handles. 715 */ 716 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 717 #endif 718 } 719 720 /* ARGSUSED */ 721 static void 722 clreclaim(void *all) 723 { 724 struct nfs_clnt *nfscl; 725 726 #ifdef DEBUG 727 clstat_debug.clreclaim.value.ui64++; 728 #endif 729 /* 730 * The system is low on memory; go through and try to reclaim some from 731 * every zone on the system. 732 */ 733 mutex_enter(&nfs_clnt_list_lock); 734 nfscl = list_head(&nfs_clnt_list); 735 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 736 clreclaim_zone(nfscl, CL_HOLDTIME); 737 mutex_exit(&nfs_clnt_list_lock); 738 } 739 740 /* 741 * Minimum time-out values indexed by call type 742 * These units are in "eights" of a second to avoid multiplies 743 */ 744 static unsigned int minimum_timeo[] = { 745 6, 7, 10 746 }; 747 748 /* 749 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 750 */ 751 #define MAXTIMO (20*hz) 752 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 753 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 754 755 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 756 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 757 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 758 759 /* 760 * Function called when rfscall notices that we have been 761 * re-transmitting, or when we get a response without retransmissions. 762 * Return 1 if the transfer size was adjusted down - 0 if no change. 763 */ 764 static int 765 nfs_feedback(int flag, int which, mntinfo_t *mi) 766 { 767 int kind; 768 int r = 0; 769 770 mutex_enter(&mi->mi_lock); 771 if (flag == FEEDBACK_REXMIT1) { 772 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 773 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 774 goto done; 775 if (mi->mi_curread > MIN_NFS_TSIZE) { 776 mi->mi_curread /= 2; 777 if (mi->mi_curread < MIN_NFS_TSIZE) 778 mi->mi_curread = MIN_NFS_TSIZE; 779 r = 1; 780 } 781 782 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 783 mi->mi_curwrite /= 2; 784 if (mi->mi_curwrite < MIN_NFS_TSIZE) 785 mi->mi_curwrite = MIN_NFS_TSIZE; 786 r = 1; 787 } 788 } else if (flag == FEEDBACK_OK) { 789 kind = mi->mi_timer_type[which]; 790 if (kind == 0 || 791 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 792 goto done; 793 if (kind == 1) { 794 if (mi->mi_curread >= mi->mi_tsize) 795 goto done; 796 mi->mi_curread += MIN_NFS_TSIZE; 797 if (mi->mi_curread > mi->mi_tsize/2) 798 mi->mi_curread = mi->mi_tsize; 799 } else if (kind == 2) { 800 if (mi->mi_curwrite >= mi->mi_stsize) 801 goto done; 802 mi->mi_curwrite += MIN_NFS_TSIZE; 803 if (mi->mi_curwrite > mi->mi_stsize/2) 804 mi->mi_curwrite = mi->mi_stsize; 805 } 806 } 807 done: 808 mutex_exit(&mi->mi_lock); 809 return (r); 810 } 811 812 #ifdef DEBUG 813 static int rfs2call_hits = 0; 814 static int rfs2call_misses = 0; 815 #endif 816 817 int 818 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 819 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 820 enum nfsstat *statusp, int flags, failinfo_t *fi) 821 { 822 int rpcerror; 823 enum clnt_stat rpc_status; 824 825 ASSERT(statusp != NULL); 826 827 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 828 cr, douprintf, &rpc_status, flags, fi); 829 if (!rpcerror) { 830 /* 831 * See crnetadjust() for comments. 832 */ 833 if (*statusp == NFSERR_ACCES && 834 (cr = crnetadjust(cr)) != NULL) { 835 #ifdef DEBUG 836 rfs2call_hits++; 837 #endif 838 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 839 resp, cr, douprintf, NULL, flags, fi); 840 crfree(cr); 841 #ifdef DEBUG 842 if (*statusp == NFSERR_ACCES) 843 rfs2call_misses++; 844 #endif 845 } 846 } else if (rpc_status == RPC_PROCUNAVAIL) { 847 *statusp = NFSERR_OPNOTSUPP; 848 rpcerror = 0; 849 } 850 851 return (rpcerror); 852 } 853 854 #define NFS3_JUKEBOX_DELAY 10 * hz 855 856 static clock_t nfs3_jukebox_delay = 0; 857 858 #ifdef DEBUG 859 static int rfs3call_hits = 0; 860 static int rfs3call_misses = 0; 861 #endif 862 863 int 864 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 865 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 866 nfsstat3 *statusp, int flags, failinfo_t *fi) 867 { 868 int rpcerror; 869 int user_informed; 870 871 user_informed = 0; 872 do { 873 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 874 cr, douprintf, NULL, flags, fi); 875 if (!rpcerror) { 876 cred_t *crr; 877 if (*statusp == NFS3ERR_JUKEBOX) { 878 if (ttoproc(curthread) == &p0) { 879 rpcerror = EAGAIN; 880 break; 881 } 882 if (!user_informed) { 883 user_informed = 1; 884 uprintf( 885 "file temporarily unavailable on the server, retrying...\n"); 886 } 887 delay(nfs3_jukebox_delay); 888 } 889 /* 890 * See crnetadjust() for comments. 891 */ 892 else if (*statusp == NFS3ERR_ACCES && 893 (crr = crnetadjust(cr)) != NULL) { 894 #ifdef DEBUG 895 rfs3call_hits++; 896 #endif 897 rpcerror = rfscall(mi, which, xdrargs, argsp, 898 xdrres, resp, crr, douprintf, 899 NULL, flags, fi); 900 901 crfree(crr); 902 #ifdef DEBUG 903 if (*statusp == NFS3ERR_ACCES) 904 rfs3call_misses++; 905 #endif 906 } 907 } 908 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 909 910 return (rpcerror); 911 } 912 913 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 914 #define INC_READERS(mi) { \ 915 mi->mi_readers++; \ 916 } 917 #define DEC_READERS(mi) { \ 918 mi->mi_readers--; \ 919 if (mi->mi_readers == 0) \ 920 cv_broadcast(&mi->mi_failover_cv); \ 921 } 922 923 static int 924 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 925 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 926 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 927 { 928 CLIENT *client; 929 struct chtab *ch; 930 cred_t *cr = icr; 931 enum clnt_stat status; 932 struct rpc_err rpcerr; 933 struct timeval wait; 934 int timeo; /* in units of hz */ 935 int my_rsize, my_wsize; 936 bool_t tryagain; 937 bool_t cred_cloned = FALSE; 938 k_sigset_t smask; 939 servinfo_t *svp; 940 struct nfs_clnt *nfscl; 941 zoneid_t zoneid = getzoneid(); 942 #ifdef DEBUG 943 char *bufp; 944 #endif 945 946 947 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 948 "rfscall_start:which %d mi %p", which, mi); 949 950 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 951 ASSERT(nfscl != NULL); 952 953 nfscl->nfscl_stat.calls.value.ui64++; 954 mi->mi_reqs[which].value.ui64++; 955 956 rpcerr.re_status = RPC_SUCCESS; 957 958 /* 959 * In case of forced unmount or zone shutdown, return EIO. 960 */ 961 962 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 963 rpcerr.re_status = RPC_FAILED; 964 rpcerr.re_errno = EIO; 965 return (rpcerr.re_errno); 966 } 967 968 /* 969 * Remember the transfer sizes in case 970 * nfs_feedback changes them underneath us. 971 */ 972 my_rsize = mi->mi_curread; 973 my_wsize = mi->mi_curwrite; 974 975 /* 976 * NFS client failover support 977 * 978 * If this rnode is not in sync with the current server (VALID_FH), 979 * we'd like to do a remap to get in sync. We can be interrupted 980 * in failover_remap(), and if so we'll bail. Otherwise, we'll 981 * use the best info we have to try the RPC. Part of that is 982 * unconditionally updating the filehandle copy kept for V3. 983 * 984 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 985 * rw_enter(); we're trying to keep the current server from being 986 * changed on us until we're done with the remapping and have a 987 * matching client handle. We don't want to sending a filehandle 988 * to the wrong host. 989 */ 990 failoverretry: 991 if (FAILOVER_MOUNT(mi)) { 992 mutex_enter(&mi->mi_lock); 993 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 994 if (failover_wait(mi)) { 995 mutex_exit(&mi->mi_lock); 996 return (EINTR); 997 } 998 } 999 INC_READERS(mi); 1000 mutex_exit(&mi->mi_lock); 1001 if (fi) { 1002 if (!VALID_FH(fi) && 1003 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1004 int remaperr; 1005 1006 svp = mi->mi_curr_serv; 1007 remaperr = failover_remap(fi); 1008 if (remaperr != 0) { 1009 #ifdef DEBUG 1010 if (remaperr != EINTR) 1011 nfs_cmn_err(remaperr, CE_WARN, 1012 "rfscall couldn't failover: %m"); 1013 #endif 1014 mutex_enter(&mi->mi_lock); 1015 DEC_READERS(mi); 1016 mutex_exit(&mi->mi_lock); 1017 /* 1018 * If failover_remap returns ETIMEDOUT 1019 * and the filesystem is hard mounted 1020 * we have to retry the call with a new 1021 * server. 1022 */ 1023 if ((mi->mi_flags & MI_HARD) && 1024 IS_RECOVERABLE_ERROR(remaperr)) { 1025 if (svp == mi->mi_curr_serv) 1026 failover_newserver(mi); 1027 rpcerr.re_status = RPC_SUCCESS; 1028 goto failoverretry; 1029 } 1030 rpcerr.re_errno = remaperr; 1031 return (remaperr); 1032 } 1033 } 1034 if (fi->fhp && fi->copyproc) 1035 (*fi->copyproc)(fi->fhp, fi->vp); 1036 } 1037 } 1038 1039 /* For TSOL, use a new cred which has net_mac_aware flag */ 1040 if (!cred_cloned && is_system_labeled()) { 1041 cred_cloned = TRUE; 1042 cr = crdup(icr); 1043 (void) setpflags(NET_MAC_AWARE, 1, cr); 1044 } 1045 1046 /* 1047 * clget() calls clnt_tli_kinit() which clears the xid, so we 1048 * are guaranteed to reprocess the retry as a new request. 1049 */ 1050 svp = mi->mi_curr_serv; 1051 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1052 1053 if (FAILOVER_MOUNT(mi)) { 1054 mutex_enter(&mi->mi_lock); 1055 DEC_READERS(mi); 1056 mutex_exit(&mi->mi_lock); 1057 1058 if ((rpcerr.re_errno == ETIMEDOUT || 1059 rpcerr.re_errno == ECONNRESET) && 1060 failover_safe(fi)) { 1061 if (svp == mi->mi_curr_serv) 1062 failover_newserver(mi); 1063 goto failoverretry; 1064 } 1065 } 1066 if (rpcerr.re_errno != 0) 1067 return (rpcerr.re_errno); 1068 1069 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1070 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1071 timeo = (mi->mi_timeo * hz) / 10; 1072 } else { 1073 mutex_enter(&mi->mi_lock); 1074 timeo = CLNT_SETTIMERS(client, 1075 &(mi->mi_timers[mi->mi_timer_type[which]]), 1076 &(mi->mi_timers[NFS_CALLTYPES]), 1077 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1078 (void (*)())NULL, (caddr_t)mi, 0); 1079 mutex_exit(&mi->mi_lock); 1080 } 1081 1082 /* 1083 * If hard mounted fs, retry call forever unless hard error occurs. 1084 */ 1085 do { 1086 tryagain = FALSE; 1087 1088 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1089 status = RPC_FAILED; 1090 rpcerr.re_status = RPC_FAILED; 1091 rpcerr.re_errno = EIO; 1092 break; 1093 } 1094 1095 TICK_TO_TIMEVAL(timeo, &wait); 1096 1097 /* 1098 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1099 * and SIGTERM. (Preserving the existing masks). 1100 * Mask out SIGINT if mount option nointr is specified. 1101 */ 1102 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1103 if (!(mi->mi_flags & MI_INT)) 1104 client->cl_nosignal = TRUE; 1105 1106 /* 1107 * If there is a current signal, then don't bother 1108 * even trying to send out the request because we 1109 * won't be able to block waiting for the response. 1110 * Simply assume RPC_INTR and get on with it. 1111 */ 1112 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1113 status = RPC_INTR; 1114 else { 1115 status = CLNT_CALL(client, which, xdrargs, argsp, 1116 xdrres, resp, wait); 1117 } 1118 1119 if (!(mi->mi_flags & MI_INT)) 1120 client->cl_nosignal = FALSE; 1121 /* 1122 * restore original signal mask 1123 */ 1124 sigunintr(&smask); 1125 1126 switch (status) { 1127 case RPC_SUCCESS: 1128 if ((mi->mi_flags & MI_DYNAMIC) && 1129 mi->mi_timer_type[which] != 0 && 1130 (mi->mi_curread != my_rsize || 1131 mi->mi_curwrite != my_wsize)) 1132 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1133 break; 1134 1135 case RPC_INTR: 1136 /* 1137 * There is no way to recover from this error, 1138 * even if mount option nointr is specified. 1139 * SIGKILL, for example, cannot be blocked. 1140 */ 1141 rpcerr.re_status = RPC_INTR; 1142 rpcerr.re_errno = EINTR; 1143 break; 1144 1145 case RPC_UDERROR: 1146 /* 1147 * If the NFS server is local (vold) and 1148 * it goes away then we get RPC_UDERROR. 1149 * This is a retryable error, so we would 1150 * loop, so check to see if the specific 1151 * error was ECONNRESET, indicating that 1152 * target did not exist at all. If so, 1153 * return with RPC_PROGUNAVAIL and 1154 * ECONNRESET to indicate why. 1155 */ 1156 CLNT_GETERR(client, &rpcerr); 1157 if (rpcerr.re_errno == ECONNRESET) { 1158 rpcerr.re_status = RPC_PROGUNAVAIL; 1159 rpcerr.re_errno = ECONNRESET; 1160 break; 1161 } 1162 /*FALLTHROUGH*/ 1163 1164 default: /* probably RPC_TIMEDOUT */ 1165 if (IS_UNRECOVERABLE_RPC(status)) 1166 break; 1167 1168 /* 1169 * increment server not responding count 1170 */ 1171 mutex_enter(&mi->mi_lock); 1172 mi->mi_noresponse++; 1173 mutex_exit(&mi->mi_lock); 1174 #ifdef DEBUG 1175 nfscl->nfscl_stat.noresponse.value.ui64++; 1176 #endif 1177 1178 if (!(mi->mi_flags & MI_HARD)) { 1179 if (!(mi->mi_flags & MI_SEMISOFT) || 1180 (mi->mi_ss_call_type[which] == 0)) 1181 break; 1182 } 1183 1184 /* 1185 * The call is in progress (over COTS). 1186 * Try the CLNT_CALL again, but don't 1187 * print a noisy error message. 1188 */ 1189 if (status == RPC_INPROGRESS) { 1190 tryagain = TRUE; 1191 break; 1192 } 1193 1194 if (flags & RFSCALL_SOFT) 1195 break; 1196 1197 /* 1198 * On zone shutdown, just move on. 1199 */ 1200 if (zone_status_get(curproc->p_zone) >= 1201 ZONE_IS_SHUTTING_DOWN) { 1202 rpcerr.re_status = RPC_FAILED; 1203 rpcerr.re_errno = EIO; 1204 break; 1205 } 1206 1207 /* 1208 * NFS client failover support 1209 * 1210 * If the current server just failed us, we'll 1211 * start the process of finding a new server. 1212 * After that, we can just retry. 1213 */ 1214 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1215 if (svp == mi->mi_curr_serv) 1216 failover_newserver(mi); 1217 clfree_impl(client, ch, nfscl); 1218 goto failoverretry; 1219 } 1220 1221 tryagain = TRUE; 1222 timeo = backoff(timeo); 1223 mutex_enter(&mi->mi_lock); 1224 if (!(mi->mi_flags & MI_PRINTED)) { 1225 mi->mi_flags |= MI_PRINTED; 1226 mutex_exit(&mi->mi_lock); 1227 #ifdef DEBUG 1228 zprintf(zoneid, 1229 "NFS%d server %s not responding still trying\n", 1230 mi->mi_vers, svp->sv_hostname); 1231 #else 1232 zprintf(zoneid, 1233 "NFS server %s not responding still trying\n", 1234 svp->sv_hostname); 1235 #endif 1236 } else 1237 mutex_exit(&mi->mi_lock); 1238 if (*douprintf && curproc->p_sessp->s_vp != NULL) { 1239 *douprintf = 0; 1240 if (!(mi->mi_flags & MI_NOPRINT)) 1241 #ifdef DEBUG 1242 uprintf( 1243 "NFS%d server %s not responding still trying\n", 1244 mi->mi_vers, svp->sv_hostname); 1245 #else 1246 uprintf( 1247 "NFS server %s not responding still trying\n", 1248 svp->sv_hostname); 1249 #endif 1250 } 1251 1252 /* 1253 * If doing dynamic adjustment of transfer 1254 * size and if it's a read or write call 1255 * and if the transfer size changed while 1256 * retransmitting or if the feedback routine 1257 * changed the transfer size, 1258 * then exit rfscall so that the transfer 1259 * size can be adjusted at the vnops level. 1260 */ 1261 if ((mi->mi_flags & MI_DYNAMIC) && 1262 mi->mi_timer_type[which] != 0 && 1263 (mi->mi_curread != my_rsize || 1264 mi->mi_curwrite != my_wsize || 1265 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1266 /* 1267 * On read or write calls, return 1268 * back to the vnode ops level if 1269 * the transfer size changed. 1270 */ 1271 clfree_impl(client, ch, nfscl); 1272 if (cred_cloned) 1273 crfree(cr); 1274 return (ENFS_TRYAGAIN); 1275 } 1276 } 1277 } while (tryagain); 1278 1279 if (status != RPC_SUCCESS) { 1280 /* 1281 * Let soft mounts use the timed out message. 1282 */ 1283 if (status == RPC_INPROGRESS) 1284 status = RPC_TIMEDOUT; 1285 nfscl->nfscl_stat.badcalls.value.ui64++; 1286 if (status != RPC_INTR) { 1287 mutex_enter(&mi->mi_lock); 1288 mi->mi_flags |= MI_DOWN; 1289 mutex_exit(&mi->mi_lock); 1290 CLNT_GETERR(client, &rpcerr); 1291 #ifdef DEBUG 1292 bufp = clnt_sperror(client, svp->sv_hostname); 1293 zprintf(zoneid, "NFS%d %s failed for %s\n", 1294 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1295 if (curproc->p_sessp->s_vp != NULL) { 1296 if (!(mi->mi_flags & MI_NOPRINT)) { 1297 uprintf("NFS%d %s failed for %s\n", 1298 mi->mi_vers, mi->mi_rfsnames[which], 1299 bufp); 1300 } 1301 } 1302 kmem_free(bufp, MAXPATHLEN); 1303 #else 1304 zprintf(zoneid, 1305 "NFS %s failed for server %s: error %d (%s)\n", 1306 mi->mi_rfsnames[which], svp->sv_hostname, 1307 status, clnt_sperrno(status)); 1308 if (curproc->p_sessp->s_vp != NULL) { 1309 if (!(mi->mi_flags & MI_NOPRINT)) { 1310 uprintf( 1311 "NFS %s failed for server %s: error %d (%s)\n", 1312 mi->mi_rfsnames[which], 1313 svp->sv_hostname, status, 1314 clnt_sperrno(status)); 1315 } 1316 } 1317 #endif 1318 /* 1319 * when CLNT_CALL() fails with RPC_AUTHERROR, 1320 * re_errno is set appropriately depending on 1321 * the authentication error 1322 */ 1323 if (status == RPC_VERSMISMATCH || 1324 status == RPC_PROGVERSMISMATCH) 1325 rpcerr.re_errno = EIO; 1326 } 1327 } else { 1328 /* 1329 * Test the value of mi_down and mi_printed without 1330 * holding the mi_lock mutex. If they are both zero, 1331 * then it is okay to skip the down and printed 1332 * processing. This saves on a mutex_enter and 1333 * mutex_exit pair for a normal, successful RPC. 1334 * This was just complete overhead. 1335 */ 1336 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1337 mutex_enter(&mi->mi_lock); 1338 mi->mi_flags &= ~MI_DOWN; 1339 if (mi->mi_flags & MI_PRINTED) { 1340 mi->mi_flags &= ~MI_PRINTED; 1341 mutex_exit(&mi->mi_lock); 1342 #ifdef DEBUG 1343 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1344 zprintf(zoneid, "NFS%d server %s ok\n", 1345 mi->mi_vers, svp->sv_hostname); 1346 #else 1347 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1348 zprintf(zoneid, "NFS server %s ok\n", 1349 svp->sv_hostname); 1350 #endif 1351 } else 1352 mutex_exit(&mi->mi_lock); 1353 } 1354 1355 if (*douprintf == 0) { 1356 if (!(mi->mi_flags & MI_NOPRINT)) 1357 #ifdef DEBUG 1358 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1359 uprintf("NFS%d server %s ok\n", 1360 mi->mi_vers, svp->sv_hostname); 1361 #else 1362 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1363 uprintf("NFS server %s ok\n", svp->sv_hostname); 1364 #endif 1365 *douprintf = 1; 1366 } 1367 } 1368 1369 clfree_impl(client, ch, nfscl); 1370 if (cred_cloned) 1371 crfree(cr); 1372 1373 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1374 1375 if (rpc_status != NULL) 1376 *rpc_status = rpcerr.re_status; 1377 1378 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1379 rpcerr.re_errno); 1380 1381 return (rpcerr.re_errno); 1382 } 1383 1384 #ifdef DEBUG 1385 static int acl2call_hits = 0; 1386 static int acl2call_misses = 0; 1387 #endif 1388 1389 int 1390 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1391 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1392 enum nfsstat *statusp, int flags, failinfo_t *fi) 1393 { 1394 int rpcerror; 1395 1396 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1397 cr, douprintf, flags, fi); 1398 if (!rpcerror) { 1399 /* 1400 * See comments with crnetadjust(). 1401 */ 1402 if (*statusp == NFSERR_ACCES && 1403 (cr = crnetadjust(cr)) != NULL) { 1404 #ifdef DEBUG 1405 acl2call_hits++; 1406 #endif 1407 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1408 resp, cr, douprintf, flags, fi); 1409 crfree(cr); 1410 #ifdef DEBUG 1411 if (*statusp == NFSERR_ACCES) 1412 acl2call_misses++; 1413 #endif 1414 } 1415 } 1416 1417 return (rpcerror); 1418 } 1419 1420 #ifdef DEBUG 1421 static int acl3call_hits = 0; 1422 static int acl3call_misses = 0; 1423 #endif 1424 1425 int 1426 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1427 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1428 nfsstat3 *statusp, int flags, failinfo_t *fi) 1429 { 1430 int rpcerror; 1431 int user_informed; 1432 1433 user_informed = 0; 1434 1435 do { 1436 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1437 cr, douprintf, flags, fi); 1438 if (!rpcerror) { 1439 cred_t *crr; 1440 if (*statusp == NFS3ERR_JUKEBOX) { 1441 if (!user_informed) { 1442 user_informed = 1; 1443 uprintf( 1444 "file temporarily unavailable on the server, retrying...\n"); 1445 } 1446 delay(nfs3_jukebox_delay); 1447 } 1448 /* 1449 * See crnetadjust() for comments. 1450 */ 1451 else if (*statusp == NFS3ERR_ACCES && 1452 (crr = crnetadjust(cr)) != NULL) { 1453 #ifdef DEBUG 1454 acl3call_hits++; 1455 #endif 1456 rpcerror = aclcall(mi, which, xdrargs, argsp, 1457 xdrres, resp, crr, douprintf, flags, fi); 1458 1459 crfree(crr); 1460 #ifdef DEBUG 1461 if (*statusp == NFS3ERR_ACCES) 1462 acl3call_misses++; 1463 #endif 1464 } 1465 } 1466 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1467 1468 return (rpcerror); 1469 } 1470 1471 static int 1472 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1473 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1474 int flags, failinfo_t *fi) 1475 { 1476 CLIENT *client; 1477 struct chtab *ch; 1478 cred_t *cr = icr; 1479 bool_t cred_cloned = FALSE; 1480 enum clnt_stat status; 1481 struct rpc_err rpcerr; 1482 struct timeval wait; 1483 int timeo; /* in units of hz */ 1484 #if 0 /* notyet */ 1485 int my_rsize, my_wsize; 1486 #endif 1487 bool_t tryagain; 1488 k_sigset_t smask; 1489 servinfo_t *svp; 1490 struct nfs_clnt *nfscl; 1491 zoneid_t zoneid = getzoneid(); 1492 #ifdef DEBUG 1493 char *bufp; 1494 #endif 1495 1496 #if 0 /* notyet */ 1497 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1498 "rfscall_start:which %d mi %p", which, mi); 1499 #endif 1500 1501 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1502 ASSERT(nfscl != NULL); 1503 1504 nfscl->nfscl_stat.calls.value.ui64++; 1505 mi->mi_aclreqs[which].value.ui64++; 1506 1507 rpcerr.re_status = RPC_SUCCESS; 1508 1509 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1510 rpcerr.re_status = RPC_FAILED; 1511 rpcerr.re_errno = EIO; 1512 return (rpcerr.re_errno); 1513 } 1514 1515 #if 0 /* notyet */ 1516 /* 1517 * Remember the transfer sizes in case 1518 * nfs_feedback changes them underneath us. 1519 */ 1520 my_rsize = mi->mi_curread; 1521 my_wsize = mi->mi_curwrite; 1522 #endif 1523 1524 /* 1525 * NFS client failover support 1526 * 1527 * If this rnode is not in sync with the current server (VALID_FH), 1528 * we'd like to do a remap to get in sync. We can be interrupted 1529 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1530 * use the best info we have to try the RPC. Part of that is 1531 * unconditionally updating the filehandle copy kept for V3. 1532 * 1533 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1534 * rw_enter(); we're trying to keep the current server from being 1535 * changed on us until we're done with the remapping and have a 1536 * matching client handle. We don't want to sending a filehandle 1537 * to the wrong host. 1538 */ 1539 failoverretry: 1540 if (FAILOVER_MOUNT(mi)) { 1541 mutex_enter(&mi->mi_lock); 1542 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1543 if (failover_wait(mi)) { 1544 mutex_exit(&mi->mi_lock); 1545 return (EINTR); 1546 } 1547 } 1548 INC_READERS(mi); 1549 mutex_exit(&mi->mi_lock); 1550 if (fi) { 1551 if (!VALID_FH(fi) && 1552 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1553 int remaperr; 1554 1555 svp = mi->mi_curr_serv; 1556 remaperr = failover_remap(fi); 1557 if (remaperr != 0) { 1558 #ifdef DEBUG 1559 if (remaperr != EINTR) 1560 nfs_cmn_err(remaperr, CE_WARN, 1561 "aclcall couldn't failover: %m"); 1562 #endif 1563 mutex_enter(&mi->mi_lock); 1564 DEC_READERS(mi); 1565 mutex_exit(&mi->mi_lock); 1566 1567 /* 1568 * If failover_remap returns ETIMEDOUT 1569 * and the filesystem is hard mounted 1570 * we have to retry the call with a new 1571 * server. 1572 */ 1573 if ((mi->mi_flags & MI_HARD) && 1574 IS_RECOVERABLE_ERROR(remaperr)) { 1575 if (svp == mi->mi_curr_serv) 1576 failover_newserver(mi); 1577 rpcerr.re_status = RPC_SUCCESS; 1578 goto failoverretry; 1579 } 1580 return (remaperr); 1581 } 1582 } 1583 if (fi->fhp && fi->copyproc) 1584 (*fi->copyproc)(fi->fhp, fi->vp); 1585 } 1586 } 1587 1588 /* For TSOL, use a new cred which has net_mac_aware flag */ 1589 if (!cred_cloned && is_system_labeled()) { 1590 cred_cloned = TRUE; 1591 cr = crdup(icr); 1592 (void) setpflags(NET_MAC_AWARE, 1, cr); 1593 } 1594 1595 /* 1596 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1597 * are guaranteed to reprocess the retry as a new request. 1598 */ 1599 svp = mi->mi_curr_serv; 1600 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1601 if (FAILOVER_MOUNT(mi)) { 1602 mutex_enter(&mi->mi_lock); 1603 DEC_READERS(mi); 1604 mutex_exit(&mi->mi_lock); 1605 1606 if ((rpcerr.re_errno == ETIMEDOUT || 1607 rpcerr.re_errno == ECONNRESET) && 1608 failover_safe(fi)) { 1609 if (svp == mi->mi_curr_serv) 1610 failover_newserver(mi); 1611 goto failoverretry; 1612 } 1613 } 1614 if (rpcerr.re_errno != 0) { 1615 if (cred_cloned) 1616 crfree(cr); 1617 return (rpcerr.re_errno); 1618 } 1619 1620 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1621 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1622 timeo = (mi->mi_timeo * hz) / 10; 1623 } else { 1624 mutex_enter(&mi->mi_lock); 1625 timeo = CLNT_SETTIMERS(client, 1626 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1627 &(mi->mi_timers[NFS_CALLTYPES]), 1628 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1629 (void (*)()) 0, (caddr_t)mi, 0); 1630 mutex_exit(&mi->mi_lock); 1631 } 1632 1633 /* 1634 * If hard mounted fs, retry call forever unless hard error occurs. 1635 */ 1636 do { 1637 tryagain = FALSE; 1638 1639 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1640 status = RPC_FAILED; 1641 rpcerr.re_status = RPC_FAILED; 1642 rpcerr.re_errno = EIO; 1643 break; 1644 } 1645 1646 TICK_TO_TIMEVAL(timeo, &wait); 1647 1648 /* 1649 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1650 * and SIGTERM. (Preserving the existing masks). 1651 * Mask out SIGINT if mount option nointr is specified. 1652 */ 1653 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1654 if (!(mi->mi_flags & MI_INT)) 1655 client->cl_nosignal = TRUE; 1656 1657 /* 1658 * If there is a current signal, then don't bother 1659 * even trying to send out the request because we 1660 * won't be able to block waiting for the response. 1661 * Simply assume RPC_INTR and get on with it. 1662 */ 1663 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1664 status = RPC_INTR; 1665 else { 1666 status = CLNT_CALL(client, which, xdrargs, argsp, 1667 xdrres, resp, wait); 1668 } 1669 1670 if (!(mi->mi_flags & MI_INT)) 1671 client->cl_nosignal = FALSE; 1672 /* 1673 * restore original signal mask 1674 */ 1675 sigunintr(&smask); 1676 1677 switch (status) { 1678 case RPC_SUCCESS: 1679 #if 0 /* notyet */ 1680 if ((mi->mi_flags & MI_DYNAMIC) && 1681 mi->mi_timer_type[which] != 0 && 1682 (mi->mi_curread != my_rsize || 1683 mi->mi_curwrite != my_wsize)) 1684 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1685 #endif 1686 break; 1687 1688 /* 1689 * Unfortunately, there are servers in the world which 1690 * are not coded correctly. They are not prepared to 1691 * handle RPC requests to the NFS port which are not 1692 * NFS requests. Thus, they may try to process the 1693 * NFS_ACL request as if it were an NFS request. This 1694 * does not work. Generally, an error will be generated 1695 * on the client because it will not be able to decode 1696 * the response from the server. However, it seems 1697 * possible that the server may not be able to decode 1698 * the arguments. Thus, the criteria for deciding 1699 * whether the server supports NFS_ACL or not is whether 1700 * the following RPC errors are returned from CLNT_CALL. 1701 */ 1702 case RPC_CANTDECODERES: 1703 case RPC_PROGUNAVAIL: 1704 case RPC_CANTDECODEARGS: 1705 case RPC_PROGVERSMISMATCH: 1706 mutex_enter(&mi->mi_lock); 1707 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1708 mutex_exit(&mi->mi_lock); 1709 break; 1710 1711 /* 1712 * If the server supports NFS_ACL but not the new ops 1713 * for extended attributes, make sure we don't retry. 1714 */ 1715 case RPC_PROCUNAVAIL: 1716 mutex_enter(&mi->mi_lock); 1717 mi->mi_flags &= ~MI_EXTATTR; 1718 mutex_exit(&mi->mi_lock); 1719 break; 1720 1721 case RPC_INTR: 1722 /* 1723 * There is no way to recover from this error, 1724 * even if mount option nointr is specified. 1725 * SIGKILL, for example, cannot be blocked. 1726 */ 1727 rpcerr.re_status = RPC_INTR; 1728 rpcerr.re_errno = EINTR; 1729 break; 1730 1731 case RPC_UDERROR: 1732 /* 1733 * If the NFS server is local (vold) and 1734 * it goes away then we get RPC_UDERROR. 1735 * This is a retryable error, so we would 1736 * loop, so check to see if the specific 1737 * error was ECONNRESET, indicating that 1738 * target did not exist at all. If so, 1739 * return with RPC_PROGUNAVAIL and 1740 * ECONNRESET to indicate why. 1741 */ 1742 CLNT_GETERR(client, &rpcerr); 1743 if (rpcerr.re_errno == ECONNRESET) { 1744 rpcerr.re_status = RPC_PROGUNAVAIL; 1745 rpcerr.re_errno = ECONNRESET; 1746 break; 1747 } 1748 /*FALLTHROUGH*/ 1749 1750 default: /* probably RPC_TIMEDOUT */ 1751 if (IS_UNRECOVERABLE_RPC(status)) 1752 break; 1753 1754 /* 1755 * increment server not responding count 1756 */ 1757 mutex_enter(&mi->mi_lock); 1758 mi->mi_noresponse++; 1759 mutex_exit(&mi->mi_lock); 1760 #ifdef DEBUG 1761 nfscl->nfscl_stat.noresponse.value.ui64++; 1762 #endif 1763 1764 if (!(mi->mi_flags & MI_HARD)) { 1765 if (!(mi->mi_flags & MI_SEMISOFT) || 1766 (mi->mi_acl_ss_call_type[which] == 0)) 1767 break; 1768 } 1769 1770 /* 1771 * The call is in progress (over COTS). 1772 * Try the CLNT_CALL again, but don't 1773 * print a noisy error message. 1774 */ 1775 if (status == RPC_INPROGRESS) { 1776 tryagain = TRUE; 1777 break; 1778 } 1779 1780 if (flags & RFSCALL_SOFT) 1781 break; 1782 1783 /* 1784 * On zone shutdown, just move on. 1785 */ 1786 if (zone_status_get(curproc->p_zone) >= 1787 ZONE_IS_SHUTTING_DOWN) { 1788 rpcerr.re_status = RPC_FAILED; 1789 rpcerr.re_errno = EIO; 1790 break; 1791 } 1792 1793 /* 1794 * NFS client failover support 1795 * 1796 * If the current server just failed us, we'll 1797 * start the process of finding a new server. 1798 * After that, we can just retry. 1799 */ 1800 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1801 if (svp == mi->mi_curr_serv) 1802 failover_newserver(mi); 1803 clfree_impl(client, ch, nfscl); 1804 goto failoverretry; 1805 } 1806 1807 tryagain = TRUE; 1808 timeo = backoff(timeo); 1809 mutex_enter(&mi->mi_lock); 1810 if (!(mi->mi_flags & MI_PRINTED)) { 1811 mi->mi_flags |= MI_PRINTED; 1812 mutex_exit(&mi->mi_lock); 1813 #ifdef DEBUG 1814 zprintf(zoneid, 1815 "NFS_ACL%d server %s not responding still trying\n", 1816 mi->mi_vers, svp->sv_hostname); 1817 #else 1818 zprintf(zoneid, 1819 "NFS server %s not responding still trying\n", 1820 svp->sv_hostname); 1821 #endif 1822 } else 1823 mutex_exit(&mi->mi_lock); 1824 if (*douprintf && curproc->p_sessp->s_vp != NULL) { 1825 *douprintf = 0; 1826 if (!(mi->mi_flags & MI_NOPRINT)) 1827 #ifdef DEBUG 1828 uprintf( 1829 "NFS_ACL%d server %s not responding still trying\n", 1830 mi->mi_vers, svp->sv_hostname); 1831 #else 1832 uprintf( 1833 "NFS server %s not responding still trying\n", 1834 svp->sv_hostname); 1835 #endif 1836 } 1837 1838 #if 0 /* notyet */ 1839 /* 1840 * If doing dynamic adjustment of transfer 1841 * size and if it's a read or write call 1842 * and if the transfer size changed while 1843 * retransmitting or if the feedback routine 1844 * changed the transfer size, 1845 * then exit rfscall so that the transfer 1846 * size can be adjusted at the vnops level. 1847 */ 1848 if ((mi->mi_flags & MI_DYNAMIC) && 1849 mi->mi_acl_timer_type[which] != 0 && 1850 (mi->mi_curread != my_rsize || 1851 mi->mi_curwrite != my_wsize || 1852 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1853 /* 1854 * On read or write calls, return 1855 * back to the vnode ops level if 1856 * the transfer size changed. 1857 */ 1858 clfree_impl(client, ch, nfscl); 1859 if (cred_cloned) 1860 crfree(cr); 1861 return (ENFS_TRYAGAIN); 1862 } 1863 #endif 1864 } 1865 } while (tryagain); 1866 1867 if (status != RPC_SUCCESS) { 1868 /* 1869 * Let soft mounts use the timed out message. 1870 */ 1871 if (status == RPC_INPROGRESS) 1872 status = RPC_TIMEDOUT; 1873 nfscl->nfscl_stat.badcalls.value.ui64++; 1874 if (status == RPC_CANTDECODERES || 1875 status == RPC_PROGUNAVAIL || 1876 status == RPC_PROCUNAVAIL || 1877 status == RPC_CANTDECODEARGS || 1878 status == RPC_PROGVERSMISMATCH) 1879 CLNT_GETERR(client, &rpcerr); 1880 else if (status != RPC_INTR) { 1881 mutex_enter(&mi->mi_lock); 1882 mi->mi_flags |= MI_DOWN; 1883 mutex_exit(&mi->mi_lock); 1884 CLNT_GETERR(client, &rpcerr); 1885 #ifdef DEBUG 1886 bufp = clnt_sperror(client, svp->sv_hostname); 1887 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1888 mi->mi_vers, mi->mi_aclnames[which], bufp); 1889 if (curproc->p_sessp->s_vp != NULL) { 1890 if (!(mi->mi_flags & MI_NOPRINT)) { 1891 uprintf("NFS_ACL%d %s failed for %s\n", 1892 mi->mi_vers, mi->mi_aclnames[which], 1893 bufp); 1894 } 1895 } 1896 kmem_free(bufp, MAXPATHLEN); 1897 #else 1898 zprintf(zoneid, 1899 "NFS %s failed for server %s: error %d (%s)\n", 1900 mi->mi_aclnames[which], svp->sv_hostname, 1901 status, clnt_sperrno(status)); 1902 if (curproc->p_sessp->s_vp != NULL) { 1903 if (!(mi->mi_flags & MI_NOPRINT)) 1904 uprintf( 1905 "NFS %s failed for server %s: error %d (%s)\n", 1906 mi->mi_aclnames[which], 1907 svp->sv_hostname, status, 1908 clnt_sperrno(status)); 1909 } 1910 #endif 1911 /* 1912 * when CLNT_CALL() fails with RPC_AUTHERROR, 1913 * re_errno is set appropriately depending on 1914 * the authentication error 1915 */ 1916 if (status == RPC_VERSMISMATCH || 1917 status == RPC_PROGVERSMISMATCH) 1918 rpcerr.re_errno = EIO; 1919 } 1920 } else { 1921 /* 1922 * Test the value of mi_down and mi_printed without 1923 * holding the mi_lock mutex. If they are both zero, 1924 * then it is okay to skip the down and printed 1925 * processing. This saves on a mutex_enter and 1926 * mutex_exit pair for a normal, successful RPC. 1927 * This was just complete overhead. 1928 */ 1929 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1930 mutex_enter(&mi->mi_lock); 1931 mi->mi_flags &= ~MI_DOWN; 1932 if (mi->mi_flags & MI_PRINTED) { 1933 mi->mi_flags &= ~MI_PRINTED; 1934 mutex_exit(&mi->mi_lock); 1935 #ifdef DEBUG 1936 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1937 mi->mi_vers, svp->sv_hostname); 1938 #else 1939 zprintf(zoneid, "NFS server %s ok\n", 1940 svp->sv_hostname); 1941 #endif 1942 } else 1943 mutex_exit(&mi->mi_lock); 1944 } 1945 1946 if (*douprintf == 0) { 1947 if (!(mi->mi_flags & MI_NOPRINT)) 1948 #ifdef DEBUG 1949 uprintf("NFS_ACL%d server %s ok\n", 1950 mi->mi_vers, svp->sv_hostname); 1951 #else 1952 uprintf("NFS server %s ok\n", svp->sv_hostname); 1953 #endif 1954 *douprintf = 1; 1955 } 1956 } 1957 1958 clfree_impl(client, ch, nfscl); 1959 if (cred_cloned) 1960 crfree(cr); 1961 1962 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1963 1964 #if 0 /* notyet */ 1965 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1966 rpcerr.re_errno); 1967 #endif 1968 1969 return (rpcerr.re_errno); 1970 } 1971 1972 int 1973 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1974 { 1975 uint_t mask = vap->va_mask; 1976 1977 if (!(mask & AT_MODE)) 1978 sa->sa_mode = (uint32_t)-1; 1979 else 1980 sa->sa_mode = vap->va_mode; 1981 if (!(mask & AT_UID)) 1982 sa->sa_uid = (uint32_t)-1; 1983 else 1984 sa->sa_uid = (uint32_t)vap->va_uid; 1985 if (!(mask & AT_GID)) 1986 sa->sa_gid = (uint32_t)-1; 1987 else 1988 sa->sa_gid = (uint32_t)vap->va_gid; 1989 if (!(mask & AT_SIZE)) 1990 sa->sa_size = (uint32_t)-1; 1991 else 1992 sa->sa_size = (uint32_t)vap->va_size; 1993 if (!(mask & AT_ATIME)) 1994 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 1995 else { 1996 /* check time validity */ 1997 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1998 return (EOVERFLOW); 1999 } 2000 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2001 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2002 } 2003 if (!(mask & AT_MTIME)) 2004 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2005 else { 2006 /* check time validity */ 2007 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2008 return (EOVERFLOW); 2009 } 2010 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2011 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2012 } 2013 return (0); 2014 } 2015 2016 int 2017 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2018 { 2019 uint_t mask = vap->va_mask; 2020 2021 if (!(mask & AT_MODE)) 2022 sa->mode.set_it = FALSE; 2023 else { 2024 sa->mode.set_it = TRUE; 2025 sa->mode.mode = (mode3)vap->va_mode; 2026 } 2027 if (!(mask & AT_UID)) 2028 sa->uid.set_it = FALSE; 2029 else { 2030 sa->uid.set_it = TRUE; 2031 sa->uid.uid = (uid3)vap->va_uid; 2032 } 2033 if (!(mask & AT_GID)) 2034 sa->gid.set_it = FALSE; 2035 else { 2036 sa->gid.set_it = TRUE; 2037 sa->gid.gid = (gid3)vap->va_gid; 2038 } 2039 if (!(mask & AT_SIZE)) 2040 sa->size.set_it = FALSE; 2041 else { 2042 sa->size.set_it = TRUE; 2043 sa->size.size = (size3)vap->va_size; 2044 } 2045 if (!(mask & AT_ATIME)) 2046 sa->atime.set_it = DONT_CHANGE; 2047 else { 2048 /* check time validity */ 2049 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2050 return (EOVERFLOW); 2051 } 2052 sa->atime.set_it = SET_TO_CLIENT_TIME; 2053 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2054 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2055 } 2056 if (!(mask & AT_MTIME)) 2057 sa->mtime.set_it = DONT_CHANGE; 2058 else { 2059 /* check time validity */ 2060 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2061 return (EOVERFLOW); 2062 } 2063 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2064 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2065 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2066 } 2067 return (0); 2068 } 2069 2070 void 2071 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2072 { 2073 2074 da->da_fhandle = VTOFH(dvp); 2075 da->da_name = nm; 2076 da->da_flags = 0; 2077 } 2078 2079 void 2080 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2081 { 2082 2083 da->dirp = VTOFH3(dvp); 2084 da->name = nm; 2085 } 2086 2087 int 2088 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2089 { 2090 int error; 2091 rnode_t *rp; 2092 struct vattr va; 2093 2094 va.va_mask = AT_MODE | AT_GID; 2095 error = VOP_GETATTR(dvp, &va, 0, cr); 2096 if (error) 2097 return (error); 2098 2099 /* 2100 * To determine the expected group-id of the created file: 2101 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2102 * GRPID option, and the directory's set-gid bit is clear, 2103 * then use the process's gid. 2104 * 2) Otherwise, set the group-id to the gid of the parent directory. 2105 */ 2106 rp = VTOR(dvp); 2107 mutex_enter(&rp->r_statelock); 2108 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2109 *gidp = crgetgid(cr); 2110 else 2111 *gidp = va.va_gid; 2112 mutex_exit(&rp->r_statelock); 2113 return (0); 2114 } 2115 2116 int 2117 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2118 { 2119 int error; 2120 struct vattr va; 2121 2122 va.va_mask = AT_MODE; 2123 error = VOP_GETATTR(dvp, &va, 0, cr); 2124 if (error) 2125 return (error); 2126 2127 /* 2128 * Modify the expected mode (om) so that the set-gid bit matches 2129 * that of the parent directory (dvp). 2130 */ 2131 if (va.va_mode & VSGID) 2132 *omp |= VSGID; 2133 else 2134 *omp &= ~VSGID; 2135 return (0); 2136 } 2137 2138 void 2139 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2140 { 2141 2142 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2143 if (!(vp->v_flag & VSWAPLIKE)) { 2144 mutex_enter(&vp->v_lock); 2145 vp->v_flag |= VSWAPLIKE; 2146 mutex_exit(&vp->v_lock); 2147 } 2148 } else { 2149 if (vp->v_flag & VSWAPLIKE) { 2150 mutex_enter(&vp->v_lock); 2151 vp->v_flag &= ~VSWAPLIKE; 2152 mutex_exit(&vp->v_lock); 2153 } 2154 } 2155 } 2156 2157 /* 2158 * Free the resources associated with an rnode. 2159 */ 2160 static void 2161 rinactive(rnode_t *rp, cred_t *cr) 2162 { 2163 vnode_t *vp; 2164 cred_t *cred; 2165 char *contents; 2166 int size; 2167 vsecattr_t *vsp; 2168 int error; 2169 nfs3_pathconf_info *info; 2170 2171 /* 2172 * Before freeing anything, wait until all asynchronous 2173 * activity is done on this rnode. This will allow all 2174 * asynchronous read ahead and write behind i/o's to 2175 * finish. 2176 */ 2177 mutex_enter(&rp->r_statelock); 2178 while (rp->r_count > 0) 2179 cv_wait(&rp->r_cv, &rp->r_statelock); 2180 mutex_exit(&rp->r_statelock); 2181 2182 /* 2183 * Flush and invalidate all pages associated with the vnode. 2184 */ 2185 vp = RTOV(rp); 2186 if (vn_has_cached_data(vp)) { 2187 ASSERT(vp->v_type != VCHR); 2188 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2189 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); 2190 if (error && (error == ENOSPC || error == EDQUOT)) { 2191 mutex_enter(&rp->r_statelock); 2192 if (!rp->r_error) 2193 rp->r_error = error; 2194 mutex_exit(&rp->r_statelock); 2195 } 2196 } 2197 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2198 } 2199 2200 /* 2201 * Free any held credentials and caches which may be associated 2202 * with this rnode. 2203 */ 2204 mutex_enter(&rp->r_statelock); 2205 cred = rp->r_cred; 2206 rp->r_cred = NULL; 2207 contents = rp->r_symlink.contents; 2208 size = rp->r_symlink.size; 2209 rp->r_symlink.contents = NULL; 2210 vsp = rp->r_secattr; 2211 rp->r_secattr = NULL; 2212 info = rp->r_pathconf; 2213 rp->r_pathconf = NULL; 2214 mutex_exit(&rp->r_statelock); 2215 2216 /* 2217 * Free the held credential. 2218 */ 2219 if (cred != NULL) 2220 crfree(cred); 2221 2222 /* 2223 * Free the access cache entries. 2224 */ 2225 (void) nfs_access_purge_rp(rp); 2226 2227 /* 2228 * Free the readdir cache entries. 2229 */ 2230 if (HAVE_RDDIR_CACHE(rp)) 2231 nfs_purge_rddir_cache(vp); 2232 2233 /* 2234 * Free the symbolic link cache. 2235 */ 2236 if (contents != NULL) { 2237 2238 kmem_free((void *)contents, size); 2239 } 2240 2241 /* 2242 * Free any cached ACL. 2243 */ 2244 if (vsp != NULL) 2245 nfs_acl_free(vsp); 2246 2247 /* 2248 * Free any cached pathconf information. 2249 */ 2250 if (info != NULL) 2251 kmem_free(info, sizeof (*info)); 2252 } 2253 2254 /* 2255 * Return a vnode for the given NFS Version 2 file handle. 2256 * If no rnode exists for this fhandle, create one and put it 2257 * into the hash queues. If the rnode for this fhandle 2258 * already exists, return it. 2259 * 2260 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2261 */ 2262 vnode_t * 2263 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2264 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2265 { 2266 int newnode; 2267 int index; 2268 vnode_t *vp; 2269 nfs_fhandle nfh; 2270 vattr_t va; 2271 2272 nfh.fh_len = NFS_FHSIZE; 2273 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2274 2275 index = rtablehash(&nfh); 2276 rw_enter(&rtable[index].r_lock, RW_READER); 2277 2278 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2279 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2280 2281 if (attr != NULL) { 2282 if (!newnode) { 2283 rw_exit(&rtable[index].r_lock); 2284 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2285 } else { 2286 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2287 vp->v_type = VBAD; 2288 else 2289 vp->v_type = n2v_type(attr); 2290 /* 2291 * A translation here seems to be necessary 2292 * because this function can be called 2293 * with `attr' that has come from the wire, 2294 * and been operated on by vattr_to_nattr(). 2295 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2296 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2297 * ->makenfsnode(). 2298 */ 2299 if ((attr->na_rdev & 0xffff0000) == 0) 2300 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2301 else 2302 vp->v_rdev = expldev(n2v_rdev(attr)); 2303 nfs_attrcache(vp, attr, t); 2304 rw_exit(&rtable[index].r_lock); 2305 } 2306 } else { 2307 if (newnode) { 2308 PURGE_ATTRCACHE(vp); 2309 } 2310 rw_exit(&rtable[index].r_lock); 2311 } 2312 2313 return (vp); 2314 } 2315 2316 /* 2317 * Return a vnode for the given NFS Version 3 file handle. 2318 * If no rnode exists for this fhandle, create one and put it 2319 * into the hash queues. If the rnode for this fhandle 2320 * already exists, return it. 2321 * 2322 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2323 */ 2324 vnode_t * 2325 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2326 cred_t *cr, char *dnm, char *nm) 2327 { 2328 int newnode; 2329 int index; 2330 vnode_t *vp; 2331 2332 index = rtablehash((nfs_fhandle *)fh); 2333 rw_enter(&rtable[index].r_lock, RW_READER); 2334 2335 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2336 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2337 dnm, nm); 2338 2339 if (vap == NULL) { 2340 if (newnode) { 2341 PURGE_ATTRCACHE(vp); 2342 } 2343 rw_exit(&rtable[index].r_lock); 2344 return (vp); 2345 } 2346 2347 if (!newnode) { 2348 rw_exit(&rtable[index].r_lock); 2349 nfs_attr_cache(vp, vap, t, cr); 2350 } else { 2351 rnode_t *rp = VTOR(vp); 2352 2353 vp->v_type = vap->va_type; 2354 vp->v_rdev = vap->va_rdev; 2355 2356 mutex_enter(&rp->r_statelock); 2357 if (rp->r_mtime <= t) 2358 nfs_attrcache_va(vp, vap); 2359 mutex_exit(&rp->r_statelock); 2360 rw_exit(&rtable[index].r_lock); 2361 } 2362 2363 return (vp); 2364 } 2365 2366 vnode_t * 2367 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2368 cred_t *cr, char *dnm, char *nm) 2369 { 2370 int newnode; 2371 int index; 2372 vnode_t *vp; 2373 vattr_t va; 2374 2375 index = rtablehash((nfs_fhandle *)fh); 2376 rw_enter(&rtable[index].r_lock, RW_READER); 2377 2378 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2379 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2380 dnm, nm); 2381 2382 if (attr == NULL) { 2383 if (newnode) { 2384 PURGE_ATTRCACHE(vp); 2385 } 2386 rw_exit(&rtable[index].r_lock); 2387 return (vp); 2388 } 2389 2390 if (!newnode) { 2391 rw_exit(&rtable[index].r_lock); 2392 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2393 } else { 2394 if (attr->type < NF3REG || attr->type > NF3FIFO) 2395 vp->v_type = VBAD; 2396 else 2397 vp->v_type = nf3_to_vt[attr->type]; 2398 vp->v_rdev = makedevice(attr->rdev.specdata1, 2399 attr->rdev.specdata2); 2400 nfs3_attrcache(vp, attr, t); 2401 rw_exit(&rtable[index].r_lock); 2402 } 2403 2404 return (vp); 2405 } 2406 2407 /* 2408 * Read this comment before making changes to rtablehash()! 2409 * This is a hash function in which seemingly obvious and harmless 2410 * changes can cause escalations costing million dollars! 2411 * Know what you are doing. 2412 * 2413 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2414 * algorithm is currently detailed here: 2415 * 2416 * http://burtleburtle.net/bob/hash/doobs.html 2417 * 2418 * Of course, the above link may not be valid by the time you are reading 2419 * this, but suffice it to say that the one-at-a-time algorithm works well in 2420 * almost all cases. If you are changing the algorithm be sure to verify that 2421 * the hash algorithm still provides even distribution in all cases and with 2422 * any server returning filehandles in whatever order (sequential or random). 2423 */ 2424 static int 2425 rtablehash(nfs_fhandle *fh) 2426 { 2427 ulong_t hash, len, i; 2428 char *key; 2429 2430 key = fh->fh_buf; 2431 len = (ulong_t)fh->fh_len; 2432 for (hash = 0, i = 0; i < len; i++) { 2433 hash += key[i]; 2434 hash += (hash << 10); 2435 hash ^= (hash >> 6); 2436 } 2437 hash += (hash << 3); 2438 hash ^= (hash >> 11); 2439 hash += (hash << 15); 2440 return (hash & rtablemask); 2441 } 2442 2443 static vnode_t * 2444 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2445 struct vnodeops *vops, 2446 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2447 int (*compar)(const void *, const void *), 2448 int *newnode, cred_t *cr, char *dnm, char *nm) 2449 { 2450 rnode_t *rp; 2451 rnode_t *trp; 2452 vnode_t *vp; 2453 mntinfo_t *mi; 2454 2455 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2456 2457 mi = VFTOMI(vfsp); 2458 start: 2459 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2460 vp = RTOV(rp); 2461 nfs_set_vroot(vp); 2462 *newnode = 0; 2463 return (vp); 2464 } 2465 rw_exit(&rhtp->r_lock); 2466 2467 mutex_enter(&rpfreelist_lock); 2468 if (rpfreelist != NULL && rnew >= nrnode) { 2469 rp = rpfreelist; 2470 rp_rmfree(rp); 2471 mutex_exit(&rpfreelist_lock); 2472 2473 vp = RTOV(rp); 2474 2475 if (rp->r_flags & RHASHED) { 2476 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2477 mutex_enter(&vp->v_lock); 2478 if (vp->v_count > 1) { 2479 vp->v_count--; 2480 mutex_exit(&vp->v_lock); 2481 rw_exit(&rp->r_hashq->r_lock); 2482 rw_enter(&rhtp->r_lock, RW_READER); 2483 goto start; 2484 } 2485 mutex_exit(&vp->v_lock); 2486 rp_rmhash_locked(rp); 2487 rw_exit(&rp->r_hashq->r_lock); 2488 } 2489 2490 rinactive(rp, cr); 2491 2492 mutex_enter(&vp->v_lock); 2493 if (vp->v_count > 1) { 2494 vp->v_count--; 2495 mutex_exit(&vp->v_lock); 2496 rw_enter(&rhtp->r_lock, RW_READER); 2497 goto start; 2498 } 2499 mutex_exit(&vp->v_lock); 2500 vn_invalid(vp); 2501 /* 2502 * destroy old locks before bzero'ing and 2503 * recreating the locks below. 2504 */ 2505 nfs_rw_destroy(&rp->r_rwlock); 2506 nfs_rw_destroy(&rp->r_lkserlock); 2507 mutex_destroy(&rp->r_statelock); 2508 cv_destroy(&rp->r_cv); 2509 cv_destroy(&rp->r_commit.c_cv); 2510 nfs_free_r_path(rp); 2511 avl_destroy(&rp->r_dir); 2512 /* 2513 * Make sure that if rnode is recycled then 2514 * VFS count is decremented properly before 2515 * reuse. 2516 */ 2517 VFS_RELE(vp->v_vfsp); 2518 vn_reinit(vp); 2519 } else { 2520 vnode_t *new_vp; 2521 2522 mutex_exit(&rpfreelist_lock); 2523 2524 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2525 new_vp = vn_alloc(KM_SLEEP); 2526 2527 atomic_add_long((ulong_t *)&rnew, 1); 2528 #ifdef DEBUG 2529 clstat_debug.nrnode.value.ui64++; 2530 #endif 2531 vp = new_vp; 2532 } 2533 2534 bzero(rp, sizeof (*rp)); 2535 rp->r_vnode = vp; 2536 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2537 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2538 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2539 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2540 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2541 rp->r_fh.fh_len = fh->fh_len; 2542 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2543 rp->r_server = mi->mi_curr_serv; 2544 if (FAILOVER_MOUNT(mi)) { 2545 /* 2546 * If replicated servers, stash pathnames 2547 */ 2548 if (dnm != NULL && nm != NULL) { 2549 char *s, *p; 2550 uint_t len; 2551 2552 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2553 rp->r_path = kmem_alloc(len, KM_SLEEP); 2554 #ifdef DEBUG 2555 clstat_debug.rpath.value.ui64 += len; 2556 #endif 2557 s = rp->r_path; 2558 for (p = dnm; *p; p++) 2559 *s++ = *p; 2560 *s++ = '/'; 2561 for (p = nm; *p; p++) 2562 *s++ = *p; 2563 *s = '\0'; 2564 } else { 2565 /* special case for root */ 2566 rp->r_path = kmem_alloc(2, KM_SLEEP); 2567 #ifdef DEBUG 2568 clstat_debug.rpath.value.ui64 += 2; 2569 #endif 2570 *rp->r_path = '.'; 2571 *(rp->r_path + 1) = '\0'; 2572 } 2573 } 2574 VFS_HOLD(vfsp); 2575 rp->r_putapage = putapage; 2576 rp->r_hashq = rhtp; 2577 rp->r_flags = RREADDIRPLUS; 2578 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2579 offsetof(rddir_cache, tree)); 2580 vn_setops(vp, vops); 2581 vp->v_data = (caddr_t)rp; 2582 vp->v_vfsp = vfsp; 2583 vp->v_type = VNON; 2584 nfs_set_vroot(vp); 2585 2586 /* 2587 * There is a race condition if someone else 2588 * alloc's the rnode while no locks are held, so we 2589 * check again and recover if found. 2590 */ 2591 rw_enter(&rhtp->r_lock, RW_WRITER); 2592 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2593 vp = RTOV(trp); 2594 nfs_set_vroot(vp); 2595 *newnode = 0; 2596 rw_exit(&rhtp->r_lock); 2597 rp_addfree(rp, cr); 2598 rw_enter(&rhtp->r_lock, RW_READER); 2599 return (vp); 2600 } 2601 rp_addhash(rp); 2602 *newnode = 1; 2603 return (vp); 2604 } 2605 2606 static void 2607 nfs_set_vroot(vnode_t *vp) 2608 { 2609 rnode_t *rp; 2610 nfs_fhandle *rootfh; 2611 2612 rp = VTOR(vp); 2613 rootfh = &rp->r_server->sv_fhandle; 2614 if (rootfh->fh_len == rp->r_fh.fh_len && 2615 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2616 if (!(vp->v_flag & VROOT)) { 2617 mutex_enter(&vp->v_lock); 2618 vp->v_flag |= VROOT; 2619 mutex_exit(&vp->v_lock); 2620 } 2621 } 2622 } 2623 2624 static void 2625 nfs_free_r_path(rnode_t *rp) 2626 { 2627 char *path; 2628 size_t len; 2629 2630 path = rp->r_path; 2631 if (path) { 2632 rp->r_path = NULL; 2633 len = strlen(path) + 1; 2634 kmem_free(path, len); 2635 #ifdef DEBUG 2636 clstat_debug.rpath.value.ui64 -= len; 2637 #endif 2638 } 2639 } 2640 2641 /* 2642 * Put an rnode on the free list. 2643 * 2644 * Rnodes which were allocated above and beyond the normal limit 2645 * are immediately freed. 2646 */ 2647 void 2648 rp_addfree(rnode_t *rp, cred_t *cr) 2649 { 2650 vnode_t *vp; 2651 struct vfs *vfsp; 2652 2653 vp = RTOV(rp); 2654 ASSERT(vp->v_count >= 1); 2655 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2656 2657 /* 2658 * If we have too many rnodes allocated and there are no 2659 * references to this rnode, or if the rnode is no longer 2660 * accessible by it does not reside in the hash queues, 2661 * or if an i/o error occurred while writing to the file, 2662 * then just free it instead of putting it on the rnode 2663 * freelist. 2664 */ 2665 vfsp = vp->v_vfsp; 2666 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2667 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2668 if (rp->r_flags & RHASHED) { 2669 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2670 mutex_enter(&vp->v_lock); 2671 if (vp->v_count > 1) { 2672 vp->v_count--; 2673 mutex_exit(&vp->v_lock); 2674 rw_exit(&rp->r_hashq->r_lock); 2675 return; 2676 } 2677 mutex_exit(&vp->v_lock); 2678 rp_rmhash_locked(rp); 2679 rw_exit(&rp->r_hashq->r_lock); 2680 } 2681 2682 rinactive(rp, cr); 2683 2684 /* 2685 * Recheck the vnode reference count. We need to 2686 * make sure that another reference has not been 2687 * acquired while we were not holding v_lock. The 2688 * rnode is not in the rnode hash queues, so the 2689 * only way for a reference to have been acquired 2690 * is for a VOP_PUTPAGE because the rnode was marked 2691 * with RDIRTY or for a modified page. This 2692 * reference may have been acquired before our call 2693 * to rinactive. The i/o may have been completed, 2694 * thus allowing rinactive to complete, but the 2695 * reference to the vnode may not have been released 2696 * yet. In any case, the rnode can not be destroyed 2697 * until the other references to this vnode have been 2698 * released. The other references will take care of 2699 * either destroying the rnode or placing it on the 2700 * rnode freelist. If there are no other references, 2701 * then the rnode may be safely destroyed. 2702 */ 2703 mutex_enter(&vp->v_lock); 2704 if (vp->v_count > 1) { 2705 vp->v_count--; 2706 mutex_exit(&vp->v_lock); 2707 return; 2708 } 2709 mutex_exit(&vp->v_lock); 2710 2711 destroy_rnode(rp); 2712 return; 2713 } 2714 2715 /* 2716 * Lock the hash queue and then recheck the reference count 2717 * to ensure that no other threads have acquired a reference 2718 * to indicate that the rnode should not be placed on the 2719 * freelist. If another reference has been acquired, then 2720 * just release this one and let the other thread complete 2721 * the processing of adding this rnode to the freelist. 2722 */ 2723 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2724 2725 mutex_enter(&vp->v_lock); 2726 if (vp->v_count > 1) { 2727 vp->v_count--; 2728 mutex_exit(&vp->v_lock); 2729 rw_exit(&rp->r_hashq->r_lock); 2730 return; 2731 } 2732 mutex_exit(&vp->v_lock); 2733 2734 /* 2735 * If there is no cached data or metadata for this file, then 2736 * put the rnode on the front of the freelist so that it will 2737 * be reused before other rnodes which may have cached data or 2738 * metadata associated with them. 2739 */ 2740 mutex_enter(&rpfreelist_lock); 2741 if (rpfreelist == NULL) { 2742 rp->r_freef = rp; 2743 rp->r_freeb = rp; 2744 rpfreelist = rp; 2745 } else { 2746 rp->r_freef = rpfreelist; 2747 rp->r_freeb = rpfreelist->r_freeb; 2748 rpfreelist->r_freeb->r_freef = rp; 2749 rpfreelist->r_freeb = rp; 2750 if (!vn_has_cached_data(vp) && 2751 !HAVE_RDDIR_CACHE(rp) && 2752 rp->r_symlink.contents == NULL && 2753 rp->r_secattr == NULL && 2754 rp->r_pathconf == NULL) 2755 rpfreelist = rp; 2756 } 2757 mutex_exit(&rpfreelist_lock); 2758 2759 rw_exit(&rp->r_hashq->r_lock); 2760 } 2761 2762 /* 2763 * Remove an rnode from the free list. 2764 * 2765 * The caller must be holding rpfreelist_lock and the rnode 2766 * must be on the freelist. 2767 */ 2768 static void 2769 rp_rmfree(rnode_t *rp) 2770 { 2771 2772 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2773 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2774 2775 if (rp == rpfreelist) { 2776 rpfreelist = rp->r_freef; 2777 if (rp == rpfreelist) 2778 rpfreelist = NULL; 2779 } 2780 2781 rp->r_freeb->r_freef = rp->r_freef; 2782 rp->r_freef->r_freeb = rp->r_freeb; 2783 2784 rp->r_freef = rp->r_freeb = NULL; 2785 } 2786 2787 /* 2788 * Put a rnode in the hash table. 2789 * 2790 * The caller must be holding the exclusive hash queue lock. 2791 */ 2792 static void 2793 rp_addhash(rnode_t *rp) 2794 { 2795 2796 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2797 ASSERT(!(rp->r_flags & RHASHED)); 2798 2799 rp->r_hashf = rp->r_hashq->r_hashf; 2800 rp->r_hashq->r_hashf = rp; 2801 rp->r_hashb = (rnode_t *)rp->r_hashq; 2802 rp->r_hashf->r_hashb = rp; 2803 2804 mutex_enter(&rp->r_statelock); 2805 rp->r_flags |= RHASHED; 2806 mutex_exit(&rp->r_statelock); 2807 } 2808 2809 /* 2810 * Remove a rnode from the hash table. 2811 * 2812 * The caller must be holding the hash queue lock. 2813 */ 2814 static void 2815 rp_rmhash_locked(rnode_t *rp) 2816 { 2817 2818 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2819 ASSERT(rp->r_flags & RHASHED); 2820 2821 rp->r_hashb->r_hashf = rp->r_hashf; 2822 rp->r_hashf->r_hashb = rp->r_hashb; 2823 2824 mutex_enter(&rp->r_statelock); 2825 rp->r_flags &= ~RHASHED; 2826 mutex_exit(&rp->r_statelock); 2827 } 2828 2829 /* 2830 * Remove a rnode from the hash table. 2831 * 2832 * The caller must not be holding the hash queue lock. 2833 */ 2834 void 2835 rp_rmhash(rnode_t *rp) 2836 { 2837 2838 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2839 rp_rmhash_locked(rp); 2840 rw_exit(&rp->r_hashq->r_lock); 2841 } 2842 2843 /* 2844 * Lookup a rnode by fhandle. 2845 * 2846 * The caller must be holding the hash queue lock, either shared or exclusive. 2847 */ 2848 static rnode_t * 2849 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2850 { 2851 rnode_t *rp; 2852 vnode_t *vp; 2853 2854 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2855 2856 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2857 vp = RTOV(rp); 2858 if (vp->v_vfsp == vfsp && 2859 rp->r_fh.fh_len == fh->fh_len && 2860 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2861 /* 2862 * remove rnode from free list, if necessary. 2863 */ 2864 if (rp->r_freef != NULL) { 2865 mutex_enter(&rpfreelist_lock); 2866 /* 2867 * If the rnode is on the freelist, 2868 * then remove it and use that reference 2869 * as the new reference. Otherwise, 2870 * need to increment the reference count. 2871 */ 2872 if (rp->r_freef != NULL) { 2873 rp_rmfree(rp); 2874 mutex_exit(&rpfreelist_lock); 2875 } else { 2876 mutex_exit(&rpfreelist_lock); 2877 VN_HOLD(vp); 2878 } 2879 } else 2880 VN_HOLD(vp); 2881 return (rp); 2882 } 2883 } 2884 return (NULL); 2885 } 2886 2887 /* 2888 * Return 1 if there is a active vnode belonging to this vfs in the 2889 * rtable cache. 2890 * 2891 * Several of these checks are done without holding the usual 2892 * locks. This is safe because destroy_rtable(), rp_addfree(), 2893 * etc. will redo the necessary checks before actually destroying 2894 * any rnodes. 2895 */ 2896 int 2897 check_rtable(struct vfs *vfsp) 2898 { 2899 int index; 2900 rnode_t *rp; 2901 vnode_t *vp; 2902 2903 for (index = 0; index < rtablesize; index++) { 2904 rw_enter(&rtable[index].r_lock, RW_READER); 2905 for (rp = rtable[index].r_hashf; 2906 rp != (rnode_t *)(&rtable[index]); 2907 rp = rp->r_hashf) { 2908 vp = RTOV(rp); 2909 if (vp->v_vfsp == vfsp) { 2910 if (rp->r_freef == NULL || 2911 (vn_has_cached_data(vp) && 2912 (rp->r_flags & RDIRTY)) || 2913 rp->r_count > 0) { 2914 rw_exit(&rtable[index].r_lock); 2915 return (1); 2916 } 2917 } 2918 } 2919 rw_exit(&rtable[index].r_lock); 2920 } 2921 return (0); 2922 } 2923 2924 /* 2925 * Destroy inactive vnodes from the hash queues which belong to this 2926 * vfs. It is essential that we destroy all inactive vnodes during a 2927 * forced unmount as well as during a normal unmount. 2928 */ 2929 void 2930 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2931 { 2932 int index; 2933 rnode_t *rp; 2934 rnode_t *rlist; 2935 rnode_t *r_hashf; 2936 vnode_t *vp; 2937 2938 rlist = NULL; 2939 2940 for (index = 0; index < rtablesize; index++) { 2941 rw_enter(&rtable[index].r_lock, RW_WRITER); 2942 for (rp = rtable[index].r_hashf; 2943 rp != (rnode_t *)(&rtable[index]); 2944 rp = r_hashf) { 2945 /* save the hash pointer before destroying */ 2946 r_hashf = rp->r_hashf; 2947 vp = RTOV(rp); 2948 if (vp->v_vfsp == vfsp) { 2949 mutex_enter(&rpfreelist_lock); 2950 if (rp->r_freef != NULL) { 2951 rp_rmfree(rp); 2952 mutex_exit(&rpfreelist_lock); 2953 rp_rmhash_locked(rp); 2954 rp->r_hashf = rlist; 2955 rlist = rp; 2956 } else 2957 mutex_exit(&rpfreelist_lock); 2958 } 2959 } 2960 rw_exit(&rtable[index].r_lock); 2961 } 2962 2963 for (rp = rlist; rp != NULL; rp = rlist) { 2964 rlist = rp->r_hashf; 2965 /* 2966 * This call to rp_addfree will end up destroying the 2967 * rnode, but in a safe way with the appropriate set 2968 * of checks done. 2969 */ 2970 rp_addfree(rp, cr); 2971 } 2972 2973 } 2974 2975 /* 2976 * This routine destroys all the resources associated with the rnode 2977 * and then the rnode itself. 2978 */ 2979 static void 2980 destroy_rnode(rnode_t *rp) 2981 { 2982 vnode_t *vp; 2983 vfs_t *vfsp; 2984 2985 vp = RTOV(rp); 2986 vfsp = vp->v_vfsp; 2987 2988 ASSERT(vp->v_count == 1); 2989 ASSERT(rp->r_count == 0); 2990 ASSERT(rp->r_lmpl == NULL); 2991 ASSERT(rp->r_mapcnt == 0); 2992 ASSERT(!(rp->r_flags & RHASHED)); 2993 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2994 atomic_add_long((ulong_t *)&rnew, -1); 2995 #ifdef DEBUG 2996 clstat_debug.nrnode.value.ui64--; 2997 #endif 2998 nfs_rw_destroy(&rp->r_rwlock); 2999 nfs_rw_destroy(&rp->r_lkserlock); 3000 mutex_destroy(&rp->r_statelock); 3001 cv_destroy(&rp->r_cv); 3002 cv_destroy(&rp->r_commit.c_cv); 3003 if (rp->r_flags & RDELMAPLIST) 3004 list_destroy(&rp->r_indelmap); 3005 nfs_free_r_path(rp); 3006 avl_destroy(&rp->r_dir); 3007 vn_invalid(vp); 3008 vn_free(vp); 3009 kmem_cache_free(rnode_cache, rp); 3010 VFS_RELE(vfsp); 3011 } 3012 3013 /* 3014 * Flush all vnodes in this (or every) vfs. 3015 * Used by nfs_sync and by nfs_unmount. 3016 */ 3017 void 3018 rflush(struct vfs *vfsp, cred_t *cr) 3019 { 3020 int index; 3021 rnode_t *rp; 3022 vnode_t *vp, **vplist; 3023 long num, cnt; 3024 3025 /* 3026 * Check to see whether there is anything to do. 3027 */ 3028 num = rnew; 3029 if (num == 0) 3030 return; 3031 3032 /* 3033 * Allocate a slot for all currently active rnodes on the 3034 * supposition that they all may need flushing. 3035 */ 3036 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3037 cnt = 0; 3038 3039 /* 3040 * Walk the hash queues looking for rnodes with page 3041 * lists associated with them. Make a list of these 3042 * files. 3043 */ 3044 for (index = 0; index < rtablesize; index++) { 3045 rw_enter(&rtable[index].r_lock, RW_READER); 3046 for (rp = rtable[index].r_hashf; 3047 rp != (rnode_t *)(&rtable[index]); 3048 rp = rp->r_hashf) { 3049 vp = RTOV(rp); 3050 /* 3051 * Don't bother sync'ing a vp if it 3052 * is part of virtual swap device or 3053 * if VFS is read-only 3054 */ 3055 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3056 continue; 3057 /* 3058 * If flushing all mounted file systems or 3059 * the vnode belongs to this vfs, has pages 3060 * and is marked as either dirty or mmap'd, 3061 * hold and add this vnode to the list of 3062 * vnodes to flush. 3063 */ 3064 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3065 vn_has_cached_data(vp) && 3066 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3067 VN_HOLD(vp); 3068 vplist[cnt++] = vp; 3069 if (cnt == num) { 3070 rw_exit(&rtable[index].r_lock); 3071 goto toomany; 3072 } 3073 } 3074 } 3075 rw_exit(&rtable[index].r_lock); 3076 } 3077 toomany: 3078 3079 /* 3080 * Flush and release all of the files on the list. 3081 */ 3082 while (cnt-- > 0) { 3083 vp = vplist[cnt]; 3084 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr); 3085 VN_RELE(vp); 3086 } 3087 3088 /* 3089 * Free the space allocated to hold the list. 3090 */ 3091 kmem_free(vplist, num * sizeof (*vplist)); 3092 } 3093 3094 /* 3095 * This probably needs to be larger than or equal to 3096 * log2(sizeof (struct rnode)) due to the way that rnodes are 3097 * allocated. 3098 */ 3099 #define ACACHE_SHIFT_BITS 9 3100 3101 static int 3102 acachehash(rnode_t *rp, cred_t *cr) 3103 { 3104 3105 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3106 acachemask); 3107 } 3108 3109 #ifdef DEBUG 3110 static long nfs_access_cache_hits = 0; 3111 static long nfs_access_cache_misses = 0; 3112 #endif 3113 3114 nfs_access_type_t 3115 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3116 { 3117 vnode_t *vp; 3118 acache_t *ap; 3119 acache_hash_t *hp; 3120 nfs_access_type_t all; 3121 3122 vp = RTOV(rp); 3123 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3124 return (NFS_ACCESS_UNKNOWN); 3125 3126 if (rp->r_acache != NULL) { 3127 hp = &acache[acachehash(rp, cr)]; 3128 rw_enter(&hp->lock, RW_READER); 3129 ap = hp->next; 3130 while (ap != (acache_t *)hp) { 3131 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3132 if ((ap->known & acc) == acc) { 3133 #ifdef DEBUG 3134 nfs_access_cache_hits++; 3135 #endif 3136 if ((ap->allowed & acc) == acc) 3137 all = NFS_ACCESS_ALLOWED; 3138 else 3139 all = NFS_ACCESS_DENIED; 3140 } else { 3141 #ifdef DEBUG 3142 nfs_access_cache_misses++; 3143 #endif 3144 all = NFS_ACCESS_UNKNOWN; 3145 } 3146 rw_exit(&hp->lock); 3147 return (all); 3148 } 3149 ap = ap->next; 3150 } 3151 rw_exit(&hp->lock); 3152 } 3153 3154 #ifdef DEBUG 3155 nfs_access_cache_misses++; 3156 #endif 3157 return (NFS_ACCESS_UNKNOWN); 3158 } 3159 3160 void 3161 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3162 { 3163 acache_t *ap; 3164 acache_t *nap; 3165 acache_hash_t *hp; 3166 3167 hp = &acache[acachehash(rp, cr)]; 3168 3169 /* 3170 * Allocate now assuming that mostly an allocation will be 3171 * required. This allows the allocation to happen without 3172 * holding the hash bucket locked. 3173 */ 3174 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3175 if (nap != NULL) { 3176 nap->known = acc; 3177 nap->allowed = resacc; 3178 nap->rnode = rp; 3179 crhold(cr); 3180 nap->cred = cr; 3181 nap->hashq = hp; 3182 } 3183 3184 rw_enter(&hp->lock, RW_WRITER); 3185 3186 if (rp->r_acache != NULL) { 3187 ap = hp->next; 3188 while (ap != (acache_t *)hp) { 3189 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3190 ap->known |= acc; 3191 ap->allowed &= ~acc; 3192 ap->allowed |= resacc; 3193 rw_exit(&hp->lock); 3194 if (nap != NULL) { 3195 crfree(nap->cred); 3196 kmem_cache_free(acache_cache, nap); 3197 } 3198 return; 3199 } 3200 ap = ap->next; 3201 } 3202 } 3203 3204 if (nap != NULL) { 3205 #ifdef DEBUG 3206 clstat_debug.access.value.ui64++; 3207 #endif 3208 nap->next = hp->next; 3209 hp->next = nap; 3210 nap->next->prev = nap; 3211 nap->prev = (acache_t *)hp; 3212 3213 mutex_enter(&rp->r_statelock); 3214 nap->list = rp->r_acache; 3215 rp->r_acache = nap; 3216 mutex_exit(&rp->r_statelock); 3217 } 3218 3219 rw_exit(&hp->lock); 3220 } 3221 3222 int 3223 nfs_access_purge_rp(rnode_t *rp) 3224 { 3225 acache_t *ap; 3226 acache_t *tmpap; 3227 acache_t *rplist; 3228 3229 /* 3230 * If there aren't any cached entries, then there is nothing 3231 * to free. 3232 */ 3233 if (rp->r_acache == NULL) 3234 return (0); 3235 3236 mutex_enter(&rp->r_statelock); 3237 rplist = rp->r_acache; 3238 rp->r_acache = NULL; 3239 mutex_exit(&rp->r_statelock); 3240 3241 /* 3242 * Loop through each entry in the list pointed to in the 3243 * rnode. Remove each of these entries from the hash 3244 * queue that it is on and remove it from the list in 3245 * the rnode. 3246 */ 3247 for (ap = rplist; ap != NULL; ap = tmpap) { 3248 rw_enter(&ap->hashq->lock, RW_WRITER); 3249 ap->prev->next = ap->next; 3250 ap->next->prev = ap->prev; 3251 rw_exit(&ap->hashq->lock); 3252 3253 tmpap = ap->list; 3254 crfree(ap->cred); 3255 kmem_cache_free(acache_cache, ap); 3256 #ifdef DEBUG 3257 clstat_debug.access.value.ui64--; 3258 #endif 3259 } 3260 3261 return (1); 3262 } 3263 3264 static const char prefix[] = ".nfs"; 3265 3266 static kmutex_t newnum_lock; 3267 3268 int 3269 newnum(void) 3270 { 3271 static uint_t newnum = 0; 3272 uint_t id; 3273 3274 mutex_enter(&newnum_lock); 3275 if (newnum == 0) 3276 newnum = gethrestime_sec() & 0xffff; 3277 id = newnum++; 3278 mutex_exit(&newnum_lock); 3279 return (id); 3280 } 3281 3282 char * 3283 newname(void) 3284 { 3285 char *news; 3286 char *s; 3287 const char *p; 3288 uint_t id; 3289 3290 id = newnum(); 3291 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3292 s = news; 3293 p = prefix; 3294 while (*p != '\0') 3295 *s++ = *p++; 3296 while (id != 0) { 3297 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3298 id >>= 4; 3299 } 3300 *s = '\0'; 3301 return (news); 3302 } 3303 3304 int 3305 nfs_atoi(char *cp) 3306 { 3307 int n; 3308 3309 n = 0; 3310 while (*cp != '\0') { 3311 n = n * 10 + (*cp - '0'); 3312 cp++; 3313 } 3314 3315 return (n); 3316 } 3317 3318 /* 3319 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3320 * framework. 3321 */ 3322 static int 3323 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3324 { 3325 ksp->ks_snaptime = gethrtime(); 3326 if (rw == KSTAT_WRITE) { 3327 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3328 #ifdef DEBUG 3329 /* 3330 * Currently only the global zone can write to kstats, but we 3331 * add the check just for paranoia. 3332 */ 3333 if (INGLOBALZONE(curproc)) 3334 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3335 sizeof (clstat_debug)); 3336 #endif 3337 } else { 3338 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3339 #ifdef DEBUG 3340 /* 3341 * If we're displaying the "global" debug kstat values, we 3342 * display them as-is to all zones since in fact they apply to 3343 * the system as a whole. 3344 */ 3345 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3346 sizeof (clstat_debug)); 3347 #endif 3348 } 3349 return (0); 3350 } 3351 3352 static void * 3353 clinit_zone(zoneid_t zoneid) 3354 { 3355 kstat_t *nfs_client_kstat; 3356 struct nfs_clnt *nfscl; 3357 uint_t ndata; 3358 3359 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3360 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3361 nfscl->nfscl_chtable = NULL; 3362 nfscl->nfscl_zoneid = zoneid; 3363 3364 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3365 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3366 #ifdef DEBUG 3367 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3368 #endif 3369 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3370 "misc", KSTAT_TYPE_NAMED, ndata, 3371 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3372 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3373 nfs_client_kstat->ks_snapshot = cl_snapshot; 3374 kstat_install(nfs_client_kstat); 3375 } 3376 mutex_enter(&nfs_clnt_list_lock); 3377 list_insert_head(&nfs_clnt_list, nfscl); 3378 mutex_exit(&nfs_clnt_list_lock); 3379 return (nfscl); 3380 } 3381 3382 /*ARGSUSED*/ 3383 static void 3384 clfini_zone(zoneid_t zoneid, void *arg) 3385 { 3386 struct nfs_clnt *nfscl = arg; 3387 chhead_t *chp, *next; 3388 3389 if (nfscl == NULL) 3390 return; 3391 mutex_enter(&nfs_clnt_list_lock); 3392 list_remove(&nfs_clnt_list, nfscl); 3393 mutex_exit(&nfs_clnt_list_lock); 3394 clreclaim_zone(nfscl, 0); 3395 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3396 ASSERT(chp->ch_list == NULL); 3397 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3398 next = chp->ch_next; 3399 kmem_free(chp, sizeof (*chp)); 3400 } 3401 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3402 mutex_destroy(&nfscl->nfscl_chtable_lock); 3403 kmem_free(nfscl, sizeof (*nfscl)); 3404 } 3405 3406 /* 3407 * Called by endpnt_destructor to make sure the client handles are 3408 * cleaned up before the RPC endpoints. This becomes a no-op if 3409 * clfini_zone (above) is called first. This function is needed 3410 * (rather than relying on clfini_zone to clean up) because the ZSD 3411 * callbacks have no ordering mechanism, so we have no way to ensure 3412 * that clfini_zone is called before endpnt_destructor. 3413 */ 3414 void 3415 clcleanup_zone(zoneid_t zoneid) 3416 { 3417 struct nfs_clnt *nfscl; 3418 3419 mutex_enter(&nfs_clnt_list_lock); 3420 nfscl = list_head(&nfs_clnt_list); 3421 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3422 if (nfscl->nfscl_zoneid == zoneid) { 3423 clreclaim_zone(nfscl, 0); 3424 break; 3425 } 3426 } 3427 mutex_exit(&nfs_clnt_list_lock); 3428 } 3429 3430 int 3431 nfs_subrinit(void) 3432 { 3433 int i; 3434 ulong_t nrnode_max; 3435 3436 /* 3437 * Allocate and initialize the rnode hash queues 3438 */ 3439 if (nrnode <= 0) 3440 nrnode = ncsize; 3441 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3442 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3443 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3444 "setting nrnode to max value of %ld", nrnode_max); 3445 nrnode = nrnode_max; 3446 } 3447 3448 rtablesize = 1 << highbit(nrnode / hashlen); 3449 rtablemask = rtablesize - 1; 3450 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3451 for (i = 0; i < rtablesize; i++) { 3452 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3453 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3454 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3455 } 3456 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3457 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3458 3459 /* 3460 * Allocate and initialize the access cache 3461 */ 3462 3463 /* 3464 * Initial guess is one access cache entry per rnode unless 3465 * nacache is set to a non-zero value and then it is used to 3466 * indicate a guess at the number of access cache entries. 3467 */ 3468 if (nacache > 0) 3469 acachesize = 1 << highbit(nacache / hashlen); 3470 else 3471 acachesize = rtablesize; 3472 acachemask = acachesize - 1; 3473 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3474 for (i = 0; i < acachesize; i++) { 3475 acache[i].next = (acache_t *)&acache[i]; 3476 acache[i].prev = (acache_t *)&acache[i]; 3477 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3478 } 3479 acache_cache = kmem_cache_create("nfs_access_cache", 3480 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3481 /* 3482 * Allocate and initialize the client handle cache 3483 */ 3484 chtab_cache = kmem_cache_create("client_handle_cache", 3485 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, 3486 NULL, 0); 3487 /* 3488 * Initialize the list of per-zone client handles (and associated data). 3489 * This needs to be done before we call zone_key_create(). 3490 */ 3491 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3492 offsetof(struct nfs_clnt, nfscl_node)); 3493 /* 3494 * Initialize the zone_key for per-zone client handle lists. 3495 */ 3496 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3497 /* 3498 * Initialize the various mutexes and reader/writer locks 3499 */ 3500 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3501 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3502 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3503 3504 /* 3505 * Assign unique major number for all nfs mounts 3506 */ 3507 if ((nfs_major = getudev()) == -1) { 3508 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3509 "nfs: init: can't get unique device number"); 3510 nfs_major = 0; 3511 } 3512 nfs_minor = 0; 3513 3514 if (nfs3_jukebox_delay == 0) 3515 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3516 3517 return (0); 3518 } 3519 3520 void 3521 nfs_subrfini(void) 3522 { 3523 int i; 3524 3525 /* 3526 * Deallocate the rnode hash queues 3527 */ 3528 kmem_cache_destroy(rnode_cache); 3529 3530 for (i = 0; i < rtablesize; i++) 3531 rw_destroy(&rtable[i].r_lock); 3532 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3533 3534 /* 3535 * Deallocated the access cache 3536 */ 3537 kmem_cache_destroy(acache_cache); 3538 3539 for (i = 0; i < acachesize; i++) 3540 rw_destroy(&acache[i].lock); 3541 kmem_free(acache, acachesize * sizeof (*acache)); 3542 3543 /* 3544 * Deallocate the client handle cache 3545 */ 3546 kmem_cache_destroy(chtab_cache); 3547 3548 /* 3549 * Destroy the various mutexes and reader/writer locks 3550 */ 3551 mutex_destroy(&rpfreelist_lock); 3552 mutex_destroy(&newnum_lock); 3553 mutex_destroy(&nfs_minor_lock); 3554 (void) zone_key_delete(nfsclnt_zone_key); 3555 } 3556 3557 enum nfsstat 3558 puterrno(int error) 3559 { 3560 3561 switch (error) { 3562 case EOPNOTSUPP: 3563 return (NFSERR_OPNOTSUPP); 3564 case ENAMETOOLONG: 3565 return (NFSERR_NAMETOOLONG); 3566 case ENOTEMPTY: 3567 return (NFSERR_NOTEMPTY); 3568 case EDQUOT: 3569 return (NFSERR_DQUOT); 3570 case ESTALE: 3571 return (NFSERR_STALE); 3572 case EREMOTE: 3573 return (NFSERR_REMOTE); 3574 case ENOSYS: 3575 return (NFSERR_OPNOTSUPP); 3576 case EOVERFLOW: 3577 return (NFSERR_INVAL); 3578 default: 3579 return ((enum nfsstat)error); 3580 } 3581 /* NOTREACHED */ 3582 } 3583 3584 int 3585 geterrno(enum nfsstat status) 3586 { 3587 3588 switch (status) { 3589 case NFSERR_OPNOTSUPP: 3590 return (EOPNOTSUPP); 3591 case NFSERR_NAMETOOLONG: 3592 return (ENAMETOOLONG); 3593 case NFSERR_NOTEMPTY: 3594 return (ENOTEMPTY); 3595 case NFSERR_DQUOT: 3596 return (EDQUOT); 3597 case NFSERR_STALE: 3598 return (ESTALE); 3599 case NFSERR_REMOTE: 3600 return (EREMOTE); 3601 case NFSERR_WFLUSH: 3602 return (EIO); 3603 default: 3604 return ((int)status); 3605 } 3606 /* NOTREACHED */ 3607 } 3608 3609 enum nfsstat3 3610 puterrno3(int error) 3611 { 3612 3613 #ifdef DEBUG 3614 switch (error) { 3615 case 0: 3616 return (NFS3_OK); 3617 case EPERM: 3618 return (NFS3ERR_PERM); 3619 case ENOENT: 3620 return (NFS3ERR_NOENT); 3621 case EIO: 3622 return (NFS3ERR_IO); 3623 case ENXIO: 3624 return (NFS3ERR_NXIO); 3625 case EACCES: 3626 return (NFS3ERR_ACCES); 3627 case EEXIST: 3628 return (NFS3ERR_EXIST); 3629 case EXDEV: 3630 return (NFS3ERR_XDEV); 3631 case ENODEV: 3632 return (NFS3ERR_NODEV); 3633 case ENOTDIR: 3634 return (NFS3ERR_NOTDIR); 3635 case EISDIR: 3636 return (NFS3ERR_ISDIR); 3637 case EINVAL: 3638 return (NFS3ERR_INVAL); 3639 case EFBIG: 3640 return (NFS3ERR_FBIG); 3641 case ENOSPC: 3642 return (NFS3ERR_NOSPC); 3643 case EROFS: 3644 return (NFS3ERR_ROFS); 3645 case EMLINK: 3646 return (NFS3ERR_MLINK); 3647 case ENAMETOOLONG: 3648 return (NFS3ERR_NAMETOOLONG); 3649 case ENOTEMPTY: 3650 return (NFS3ERR_NOTEMPTY); 3651 case EDQUOT: 3652 return (NFS3ERR_DQUOT); 3653 case ESTALE: 3654 return (NFS3ERR_STALE); 3655 case EREMOTE: 3656 return (NFS3ERR_REMOTE); 3657 case EOPNOTSUPP: 3658 return (NFS3ERR_NOTSUPP); 3659 case EOVERFLOW: 3660 return (NFS3ERR_INVAL); 3661 default: 3662 zcmn_err(getzoneid(), CE_WARN, 3663 "puterrno3: got error %d", error); 3664 return ((enum nfsstat3)error); 3665 } 3666 #else 3667 switch (error) { 3668 case ENAMETOOLONG: 3669 return (NFS3ERR_NAMETOOLONG); 3670 case ENOTEMPTY: 3671 return (NFS3ERR_NOTEMPTY); 3672 case EDQUOT: 3673 return (NFS3ERR_DQUOT); 3674 case ESTALE: 3675 return (NFS3ERR_STALE); 3676 case EOPNOTSUPP: 3677 return (NFS3ERR_NOTSUPP); 3678 case EREMOTE: 3679 return (NFS3ERR_REMOTE); 3680 case EOVERFLOW: 3681 return (NFS3ERR_INVAL); 3682 default: 3683 return ((enum nfsstat3)error); 3684 } 3685 #endif 3686 } 3687 3688 int 3689 geterrno3(enum nfsstat3 status) 3690 { 3691 3692 #ifdef DEBUG 3693 switch (status) { 3694 case NFS3_OK: 3695 return (0); 3696 case NFS3ERR_PERM: 3697 return (EPERM); 3698 case NFS3ERR_NOENT: 3699 return (ENOENT); 3700 case NFS3ERR_IO: 3701 return (EIO); 3702 case NFS3ERR_NXIO: 3703 return (ENXIO); 3704 case NFS3ERR_ACCES: 3705 return (EACCES); 3706 case NFS3ERR_EXIST: 3707 return (EEXIST); 3708 case NFS3ERR_XDEV: 3709 return (EXDEV); 3710 case NFS3ERR_NODEV: 3711 return (ENODEV); 3712 case NFS3ERR_NOTDIR: 3713 return (ENOTDIR); 3714 case NFS3ERR_ISDIR: 3715 return (EISDIR); 3716 case NFS3ERR_INVAL: 3717 return (EINVAL); 3718 case NFS3ERR_FBIG: 3719 return (EFBIG); 3720 case NFS3ERR_NOSPC: 3721 return (ENOSPC); 3722 case NFS3ERR_ROFS: 3723 return (EROFS); 3724 case NFS3ERR_MLINK: 3725 return (EMLINK); 3726 case NFS3ERR_NAMETOOLONG: 3727 return (ENAMETOOLONG); 3728 case NFS3ERR_NOTEMPTY: 3729 return (ENOTEMPTY); 3730 case NFS3ERR_DQUOT: 3731 return (EDQUOT); 3732 case NFS3ERR_STALE: 3733 return (ESTALE); 3734 case NFS3ERR_REMOTE: 3735 return (EREMOTE); 3736 case NFS3ERR_BADHANDLE: 3737 return (ESTALE); 3738 case NFS3ERR_NOT_SYNC: 3739 return (EINVAL); 3740 case NFS3ERR_BAD_COOKIE: 3741 return (ENOENT); 3742 case NFS3ERR_NOTSUPP: 3743 return (EOPNOTSUPP); 3744 case NFS3ERR_TOOSMALL: 3745 return (EINVAL); 3746 case NFS3ERR_SERVERFAULT: 3747 return (EIO); 3748 case NFS3ERR_BADTYPE: 3749 return (EINVAL); 3750 case NFS3ERR_JUKEBOX: 3751 return (ENXIO); 3752 default: 3753 zcmn_err(getzoneid(), CE_WARN, 3754 "geterrno3: got status %d", status); 3755 return ((int)status); 3756 } 3757 #else 3758 switch (status) { 3759 case NFS3ERR_NAMETOOLONG: 3760 return (ENAMETOOLONG); 3761 case NFS3ERR_NOTEMPTY: 3762 return (ENOTEMPTY); 3763 case NFS3ERR_DQUOT: 3764 return (EDQUOT); 3765 case NFS3ERR_STALE: 3766 case NFS3ERR_BADHANDLE: 3767 return (ESTALE); 3768 case NFS3ERR_NOTSUPP: 3769 return (EOPNOTSUPP); 3770 case NFS3ERR_REMOTE: 3771 return (EREMOTE); 3772 case NFS3ERR_NOT_SYNC: 3773 case NFS3ERR_TOOSMALL: 3774 case NFS3ERR_BADTYPE: 3775 return (EINVAL); 3776 case NFS3ERR_BAD_COOKIE: 3777 return (ENOENT); 3778 case NFS3ERR_SERVERFAULT: 3779 return (EIO); 3780 case NFS3ERR_JUKEBOX: 3781 return (ENXIO); 3782 default: 3783 return ((int)status); 3784 } 3785 #endif 3786 } 3787 3788 rddir_cache * 3789 rddir_cache_alloc(int flags) 3790 { 3791 rddir_cache *rc; 3792 3793 rc = kmem_alloc(sizeof (*rc), flags); 3794 if (rc != NULL) { 3795 rc->entries = NULL; 3796 rc->flags = RDDIR; 3797 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3798 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3799 rc->count = 1; 3800 #ifdef DEBUG 3801 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3802 #endif 3803 } 3804 return (rc); 3805 } 3806 3807 static void 3808 rddir_cache_free(rddir_cache *rc) 3809 { 3810 3811 #ifdef DEBUG 3812 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3813 #endif 3814 if (rc->entries != NULL) { 3815 #ifdef DEBUG 3816 rddir_cache_buf_free(rc->entries, rc->buflen); 3817 #else 3818 kmem_free(rc->entries, rc->buflen); 3819 #endif 3820 } 3821 cv_destroy(&rc->cv); 3822 mutex_destroy(&rc->lock); 3823 kmem_free(rc, sizeof (*rc)); 3824 } 3825 3826 void 3827 rddir_cache_hold(rddir_cache *rc) 3828 { 3829 3830 mutex_enter(&rc->lock); 3831 rc->count++; 3832 mutex_exit(&rc->lock); 3833 } 3834 3835 void 3836 rddir_cache_rele(rddir_cache *rc) 3837 { 3838 3839 mutex_enter(&rc->lock); 3840 ASSERT(rc->count > 0); 3841 if (--rc->count == 0) { 3842 mutex_exit(&rc->lock); 3843 rddir_cache_free(rc); 3844 } else 3845 mutex_exit(&rc->lock); 3846 } 3847 3848 #ifdef DEBUG 3849 char * 3850 rddir_cache_buf_alloc(size_t size, int flags) 3851 { 3852 char *rc; 3853 3854 rc = kmem_alloc(size, flags); 3855 if (rc != NULL) 3856 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3857 return (rc); 3858 } 3859 3860 void 3861 rddir_cache_buf_free(void *addr, size_t size) 3862 { 3863 3864 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3865 kmem_free(addr, size); 3866 } 3867 #endif 3868 3869 static int 3870 nfs_free_data_reclaim(rnode_t *rp) 3871 { 3872 char *contents; 3873 int size; 3874 vsecattr_t *vsp; 3875 nfs3_pathconf_info *info; 3876 int freed; 3877 cred_t *cred; 3878 3879 /* 3880 * Free any held credentials and caches which 3881 * may be associated with this rnode. 3882 */ 3883 mutex_enter(&rp->r_statelock); 3884 cred = rp->r_cred; 3885 rp->r_cred = NULL; 3886 contents = rp->r_symlink.contents; 3887 size = rp->r_symlink.size; 3888 rp->r_symlink.contents = NULL; 3889 vsp = rp->r_secattr; 3890 rp->r_secattr = NULL; 3891 info = rp->r_pathconf; 3892 rp->r_pathconf = NULL; 3893 mutex_exit(&rp->r_statelock); 3894 3895 if (cred != NULL) 3896 crfree(cred); 3897 3898 /* 3899 * Free the access cache entries. 3900 */ 3901 freed = nfs_access_purge_rp(rp); 3902 3903 if (!HAVE_RDDIR_CACHE(rp) && 3904 contents == NULL && 3905 vsp == NULL && 3906 info == NULL) 3907 return (freed); 3908 3909 /* 3910 * Free the readdir cache entries 3911 */ 3912 if (HAVE_RDDIR_CACHE(rp)) 3913 nfs_purge_rddir_cache(RTOV(rp)); 3914 3915 /* 3916 * Free the symbolic link cache. 3917 */ 3918 if (contents != NULL) { 3919 3920 kmem_free((void *)contents, size); 3921 } 3922 3923 /* 3924 * Free any cached ACL. 3925 */ 3926 if (vsp != NULL) 3927 nfs_acl_free(vsp); 3928 3929 /* 3930 * Free any cached pathconf information. 3931 */ 3932 if (info != NULL) 3933 kmem_free(info, sizeof (*info)); 3934 3935 return (1); 3936 } 3937 3938 static int 3939 nfs_active_data_reclaim(rnode_t *rp) 3940 { 3941 char *contents; 3942 int size; 3943 vsecattr_t *vsp; 3944 nfs3_pathconf_info *info; 3945 int freed; 3946 3947 /* 3948 * Free any held credentials and caches which 3949 * may be associated with this rnode. 3950 */ 3951 if (!mutex_tryenter(&rp->r_statelock)) 3952 return (0); 3953 contents = rp->r_symlink.contents; 3954 size = rp->r_symlink.size; 3955 rp->r_symlink.contents = NULL; 3956 vsp = rp->r_secattr; 3957 rp->r_secattr = NULL; 3958 info = rp->r_pathconf; 3959 rp->r_pathconf = NULL; 3960 mutex_exit(&rp->r_statelock); 3961 3962 /* 3963 * Free the access cache entries. 3964 */ 3965 freed = nfs_access_purge_rp(rp); 3966 3967 if (!HAVE_RDDIR_CACHE(rp) && 3968 contents == NULL && 3969 vsp == NULL && 3970 info == NULL) 3971 return (freed); 3972 3973 /* 3974 * Free the readdir cache entries 3975 */ 3976 if (HAVE_RDDIR_CACHE(rp)) 3977 nfs_purge_rddir_cache(RTOV(rp)); 3978 3979 /* 3980 * Free the symbolic link cache. 3981 */ 3982 if (contents != NULL) { 3983 3984 kmem_free((void *)contents, size); 3985 } 3986 3987 /* 3988 * Free any cached ACL. 3989 */ 3990 if (vsp != NULL) 3991 nfs_acl_free(vsp); 3992 3993 /* 3994 * Free any cached pathconf information. 3995 */ 3996 if (info != NULL) 3997 kmem_free(info, sizeof (*info)); 3998 3999 return (1); 4000 } 4001 4002 static int 4003 nfs_free_reclaim(void) 4004 { 4005 int freed; 4006 rnode_t *rp; 4007 4008 #ifdef DEBUG 4009 clstat_debug.f_reclaim.value.ui64++; 4010 #endif 4011 freed = 0; 4012 mutex_enter(&rpfreelist_lock); 4013 rp = rpfreelist; 4014 if (rp != NULL) { 4015 do { 4016 if (nfs_free_data_reclaim(rp)) 4017 freed = 1; 4018 } while ((rp = rp->r_freef) != rpfreelist); 4019 } 4020 mutex_exit(&rpfreelist_lock); 4021 return (freed); 4022 } 4023 4024 static int 4025 nfs_active_reclaim(void) 4026 { 4027 int freed; 4028 int index; 4029 rnode_t *rp; 4030 4031 #ifdef DEBUG 4032 clstat_debug.a_reclaim.value.ui64++; 4033 #endif 4034 freed = 0; 4035 for (index = 0; index < rtablesize; index++) { 4036 rw_enter(&rtable[index].r_lock, RW_READER); 4037 for (rp = rtable[index].r_hashf; 4038 rp != (rnode_t *)(&rtable[index]); 4039 rp = rp->r_hashf) { 4040 if (nfs_active_data_reclaim(rp)) 4041 freed = 1; 4042 } 4043 rw_exit(&rtable[index].r_lock); 4044 } 4045 return (freed); 4046 } 4047 4048 static int 4049 nfs_rnode_reclaim(void) 4050 { 4051 int freed; 4052 rnode_t *rp; 4053 vnode_t *vp; 4054 4055 #ifdef DEBUG 4056 clstat_debug.r_reclaim.value.ui64++; 4057 #endif 4058 freed = 0; 4059 mutex_enter(&rpfreelist_lock); 4060 while ((rp = rpfreelist) != NULL) { 4061 rp_rmfree(rp); 4062 mutex_exit(&rpfreelist_lock); 4063 if (rp->r_flags & RHASHED) { 4064 vp = RTOV(rp); 4065 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4066 mutex_enter(&vp->v_lock); 4067 if (vp->v_count > 1) { 4068 vp->v_count--; 4069 mutex_exit(&vp->v_lock); 4070 rw_exit(&rp->r_hashq->r_lock); 4071 mutex_enter(&rpfreelist_lock); 4072 continue; 4073 } 4074 mutex_exit(&vp->v_lock); 4075 rp_rmhash_locked(rp); 4076 rw_exit(&rp->r_hashq->r_lock); 4077 } 4078 /* 4079 * This call to rp_addfree will end up destroying the 4080 * rnode, but in a safe way with the appropriate set 4081 * of checks done. 4082 */ 4083 rp_addfree(rp, CRED()); 4084 mutex_enter(&rpfreelist_lock); 4085 } 4086 mutex_exit(&rpfreelist_lock); 4087 return (freed); 4088 } 4089 4090 /*ARGSUSED*/ 4091 static void 4092 nfs_reclaim(void *cdrarg) 4093 { 4094 4095 #ifdef DEBUG 4096 clstat_debug.reclaim.value.ui64++; 4097 #endif 4098 if (nfs_free_reclaim()) 4099 return; 4100 4101 if (nfs_active_reclaim()) 4102 return; 4103 4104 (void) nfs_rnode_reclaim(); 4105 } 4106 4107 /* 4108 * NFS client failover support 4109 * 4110 * Routines to copy filehandles 4111 */ 4112 void 4113 nfscopyfh(caddr_t fhp, vnode_t *vp) 4114 { 4115 fhandle_t *dest = (fhandle_t *)fhp; 4116 4117 if (dest != NULL) 4118 *dest = *VTOFH(vp); 4119 } 4120 4121 void 4122 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4123 { 4124 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4125 4126 if (dest != NULL) 4127 *dest = *VTOFH3(vp); 4128 } 4129 4130 /* 4131 * NFS client failover support 4132 * 4133 * failover_safe() will test various conditions to ensure that 4134 * failover is permitted for this vnode. It will be denied 4135 * if: 4136 * 1) the operation in progress does not support failover (NULL fi) 4137 * 2) there are no available replicas (NULL mi_servers->sv_next) 4138 * 3) any locks are outstanding on this file 4139 */ 4140 static int 4141 failover_safe(failinfo_t *fi) 4142 { 4143 4144 /* 4145 * Does this op permit failover? 4146 */ 4147 if (fi == NULL || fi->vp == NULL) 4148 return (0); 4149 4150 /* 4151 * Are there any alternates to failover to? 4152 */ 4153 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4154 return (0); 4155 4156 /* 4157 * Disable check; we've forced local locking 4158 * 4159 * if (flk_has_remote_locks(fi->vp)) 4160 * return (0); 4161 */ 4162 4163 /* 4164 * If we have no partial path, we can't do anything 4165 */ 4166 if (VTOR(fi->vp)->r_path == NULL) 4167 return (0); 4168 4169 return (1); 4170 } 4171 4172 #include <sys/thread.h> 4173 4174 /* 4175 * NFS client failover support 4176 * 4177 * failover_newserver() will start a search for a new server, 4178 * preferably by starting an async thread to do the work. If 4179 * someone is already doing this (recognizable by MI_BINDINPROG 4180 * being set), it will simply return and the calling thread 4181 * will queue on the mi_failover_cv condition variable. 4182 */ 4183 static void 4184 failover_newserver(mntinfo_t *mi) 4185 { 4186 /* 4187 * Check if someone else is doing this already 4188 */ 4189 mutex_enter(&mi->mi_lock); 4190 if (mi->mi_flags & MI_BINDINPROG) { 4191 mutex_exit(&mi->mi_lock); 4192 return; 4193 } 4194 mi->mi_flags |= MI_BINDINPROG; 4195 4196 /* 4197 * Need to hold the vfs struct so that it can't be released 4198 * while the failover thread is selecting a new server. 4199 */ 4200 VFS_HOLD(mi->mi_vfsp); 4201 4202 /* 4203 * Start a thread to do the real searching. 4204 */ 4205 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4206 4207 mutex_exit(&mi->mi_lock); 4208 } 4209 4210 /* 4211 * NFS client failover support 4212 * 4213 * failover_thread() will find a new server to replace the one 4214 * currently in use, wake up other threads waiting on this mount 4215 * point, and die. It will start at the head of the server list 4216 * and poll servers until it finds one with an NFS server which is 4217 * registered and responds to a NULL procedure ping. 4218 * 4219 * XXX failover_thread is unsafe within the scope of the 4220 * present model defined for cpr to suspend the system. 4221 * Specifically, over-the-wire calls made by the thread 4222 * are unsafe. The thread needs to be reevaluated in case of 4223 * future updates to the cpr suspend model. 4224 */ 4225 static void 4226 failover_thread(mntinfo_t *mi) 4227 { 4228 servinfo_t *svp = NULL; 4229 CLIENT *cl; 4230 enum clnt_stat status; 4231 struct timeval tv; 4232 int error; 4233 int oncethru = 0; 4234 callb_cpr_t cprinfo; 4235 rnode_t *rp; 4236 int index; 4237 char *srvnames; 4238 size_t srvnames_len; 4239 struct nfs_clnt *nfscl = NULL; 4240 zoneid_t zoneid = getzoneid(); 4241 4242 #ifdef DEBUG 4243 /* 4244 * This is currently only needed to access counters which exist on 4245 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4246 * on non-DEBUG kernels. 4247 */ 4248 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4249 ASSERT(nfscl != NULL); 4250 #endif 4251 4252 /* 4253 * Its safe to piggyback on the mi_lock since failover_newserver() 4254 * code guarantees that there will be only one failover thread 4255 * per mountinfo at any instance. 4256 */ 4257 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4258 "failover_thread"); 4259 4260 mutex_enter(&mi->mi_lock); 4261 while (mi->mi_readers) { 4262 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4263 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4264 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4265 } 4266 mutex_exit(&mi->mi_lock); 4267 4268 tv.tv_sec = 2; 4269 tv.tv_usec = 0; 4270 4271 /* 4272 * Ping the null NFS procedure of every server in 4273 * the list until one responds. We always start 4274 * at the head of the list and always skip the one 4275 * that is current, since it's caused us a problem. 4276 */ 4277 while (svp == NULL) { 4278 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4279 if (!oncethru && svp == mi->mi_curr_serv) 4280 continue; 4281 4282 /* 4283 * If the file system was forcibly umounted 4284 * while trying to do a failover, then just 4285 * give up on the failover. It won't matter 4286 * what the server is. 4287 */ 4288 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4289 svp = NULL; 4290 goto done; 4291 } 4292 4293 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4294 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4295 if (error) 4296 continue; 4297 4298 if (!(mi->mi_flags & MI_INT)) 4299 cl->cl_nosignal = TRUE; 4300 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4301 xdr_void, NULL, tv); 4302 if (!(mi->mi_flags & MI_INT)) 4303 cl->cl_nosignal = FALSE; 4304 AUTH_DESTROY(cl->cl_auth); 4305 CLNT_DESTROY(cl); 4306 if (status == RPC_SUCCESS) { 4307 if (svp == mi->mi_curr_serv) { 4308 #ifdef DEBUG 4309 zcmn_err(zoneid, CE_NOTE, 4310 "NFS%d: failing over: selecting original server %s", 4311 mi->mi_vers, svp->sv_hostname); 4312 #else 4313 zcmn_err(zoneid, CE_NOTE, 4314 "NFS: failing over: selecting original server %s", 4315 svp->sv_hostname); 4316 #endif 4317 } else { 4318 #ifdef DEBUG 4319 zcmn_err(zoneid, CE_NOTE, 4320 "NFS%d: failing over from %s to %s", 4321 mi->mi_vers, 4322 mi->mi_curr_serv->sv_hostname, 4323 svp->sv_hostname); 4324 #else 4325 zcmn_err(zoneid, CE_NOTE, 4326 "NFS: failing over from %s to %s", 4327 mi->mi_curr_serv->sv_hostname, 4328 svp->sv_hostname); 4329 #endif 4330 } 4331 break; 4332 } 4333 } 4334 4335 if (svp == NULL) { 4336 if (!oncethru) { 4337 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4338 #ifdef DEBUG 4339 zprintf(zoneid, 4340 "NFS%d servers %s not responding " 4341 "still trying\n", mi->mi_vers, srvnames); 4342 #else 4343 zprintf(zoneid, "NFS servers %s not responding " 4344 "still trying\n", srvnames); 4345 #endif 4346 oncethru = 1; 4347 } 4348 mutex_enter(&mi->mi_lock); 4349 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4350 mutex_exit(&mi->mi_lock); 4351 delay(hz); 4352 mutex_enter(&mi->mi_lock); 4353 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4354 mutex_exit(&mi->mi_lock); 4355 } 4356 } 4357 4358 if (oncethru) { 4359 #ifdef DEBUG 4360 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4361 #else 4362 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4363 #endif 4364 } 4365 4366 if (svp != mi->mi_curr_serv) { 4367 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4368 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4369 rw_enter(&rtable[index].r_lock, RW_WRITER); 4370 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4371 mi->mi_vfsp); 4372 if (rp != NULL) { 4373 if (rp->r_flags & RHASHED) 4374 rp_rmhash_locked(rp); 4375 rw_exit(&rtable[index].r_lock); 4376 rp->r_server = svp; 4377 rp->r_fh = svp->sv_fhandle; 4378 (void) nfs_free_data_reclaim(rp); 4379 index = rtablehash(&rp->r_fh); 4380 rp->r_hashq = &rtable[index]; 4381 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4382 vn_exists(RTOV(rp)); 4383 rp_addhash(rp); 4384 rw_exit(&rp->r_hashq->r_lock); 4385 VN_RELE(RTOV(rp)); 4386 } else 4387 rw_exit(&rtable[index].r_lock); 4388 } 4389 4390 done: 4391 if (oncethru) 4392 kmem_free(srvnames, srvnames_len); 4393 mutex_enter(&mi->mi_lock); 4394 mi->mi_flags &= ~MI_BINDINPROG; 4395 if (svp != NULL) { 4396 mi->mi_curr_serv = svp; 4397 mi->mi_failover++; 4398 #ifdef DEBUG 4399 nfscl->nfscl_stat.failover.value.ui64++; 4400 #endif 4401 } 4402 cv_broadcast(&mi->mi_failover_cv); 4403 CALLB_CPR_EXIT(&cprinfo); 4404 VFS_RELE(mi->mi_vfsp); 4405 zthread_exit(); 4406 /* NOTREACHED */ 4407 } 4408 4409 /* 4410 * NFS client failover support 4411 * 4412 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4413 * is cleared, meaning that failover is complete. Called with 4414 * mi_lock mutex held. 4415 */ 4416 static int 4417 failover_wait(mntinfo_t *mi) 4418 { 4419 k_sigset_t smask; 4420 4421 /* 4422 * If someone else is hunting for a living server, 4423 * sleep until it's done. After our sleep, we may 4424 * be bound to the right server and get off cheaply. 4425 */ 4426 while (mi->mi_flags & MI_BINDINPROG) { 4427 /* 4428 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4429 * and SIGTERM. (Preserving the existing masks). 4430 * Mask out SIGINT if mount option nointr is specified. 4431 */ 4432 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4433 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4434 /* 4435 * restore original signal mask 4436 */ 4437 sigunintr(&smask); 4438 return (EINTR); 4439 } 4440 /* 4441 * restore original signal mask 4442 */ 4443 sigunintr(&smask); 4444 } 4445 return (0); 4446 } 4447 4448 /* 4449 * NFS client failover support 4450 * 4451 * failover_remap() will do a partial pathname lookup and find the 4452 * desired vnode on the current server. The interim vnode will be 4453 * discarded after we pilfer the new filehandle. 4454 * 4455 * Side effects: 4456 * - This routine will also update the filehandle in the args structure 4457 * pointed to by the fi->fhp pointer if it is non-NULL. 4458 */ 4459 4460 static int 4461 failover_remap(failinfo_t *fi) 4462 { 4463 vnode_t *vp, *nvp, *rootvp; 4464 rnode_t *rp, *nrp; 4465 mntinfo_t *mi; 4466 int error; 4467 #ifdef DEBUG 4468 struct nfs_clnt *nfscl; 4469 4470 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4471 ASSERT(nfscl != NULL); 4472 #endif 4473 /* 4474 * Sanity check 4475 */ 4476 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4477 return (EINVAL); 4478 vp = fi->vp; 4479 rp = VTOR(vp); 4480 mi = VTOMI(vp); 4481 4482 if (!(vp->v_flag & VROOT)) { 4483 /* 4484 * Given the root fh, use the path stored in 4485 * the rnode to find the fh for the new server. 4486 */ 4487 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4488 if (error) 4489 return (error); 4490 4491 error = failover_lookup(rp->r_path, rootvp, 4492 fi->lookupproc, fi->xattrdirproc, &nvp); 4493 4494 VN_RELE(rootvp); 4495 4496 if (error) 4497 return (error); 4498 4499 /* 4500 * If we found the same rnode, we're done now 4501 */ 4502 if (nvp == vp) { 4503 /* 4504 * Failed and the new server may physically be same 4505 * OR may share a same disk subsystem. In this case 4506 * file handle for a particular file path is not going 4507 * to change, given the same filehandle lookup will 4508 * always locate the same rnode as the existing one. 4509 * All we might need to do is to update the r_server 4510 * with the current servinfo. 4511 */ 4512 if (!VALID_FH(fi)) { 4513 rp->r_server = mi->mi_curr_serv; 4514 } 4515 VN_RELE(nvp); 4516 return (0); 4517 } 4518 4519 /* 4520 * Try to make it so that no one else will find this 4521 * vnode because it is just a temporary to hold the 4522 * new file handle until that file handle can be 4523 * copied to the original vnode/rnode. 4524 */ 4525 nrp = VTOR(nvp); 4526 mutex_enter(&mi->mi_remap_lock); 4527 /* 4528 * Some other thread could have raced in here and could 4529 * have done the remap for this particular rnode before 4530 * this thread here. Check for rp->r_server and 4531 * mi->mi_curr_serv and return if they are same. 4532 */ 4533 if (VALID_FH(fi)) { 4534 mutex_exit(&mi->mi_remap_lock); 4535 VN_RELE(nvp); 4536 return (0); 4537 } 4538 4539 if (nrp->r_flags & RHASHED) 4540 rp_rmhash(nrp); 4541 4542 /* 4543 * As a heuristic check on the validity of the new 4544 * file, check that the size and type match against 4545 * that we remember from the old version. 4546 */ 4547 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4548 mutex_exit(&mi->mi_remap_lock); 4549 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4550 "NFS replicas %s and %s: file %s not same.", 4551 rp->r_server->sv_hostname, 4552 nrp->r_server->sv_hostname, rp->r_path); 4553 VN_RELE(nvp); 4554 return (EINVAL); 4555 } 4556 4557 /* 4558 * snarf the filehandle from the new rnode 4559 * then release it, again while updating the 4560 * hash queues for the rnode. 4561 */ 4562 if (rp->r_flags & RHASHED) 4563 rp_rmhash(rp); 4564 rp->r_server = mi->mi_curr_serv; 4565 rp->r_fh = nrp->r_fh; 4566 rp->r_hashq = nrp->r_hashq; 4567 /* 4568 * Copy the attributes from the new rnode to the old 4569 * rnode. This will help to reduce unnecessary page 4570 * cache flushes. 4571 */ 4572 rp->r_attr = nrp->r_attr; 4573 rp->r_attrtime = nrp->r_attrtime; 4574 rp->r_mtime = nrp->r_mtime; 4575 (void) nfs_free_data_reclaim(rp); 4576 nfs_setswaplike(vp, &rp->r_attr); 4577 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4578 rp_addhash(rp); 4579 rw_exit(&rp->r_hashq->r_lock); 4580 mutex_exit(&mi->mi_remap_lock); 4581 VN_RELE(nvp); 4582 } 4583 4584 /* 4585 * Update successful failover remap count 4586 */ 4587 mutex_enter(&mi->mi_lock); 4588 mi->mi_remap++; 4589 mutex_exit(&mi->mi_lock); 4590 #ifdef DEBUG 4591 nfscl->nfscl_stat.remap.value.ui64++; 4592 #endif 4593 4594 /* 4595 * If we have a copied filehandle to update, do it now. 4596 */ 4597 if (fi->fhp != NULL && fi->copyproc != NULL) 4598 (*fi->copyproc)(fi->fhp, vp); 4599 4600 return (0); 4601 } 4602 4603 /* 4604 * NFS client failover support 4605 * 4606 * We want a simple pathname lookup routine to parse the pieces 4607 * of path in rp->r_path. We know that the path was a created 4608 * as rnodes were made, so we know we have only to deal with 4609 * paths that look like: 4610 * dir1/dir2/dir3/file 4611 * Any evidence of anything like .., symlinks, and ENOTDIR 4612 * are hard errors, because they mean something in this filesystem 4613 * is different from the one we came from, or has changed under 4614 * us in some way. If this is true, we want the failure. 4615 * 4616 * Extended attributes: if the filesystem is mounted with extended 4617 * attributes enabled (-o xattr), the attribute directory will be 4618 * represented in the r_path as the magic name XATTR_RPATH. So if 4619 * we see that name in the pathname, is must be because this node 4620 * is an extended attribute. Therefore, look it up that way. 4621 */ 4622 static int 4623 failover_lookup(char *path, vnode_t *root, 4624 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4625 vnode_t *, cred_t *, int), 4626 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4627 vnode_t **new) 4628 { 4629 vnode_t *dvp, *nvp; 4630 int error = EINVAL; 4631 char *s, *p, *tmppath; 4632 size_t len; 4633 mntinfo_t *mi; 4634 bool_t xattr; 4635 4636 /* Make local copy of path */ 4637 len = strlen(path) + 1; 4638 tmppath = kmem_alloc(len, KM_SLEEP); 4639 (void) strcpy(tmppath, path); 4640 s = tmppath; 4641 4642 dvp = root; 4643 VN_HOLD(dvp); 4644 mi = VTOMI(root); 4645 xattr = mi->mi_flags & MI_EXTATTR; 4646 4647 do { 4648 p = strchr(s, '/'); 4649 if (p != NULL) 4650 *p = '\0'; 4651 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4652 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4653 RFSCALL_SOFT); 4654 } else { 4655 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4656 CRED(), RFSCALL_SOFT); 4657 } 4658 if (p != NULL) 4659 *p++ = '/'; 4660 if (error) { 4661 VN_RELE(dvp); 4662 kmem_free(tmppath, len); 4663 return (error); 4664 } 4665 s = p; 4666 VN_RELE(dvp); 4667 dvp = nvp; 4668 } while (p != NULL); 4669 4670 if (nvp != NULL && new != NULL) 4671 *new = nvp; 4672 kmem_free(tmppath, len); 4673 return (0); 4674 } 4675 4676 /* 4677 * NFS client failover support 4678 * 4679 * sv_free() frees the malloc'd portion of a "servinfo_t". 4680 */ 4681 void 4682 sv_free(servinfo_t *svp) 4683 { 4684 servinfo_t *next; 4685 struct knetconfig *knconf; 4686 4687 while (svp != NULL) { 4688 next = svp->sv_next; 4689 if (svp->sv_secdata) 4690 sec_clnt_freeinfo(svp->sv_secdata); 4691 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4692 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4693 knconf = svp->sv_knconf; 4694 if (knconf != NULL) { 4695 if (knconf->knc_protofmly != NULL) 4696 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4697 if (knconf->knc_proto != NULL) 4698 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4699 kmem_free(knconf, sizeof (*knconf)); 4700 } 4701 knconf = svp->sv_origknconf; 4702 if (knconf != NULL) { 4703 if (knconf->knc_protofmly != NULL) 4704 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4705 if (knconf->knc_proto != NULL) 4706 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4707 kmem_free(knconf, sizeof (*knconf)); 4708 } 4709 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4710 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4711 mutex_destroy(&svp->sv_lock); 4712 kmem_free(svp, sizeof (*svp)); 4713 svp = next; 4714 } 4715 } 4716 4717 /* 4718 * Only can return non-zero if intr != 0. 4719 */ 4720 int 4721 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4722 { 4723 4724 mutex_enter(&l->lock); 4725 4726 /* 4727 * If this is a nested enter, then allow it. There 4728 * must be as many exits as enters through. 4729 */ 4730 if (l->owner == curthread) { 4731 /* lock is held for writing by current thread */ 4732 ASSERT(rw == RW_READER || rw == RW_WRITER); 4733 l->count--; 4734 } else if (rw == RW_READER) { 4735 /* 4736 * While there is a writer active or writers waiting, 4737 * then wait for them to finish up and move on. Then, 4738 * increment the count to indicate that a reader is 4739 * active. 4740 */ 4741 while (l->count < 0 || l->waiters > 0) { 4742 if (intr) { 4743 klwp_t *lwp = ttolwp(curthread); 4744 4745 if (lwp != NULL) 4746 lwp->lwp_nostop++; 4747 if (!cv_wait_sig(&l->cv, &l->lock)) { 4748 if (lwp != NULL) 4749 lwp->lwp_nostop--; 4750 mutex_exit(&l->lock); 4751 return (EINTR); 4752 } 4753 if (lwp != NULL) 4754 lwp->lwp_nostop--; 4755 } else 4756 cv_wait(&l->cv, &l->lock); 4757 } 4758 ASSERT(l->count < INT_MAX); 4759 #ifdef DEBUG 4760 if ((l->count % 10000) == 9999) 4761 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4762 "rwlock @ %p\n", l->count, (void *)&l); 4763 #endif 4764 l->count++; 4765 } else { 4766 ASSERT(rw == RW_WRITER); 4767 /* 4768 * While there are readers active or a writer 4769 * active, then wait for all of the readers 4770 * to finish or for the writer to finish. 4771 * Then, set the owner field to curthread and 4772 * decrement count to indicate that a writer 4773 * is active. 4774 */ 4775 while (l->count > 0 || l->owner != NULL) { 4776 l->waiters++; 4777 if (intr) { 4778 klwp_t *lwp = ttolwp(curthread); 4779 4780 if (lwp != NULL) 4781 lwp->lwp_nostop++; 4782 if (!cv_wait_sig(&l->cv, &l->lock)) { 4783 if (lwp != NULL) 4784 lwp->lwp_nostop--; 4785 l->waiters--; 4786 cv_broadcast(&l->cv); 4787 mutex_exit(&l->lock); 4788 return (EINTR); 4789 } 4790 if (lwp != NULL) 4791 lwp->lwp_nostop--; 4792 } else 4793 cv_wait(&l->cv, &l->lock); 4794 l->waiters--; 4795 } 4796 l->owner = curthread; 4797 l->count--; 4798 } 4799 4800 mutex_exit(&l->lock); 4801 4802 return (0); 4803 } 4804 4805 /* 4806 * If the lock is available, obtain it and return non-zero. If there is 4807 * already a conflicting lock, return 0 immediately. 4808 */ 4809 4810 int 4811 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4812 { 4813 mutex_enter(&l->lock); 4814 4815 /* 4816 * If this is a nested enter, then allow it. There 4817 * must be as many exits as enters through. 4818 */ 4819 if (l->owner == curthread) { 4820 /* lock is held for writing by current thread */ 4821 ASSERT(rw == RW_READER || rw == RW_WRITER); 4822 l->count--; 4823 } else if (rw == RW_READER) { 4824 /* 4825 * If there is a writer active or writers waiting, deny the 4826 * lock. Otherwise, bump the count of readers. 4827 */ 4828 if (l->count < 0 || l->waiters > 0) { 4829 mutex_exit(&l->lock); 4830 return (0); 4831 } 4832 l->count++; 4833 } else { 4834 ASSERT(rw == RW_WRITER); 4835 /* 4836 * If there are readers active or a writer active, deny the 4837 * lock. Otherwise, set the owner field to curthread and 4838 * decrement count to indicate that a writer is active. 4839 */ 4840 if (l->count > 0 || l->owner != NULL) { 4841 mutex_exit(&l->lock); 4842 return (0); 4843 } 4844 l->owner = curthread; 4845 l->count--; 4846 } 4847 4848 mutex_exit(&l->lock); 4849 4850 return (1); 4851 } 4852 4853 void 4854 nfs_rw_exit(nfs_rwlock_t *l) 4855 { 4856 4857 mutex_enter(&l->lock); 4858 /* 4859 * If this is releasing a writer lock, then increment count to 4860 * indicate that there is one less writer active. If this was 4861 * the last of possibly nested writer locks, then clear the owner 4862 * field as well to indicate that there is no writer active 4863 * and wakeup any possible waiting writers or readers. 4864 * 4865 * If releasing a reader lock, then just decrement count to 4866 * indicate that there is one less reader active. If this was 4867 * the last active reader and there are writer(s) waiting, 4868 * then wake up the first. 4869 */ 4870 if (l->owner != NULL) { 4871 ASSERT(l->owner == curthread); 4872 l->count++; 4873 if (l->count == 0) { 4874 l->owner = NULL; 4875 cv_broadcast(&l->cv); 4876 } 4877 } else { 4878 ASSERT(l->count > 0); 4879 l->count--; 4880 if (l->count == 0 && l->waiters > 0) 4881 cv_broadcast(&l->cv); 4882 } 4883 mutex_exit(&l->lock); 4884 } 4885 4886 int 4887 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4888 { 4889 4890 if (rw == RW_READER) 4891 return (l->count > 0); 4892 ASSERT(rw == RW_WRITER); 4893 return (l->count < 0); 4894 } 4895 4896 /* ARGSUSED */ 4897 void 4898 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4899 { 4900 4901 l->count = 0; 4902 l->waiters = 0; 4903 l->owner = NULL; 4904 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4905 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4906 } 4907 4908 void 4909 nfs_rw_destroy(nfs_rwlock_t *l) 4910 { 4911 4912 mutex_destroy(&l->lock); 4913 cv_destroy(&l->cv); 4914 } 4915 4916 int 4917 nfs3_rddir_compar(const void *x, const void *y) 4918 { 4919 rddir_cache *a = (rddir_cache *)x; 4920 rddir_cache *b = (rddir_cache *)y; 4921 4922 if (a->nfs3_cookie == b->nfs3_cookie) { 4923 if (a->buflen == b->buflen) 4924 return (0); 4925 if (a->buflen < b->buflen) 4926 return (-1); 4927 return (1); 4928 } 4929 4930 if (a->nfs3_cookie < b->nfs3_cookie) 4931 return (-1); 4932 4933 return (1); 4934 } 4935 4936 int 4937 nfs_rddir_compar(const void *x, const void *y) 4938 { 4939 rddir_cache *a = (rddir_cache *)x; 4940 rddir_cache *b = (rddir_cache *)y; 4941 4942 if (a->nfs_cookie == b->nfs_cookie) { 4943 if (a->buflen == b->buflen) 4944 return (0); 4945 if (a->buflen < b->buflen) 4946 return (-1); 4947 return (1); 4948 } 4949 4950 if (a->nfs_cookie < b->nfs_cookie) 4951 return (-1); 4952 4953 return (1); 4954 } 4955 4956 static char * 4957 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4958 { 4959 servinfo_t *s; 4960 char *srvnames; 4961 char *namep; 4962 size_t length; 4963 4964 /* 4965 * Calculate the length of the string required to hold all 4966 * of the server names plus either a comma or a null 4967 * character following each individual one. 4968 */ 4969 length = 0; 4970 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4971 length += s->sv_hostnamelen; 4972 4973 srvnames = kmem_alloc(length, KM_SLEEP); 4974 4975 namep = srvnames; 4976 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4977 (void) strcpy(namep, s->sv_hostname); 4978 namep += s->sv_hostnamelen - 1; 4979 *namep++ = ','; 4980 } 4981 *--namep = '\0'; 4982 4983 *len = length; 4984 4985 return (srvnames); 4986 } 4987 4988 /* 4989 * These two functions are temporary and designed for the upgrade-workaround 4990 * only. They cannot be used for general zone-crossing NFS client support, and 4991 * will be removed shortly. 4992 * 4993 * When the workaround is enabled, all NFS traffic is forced into the global 4994 * zone. These functions are called when the code needs to refer to the state 4995 * of the underlying network connection. They're not called when the function 4996 * needs to refer to the state of the process that invoked the system call. 4997 * (E.g., when checking whether the zone is shutting down during the mount() 4998 * call.) 4999 */ 5000 5001 struct zone * 5002 nfs_zone(void) 5003 { 5004 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5005 } 5006 5007 zoneid_t 5008 nfs_zoneid(void) 5009 { 5010 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5011 } 5012 5013 /* 5014 * nfs_mount_label_policy: 5015 * Determine whether the mount is allowed according to MAC check, 5016 * by comparing (where appropriate) label of the remote server 5017 * against the label of the zone being mounted into. 5018 * 5019 * Returns: 5020 * 0 : access allowed 5021 * -1 : read-only access allowed (i.e., read-down) 5022 * >0 : error code, such as EACCES 5023 */ 5024 int 5025 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5026 struct knetconfig *knconf, cred_t *cr) 5027 { 5028 int addr_type; 5029 void *ipaddr; 5030 bslabel_t *server_sl, *mntlabel; 5031 zone_t *mntzone = NULL; 5032 ts_label_t *zlabel; 5033 tsol_tpc_t *tp; 5034 ts_label_t *tsl = NULL; 5035 int retv; 5036 5037 /* 5038 * Get the zone's label. Each zone on a labeled system has a label. 5039 */ 5040 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5041 zlabel = mntzone->zone_slabel; 5042 ASSERT(zlabel != NULL); 5043 label_hold(zlabel); 5044 5045 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5046 addr_type = IPV4_VERSION; 5047 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5048 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5049 addr_type = IPV6_VERSION; 5050 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5051 } else { 5052 retv = 0; 5053 goto out; 5054 } 5055 5056 retv = EACCES; /* assume the worst */ 5057 5058 /* 5059 * Next, get the assigned label of the remote server. 5060 */ 5061 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5062 if (tp == NULL) 5063 goto out; /* error getting host entry */ 5064 5065 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5066 goto rel_tpc; /* invalid domain */ 5067 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5068 (tp->tpc_tp.host_type != UNLABELED)) 5069 goto rel_tpc; /* invalid hosttype */ 5070 5071 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5072 tsl = getflabel_cipso(vfsp); 5073 if (tsl == NULL) 5074 goto rel_tpc; /* error getting server lbl */ 5075 5076 server_sl = label2bslabel(tsl); 5077 } else { /* UNLABELED */ 5078 server_sl = &tp->tpc_tp.tp_def_label; 5079 } 5080 5081 mntlabel = label2bslabel(zlabel); 5082 5083 /* 5084 * Now compare labels to complete the MAC check. If the labels 5085 * are equal or if the requestor is in the global zone and has 5086 * NET_MAC_AWARE, then allow read-write access. (Except for 5087 * mounts into the global zone itself; restrict these to 5088 * read-only.) 5089 * 5090 * If the requestor is in some other zone, but his label 5091 * dominates the server, then allow read-down. 5092 * 5093 * Otherwise, access is denied. 5094 */ 5095 if (blequal(mntlabel, server_sl) || 5096 (crgetzoneid(cr) == GLOBAL_ZONEID && 5097 getpflags(NET_MAC_AWARE, cr) != 0)) { 5098 if ((mntzone == global_zone) || 5099 !blequal(mntlabel, server_sl)) 5100 retv = -1; /* read-only */ 5101 else 5102 retv = 0; /* access OK */ 5103 } else if (bldominates(mntlabel, server_sl)) { 5104 retv = -1; /* read-only */ 5105 } else { 5106 retv = EACCES; 5107 } 5108 5109 if (tsl != NULL) 5110 label_rele(tsl); 5111 5112 rel_tpc: 5113 TPC_RELE(tp); 5114 out: 5115 if (mntzone) 5116 zone_rele(mntzone); 5117 label_rele(zlabel); 5118 return (retv); 5119 } 5120