1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 26 * All rights reserved. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred_impl.h> 35 #include <sys/proc.h> 36 #include <sys/user.h> 37 #include <sys/time.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/socket.h> 42 #include <sys/uio.h> 43 #include <sys/tiuser.h> 44 #include <sys/swap.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/kmem.h> 48 #include <sys/kstat.h> 49 #include <sys/cmn_err.h> 50 #include <sys/vtrace.h> 51 #include <sys/session.h> 52 #include <sys/dnlc.h> 53 #include <sys/bitmap.h> 54 #include <sys/acl.h> 55 #include <sys/ddi.h> 56 #include <sys/pathname.h> 57 #include <sys/flock.h> 58 #include <sys/dirent.h> 59 #include <sys/flock.h> 60 #include <sys/callb.h> 61 #include <sys/atomic.h> 62 #include <sys/list.h> 63 #include <sys/tsol/tnet.h> 64 #include <sys/priv.h> 65 #include <sys/sdt.h> 66 67 #include <inet/ip6.h> 68 69 #include <rpc/types.h> 70 #include <rpc/xdr.h> 71 #include <rpc/auth.h> 72 #include <rpc/clnt.h> 73 74 #include <nfs/nfs.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs_clnt.h> 77 #include <nfs/rnode.h> 78 #include <nfs/nfs_acl.h> 79 80 #include <sys/tsol/label.h> 81 82 /* 83 * The hash queues for the access to active and cached rnodes 84 * are organized as doubly linked lists. A reader/writer lock 85 * for each hash bucket is used to control access and to synchronize 86 * lookups, additions, and deletions from the hash queue. 87 * 88 * The rnode freelist is organized as a doubly linked list with 89 * a head pointer. Additions and deletions are synchronized via 90 * a single mutex. 91 * 92 * In order to add an rnode to the free list, it must be hashed into 93 * a hash queue and the exclusive lock to the hash queue be held. 94 * If an rnode is not hashed into a hash queue, then it is destroyed 95 * because it represents no valuable information that can be reused 96 * about the file. The exclusive lock to the hash queue must be 97 * held in order to prevent a lookup in the hash queue from finding 98 * the rnode and using it and assuming that the rnode is not on the 99 * freelist. The lookup in the hash queue will have the hash queue 100 * locked, either exclusive or shared. 101 * 102 * The vnode reference count for each rnode is not allowed to drop 103 * below 1. This prevents external entities, such as the VM 104 * subsystem, from acquiring references to vnodes already on the 105 * freelist and then trying to place them back on the freelist 106 * when their reference is released. This means that the when an 107 * rnode is looked up in the hash queues, then either the rnode 108 * is removed from the freelist and that reference is transferred to 109 * the new reference or the vnode reference count must be incremented 110 * accordingly. The mutex for the freelist must be held in order to 111 * accurately test to see if the rnode is on the freelist or not. 112 * The hash queue lock might be held shared and it is possible that 113 * two different threads may race to remove the rnode from the 114 * freelist. This race can be resolved by holding the mutex for the 115 * freelist. Please note that the mutex for the freelist does not 116 * need to held if the rnode is not on the freelist. It can not be 117 * placed on the freelist due to the requirement that the thread 118 * putting the rnode on the freelist must hold the exclusive lock 119 * to the hash queue and the thread doing the lookup in the hash 120 * queue is holding either a shared or exclusive lock to the hash 121 * queue. 122 * 123 * The lock ordering is: 124 * 125 * hash bucket lock -> vnode lock 126 * hash bucket lock -> freelist lock 127 */ 128 static rhashq_t *rtable; 129 130 static kmutex_t rpfreelist_lock; 131 static rnode_t *rpfreelist = NULL; 132 static long rnew = 0; 133 long nrnode = 0; 134 135 static int rtablesize; 136 static int rtablemask; 137 138 static int hashlen = 4; 139 140 static struct kmem_cache *rnode_cache; 141 142 /* 143 * Mutex to protect the following variables: 144 * nfs_major 145 * nfs_minor 146 */ 147 kmutex_t nfs_minor_lock; 148 int nfs_major; 149 int nfs_minor; 150 151 /* Do we allow preepoch (negative) time values otw? */ 152 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 153 154 /* 155 * Access cache 156 */ 157 static acache_hash_t *acache; 158 static long nacache; /* used strictly to size the number of hash queues */ 159 160 static int acachesize; 161 static int acachemask; 162 static struct kmem_cache *acache_cache; 163 164 /* 165 * Client side utilities 166 */ 167 168 /* 169 * client side statistics 170 */ 171 static const struct clstat clstat_tmpl = { 172 { "calls", KSTAT_DATA_UINT64 }, 173 { "badcalls", KSTAT_DATA_UINT64 }, 174 { "clgets", KSTAT_DATA_UINT64 }, 175 { "cltoomany", KSTAT_DATA_UINT64 }, 176 #ifdef DEBUG 177 { "clalloc", KSTAT_DATA_UINT64 }, 178 { "noresponse", KSTAT_DATA_UINT64 }, 179 { "failover", KSTAT_DATA_UINT64 }, 180 { "remap", KSTAT_DATA_UINT64 }, 181 #endif 182 }; 183 184 /* 185 * The following are statistics that describe behavior of the system as a whole 186 * and doesn't correspond to any one particular zone. 187 */ 188 #ifdef DEBUG 189 static struct clstat_debug { 190 kstat_named_t nrnode; /* number of allocated rnodes */ 191 kstat_named_t access; /* size of access cache */ 192 kstat_named_t dirent; /* size of readdir cache */ 193 kstat_named_t dirents; /* size of readdir buf cache */ 194 kstat_named_t reclaim; /* number of reclaims */ 195 kstat_named_t clreclaim; /* number of cl reclaims */ 196 kstat_named_t f_reclaim; /* number of free reclaims */ 197 kstat_named_t a_reclaim; /* number of active reclaims */ 198 kstat_named_t r_reclaim; /* number of rnode reclaims */ 199 kstat_named_t rpath; /* bytes used to store rpaths */ 200 } clstat_debug = { 201 { "nrnode", KSTAT_DATA_UINT64 }, 202 { "access", KSTAT_DATA_UINT64 }, 203 { "dirent", KSTAT_DATA_UINT64 }, 204 { "dirents", KSTAT_DATA_UINT64 }, 205 { "reclaim", KSTAT_DATA_UINT64 }, 206 { "clreclaim", KSTAT_DATA_UINT64 }, 207 { "f_reclaim", KSTAT_DATA_UINT64 }, 208 { "a_reclaim", KSTAT_DATA_UINT64 }, 209 { "r_reclaim", KSTAT_DATA_UINT64 }, 210 { "r_path", KSTAT_DATA_UINT64 }, 211 }; 212 #endif /* DEBUG */ 213 214 /* 215 * We keep a global list of per-zone client data, so we can clean up all zones 216 * if we get low on memory. 217 */ 218 static list_t nfs_clnt_list; 219 static kmutex_t nfs_clnt_list_lock; 220 static zone_key_t nfsclnt_zone_key; 221 222 static struct kmem_cache *chtab_cache; 223 224 /* 225 * Some servers do not properly update the attributes of the 226 * directory when changes are made. To allow interoperability 227 * with these broken servers, the nfs_disable_rddir_cache 228 * parameter must be set in /etc/system 229 */ 230 int nfs_disable_rddir_cache = 0; 231 232 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 233 struct chtab **); 234 void clfree(CLIENT *, struct chtab *); 235 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 236 struct chtab **, struct nfs_clnt *); 237 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 238 struct chtab **, struct nfs_clnt *); 239 static void clreclaim(void *); 240 static int nfs_feedback(int, int, mntinfo_t *); 241 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 242 caddr_t, cred_t *, int *, enum clnt_stat *, int, 243 failinfo_t *); 244 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 245 caddr_t, cred_t *, int *, int, failinfo_t *); 246 static void rinactive(rnode_t *, cred_t *); 247 static int rtablehash(nfs_fhandle *); 248 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 249 struct vnodeops *, 250 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 251 cred_t *), 252 int (*)(const void *, const void *), int *, cred_t *, 253 char *, char *); 254 static void rp_rmfree(rnode_t *); 255 static void rp_addhash(rnode_t *); 256 static void rp_rmhash_locked(rnode_t *); 257 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 258 static void destroy_rnode(rnode_t *); 259 static void rddir_cache_free(rddir_cache *); 260 static int nfs_free_data_reclaim(rnode_t *); 261 static int nfs_active_data_reclaim(rnode_t *); 262 static int nfs_free_reclaim(void); 263 static int nfs_active_reclaim(void); 264 static int nfs_rnode_reclaim(void); 265 static void nfs_reclaim(void *); 266 static int failover_safe(failinfo_t *); 267 static void failover_newserver(mntinfo_t *mi); 268 static void failover_thread(mntinfo_t *mi); 269 static int failover_wait(mntinfo_t *); 270 static int failover_remap(failinfo_t *); 271 static int failover_lookup(char *, vnode_t *, 272 int (*)(vnode_t *, char *, vnode_t **, 273 struct pathname *, int, vnode_t *, cred_t *, int), 274 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 275 vnode_t **); 276 static void nfs_free_r_path(rnode_t *); 277 static void nfs_set_vroot(vnode_t *); 278 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 279 280 /* 281 * from rpcsec module (common/rpcsec) 282 */ 283 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 284 extern void sec_clnt_freeh(AUTH *); 285 extern void sec_clnt_freeinfo(struct sec_data *); 286 287 /* 288 * used in mount policy 289 */ 290 extern ts_label_t *getflabel_cipso(vfs_t *); 291 292 /* 293 * EIO or EINTR are not recoverable errors. 294 */ 295 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 296 297 /* 298 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 299 */ 300 static int 301 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 302 struct chtab **chp, struct nfs_clnt *nfscl) 303 { 304 struct chhead *ch, *newch; 305 struct chhead **plistp; 306 struct chtab *cp; 307 int error; 308 k_sigset_t smask; 309 310 if (newcl == NULL || chp == NULL || ci == NULL) 311 return (EINVAL); 312 313 *newcl = NULL; 314 *chp = NULL; 315 316 /* 317 * Find an unused handle or create one 318 */ 319 newch = NULL; 320 nfscl->nfscl_stat.clgets.value.ui64++; 321 top: 322 /* 323 * Find the correct entry in the cache to check for free 324 * client handles. The search is based on the RPC program 325 * number, program version number, dev_t for the transport 326 * device, and the protocol family. 327 */ 328 mutex_enter(&nfscl->nfscl_chtable_lock); 329 plistp = &nfscl->nfscl_chtable; 330 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 331 if (ch->ch_prog == ci->cl_prog && 332 ch->ch_vers == ci->cl_vers && 333 ch->ch_dev == svp->sv_knconf->knc_rdev && 334 (strcmp(ch->ch_protofmly, 335 svp->sv_knconf->knc_protofmly) == 0)) 336 break; 337 plistp = &ch->ch_next; 338 } 339 340 /* 341 * If we didn't find a cache entry for this quadruple, then 342 * create one. If we don't have one already preallocated, 343 * then drop the cache lock, create one, and then start over. 344 * If we did have a preallocated entry, then just add it to 345 * the front of the list. 346 */ 347 if (ch == NULL) { 348 if (newch == NULL) { 349 mutex_exit(&nfscl->nfscl_chtable_lock); 350 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 351 newch->ch_timesused = 0; 352 newch->ch_prog = ci->cl_prog; 353 newch->ch_vers = ci->cl_vers; 354 newch->ch_dev = svp->sv_knconf->knc_rdev; 355 newch->ch_protofmly = kmem_alloc( 356 strlen(svp->sv_knconf->knc_protofmly) + 1, 357 KM_SLEEP); 358 (void) strcpy(newch->ch_protofmly, 359 svp->sv_knconf->knc_protofmly); 360 newch->ch_list = NULL; 361 goto top; 362 } 363 ch = newch; 364 newch = NULL; 365 ch->ch_next = nfscl->nfscl_chtable; 366 nfscl->nfscl_chtable = ch; 367 /* 368 * We found a cache entry, but if it isn't on the front of the 369 * list, then move it to the front of the list to try to take 370 * advantage of locality of operations. 371 */ 372 } else if (ch != nfscl->nfscl_chtable) { 373 *plistp = ch->ch_next; 374 ch->ch_next = nfscl->nfscl_chtable; 375 nfscl->nfscl_chtable = ch; 376 } 377 378 /* 379 * If there was a free client handle cached, then remove it 380 * from the list, init it, and use it. 381 */ 382 if (ch->ch_list != NULL) { 383 cp = ch->ch_list; 384 ch->ch_list = cp->ch_list; 385 mutex_exit(&nfscl->nfscl_chtable_lock); 386 if (newch != NULL) { 387 kmem_free(newch->ch_protofmly, 388 strlen(newch->ch_protofmly) + 1); 389 kmem_free(newch, sizeof (*newch)); 390 } 391 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 392 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 393 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 394 &cp->ch_client->cl_auth); 395 if (error || cp->ch_client->cl_auth == NULL) { 396 CLNT_DESTROY(cp->ch_client); 397 kmem_cache_free(chtab_cache, cp); 398 return ((error != 0) ? error : EINTR); 399 } 400 ch->ch_timesused++; 401 *newcl = cp->ch_client; 402 *chp = cp; 403 return (0); 404 } 405 406 /* 407 * There weren't any free client handles which fit, so allocate 408 * a new one and use that. 409 */ 410 #ifdef DEBUG 411 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 412 #endif 413 mutex_exit(&nfscl->nfscl_chtable_lock); 414 415 nfscl->nfscl_stat.cltoomany.value.ui64++; 416 if (newch != NULL) { 417 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 418 kmem_free(newch, sizeof (*newch)); 419 } 420 421 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 422 cp->ch_head = ch; 423 424 sigintr(&smask, (int)ci->cl_flags & MI_INT); 425 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 426 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 427 sigunintr(&smask); 428 429 if (error != 0) { 430 kmem_cache_free(chtab_cache, cp); 431 #ifdef DEBUG 432 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 433 #endif 434 /* 435 * Warning is unnecessary if error is EINTR. 436 */ 437 if (error != EINTR) { 438 nfs_cmn_err(error, CE_WARN, 439 "clget: couldn't create handle: %m\n"); 440 } 441 return (error); 442 } 443 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 444 auth_destroy(cp->ch_client->cl_auth); 445 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 446 &cp->ch_client->cl_auth); 447 if (error || cp->ch_client->cl_auth == NULL) { 448 CLNT_DESTROY(cp->ch_client); 449 kmem_cache_free(chtab_cache, cp); 450 #ifdef DEBUG 451 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 452 #endif 453 return ((error != 0) ? error : EINTR); 454 } 455 ch->ch_timesused++; 456 *newcl = cp->ch_client; 457 ASSERT(cp->ch_client->cl_nosignal == FALSE); 458 *chp = cp; 459 return (0); 460 } 461 462 int 463 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 464 struct chtab **chp) 465 { 466 struct nfs_clnt *nfscl; 467 468 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 469 ASSERT(nfscl != NULL); 470 471 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 472 } 473 474 static int 475 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 476 struct chtab **chp, struct nfs_clnt *nfscl) 477 { 478 clinfo_t ci; 479 int error; 480 481 /* 482 * Set read buffer size to rsize 483 * and add room for RPC headers. 484 */ 485 ci.cl_readsize = mi->mi_tsize; 486 if (ci.cl_readsize != 0) 487 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 488 489 /* 490 * If soft mount and server is down just try once. 491 * meaning: do not retransmit. 492 */ 493 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 494 ci.cl_retrans = 0; 495 else 496 ci.cl_retrans = mi->mi_retrans; 497 498 ci.cl_prog = NFS_ACL_PROGRAM; 499 ci.cl_vers = mi->mi_vers; 500 ci.cl_flags = mi->mi_flags; 501 502 /* 503 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 504 * security flavor, the client tries to establish a security context 505 * by contacting the server. If the connection is timed out or reset, 506 * e.g. server reboot, we will try again. 507 */ 508 do { 509 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 510 511 if (error == 0) 512 break; 513 514 /* 515 * For forced unmount or zone shutdown, bail out, no retry. 516 */ 517 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 518 error = EIO; 519 break; 520 } 521 522 /* do not retry for softmount */ 523 if (!(mi->mi_flags & MI_HARD)) 524 break; 525 526 /* let the caller deal with the failover case */ 527 if (FAILOVER_MOUNT(mi)) 528 break; 529 530 } while (error == ETIMEDOUT || error == ECONNRESET); 531 532 return (error); 533 } 534 535 static int 536 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 537 struct chtab **chp, struct nfs_clnt *nfscl) 538 { 539 clinfo_t ci; 540 int error; 541 542 /* 543 * Set read buffer size to rsize 544 * and add room for RPC headers. 545 */ 546 ci.cl_readsize = mi->mi_tsize; 547 if (ci.cl_readsize != 0) 548 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 549 550 /* 551 * If soft mount and server is down just try once. 552 * meaning: do not retransmit. 553 */ 554 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 555 ci.cl_retrans = 0; 556 else 557 ci.cl_retrans = mi->mi_retrans; 558 559 ci.cl_prog = mi->mi_prog; 560 ci.cl_vers = mi->mi_vers; 561 ci.cl_flags = mi->mi_flags; 562 563 /* 564 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 565 * security flavor, the client tries to establish a security context 566 * by contacting the server. If the connection is timed out or reset, 567 * e.g. server reboot, we will try again. 568 */ 569 do { 570 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 571 572 if (error == 0) 573 break; 574 575 /* 576 * For forced unmount or zone shutdown, bail out, no retry. 577 */ 578 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 579 error = EIO; 580 break; 581 } 582 583 /* do not retry for softmount */ 584 if (!(mi->mi_flags & MI_HARD)) 585 break; 586 587 /* let the caller deal with the failover case */ 588 if (FAILOVER_MOUNT(mi)) 589 break; 590 591 } while (error == ETIMEDOUT || error == ECONNRESET); 592 593 return (error); 594 } 595 596 static void 597 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 598 { 599 if (cl->cl_auth != NULL) { 600 sec_clnt_freeh(cl->cl_auth); 601 cl->cl_auth = NULL; 602 } 603 604 /* 605 * Timestamp this cache entry so that we know when it was last 606 * used. 607 */ 608 cp->ch_freed = gethrestime_sec(); 609 610 /* 611 * Add the free client handle to the front of the list. 612 * This way, the list will be sorted in youngest to oldest 613 * order. 614 */ 615 mutex_enter(&nfscl->nfscl_chtable_lock); 616 cp->ch_list = cp->ch_head->ch_list; 617 cp->ch_head->ch_list = cp; 618 mutex_exit(&nfscl->nfscl_chtable_lock); 619 } 620 621 void 622 clfree(CLIENT *cl, struct chtab *cp) 623 { 624 struct nfs_clnt *nfscl; 625 626 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 627 ASSERT(nfscl != NULL); 628 629 clfree_impl(cl, cp, nfscl); 630 } 631 632 #define CL_HOLDTIME 60 /* time to hold client handles */ 633 634 static void 635 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 636 { 637 struct chhead *ch; 638 struct chtab *cp; /* list of objects that can be reclaimed */ 639 struct chtab *cpe; 640 struct chtab *cpl; 641 struct chtab **cpp; 642 #ifdef DEBUG 643 int n = 0; 644 #endif 645 646 /* 647 * Need to reclaim some memory, so step through the cache 648 * looking through the lists for entries which can be freed. 649 */ 650 cp = NULL; 651 652 mutex_enter(&nfscl->nfscl_chtable_lock); 653 654 /* 655 * Here we step through each non-NULL quadruple and start to 656 * construct the reclaim list pointed to by cp. Note that 657 * cp will contain all eligible chtab entries. When this traversal 658 * completes, chtab entries from the last quadruple will be at the 659 * front of cp and entries from previously inspected quadruples have 660 * been appended to the rear of cp. 661 */ 662 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 663 if (ch->ch_list == NULL) 664 continue; 665 /* 666 * Search each list for entries older then 667 * cl_holdtime seconds. The lists are maintained 668 * in youngest to oldest order so that when the 669 * first entry is found which is old enough, then 670 * all of the rest of the entries on the list will 671 * be old enough as well. 672 */ 673 cpl = ch->ch_list; 674 cpp = &ch->ch_list; 675 while (cpl != NULL && 676 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 677 cpp = &cpl->ch_list; 678 cpl = cpl->ch_list; 679 } 680 if (cpl != NULL) { 681 *cpp = NULL; 682 if (cp != NULL) { 683 cpe = cpl; 684 while (cpe->ch_list != NULL) 685 cpe = cpe->ch_list; 686 cpe->ch_list = cp; 687 } 688 cp = cpl; 689 } 690 } 691 692 mutex_exit(&nfscl->nfscl_chtable_lock); 693 694 /* 695 * If cp is empty, then there is nothing to reclaim here. 696 */ 697 if (cp == NULL) 698 return; 699 700 /* 701 * Step through the list of entries to free, destroying each client 702 * handle and kmem_free'ing the memory for each entry. 703 */ 704 while (cp != NULL) { 705 #ifdef DEBUG 706 n++; 707 #endif 708 CLNT_DESTROY(cp->ch_client); 709 cpl = cp->ch_list; 710 kmem_cache_free(chtab_cache, cp); 711 cp = cpl; 712 } 713 714 #ifdef DEBUG 715 /* 716 * Update clalloc so that nfsstat shows the current number 717 * of allocated client handles. 718 */ 719 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 720 #endif 721 } 722 723 /* ARGSUSED */ 724 static void 725 clreclaim(void *all) 726 { 727 struct nfs_clnt *nfscl; 728 729 #ifdef DEBUG 730 clstat_debug.clreclaim.value.ui64++; 731 #endif 732 /* 733 * The system is low on memory; go through and try to reclaim some from 734 * every zone on the system. 735 */ 736 mutex_enter(&nfs_clnt_list_lock); 737 nfscl = list_head(&nfs_clnt_list); 738 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 739 clreclaim_zone(nfscl, CL_HOLDTIME); 740 mutex_exit(&nfs_clnt_list_lock); 741 } 742 743 /* 744 * Minimum time-out values indexed by call type 745 * These units are in "eights" of a second to avoid multiplies 746 */ 747 static unsigned int minimum_timeo[] = { 748 6, 7, 10 749 }; 750 751 /* 752 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 753 */ 754 #define MAXTIMO (20*hz) 755 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 756 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 757 758 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 759 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 760 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 761 762 /* 763 * Function called when rfscall notices that we have been 764 * re-transmitting, or when we get a response without retransmissions. 765 * Return 1 if the transfer size was adjusted down - 0 if no change. 766 */ 767 static int 768 nfs_feedback(int flag, int which, mntinfo_t *mi) 769 { 770 int kind; 771 int r = 0; 772 773 mutex_enter(&mi->mi_lock); 774 if (flag == FEEDBACK_REXMIT1) { 775 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 776 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 777 goto done; 778 if (mi->mi_curread > MIN_NFS_TSIZE) { 779 mi->mi_curread /= 2; 780 if (mi->mi_curread < MIN_NFS_TSIZE) 781 mi->mi_curread = MIN_NFS_TSIZE; 782 r = 1; 783 } 784 785 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 786 mi->mi_curwrite /= 2; 787 if (mi->mi_curwrite < MIN_NFS_TSIZE) 788 mi->mi_curwrite = MIN_NFS_TSIZE; 789 r = 1; 790 } 791 } else if (flag == FEEDBACK_OK) { 792 kind = mi->mi_timer_type[which]; 793 if (kind == 0 || 794 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 795 goto done; 796 if (kind == 1) { 797 if (mi->mi_curread >= mi->mi_tsize) 798 goto done; 799 mi->mi_curread += MIN_NFS_TSIZE; 800 if (mi->mi_curread > mi->mi_tsize/2) 801 mi->mi_curread = mi->mi_tsize; 802 } else if (kind == 2) { 803 if (mi->mi_curwrite >= mi->mi_stsize) 804 goto done; 805 mi->mi_curwrite += MIN_NFS_TSIZE; 806 if (mi->mi_curwrite > mi->mi_stsize/2) 807 mi->mi_curwrite = mi->mi_stsize; 808 } 809 } 810 done: 811 mutex_exit(&mi->mi_lock); 812 return (r); 813 } 814 815 #ifdef DEBUG 816 static int rfs2call_hits = 0; 817 static int rfs2call_misses = 0; 818 #endif 819 820 int 821 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 822 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 823 enum nfsstat *statusp, int flags, failinfo_t *fi) 824 { 825 int rpcerror; 826 enum clnt_stat rpc_status; 827 828 ASSERT(statusp != NULL); 829 830 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 831 cr, douprintf, &rpc_status, flags, fi); 832 if (!rpcerror) { 833 /* 834 * See crnetadjust() for comments. 835 */ 836 if (*statusp == NFSERR_ACCES && 837 (cr = crnetadjust(cr)) != NULL) { 838 #ifdef DEBUG 839 rfs2call_hits++; 840 #endif 841 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 842 resp, cr, douprintf, NULL, flags, fi); 843 crfree(cr); 844 #ifdef DEBUG 845 if (*statusp == NFSERR_ACCES) 846 rfs2call_misses++; 847 #endif 848 } 849 } else if (rpc_status == RPC_PROCUNAVAIL) { 850 *statusp = NFSERR_OPNOTSUPP; 851 rpcerror = 0; 852 } 853 854 return (rpcerror); 855 } 856 857 #define NFS3_JUKEBOX_DELAY 10 * hz 858 859 static clock_t nfs3_jukebox_delay = 0; 860 861 #ifdef DEBUG 862 static int rfs3call_hits = 0; 863 static int rfs3call_misses = 0; 864 #endif 865 866 int 867 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 868 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 869 nfsstat3 *statusp, int flags, failinfo_t *fi) 870 { 871 int rpcerror; 872 int user_informed; 873 874 user_informed = 0; 875 do { 876 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 877 cr, douprintf, NULL, flags, fi); 878 if (!rpcerror) { 879 cred_t *crr; 880 if (*statusp == NFS3ERR_JUKEBOX) { 881 if (ttoproc(curthread) == &p0) { 882 rpcerror = EAGAIN; 883 break; 884 } 885 if (!user_informed) { 886 user_informed = 1; 887 uprintf( 888 "file temporarily unavailable on the server, retrying...\n"); 889 } 890 delay(nfs3_jukebox_delay); 891 } 892 /* 893 * See crnetadjust() for comments. 894 */ 895 else if (*statusp == NFS3ERR_ACCES && 896 (crr = crnetadjust(cr)) != NULL) { 897 #ifdef DEBUG 898 rfs3call_hits++; 899 #endif 900 rpcerror = rfscall(mi, which, xdrargs, argsp, 901 xdrres, resp, crr, douprintf, 902 NULL, flags, fi); 903 904 crfree(crr); 905 #ifdef DEBUG 906 if (*statusp == NFS3ERR_ACCES) 907 rfs3call_misses++; 908 #endif 909 } 910 } 911 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 912 913 return (rpcerror); 914 } 915 916 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 917 #define INC_READERS(mi) { \ 918 mi->mi_readers++; \ 919 } 920 #define DEC_READERS(mi) { \ 921 mi->mi_readers--; \ 922 if (mi->mi_readers == 0) \ 923 cv_broadcast(&mi->mi_failover_cv); \ 924 } 925 926 static int 927 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 928 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 929 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 930 { 931 CLIENT *client; 932 struct chtab *ch; 933 cred_t *cr = icr; 934 enum clnt_stat status; 935 struct rpc_err rpcerr; 936 struct timeval wait; 937 int timeo; /* in units of hz */ 938 int my_rsize, my_wsize; 939 bool_t tryagain; 940 bool_t cred_cloned = FALSE; 941 k_sigset_t smask; 942 servinfo_t *svp; 943 struct nfs_clnt *nfscl; 944 zoneid_t zoneid = getzoneid(); 945 #ifdef DEBUG 946 char *bufp; 947 #endif 948 949 950 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 951 "rfscall_start:which %d mi %p", which, mi); 952 953 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 954 ASSERT(nfscl != NULL); 955 956 nfscl->nfscl_stat.calls.value.ui64++; 957 mi->mi_reqs[which].value.ui64++; 958 959 rpcerr.re_status = RPC_SUCCESS; 960 961 /* 962 * In case of forced unmount or zone shutdown, return EIO. 963 */ 964 965 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 966 rpcerr.re_status = RPC_FAILED; 967 rpcerr.re_errno = EIO; 968 return (rpcerr.re_errno); 969 } 970 971 /* 972 * Remember the transfer sizes in case 973 * nfs_feedback changes them underneath us. 974 */ 975 my_rsize = mi->mi_curread; 976 my_wsize = mi->mi_curwrite; 977 978 /* 979 * NFS client failover support 980 * 981 * If this rnode is not in sync with the current server (VALID_FH), 982 * we'd like to do a remap to get in sync. We can be interrupted 983 * in failover_remap(), and if so we'll bail. Otherwise, we'll 984 * use the best info we have to try the RPC. Part of that is 985 * unconditionally updating the filehandle copy kept for V3. 986 * 987 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 988 * rw_enter(); we're trying to keep the current server from being 989 * changed on us until we're done with the remapping and have a 990 * matching client handle. We don't want to sending a filehandle 991 * to the wrong host. 992 */ 993 failoverretry: 994 if (FAILOVER_MOUNT(mi)) { 995 mutex_enter(&mi->mi_lock); 996 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 997 if (failover_wait(mi)) { 998 mutex_exit(&mi->mi_lock); 999 return (EINTR); 1000 } 1001 } 1002 INC_READERS(mi); 1003 mutex_exit(&mi->mi_lock); 1004 if (fi) { 1005 if (!VALID_FH(fi) && 1006 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1007 int remaperr; 1008 1009 svp = mi->mi_curr_serv; 1010 remaperr = failover_remap(fi); 1011 if (remaperr != 0) { 1012 #ifdef DEBUG 1013 if (remaperr != EINTR) 1014 nfs_cmn_err(remaperr, CE_WARN, 1015 "rfscall couldn't failover: %m"); 1016 #endif 1017 mutex_enter(&mi->mi_lock); 1018 DEC_READERS(mi); 1019 mutex_exit(&mi->mi_lock); 1020 /* 1021 * If failover_remap returns ETIMEDOUT 1022 * and the filesystem is hard mounted 1023 * we have to retry the call with a new 1024 * server. 1025 */ 1026 if ((mi->mi_flags & MI_HARD) && 1027 IS_RECOVERABLE_ERROR(remaperr)) { 1028 if (svp == mi->mi_curr_serv) 1029 failover_newserver(mi); 1030 rpcerr.re_status = RPC_SUCCESS; 1031 goto failoverretry; 1032 } 1033 rpcerr.re_errno = remaperr; 1034 return (remaperr); 1035 } 1036 } 1037 if (fi->fhp && fi->copyproc) 1038 (*fi->copyproc)(fi->fhp, fi->vp); 1039 } 1040 } 1041 1042 /* For TSOL, use a new cred which has net_mac_aware flag */ 1043 if (!cred_cloned && is_system_labeled()) { 1044 cred_cloned = TRUE; 1045 cr = crdup(icr); 1046 (void) setpflags(NET_MAC_AWARE, 1, cr); 1047 } 1048 1049 /* 1050 * clget() calls clnt_tli_kinit() which clears the xid, so we 1051 * are guaranteed to reprocess the retry as a new request. 1052 */ 1053 svp = mi->mi_curr_serv; 1054 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1055 1056 if (FAILOVER_MOUNT(mi)) { 1057 mutex_enter(&mi->mi_lock); 1058 DEC_READERS(mi); 1059 mutex_exit(&mi->mi_lock); 1060 1061 if ((rpcerr.re_errno == ETIMEDOUT || 1062 rpcerr.re_errno == ECONNRESET) && 1063 failover_safe(fi)) { 1064 if (svp == mi->mi_curr_serv) 1065 failover_newserver(mi); 1066 goto failoverretry; 1067 } 1068 } 1069 if (rpcerr.re_errno != 0) 1070 return (rpcerr.re_errno); 1071 1072 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1073 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1074 timeo = (mi->mi_timeo * hz) / 10; 1075 } else { 1076 mutex_enter(&mi->mi_lock); 1077 timeo = CLNT_SETTIMERS(client, 1078 &(mi->mi_timers[mi->mi_timer_type[which]]), 1079 &(mi->mi_timers[NFS_CALLTYPES]), 1080 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1081 (void (*)())NULL, (caddr_t)mi, 0); 1082 mutex_exit(&mi->mi_lock); 1083 } 1084 1085 /* 1086 * If hard mounted fs, retry call forever unless hard error occurs. 1087 */ 1088 do { 1089 tryagain = FALSE; 1090 1091 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1092 status = RPC_FAILED; 1093 rpcerr.re_status = RPC_FAILED; 1094 rpcerr.re_errno = EIO; 1095 break; 1096 } 1097 1098 TICK_TO_TIMEVAL(timeo, &wait); 1099 1100 /* 1101 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1102 * and SIGTERM. (Preserving the existing masks). 1103 * Mask out SIGINT if mount option nointr is specified. 1104 */ 1105 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1106 if (!(mi->mi_flags & MI_INT)) 1107 client->cl_nosignal = TRUE; 1108 1109 /* 1110 * If there is a current signal, then don't bother 1111 * even trying to send out the request because we 1112 * won't be able to block waiting for the response. 1113 * Simply assume RPC_INTR and get on with it. 1114 */ 1115 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1116 status = RPC_INTR; 1117 else { 1118 status = CLNT_CALL(client, which, xdrargs, argsp, 1119 xdrres, resp, wait); 1120 } 1121 1122 if (!(mi->mi_flags & MI_INT)) 1123 client->cl_nosignal = FALSE; 1124 /* 1125 * restore original signal mask 1126 */ 1127 sigunintr(&smask); 1128 1129 switch (status) { 1130 case RPC_SUCCESS: 1131 if ((mi->mi_flags & MI_DYNAMIC) && 1132 mi->mi_timer_type[which] != 0 && 1133 (mi->mi_curread != my_rsize || 1134 mi->mi_curwrite != my_wsize)) 1135 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1136 break; 1137 1138 case RPC_INTR: 1139 /* 1140 * There is no way to recover from this error, 1141 * even if mount option nointr is specified. 1142 * SIGKILL, for example, cannot be blocked. 1143 */ 1144 rpcerr.re_status = RPC_INTR; 1145 rpcerr.re_errno = EINTR; 1146 break; 1147 1148 case RPC_UDERROR: 1149 /* 1150 * If the NFS server is local (vold) and 1151 * it goes away then we get RPC_UDERROR. 1152 * This is a retryable error, so we would 1153 * loop, so check to see if the specific 1154 * error was ECONNRESET, indicating that 1155 * target did not exist at all. If so, 1156 * return with RPC_PROGUNAVAIL and 1157 * ECONNRESET to indicate why. 1158 */ 1159 CLNT_GETERR(client, &rpcerr); 1160 if (rpcerr.re_errno == ECONNRESET) { 1161 rpcerr.re_status = RPC_PROGUNAVAIL; 1162 rpcerr.re_errno = ECONNRESET; 1163 break; 1164 } 1165 /*FALLTHROUGH*/ 1166 1167 default: /* probably RPC_TIMEDOUT */ 1168 if (IS_UNRECOVERABLE_RPC(status)) 1169 break; 1170 1171 /* 1172 * increment server not responding count 1173 */ 1174 mutex_enter(&mi->mi_lock); 1175 mi->mi_noresponse++; 1176 mutex_exit(&mi->mi_lock); 1177 #ifdef DEBUG 1178 nfscl->nfscl_stat.noresponse.value.ui64++; 1179 #endif 1180 1181 if (!(mi->mi_flags & MI_HARD)) { 1182 if (!(mi->mi_flags & MI_SEMISOFT) || 1183 (mi->mi_ss_call_type[which] == 0)) 1184 break; 1185 } 1186 1187 /* 1188 * The call is in progress (over COTS). 1189 * Try the CLNT_CALL again, but don't 1190 * print a noisy error message. 1191 */ 1192 if (status == RPC_INPROGRESS) { 1193 tryagain = TRUE; 1194 break; 1195 } 1196 1197 if (flags & RFSCALL_SOFT) 1198 break; 1199 1200 /* 1201 * On zone shutdown, just move on. 1202 */ 1203 if (zone_status_get(curproc->p_zone) >= 1204 ZONE_IS_SHUTTING_DOWN) { 1205 rpcerr.re_status = RPC_FAILED; 1206 rpcerr.re_errno = EIO; 1207 break; 1208 } 1209 1210 /* 1211 * NFS client failover support 1212 * 1213 * If the current server just failed us, we'll 1214 * start the process of finding a new server. 1215 * After that, we can just retry. 1216 */ 1217 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1218 if (svp == mi->mi_curr_serv) 1219 failover_newserver(mi); 1220 clfree_impl(client, ch, nfscl); 1221 goto failoverretry; 1222 } 1223 1224 tryagain = TRUE; 1225 timeo = backoff(timeo); 1226 mutex_enter(&mi->mi_lock); 1227 if (!(mi->mi_flags & MI_PRINTED)) { 1228 mi->mi_flags |= MI_PRINTED; 1229 mutex_exit(&mi->mi_lock); 1230 #ifdef DEBUG 1231 zprintf(zoneid, 1232 "NFS%d server %s not responding still trying\n", 1233 mi->mi_vers, svp->sv_hostname); 1234 #else 1235 zprintf(zoneid, 1236 "NFS server %s not responding still trying\n", 1237 svp->sv_hostname); 1238 #endif 1239 } else 1240 mutex_exit(&mi->mi_lock); 1241 if (*douprintf && nfs_has_ctty()) { 1242 *douprintf = 0; 1243 if (!(mi->mi_flags & MI_NOPRINT)) 1244 #ifdef DEBUG 1245 uprintf( 1246 "NFS%d server %s not responding still trying\n", 1247 mi->mi_vers, svp->sv_hostname); 1248 #else 1249 uprintf( 1250 "NFS server %s not responding still trying\n", 1251 svp->sv_hostname); 1252 #endif 1253 } 1254 1255 /* 1256 * If doing dynamic adjustment of transfer 1257 * size and if it's a read or write call 1258 * and if the transfer size changed while 1259 * retransmitting or if the feedback routine 1260 * changed the transfer size, 1261 * then exit rfscall so that the transfer 1262 * size can be adjusted at the vnops level. 1263 */ 1264 if ((mi->mi_flags & MI_DYNAMIC) && 1265 mi->mi_timer_type[which] != 0 && 1266 (mi->mi_curread != my_rsize || 1267 mi->mi_curwrite != my_wsize || 1268 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1269 /* 1270 * On read or write calls, return 1271 * back to the vnode ops level if 1272 * the transfer size changed. 1273 */ 1274 clfree_impl(client, ch, nfscl); 1275 if (cred_cloned) 1276 crfree(cr); 1277 return (ENFS_TRYAGAIN); 1278 } 1279 } 1280 } while (tryagain); 1281 1282 if (status != RPC_SUCCESS) { 1283 /* 1284 * Let soft mounts use the timed out message. 1285 */ 1286 if (status == RPC_INPROGRESS) 1287 status = RPC_TIMEDOUT; 1288 nfscl->nfscl_stat.badcalls.value.ui64++; 1289 if (status != RPC_INTR) { 1290 mutex_enter(&mi->mi_lock); 1291 mi->mi_flags |= MI_DOWN; 1292 mutex_exit(&mi->mi_lock); 1293 CLNT_GETERR(client, &rpcerr); 1294 #ifdef DEBUG 1295 bufp = clnt_sperror(client, svp->sv_hostname); 1296 zprintf(zoneid, "NFS%d %s failed for %s\n", 1297 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1298 if (nfs_has_ctty()) { 1299 if (!(mi->mi_flags & MI_NOPRINT)) { 1300 uprintf("NFS%d %s failed for %s\n", 1301 mi->mi_vers, mi->mi_rfsnames[which], 1302 bufp); 1303 } 1304 } 1305 kmem_free(bufp, MAXPATHLEN); 1306 #else 1307 zprintf(zoneid, 1308 "NFS %s failed for server %s: error %d (%s)\n", 1309 mi->mi_rfsnames[which], svp->sv_hostname, 1310 status, clnt_sperrno(status)); 1311 if (nfs_has_ctty()) { 1312 if (!(mi->mi_flags & MI_NOPRINT)) { 1313 uprintf( 1314 "NFS %s failed for server %s: error %d (%s)\n", 1315 mi->mi_rfsnames[which], 1316 svp->sv_hostname, status, 1317 clnt_sperrno(status)); 1318 } 1319 } 1320 #endif 1321 /* 1322 * when CLNT_CALL() fails with RPC_AUTHERROR, 1323 * re_errno is set appropriately depending on 1324 * the authentication error 1325 */ 1326 if (status == RPC_VERSMISMATCH || 1327 status == RPC_PROGVERSMISMATCH) 1328 rpcerr.re_errno = EIO; 1329 } 1330 } else { 1331 /* 1332 * Test the value of mi_down and mi_printed without 1333 * holding the mi_lock mutex. If they are both zero, 1334 * then it is okay to skip the down and printed 1335 * processing. This saves on a mutex_enter and 1336 * mutex_exit pair for a normal, successful RPC. 1337 * This was just complete overhead. 1338 */ 1339 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1340 mutex_enter(&mi->mi_lock); 1341 mi->mi_flags &= ~MI_DOWN; 1342 if (mi->mi_flags & MI_PRINTED) { 1343 mi->mi_flags &= ~MI_PRINTED; 1344 mutex_exit(&mi->mi_lock); 1345 #ifdef DEBUG 1346 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1347 zprintf(zoneid, "NFS%d server %s ok\n", 1348 mi->mi_vers, svp->sv_hostname); 1349 #else 1350 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1351 zprintf(zoneid, "NFS server %s ok\n", 1352 svp->sv_hostname); 1353 #endif 1354 } else 1355 mutex_exit(&mi->mi_lock); 1356 } 1357 1358 if (*douprintf == 0) { 1359 if (!(mi->mi_flags & MI_NOPRINT)) 1360 #ifdef DEBUG 1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1362 uprintf("NFS%d server %s ok\n", 1363 mi->mi_vers, svp->sv_hostname); 1364 #else 1365 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1366 uprintf("NFS server %s ok\n", svp->sv_hostname); 1367 #endif 1368 *douprintf = 1; 1369 } 1370 } 1371 1372 clfree_impl(client, ch, nfscl); 1373 if (cred_cloned) 1374 crfree(cr); 1375 1376 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1377 1378 if (rpc_status != NULL) 1379 *rpc_status = rpcerr.re_status; 1380 1381 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1382 rpcerr.re_errno); 1383 1384 return (rpcerr.re_errno); 1385 } 1386 1387 #ifdef DEBUG 1388 static int acl2call_hits = 0; 1389 static int acl2call_misses = 0; 1390 #endif 1391 1392 int 1393 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1394 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1395 enum nfsstat *statusp, int flags, failinfo_t *fi) 1396 { 1397 int rpcerror; 1398 1399 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1400 cr, douprintf, flags, fi); 1401 if (!rpcerror) { 1402 /* 1403 * See comments with crnetadjust(). 1404 */ 1405 if (*statusp == NFSERR_ACCES && 1406 (cr = crnetadjust(cr)) != NULL) { 1407 #ifdef DEBUG 1408 acl2call_hits++; 1409 #endif 1410 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1411 resp, cr, douprintf, flags, fi); 1412 crfree(cr); 1413 #ifdef DEBUG 1414 if (*statusp == NFSERR_ACCES) 1415 acl2call_misses++; 1416 #endif 1417 } 1418 } 1419 1420 return (rpcerror); 1421 } 1422 1423 #ifdef DEBUG 1424 static int acl3call_hits = 0; 1425 static int acl3call_misses = 0; 1426 #endif 1427 1428 int 1429 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1430 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1431 nfsstat3 *statusp, int flags, failinfo_t *fi) 1432 { 1433 int rpcerror; 1434 int user_informed; 1435 1436 user_informed = 0; 1437 1438 do { 1439 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1440 cr, douprintf, flags, fi); 1441 if (!rpcerror) { 1442 cred_t *crr; 1443 if (*statusp == NFS3ERR_JUKEBOX) { 1444 if (!user_informed) { 1445 user_informed = 1; 1446 uprintf( 1447 "file temporarily unavailable on the server, retrying...\n"); 1448 } 1449 delay(nfs3_jukebox_delay); 1450 } 1451 /* 1452 * See crnetadjust() for comments. 1453 */ 1454 else if (*statusp == NFS3ERR_ACCES && 1455 (crr = crnetadjust(cr)) != NULL) { 1456 #ifdef DEBUG 1457 acl3call_hits++; 1458 #endif 1459 rpcerror = aclcall(mi, which, xdrargs, argsp, 1460 xdrres, resp, crr, douprintf, flags, fi); 1461 1462 crfree(crr); 1463 #ifdef DEBUG 1464 if (*statusp == NFS3ERR_ACCES) 1465 acl3call_misses++; 1466 #endif 1467 } 1468 } 1469 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1470 1471 return (rpcerror); 1472 } 1473 1474 static int 1475 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1476 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1477 int flags, failinfo_t *fi) 1478 { 1479 CLIENT *client; 1480 struct chtab *ch; 1481 cred_t *cr = icr; 1482 bool_t cred_cloned = FALSE; 1483 enum clnt_stat status; 1484 struct rpc_err rpcerr; 1485 struct timeval wait; 1486 int timeo; /* in units of hz */ 1487 #if 0 /* notyet */ 1488 int my_rsize, my_wsize; 1489 #endif 1490 bool_t tryagain; 1491 k_sigset_t smask; 1492 servinfo_t *svp; 1493 struct nfs_clnt *nfscl; 1494 zoneid_t zoneid = getzoneid(); 1495 #ifdef DEBUG 1496 char *bufp; 1497 #endif 1498 1499 #if 0 /* notyet */ 1500 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1501 "rfscall_start:which %d mi %p", which, mi); 1502 #endif 1503 1504 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1505 ASSERT(nfscl != NULL); 1506 1507 nfscl->nfscl_stat.calls.value.ui64++; 1508 mi->mi_aclreqs[which].value.ui64++; 1509 1510 rpcerr.re_status = RPC_SUCCESS; 1511 1512 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1513 rpcerr.re_status = RPC_FAILED; 1514 rpcerr.re_errno = EIO; 1515 return (rpcerr.re_errno); 1516 } 1517 1518 #if 0 /* notyet */ 1519 /* 1520 * Remember the transfer sizes in case 1521 * nfs_feedback changes them underneath us. 1522 */ 1523 my_rsize = mi->mi_curread; 1524 my_wsize = mi->mi_curwrite; 1525 #endif 1526 1527 /* 1528 * NFS client failover support 1529 * 1530 * If this rnode is not in sync with the current server (VALID_FH), 1531 * we'd like to do a remap to get in sync. We can be interrupted 1532 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1533 * use the best info we have to try the RPC. Part of that is 1534 * unconditionally updating the filehandle copy kept for V3. 1535 * 1536 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1537 * rw_enter(); we're trying to keep the current server from being 1538 * changed on us until we're done with the remapping and have a 1539 * matching client handle. We don't want to sending a filehandle 1540 * to the wrong host. 1541 */ 1542 failoverretry: 1543 if (FAILOVER_MOUNT(mi)) { 1544 mutex_enter(&mi->mi_lock); 1545 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1546 if (failover_wait(mi)) { 1547 mutex_exit(&mi->mi_lock); 1548 return (EINTR); 1549 } 1550 } 1551 INC_READERS(mi); 1552 mutex_exit(&mi->mi_lock); 1553 if (fi) { 1554 if (!VALID_FH(fi) && 1555 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1556 int remaperr; 1557 1558 svp = mi->mi_curr_serv; 1559 remaperr = failover_remap(fi); 1560 if (remaperr != 0) { 1561 #ifdef DEBUG 1562 if (remaperr != EINTR) 1563 nfs_cmn_err(remaperr, CE_WARN, 1564 "aclcall couldn't failover: %m"); 1565 #endif 1566 mutex_enter(&mi->mi_lock); 1567 DEC_READERS(mi); 1568 mutex_exit(&mi->mi_lock); 1569 1570 /* 1571 * If failover_remap returns ETIMEDOUT 1572 * and the filesystem is hard mounted 1573 * we have to retry the call with a new 1574 * server. 1575 */ 1576 if ((mi->mi_flags & MI_HARD) && 1577 IS_RECOVERABLE_ERROR(remaperr)) { 1578 if (svp == mi->mi_curr_serv) 1579 failover_newserver(mi); 1580 rpcerr.re_status = RPC_SUCCESS; 1581 goto failoverretry; 1582 } 1583 return (remaperr); 1584 } 1585 } 1586 if (fi->fhp && fi->copyproc) 1587 (*fi->copyproc)(fi->fhp, fi->vp); 1588 } 1589 } 1590 1591 /* For TSOL, use a new cred which has net_mac_aware flag */ 1592 if (!cred_cloned && is_system_labeled()) { 1593 cred_cloned = TRUE; 1594 cr = crdup(icr); 1595 (void) setpflags(NET_MAC_AWARE, 1, cr); 1596 } 1597 1598 /* 1599 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1600 * are guaranteed to reprocess the retry as a new request. 1601 */ 1602 svp = mi->mi_curr_serv; 1603 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1604 if (FAILOVER_MOUNT(mi)) { 1605 mutex_enter(&mi->mi_lock); 1606 DEC_READERS(mi); 1607 mutex_exit(&mi->mi_lock); 1608 1609 if ((rpcerr.re_errno == ETIMEDOUT || 1610 rpcerr.re_errno == ECONNRESET) && 1611 failover_safe(fi)) { 1612 if (svp == mi->mi_curr_serv) 1613 failover_newserver(mi); 1614 goto failoverretry; 1615 } 1616 } 1617 if (rpcerr.re_errno != 0) { 1618 if (cred_cloned) 1619 crfree(cr); 1620 return (rpcerr.re_errno); 1621 } 1622 1623 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1624 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1625 timeo = (mi->mi_timeo * hz) / 10; 1626 } else { 1627 mutex_enter(&mi->mi_lock); 1628 timeo = CLNT_SETTIMERS(client, 1629 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1630 &(mi->mi_timers[NFS_CALLTYPES]), 1631 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1632 (void (*)()) 0, (caddr_t)mi, 0); 1633 mutex_exit(&mi->mi_lock); 1634 } 1635 1636 /* 1637 * If hard mounted fs, retry call forever unless hard error occurs. 1638 */ 1639 do { 1640 tryagain = FALSE; 1641 1642 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1643 status = RPC_FAILED; 1644 rpcerr.re_status = RPC_FAILED; 1645 rpcerr.re_errno = EIO; 1646 break; 1647 } 1648 1649 TICK_TO_TIMEVAL(timeo, &wait); 1650 1651 /* 1652 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1653 * and SIGTERM. (Preserving the existing masks). 1654 * Mask out SIGINT if mount option nointr is specified. 1655 */ 1656 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1657 if (!(mi->mi_flags & MI_INT)) 1658 client->cl_nosignal = TRUE; 1659 1660 /* 1661 * If there is a current signal, then don't bother 1662 * even trying to send out the request because we 1663 * won't be able to block waiting for the response. 1664 * Simply assume RPC_INTR and get on with it. 1665 */ 1666 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1667 status = RPC_INTR; 1668 else { 1669 status = CLNT_CALL(client, which, xdrargs, argsp, 1670 xdrres, resp, wait); 1671 } 1672 1673 if (!(mi->mi_flags & MI_INT)) 1674 client->cl_nosignal = FALSE; 1675 /* 1676 * restore original signal mask 1677 */ 1678 sigunintr(&smask); 1679 1680 switch (status) { 1681 case RPC_SUCCESS: 1682 #if 0 /* notyet */ 1683 if ((mi->mi_flags & MI_DYNAMIC) && 1684 mi->mi_timer_type[which] != 0 && 1685 (mi->mi_curread != my_rsize || 1686 mi->mi_curwrite != my_wsize)) 1687 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1688 #endif 1689 break; 1690 1691 /* 1692 * Unfortunately, there are servers in the world which 1693 * are not coded correctly. They are not prepared to 1694 * handle RPC requests to the NFS port which are not 1695 * NFS requests. Thus, they may try to process the 1696 * NFS_ACL request as if it were an NFS request. This 1697 * does not work. Generally, an error will be generated 1698 * on the client because it will not be able to decode 1699 * the response from the server. However, it seems 1700 * possible that the server may not be able to decode 1701 * the arguments. Thus, the criteria for deciding 1702 * whether the server supports NFS_ACL or not is whether 1703 * the following RPC errors are returned from CLNT_CALL. 1704 */ 1705 case RPC_CANTDECODERES: 1706 case RPC_PROGUNAVAIL: 1707 case RPC_CANTDECODEARGS: 1708 case RPC_PROGVERSMISMATCH: 1709 mutex_enter(&mi->mi_lock); 1710 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1711 mutex_exit(&mi->mi_lock); 1712 break; 1713 1714 /* 1715 * If the server supports NFS_ACL but not the new ops 1716 * for extended attributes, make sure we don't retry. 1717 */ 1718 case RPC_PROCUNAVAIL: 1719 mutex_enter(&mi->mi_lock); 1720 mi->mi_flags &= ~MI_EXTATTR; 1721 mutex_exit(&mi->mi_lock); 1722 break; 1723 1724 case RPC_INTR: 1725 /* 1726 * There is no way to recover from this error, 1727 * even if mount option nointr is specified. 1728 * SIGKILL, for example, cannot be blocked. 1729 */ 1730 rpcerr.re_status = RPC_INTR; 1731 rpcerr.re_errno = EINTR; 1732 break; 1733 1734 case RPC_UDERROR: 1735 /* 1736 * If the NFS server is local (vold) and 1737 * it goes away then we get RPC_UDERROR. 1738 * This is a retryable error, so we would 1739 * loop, so check to see if the specific 1740 * error was ECONNRESET, indicating that 1741 * target did not exist at all. If so, 1742 * return with RPC_PROGUNAVAIL and 1743 * ECONNRESET to indicate why. 1744 */ 1745 CLNT_GETERR(client, &rpcerr); 1746 if (rpcerr.re_errno == ECONNRESET) { 1747 rpcerr.re_status = RPC_PROGUNAVAIL; 1748 rpcerr.re_errno = ECONNRESET; 1749 break; 1750 } 1751 /*FALLTHROUGH*/ 1752 1753 default: /* probably RPC_TIMEDOUT */ 1754 if (IS_UNRECOVERABLE_RPC(status)) 1755 break; 1756 1757 /* 1758 * increment server not responding count 1759 */ 1760 mutex_enter(&mi->mi_lock); 1761 mi->mi_noresponse++; 1762 mutex_exit(&mi->mi_lock); 1763 #ifdef DEBUG 1764 nfscl->nfscl_stat.noresponse.value.ui64++; 1765 #endif 1766 1767 if (!(mi->mi_flags & MI_HARD)) { 1768 if (!(mi->mi_flags & MI_SEMISOFT) || 1769 (mi->mi_acl_ss_call_type[which] == 0)) 1770 break; 1771 } 1772 1773 /* 1774 * The call is in progress (over COTS). 1775 * Try the CLNT_CALL again, but don't 1776 * print a noisy error message. 1777 */ 1778 if (status == RPC_INPROGRESS) { 1779 tryagain = TRUE; 1780 break; 1781 } 1782 1783 if (flags & RFSCALL_SOFT) 1784 break; 1785 1786 /* 1787 * On zone shutdown, just move on. 1788 */ 1789 if (zone_status_get(curproc->p_zone) >= 1790 ZONE_IS_SHUTTING_DOWN) { 1791 rpcerr.re_status = RPC_FAILED; 1792 rpcerr.re_errno = EIO; 1793 break; 1794 } 1795 1796 /* 1797 * NFS client failover support 1798 * 1799 * If the current server just failed us, we'll 1800 * start the process of finding a new server. 1801 * After that, we can just retry. 1802 */ 1803 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1804 if (svp == mi->mi_curr_serv) 1805 failover_newserver(mi); 1806 clfree_impl(client, ch, nfscl); 1807 goto failoverretry; 1808 } 1809 1810 tryagain = TRUE; 1811 timeo = backoff(timeo); 1812 mutex_enter(&mi->mi_lock); 1813 if (!(mi->mi_flags & MI_PRINTED)) { 1814 mi->mi_flags |= MI_PRINTED; 1815 mutex_exit(&mi->mi_lock); 1816 #ifdef DEBUG 1817 zprintf(zoneid, 1818 "NFS_ACL%d server %s not responding still trying\n", 1819 mi->mi_vers, svp->sv_hostname); 1820 #else 1821 zprintf(zoneid, 1822 "NFS server %s not responding still trying\n", 1823 svp->sv_hostname); 1824 #endif 1825 } else 1826 mutex_exit(&mi->mi_lock); 1827 if (*douprintf && nfs_has_ctty()) { 1828 *douprintf = 0; 1829 if (!(mi->mi_flags & MI_NOPRINT)) 1830 #ifdef DEBUG 1831 uprintf( 1832 "NFS_ACL%d server %s not responding still trying\n", 1833 mi->mi_vers, svp->sv_hostname); 1834 #else 1835 uprintf( 1836 "NFS server %s not responding still trying\n", 1837 svp->sv_hostname); 1838 #endif 1839 } 1840 1841 #if 0 /* notyet */ 1842 /* 1843 * If doing dynamic adjustment of transfer 1844 * size and if it's a read or write call 1845 * and if the transfer size changed while 1846 * retransmitting or if the feedback routine 1847 * changed the transfer size, 1848 * then exit rfscall so that the transfer 1849 * size can be adjusted at the vnops level. 1850 */ 1851 if ((mi->mi_flags & MI_DYNAMIC) && 1852 mi->mi_acl_timer_type[which] != 0 && 1853 (mi->mi_curread != my_rsize || 1854 mi->mi_curwrite != my_wsize || 1855 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1856 /* 1857 * On read or write calls, return 1858 * back to the vnode ops level if 1859 * the transfer size changed. 1860 */ 1861 clfree_impl(client, ch, nfscl); 1862 if (cred_cloned) 1863 crfree(cr); 1864 return (ENFS_TRYAGAIN); 1865 } 1866 #endif 1867 } 1868 } while (tryagain); 1869 1870 if (status != RPC_SUCCESS) { 1871 /* 1872 * Let soft mounts use the timed out message. 1873 */ 1874 if (status == RPC_INPROGRESS) 1875 status = RPC_TIMEDOUT; 1876 nfscl->nfscl_stat.badcalls.value.ui64++; 1877 if (status == RPC_CANTDECODERES || 1878 status == RPC_PROGUNAVAIL || 1879 status == RPC_PROCUNAVAIL || 1880 status == RPC_CANTDECODEARGS || 1881 status == RPC_PROGVERSMISMATCH) 1882 CLNT_GETERR(client, &rpcerr); 1883 else if (status != RPC_INTR) { 1884 mutex_enter(&mi->mi_lock); 1885 mi->mi_flags |= MI_DOWN; 1886 mutex_exit(&mi->mi_lock); 1887 CLNT_GETERR(client, &rpcerr); 1888 #ifdef DEBUG 1889 bufp = clnt_sperror(client, svp->sv_hostname); 1890 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1891 mi->mi_vers, mi->mi_aclnames[which], bufp); 1892 if (nfs_has_ctty()) { 1893 if (!(mi->mi_flags & MI_NOPRINT)) { 1894 uprintf("NFS_ACL%d %s failed for %s\n", 1895 mi->mi_vers, mi->mi_aclnames[which], 1896 bufp); 1897 } 1898 } 1899 kmem_free(bufp, MAXPATHLEN); 1900 #else 1901 zprintf(zoneid, 1902 "NFS %s failed for server %s: error %d (%s)\n", 1903 mi->mi_aclnames[which], svp->sv_hostname, 1904 status, clnt_sperrno(status)); 1905 if (nfs_has_ctty()) { 1906 if (!(mi->mi_flags & MI_NOPRINT)) 1907 uprintf( 1908 "NFS %s failed for server %s: error %d (%s)\n", 1909 mi->mi_aclnames[which], 1910 svp->sv_hostname, status, 1911 clnt_sperrno(status)); 1912 } 1913 #endif 1914 /* 1915 * when CLNT_CALL() fails with RPC_AUTHERROR, 1916 * re_errno is set appropriately depending on 1917 * the authentication error 1918 */ 1919 if (status == RPC_VERSMISMATCH || 1920 status == RPC_PROGVERSMISMATCH) 1921 rpcerr.re_errno = EIO; 1922 } 1923 } else { 1924 /* 1925 * Test the value of mi_down and mi_printed without 1926 * holding the mi_lock mutex. If they are both zero, 1927 * then it is okay to skip the down and printed 1928 * processing. This saves on a mutex_enter and 1929 * mutex_exit pair for a normal, successful RPC. 1930 * This was just complete overhead. 1931 */ 1932 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1933 mutex_enter(&mi->mi_lock); 1934 mi->mi_flags &= ~MI_DOWN; 1935 if (mi->mi_flags & MI_PRINTED) { 1936 mi->mi_flags &= ~MI_PRINTED; 1937 mutex_exit(&mi->mi_lock); 1938 #ifdef DEBUG 1939 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1940 mi->mi_vers, svp->sv_hostname); 1941 #else 1942 zprintf(zoneid, "NFS server %s ok\n", 1943 svp->sv_hostname); 1944 #endif 1945 } else 1946 mutex_exit(&mi->mi_lock); 1947 } 1948 1949 if (*douprintf == 0) { 1950 if (!(mi->mi_flags & MI_NOPRINT)) 1951 #ifdef DEBUG 1952 uprintf("NFS_ACL%d server %s ok\n", 1953 mi->mi_vers, svp->sv_hostname); 1954 #else 1955 uprintf("NFS server %s ok\n", svp->sv_hostname); 1956 #endif 1957 *douprintf = 1; 1958 } 1959 } 1960 1961 clfree_impl(client, ch, nfscl); 1962 if (cred_cloned) 1963 crfree(cr); 1964 1965 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1966 1967 #if 0 /* notyet */ 1968 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1969 rpcerr.re_errno); 1970 #endif 1971 1972 return (rpcerr.re_errno); 1973 } 1974 1975 int 1976 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1977 { 1978 uint_t mask = vap->va_mask; 1979 1980 if (!(mask & AT_MODE)) 1981 sa->sa_mode = (uint32_t)-1; 1982 else 1983 sa->sa_mode = vap->va_mode; 1984 if (!(mask & AT_UID)) 1985 sa->sa_uid = (uint32_t)-1; 1986 else 1987 sa->sa_uid = (uint32_t)vap->va_uid; 1988 if (!(mask & AT_GID)) 1989 sa->sa_gid = (uint32_t)-1; 1990 else 1991 sa->sa_gid = (uint32_t)vap->va_gid; 1992 if (!(mask & AT_SIZE)) 1993 sa->sa_size = (uint32_t)-1; 1994 else 1995 sa->sa_size = (uint32_t)vap->va_size; 1996 if (!(mask & AT_ATIME)) 1997 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 1998 else { 1999 /* check time validity */ 2000 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2001 return (EOVERFLOW); 2002 } 2003 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2004 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2005 } 2006 if (!(mask & AT_MTIME)) 2007 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2008 else { 2009 /* check time validity */ 2010 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2011 return (EOVERFLOW); 2012 } 2013 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2014 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2015 } 2016 return (0); 2017 } 2018 2019 int 2020 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2021 { 2022 uint_t mask = vap->va_mask; 2023 2024 if (!(mask & AT_MODE)) 2025 sa->mode.set_it = FALSE; 2026 else { 2027 sa->mode.set_it = TRUE; 2028 sa->mode.mode = (mode3)vap->va_mode; 2029 } 2030 if (!(mask & AT_UID)) 2031 sa->uid.set_it = FALSE; 2032 else { 2033 sa->uid.set_it = TRUE; 2034 sa->uid.uid = (uid3)vap->va_uid; 2035 } 2036 if (!(mask & AT_GID)) 2037 sa->gid.set_it = FALSE; 2038 else { 2039 sa->gid.set_it = TRUE; 2040 sa->gid.gid = (gid3)vap->va_gid; 2041 } 2042 if (!(mask & AT_SIZE)) 2043 sa->size.set_it = FALSE; 2044 else { 2045 sa->size.set_it = TRUE; 2046 sa->size.size = (size3)vap->va_size; 2047 } 2048 if (!(mask & AT_ATIME)) 2049 sa->atime.set_it = DONT_CHANGE; 2050 else { 2051 /* check time validity */ 2052 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2053 return (EOVERFLOW); 2054 } 2055 sa->atime.set_it = SET_TO_CLIENT_TIME; 2056 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2057 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2058 } 2059 if (!(mask & AT_MTIME)) 2060 sa->mtime.set_it = DONT_CHANGE; 2061 else { 2062 /* check time validity */ 2063 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2064 return (EOVERFLOW); 2065 } 2066 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2067 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2068 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2069 } 2070 return (0); 2071 } 2072 2073 void 2074 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2075 { 2076 2077 da->da_fhandle = VTOFH(dvp); 2078 da->da_name = nm; 2079 da->da_flags = 0; 2080 } 2081 2082 void 2083 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2084 { 2085 2086 da->dirp = VTOFH3(dvp); 2087 da->name = nm; 2088 } 2089 2090 int 2091 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2092 { 2093 int error; 2094 rnode_t *rp; 2095 struct vattr va; 2096 2097 va.va_mask = AT_MODE | AT_GID; 2098 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2099 if (error) 2100 return (error); 2101 2102 /* 2103 * To determine the expected group-id of the created file: 2104 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2105 * GRPID option, and the directory's set-gid bit is clear, 2106 * then use the process's gid. 2107 * 2) Otherwise, set the group-id to the gid of the parent directory. 2108 */ 2109 rp = VTOR(dvp); 2110 mutex_enter(&rp->r_statelock); 2111 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2112 *gidp = crgetgid(cr); 2113 else 2114 *gidp = va.va_gid; 2115 mutex_exit(&rp->r_statelock); 2116 return (0); 2117 } 2118 2119 int 2120 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2121 { 2122 int error; 2123 struct vattr va; 2124 2125 va.va_mask = AT_MODE; 2126 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2127 if (error) 2128 return (error); 2129 2130 /* 2131 * Modify the expected mode (om) so that the set-gid bit matches 2132 * that of the parent directory (dvp). 2133 */ 2134 if (va.va_mode & VSGID) 2135 *omp |= VSGID; 2136 else 2137 *omp &= ~VSGID; 2138 return (0); 2139 } 2140 2141 void 2142 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2143 { 2144 2145 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2146 if (!(vp->v_flag & VSWAPLIKE)) { 2147 mutex_enter(&vp->v_lock); 2148 vp->v_flag |= VSWAPLIKE; 2149 mutex_exit(&vp->v_lock); 2150 } 2151 } else { 2152 if (vp->v_flag & VSWAPLIKE) { 2153 mutex_enter(&vp->v_lock); 2154 vp->v_flag &= ~VSWAPLIKE; 2155 mutex_exit(&vp->v_lock); 2156 } 2157 } 2158 } 2159 2160 /* 2161 * Free the resources associated with an rnode. 2162 */ 2163 static void 2164 rinactive(rnode_t *rp, cred_t *cr) 2165 { 2166 vnode_t *vp; 2167 cred_t *cred; 2168 char *contents; 2169 int size; 2170 vsecattr_t *vsp; 2171 int error; 2172 nfs3_pathconf_info *info; 2173 2174 /* 2175 * Before freeing anything, wait until all asynchronous 2176 * activity is done on this rnode. This will allow all 2177 * asynchronous read ahead and write behind i/o's to 2178 * finish. 2179 */ 2180 mutex_enter(&rp->r_statelock); 2181 while (rp->r_count > 0) 2182 cv_wait(&rp->r_cv, &rp->r_statelock); 2183 mutex_exit(&rp->r_statelock); 2184 2185 /* 2186 * Flush and invalidate all pages associated with the vnode. 2187 */ 2188 vp = RTOV(rp); 2189 if (vn_has_cached_data(vp)) { 2190 ASSERT(vp->v_type != VCHR); 2191 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2192 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2193 if (error && (error == ENOSPC || error == EDQUOT)) { 2194 mutex_enter(&rp->r_statelock); 2195 if (!rp->r_error) 2196 rp->r_error = error; 2197 mutex_exit(&rp->r_statelock); 2198 } 2199 } 2200 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2201 } 2202 2203 /* 2204 * Free any held credentials and caches which may be associated 2205 * with this rnode. 2206 */ 2207 mutex_enter(&rp->r_statelock); 2208 cred = rp->r_cred; 2209 rp->r_cred = NULL; 2210 contents = rp->r_symlink.contents; 2211 size = rp->r_symlink.size; 2212 rp->r_symlink.contents = NULL; 2213 vsp = rp->r_secattr; 2214 rp->r_secattr = NULL; 2215 info = rp->r_pathconf; 2216 rp->r_pathconf = NULL; 2217 mutex_exit(&rp->r_statelock); 2218 2219 /* 2220 * Free the held credential. 2221 */ 2222 if (cred != NULL) 2223 crfree(cred); 2224 2225 /* 2226 * Free the access cache entries. 2227 */ 2228 (void) nfs_access_purge_rp(rp); 2229 2230 /* 2231 * Free the readdir cache entries. 2232 */ 2233 if (HAVE_RDDIR_CACHE(rp)) 2234 nfs_purge_rddir_cache(vp); 2235 2236 /* 2237 * Free the symbolic link cache. 2238 */ 2239 if (contents != NULL) { 2240 2241 kmem_free((void *)contents, size); 2242 } 2243 2244 /* 2245 * Free any cached ACL. 2246 */ 2247 if (vsp != NULL) 2248 nfs_acl_free(vsp); 2249 2250 /* 2251 * Free any cached pathconf information. 2252 */ 2253 if (info != NULL) 2254 kmem_free(info, sizeof (*info)); 2255 } 2256 2257 /* 2258 * Return a vnode for the given NFS Version 2 file handle. 2259 * If no rnode exists for this fhandle, create one and put it 2260 * into the hash queues. If the rnode for this fhandle 2261 * already exists, return it. 2262 * 2263 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2264 */ 2265 vnode_t * 2266 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2267 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2268 { 2269 int newnode; 2270 int index; 2271 vnode_t *vp; 2272 nfs_fhandle nfh; 2273 vattr_t va; 2274 2275 nfh.fh_len = NFS_FHSIZE; 2276 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2277 2278 index = rtablehash(&nfh); 2279 rw_enter(&rtable[index].r_lock, RW_READER); 2280 2281 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2282 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2283 2284 if (attr != NULL) { 2285 if (!newnode) { 2286 rw_exit(&rtable[index].r_lock); 2287 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2288 } else { 2289 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2290 vp->v_type = VBAD; 2291 else 2292 vp->v_type = n2v_type(attr); 2293 /* 2294 * A translation here seems to be necessary 2295 * because this function can be called 2296 * with `attr' that has come from the wire, 2297 * and been operated on by vattr_to_nattr(). 2298 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2299 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2300 * ->makenfsnode(). 2301 */ 2302 if ((attr->na_rdev & 0xffff0000) == 0) 2303 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2304 else 2305 vp->v_rdev = expldev(n2v_rdev(attr)); 2306 nfs_attrcache(vp, attr, t); 2307 rw_exit(&rtable[index].r_lock); 2308 } 2309 } else { 2310 if (newnode) { 2311 PURGE_ATTRCACHE(vp); 2312 } 2313 rw_exit(&rtable[index].r_lock); 2314 } 2315 2316 return (vp); 2317 } 2318 2319 /* 2320 * Return a vnode for the given NFS Version 3 file handle. 2321 * If no rnode exists for this fhandle, create one and put it 2322 * into the hash queues. If the rnode for this fhandle 2323 * already exists, return it. 2324 * 2325 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2326 */ 2327 vnode_t * 2328 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2329 cred_t *cr, char *dnm, char *nm) 2330 { 2331 int newnode; 2332 int index; 2333 vnode_t *vp; 2334 2335 index = rtablehash((nfs_fhandle *)fh); 2336 rw_enter(&rtable[index].r_lock, RW_READER); 2337 2338 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2339 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2340 dnm, nm); 2341 2342 if (vap == NULL) { 2343 if (newnode) { 2344 PURGE_ATTRCACHE(vp); 2345 } 2346 rw_exit(&rtable[index].r_lock); 2347 return (vp); 2348 } 2349 2350 if (!newnode) { 2351 rw_exit(&rtable[index].r_lock); 2352 nfs_attr_cache(vp, vap, t, cr); 2353 } else { 2354 rnode_t *rp = VTOR(vp); 2355 2356 vp->v_type = vap->va_type; 2357 vp->v_rdev = vap->va_rdev; 2358 2359 mutex_enter(&rp->r_statelock); 2360 if (rp->r_mtime <= t) 2361 nfs_attrcache_va(vp, vap); 2362 mutex_exit(&rp->r_statelock); 2363 rw_exit(&rtable[index].r_lock); 2364 } 2365 2366 return (vp); 2367 } 2368 2369 vnode_t * 2370 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2371 cred_t *cr, char *dnm, char *nm) 2372 { 2373 int newnode; 2374 int index; 2375 vnode_t *vp; 2376 vattr_t va; 2377 2378 index = rtablehash((nfs_fhandle *)fh); 2379 rw_enter(&rtable[index].r_lock, RW_READER); 2380 2381 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2382 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2383 dnm, nm); 2384 2385 if (attr == NULL) { 2386 if (newnode) { 2387 PURGE_ATTRCACHE(vp); 2388 } 2389 rw_exit(&rtable[index].r_lock); 2390 return (vp); 2391 } 2392 2393 if (!newnode) { 2394 rw_exit(&rtable[index].r_lock); 2395 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2396 } else { 2397 if (attr->type < NF3REG || attr->type > NF3FIFO) 2398 vp->v_type = VBAD; 2399 else 2400 vp->v_type = nf3_to_vt[attr->type]; 2401 vp->v_rdev = makedevice(attr->rdev.specdata1, 2402 attr->rdev.specdata2); 2403 nfs3_attrcache(vp, attr, t); 2404 rw_exit(&rtable[index].r_lock); 2405 } 2406 2407 return (vp); 2408 } 2409 2410 /* 2411 * Read this comment before making changes to rtablehash()! 2412 * This is a hash function in which seemingly obvious and harmless 2413 * changes can cause escalations costing million dollars! 2414 * Know what you are doing. 2415 * 2416 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2417 * algorithm is currently detailed here: 2418 * 2419 * http://burtleburtle.net/bob/hash/doobs.html 2420 * 2421 * Of course, the above link may not be valid by the time you are reading 2422 * this, but suffice it to say that the one-at-a-time algorithm works well in 2423 * almost all cases. If you are changing the algorithm be sure to verify that 2424 * the hash algorithm still provides even distribution in all cases and with 2425 * any server returning filehandles in whatever order (sequential or random). 2426 */ 2427 static int 2428 rtablehash(nfs_fhandle *fh) 2429 { 2430 ulong_t hash, len, i; 2431 char *key; 2432 2433 key = fh->fh_buf; 2434 len = (ulong_t)fh->fh_len; 2435 for (hash = 0, i = 0; i < len; i++) { 2436 hash += key[i]; 2437 hash += (hash << 10); 2438 hash ^= (hash >> 6); 2439 } 2440 hash += (hash << 3); 2441 hash ^= (hash >> 11); 2442 hash += (hash << 15); 2443 return (hash & rtablemask); 2444 } 2445 2446 static vnode_t * 2447 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2448 struct vnodeops *vops, 2449 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2450 int (*compar)(const void *, const void *), 2451 int *newnode, cred_t *cr, char *dnm, char *nm) 2452 { 2453 rnode_t *rp; 2454 rnode_t *trp; 2455 vnode_t *vp; 2456 mntinfo_t *mi; 2457 2458 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2459 2460 mi = VFTOMI(vfsp); 2461 start: 2462 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2463 vp = RTOV(rp); 2464 nfs_set_vroot(vp); 2465 *newnode = 0; 2466 return (vp); 2467 } 2468 rw_exit(&rhtp->r_lock); 2469 2470 mutex_enter(&rpfreelist_lock); 2471 if (rpfreelist != NULL && rnew >= nrnode) { 2472 rp = rpfreelist; 2473 rp_rmfree(rp); 2474 mutex_exit(&rpfreelist_lock); 2475 2476 vp = RTOV(rp); 2477 2478 if (rp->r_flags & RHASHED) { 2479 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2480 mutex_enter(&vp->v_lock); 2481 if (vp->v_count > 1) { 2482 vp->v_count--; 2483 mutex_exit(&vp->v_lock); 2484 rw_exit(&rp->r_hashq->r_lock); 2485 rw_enter(&rhtp->r_lock, RW_READER); 2486 goto start; 2487 } 2488 mutex_exit(&vp->v_lock); 2489 rp_rmhash_locked(rp); 2490 rw_exit(&rp->r_hashq->r_lock); 2491 } 2492 2493 rinactive(rp, cr); 2494 2495 mutex_enter(&vp->v_lock); 2496 if (vp->v_count > 1) { 2497 vp->v_count--; 2498 mutex_exit(&vp->v_lock); 2499 rw_enter(&rhtp->r_lock, RW_READER); 2500 goto start; 2501 } 2502 mutex_exit(&vp->v_lock); 2503 vn_invalid(vp); 2504 /* 2505 * destroy old locks before bzero'ing and 2506 * recreating the locks below. 2507 */ 2508 nfs_rw_destroy(&rp->r_rwlock); 2509 nfs_rw_destroy(&rp->r_lkserlock); 2510 mutex_destroy(&rp->r_statelock); 2511 cv_destroy(&rp->r_cv); 2512 cv_destroy(&rp->r_commit.c_cv); 2513 nfs_free_r_path(rp); 2514 avl_destroy(&rp->r_dir); 2515 /* 2516 * Make sure that if rnode is recycled then 2517 * VFS count is decremented properly before 2518 * reuse. 2519 */ 2520 VFS_RELE(vp->v_vfsp); 2521 vn_reinit(vp); 2522 } else { 2523 vnode_t *new_vp; 2524 2525 mutex_exit(&rpfreelist_lock); 2526 2527 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2528 new_vp = vn_alloc(KM_SLEEP); 2529 2530 atomic_add_long((ulong_t *)&rnew, 1); 2531 #ifdef DEBUG 2532 clstat_debug.nrnode.value.ui64++; 2533 #endif 2534 vp = new_vp; 2535 } 2536 2537 bzero(rp, sizeof (*rp)); 2538 rp->r_vnode = vp; 2539 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2540 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2541 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2542 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2543 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2544 rp->r_fh.fh_len = fh->fh_len; 2545 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2546 rp->r_server = mi->mi_curr_serv; 2547 if (FAILOVER_MOUNT(mi)) { 2548 /* 2549 * If replicated servers, stash pathnames 2550 */ 2551 if (dnm != NULL && nm != NULL) { 2552 char *s, *p; 2553 uint_t len; 2554 2555 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2556 rp->r_path = kmem_alloc(len, KM_SLEEP); 2557 #ifdef DEBUG 2558 clstat_debug.rpath.value.ui64 += len; 2559 #endif 2560 s = rp->r_path; 2561 for (p = dnm; *p; p++) 2562 *s++ = *p; 2563 *s++ = '/'; 2564 for (p = nm; *p; p++) 2565 *s++ = *p; 2566 *s = '\0'; 2567 } else { 2568 /* special case for root */ 2569 rp->r_path = kmem_alloc(2, KM_SLEEP); 2570 #ifdef DEBUG 2571 clstat_debug.rpath.value.ui64 += 2; 2572 #endif 2573 *rp->r_path = '.'; 2574 *(rp->r_path + 1) = '\0'; 2575 } 2576 } 2577 VFS_HOLD(vfsp); 2578 rp->r_putapage = putapage; 2579 rp->r_hashq = rhtp; 2580 rp->r_flags = RREADDIRPLUS; 2581 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2582 offsetof(rddir_cache, tree)); 2583 vn_setops(vp, vops); 2584 vp->v_data = (caddr_t)rp; 2585 vp->v_vfsp = vfsp; 2586 vp->v_type = VNON; 2587 nfs_set_vroot(vp); 2588 2589 /* 2590 * There is a race condition if someone else 2591 * alloc's the rnode while no locks are held, so we 2592 * check again and recover if found. 2593 */ 2594 rw_enter(&rhtp->r_lock, RW_WRITER); 2595 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2596 vp = RTOV(trp); 2597 nfs_set_vroot(vp); 2598 *newnode = 0; 2599 rw_exit(&rhtp->r_lock); 2600 rp_addfree(rp, cr); 2601 rw_enter(&rhtp->r_lock, RW_READER); 2602 return (vp); 2603 } 2604 rp_addhash(rp); 2605 *newnode = 1; 2606 return (vp); 2607 } 2608 2609 static void 2610 nfs_set_vroot(vnode_t *vp) 2611 { 2612 rnode_t *rp; 2613 nfs_fhandle *rootfh; 2614 2615 rp = VTOR(vp); 2616 rootfh = &rp->r_server->sv_fhandle; 2617 if (rootfh->fh_len == rp->r_fh.fh_len && 2618 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2619 if (!(vp->v_flag & VROOT)) { 2620 mutex_enter(&vp->v_lock); 2621 vp->v_flag |= VROOT; 2622 mutex_exit(&vp->v_lock); 2623 } 2624 } 2625 } 2626 2627 static void 2628 nfs_free_r_path(rnode_t *rp) 2629 { 2630 char *path; 2631 size_t len; 2632 2633 path = rp->r_path; 2634 if (path) { 2635 rp->r_path = NULL; 2636 len = strlen(path) + 1; 2637 kmem_free(path, len); 2638 #ifdef DEBUG 2639 clstat_debug.rpath.value.ui64 -= len; 2640 #endif 2641 } 2642 } 2643 2644 /* 2645 * Put an rnode on the free list. 2646 * 2647 * Rnodes which were allocated above and beyond the normal limit 2648 * are immediately freed. 2649 */ 2650 void 2651 rp_addfree(rnode_t *rp, cred_t *cr) 2652 { 2653 vnode_t *vp; 2654 struct vfs *vfsp; 2655 2656 vp = RTOV(rp); 2657 ASSERT(vp->v_count >= 1); 2658 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2659 2660 /* 2661 * If we have too many rnodes allocated and there are no 2662 * references to this rnode, or if the rnode is no longer 2663 * accessible by it does not reside in the hash queues, 2664 * or if an i/o error occurred while writing to the file, 2665 * then just free it instead of putting it on the rnode 2666 * freelist. 2667 */ 2668 vfsp = vp->v_vfsp; 2669 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2670 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2671 if (rp->r_flags & RHASHED) { 2672 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2673 mutex_enter(&vp->v_lock); 2674 if (vp->v_count > 1) { 2675 vp->v_count--; 2676 mutex_exit(&vp->v_lock); 2677 rw_exit(&rp->r_hashq->r_lock); 2678 return; 2679 } 2680 mutex_exit(&vp->v_lock); 2681 rp_rmhash_locked(rp); 2682 rw_exit(&rp->r_hashq->r_lock); 2683 } 2684 2685 rinactive(rp, cr); 2686 2687 /* 2688 * Recheck the vnode reference count. We need to 2689 * make sure that another reference has not been 2690 * acquired while we were not holding v_lock. The 2691 * rnode is not in the rnode hash queues, so the 2692 * only way for a reference to have been acquired 2693 * is for a VOP_PUTPAGE because the rnode was marked 2694 * with RDIRTY or for a modified page. This 2695 * reference may have been acquired before our call 2696 * to rinactive. The i/o may have been completed, 2697 * thus allowing rinactive to complete, but the 2698 * reference to the vnode may not have been released 2699 * yet. In any case, the rnode can not be destroyed 2700 * until the other references to this vnode have been 2701 * released. The other references will take care of 2702 * either destroying the rnode or placing it on the 2703 * rnode freelist. If there are no other references, 2704 * then the rnode may be safely destroyed. 2705 */ 2706 mutex_enter(&vp->v_lock); 2707 if (vp->v_count > 1) { 2708 vp->v_count--; 2709 mutex_exit(&vp->v_lock); 2710 return; 2711 } 2712 mutex_exit(&vp->v_lock); 2713 2714 destroy_rnode(rp); 2715 return; 2716 } 2717 2718 /* 2719 * Lock the hash queue and then recheck the reference count 2720 * to ensure that no other threads have acquired a reference 2721 * to indicate that the rnode should not be placed on the 2722 * freelist. If another reference has been acquired, then 2723 * just release this one and let the other thread complete 2724 * the processing of adding this rnode to the freelist. 2725 */ 2726 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2727 2728 mutex_enter(&vp->v_lock); 2729 if (vp->v_count > 1) { 2730 vp->v_count--; 2731 mutex_exit(&vp->v_lock); 2732 rw_exit(&rp->r_hashq->r_lock); 2733 return; 2734 } 2735 mutex_exit(&vp->v_lock); 2736 2737 /* 2738 * If there is no cached data or metadata for this file, then 2739 * put the rnode on the front of the freelist so that it will 2740 * be reused before other rnodes which may have cached data or 2741 * metadata associated with them. 2742 */ 2743 mutex_enter(&rpfreelist_lock); 2744 if (rpfreelist == NULL) { 2745 rp->r_freef = rp; 2746 rp->r_freeb = rp; 2747 rpfreelist = rp; 2748 } else { 2749 rp->r_freef = rpfreelist; 2750 rp->r_freeb = rpfreelist->r_freeb; 2751 rpfreelist->r_freeb->r_freef = rp; 2752 rpfreelist->r_freeb = rp; 2753 if (!vn_has_cached_data(vp) && 2754 !HAVE_RDDIR_CACHE(rp) && 2755 rp->r_symlink.contents == NULL && 2756 rp->r_secattr == NULL && 2757 rp->r_pathconf == NULL) 2758 rpfreelist = rp; 2759 } 2760 mutex_exit(&rpfreelist_lock); 2761 2762 rw_exit(&rp->r_hashq->r_lock); 2763 } 2764 2765 /* 2766 * Remove an rnode from the free list. 2767 * 2768 * The caller must be holding rpfreelist_lock and the rnode 2769 * must be on the freelist. 2770 */ 2771 static void 2772 rp_rmfree(rnode_t *rp) 2773 { 2774 2775 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2776 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2777 2778 if (rp == rpfreelist) { 2779 rpfreelist = rp->r_freef; 2780 if (rp == rpfreelist) 2781 rpfreelist = NULL; 2782 } 2783 2784 rp->r_freeb->r_freef = rp->r_freef; 2785 rp->r_freef->r_freeb = rp->r_freeb; 2786 2787 rp->r_freef = rp->r_freeb = NULL; 2788 } 2789 2790 /* 2791 * Put a rnode in the hash table. 2792 * 2793 * The caller must be holding the exclusive hash queue lock. 2794 */ 2795 static void 2796 rp_addhash(rnode_t *rp) 2797 { 2798 2799 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2800 ASSERT(!(rp->r_flags & RHASHED)); 2801 2802 rp->r_hashf = rp->r_hashq->r_hashf; 2803 rp->r_hashq->r_hashf = rp; 2804 rp->r_hashb = (rnode_t *)rp->r_hashq; 2805 rp->r_hashf->r_hashb = rp; 2806 2807 mutex_enter(&rp->r_statelock); 2808 rp->r_flags |= RHASHED; 2809 mutex_exit(&rp->r_statelock); 2810 } 2811 2812 /* 2813 * Remove a rnode from the hash table. 2814 * 2815 * The caller must be holding the hash queue lock. 2816 */ 2817 static void 2818 rp_rmhash_locked(rnode_t *rp) 2819 { 2820 2821 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2822 ASSERT(rp->r_flags & RHASHED); 2823 2824 rp->r_hashb->r_hashf = rp->r_hashf; 2825 rp->r_hashf->r_hashb = rp->r_hashb; 2826 2827 mutex_enter(&rp->r_statelock); 2828 rp->r_flags &= ~RHASHED; 2829 mutex_exit(&rp->r_statelock); 2830 } 2831 2832 /* 2833 * Remove a rnode from the hash table. 2834 * 2835 * The caller must not be holding the hash queue lock. 2836 */ 2837 void 2838 rp_rmhash(rnode_t *rp) 2839 { 2840 2841 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2842 rp_rmhash_locked(rp); 2843 rw_exit(&rp->r_hashq->r_lock); 2844 } 2845 2846 /* 2847 * Lookup a rnode by fhandle. 2848 * 2849 * The caller must be holding the hash queue lock, either shared or exclusive. 2850 */ 2851 static rnode_t * 2852 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2853 { 2854 rnode_t *rp; 2855 vnode_t *vp; 2856 2857 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2858 2859 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2860 vp = RTOV(rp); 2861 if (vp->v_vfsp == vfsp && 2862 rp->r_fh.fh_len == fh->fh_len && 2863 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2864 /* 2865 * remove rnode from free list, if necessary. 2866 */ 2867 if (rp->r_freef != NULL) { 2868 mutex_enter(&rpfreelist_lock); 2869 /* 2870 * If the rnode is on the freelist, 2871 * then remove it and use that reference 2872 * as the new reference. Otherwise, 2873 * need to increment the reference count. 2874 */ 2875 if (rp->r_freef != NULL) { 2876 rp_rmfree(rp); 2877 mutex_exit(&rpfreelist_lock); 2878 } else { 2879 mutex_exit(&rpfreelist_lock); 2880 VN_HOLD(vp); 2881 } 2882 } else 2883 VN_HOLD(vp); 2884 return (rp); 2885 } 2886 } 2887 return (NULL); 2888 } 2889 2890 /* 2891 * Return 1 if there is a active vnode belonging to this vfs in the 2892 * rtable cache. 2893 * 2894 * Several of these checks are done without holding the usual 2895 * locks. This is safe because destroy_rtable(), rp_addfree(), 2896 * etc. will redo the necessary checks before actually destroying 2897 * any rnodes. 2898 */ 2899 int 2900 check_rtable(struct vfs *vfsp) 2901 { 2902 int index; 2903 rnode_t *rp; 2904 vnode_t *vp; 2905 2906 for (index = 0; index < rtablesize; index++) { 2907 rw_enter(&rtable[index].r_lock, RW_READER); 2908 for (rp = rtable[index].r_hashf; 2909 rp != (rnode_t *)(&rtable[index]); 2910 rp = rp->r_hashf) { 2911 vp = RTOV(rp); 2912 if (vp->v_vfsp == vfsp) { 2913 if (rp->r_freef == NULL || 2914 (vn_has_cached_data(vp) && 2915 (rp->r_flags & RDIRTY)) || 2916 rp->r_count > 0) { 2917 rw_exit(&rtable[index].r_lock); 2918 return (1); 2919 } 2920 } 2921 } 2922 rw_exit(&rtable[index].r_lock); 2923 } 2924 return (0); 2925 } 2926 2927 /* 2928 * Destroy inactive vnodes from the hash queues which belong to this 2929 * vfs. It is essential that we destroy all inactive vnodes during a 2930 * forced unmount as well as during a normal unmount. 2931 */ 2932 void 2933 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2934 { 2935 int index; 2936 rnode_t *rp; 2937 rnode_t *rlist; 2938 rnode_t *r_hashf; 2939 vnode_t *vp; 2940 2941 rlist = NULL; 2942 2943 for (index = 0; index < rtablesize; index++) { 2944 rw_enter(&rtable[index].r_lock, RW_WRITER); 2945 for (rp = rtable[index].r_hashf; 2946 rp != (rnode_t *)(&rtable[index]); 2947 rp = r_hashf) { 2948 /* save the hash pointer before destroying */ 2949 r_hashf = rp->r_hashf; 2950 vp = RTOV(rp); 2951 if (vp->v_vfsp == vfsp) { 2952 mutex_enter(&rpfreelist_lock); 2953 if (rp->r_freef != NULL) { 2954 rp_rmfree(rp); 2955 mutex_exit(&rpfreelist_lock); 2956 rp_rmhash_locked(rp); 2957 rp->r_hashf = rlist; 2958 rlist = rp; 2959 } else 2960 mutex_exit(&rpfreelist_lock); 2961 } 2962 } 2963 rw_exit(&rtable[index].r_lock); 2964 } 2965 2966 for (rp = rlist; rp != NULL; rp = rlist) { 2967 rlist = rp->r_hashf; 2968 /* 2969 * This call to rp_addfree will end up destroying the 2970 * rnode, but in a safe way with the appropriate set 2971 * of checks done. 2972 */ 2973 rp_addfree(rp, cr); 2974 } 2975 2976 } 2977 2978 /* 2979 * This routine destroys all the resources associated with the rnode 2980 * and then the rnode itself. 2981 */ 2982 static void 2983 destroy_rnode(rnode_t *rp) 2984 { 2985 vnode_t *vp; 2986 vfs_t *vfsp; 2987 2988 vp = RTOV(rp); 2989 vfsp = vp->v_vfsp; 2990 2991 ASSERT(vp->v_count == 1); 2992 ASSERT(rp->r_count == 0); 2993 ASSERT(rp->r_lmpl == NULL); 2994 ASSERT(rp->r_mapcnt == 0); 2995 ASSERT(!(rp->r_flags & RHASHED)); 2996 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2997 atomic_add_long((ulong_t *)&rnew, -1); 2998 #ifdef DEBUG 2999 clstat_debug.nrnode.value.ui64--; 3000 #endif 3001 nfs_rw_destroy(&rp->r_rwlock); 3002 nfs_rw_destroy(&rp->r_lkserlock); 3003 mutex_destroy(&rp->r_statelock); 3004 cv_destroy(&rp->r_cv); 3005 cv_destroy(&rp->r_commit.c_cv); 3006 if (rp->r_flags & RDELMAPLIST) 3007 list_destroy(&rp->r_indelmap); 3008 nfs_free_r_path(rp); 3009 avl_destroy(&rp->r_dir); 3010 vn_invalid(vp); 3011 vn_free(vp); 3012 kmem_cache_free(rnode_cache, rp); 3013 VFS_RELE(vfsp); 3014 } 3015 3016 /* 3017 * Flush all vnodes in this (or every) vfs. 3018 * Used by nfs_sync and by nfs_unmount. 3019 */ 3020 void 3021 rflush(struct vfs *vfsp, cred_t *cr) 3022 { 3023 int index; 3024 rnode_t *rp; 3025 vnode_t *vp, **vplist; 3026 long num, cnt; 3027 3028 /* 3029 * Check to see whether there is anything to do. 3030 */ 3031 num = rnew; 3032 if (num == 0) 3033 return; 3034 3035 /* 3036 * Allocate a slot for all currently active rnodes on the 3037 * supposition that they all may need flushing. 3038 */ 3039 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3040 cnt = 0; 3041 3042 /* 3043 * Walk the hash queues looking for rnodes with page 3044 * lists associated with them. Make a list of these 3045 * files. 3046 */ 3047 for (index = 0; index < rtablesize; index++) { 3048 rw_enter(&rtable[index].r_lock, RW_READER); 3049 for (rp = rtable[index].r_hashf; 3050 rp != (rnode_t *)(&rtable[index]); 3051 rp = rp->r_hashf) { 3052 vp = RTOV(rp); 3053 /* 3054 * Don't bother sync'ing a vp if it 3055 * is part of virtual swap device or 3056 * if VFS is read-only 3057 */ 3058 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3059 continue; 3060 /* 3061 * If flushing all mounted file systems or 3062 * the vnode belongs to this vfs, has pages 3063 * and is marked as either dirty or mmap'd, 3064 * hold and add this vnode to the list of 3065 * vnodes to flush. 3066 */ 3067 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3068 vn_has_cached_data(vp) && 3069 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3070 VN_HOLD(vp); 3071 vplist[cnt++] = vp; 3072 if (cnt == num) { 3073 rw_exit(&rtable[index].r_lock); 3074 goto toomany; 3075 } 3076 } 3077 } 3078 rw_exit(&rtable[index].r_lock); 3079 } 3080 toomany: 3081 3082 /* 3083 * Flush and release all of the files on the list. 3084 */ 3085 while (cnt-- > 0) { 3086 vp = vplist[cnt]; 3087 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3088 VN_RELE(vp); 3089 } 3090 3091 /* 3092 * Free the space allocated to hold the list. 3093 */ 3094 kmem_free(vplist, num * sizeof (*vplist)); 3095 } 3096 3097 /* 3098 * This probably needs to be larger than or equal to 3099 * log2(sizeof (struct rnode)) due to the way that rnodes are 3100 * allocated. 3101 */ 3102 #define ACACHE_SHIFT_BITS 9 3103 3104 static int 3105 acachehash(rnode_t *rp, cred_t *cr) 3106 { 3107 3108 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3109 acachemask); 3110 } 3111 3112 #ifdef DEBUG 3113 static long nfs_access_cache_hits = 0; 3114 static long nfs_access_cache_misses = 0; 3115 #endif 3116 3117 nfs_access_type_t 3118 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3119 { 3120 vnode_t *vp; 3121 acache_t *ap; 3122 acache_hash_t *hp; 3123 nfs_access_type_t all; 3124 3125 vp = RTOV(rp); 3126 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3127 return (NFS_ACCESS_UNKNOWN); 3128 3129 if (rp->r_acache != NULL) { 3130 hp = &acache[acachehash(rp, cr)]; 3131 rw_enter(&hp->lock, RW_READER); 3132 ap = hp->next; 3133 while (ap != (acache_t *)hp) { 3134 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3135 if ((ap->known & acc) == acc) { 3136 #ifdef DEBUG 3137 nfs_access_cache_hits++; 3138 #endif 3139 if ((ap->allowed & acc) == acc) 3140 all = NFS_ACCESS_ALLOWED; 3141 else 3142 all = NFS_ACCESS_DENIED; 3143 } else { 3144 #ifdef DEBUG 3145 nfs_access_cache_misses++; 3146 #endif 3147 all = NFS_ACCESS_UNKNOWN; 3148 } 3149 rw_exit(&hp->lock); 3150 return (all); 3151 } 3152 ap = ap->next; 3153 } 3154 rw_exit(&hp->lock); 3155 } 3156 3157 #ifdef DEBUG 3158 nfs_access_cache_misses++; 3159 #endif 3160 return (NFS_ACCESS_UNKNOWN); 3161 } 3162 3163 void 3164 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3165 { 3166 acache_t *ap; 3167 acache_t *nap; 3168 acache_hash_t *hp; 3169 3170 hp = &acache[acachehash(rp, cr)]; 3171 3172 /* 3173 * Allocate now assuming that mostly an allocation will be 3174 * required. This allows the allocation to happen without 3175 * holding the hash bucket locked. 3176 */ 3177 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3178 if (nap != NULL) { 3179 nap->known = acc; 3180 nap->allowed = resacc; 3181 nap->rnode = rp; 3182 crhold(cr); 3183 nap->cred = cr; 3184 nap->hashq = hp; 3185 } 3186 3187 rw_enter(&hp->lock, RW_WRITER); 3188 3189 if (rp->r_acache != NULL) { 3190 ap = hp->next; 3191 while (ap != (acache_t *)hp) { 3192 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3193 ap->known |= acc; 3194 ap->allowed &= ~acc; 3195 ap->allowed |= resacc; 3196 rw_exit(&hp->lock); 3197 if (nap != NULL) { 3198 crfree(nap->cred); 3199 kmem_cache_free(acache_cache, nap); 3200 } 3201 return; 3202 } 3203 ap = ap->next; 3204 } 3205 } 3206 3207 if (nap != NULL) { 3208 #ifdef DEBUG 3209 clstat_debug.access.value.ui64++; 3210 #endif 3211 nap->next = hp->next; 3212 hp->next = nap; 3213 nap->next->prev = nap; 3214 nap->prev = (acache_t *)hp; 3215 3216 mutex_enter(&rp->r_statelock); 3217 nap->list = rp->r_acache; 3218 rp->r_acache = nap; 3219 mutex_exit(&rp->r_statelock); 3220 } 3221 3222 rw_exit(&hp->lock); 3223 } 3224 3225 int 3226 nfs_access_purge_rp(rnode_t *rp) 3227 { 3228 acache_t *ap; 3229 acache_t *tmpap; 3230 acache_t *rplist; 3231 3232 /* 3233 * If there aren't any cached entries, then there is nothing 3234 * to free. 3235 */ 3236 if (rp->r_acache == NULL) 3237 return (0); 3238 3239 mutex_enter(&rp->r_statelock); 3240 rplist = rp->r_acache; 3241 rp->r_acache = NULL; 3242 mutex_exit(&rp->r_statelock); 3243 3244 /* 3245 * Loop through each entry in the list pointed to in the 3246 * rnode. Remove each of these entries from the hash 3247 * queue that it is on and remove it from the list in 3248 * the rnode. 3249 */ 3250 for (ap = rplist; ap != NULL; ap = tmpap) { 3251 rw_enter(&ap->hashq->lock, RW_WRITER); 3252 ap->prev->next = ap->next; 3253 ap->next->prev = ap->prev; 3254 rw_exit(&ap->hashq->lock); 3255 3256 tmpap = ap->list; 3257 crfree(ap->cred); 3258 kmem_cache_free(acache_cache, ap); 3259 #ifdef DEBUG 3260 clstat_debug.access.value.ui64--; 3261 #endif 3262 } 3263 3264 return (1); 3265 } 3266 3267 static const char prefix[] = ".nfs"; 3268 3269 static kmutex_t newnum_lock; 3270 3271 int 3272 newnum(void) 3273 { 3274 static uint_t newnum = 0; 3275 uint_t id; 3276 3277 mutex_enter(&newnum_lock); 3278 if (newnum == 0) 3279 newnum = gethrestime_sec() & 0xffff; 3280 id = newnum++; 3281 mutex_exit(&newnum_lock); 3282 return (id); 3283 } 3284 3285 char * 3286 newname(void) 3287 { 3288 char *news; 3289 char *s; 3290 const char *p; 3291 uint_t id; 3292 3293 id = newnum(); 3294 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3295 s = news; 3296 p = prefix; 3297 while (*p != '\0') 3298 *s++ = *p++; 3299 while (id != 0) { 3300 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3301 id >>= 4; 3302 } 3303 *s = '\0'; 3304 return (news); 3305 } 3306 3307 int 3308 nfs_atoi(char *cp) 3309 { 3310 int n; 3311 3312 n = 0; 3313 while (*cp != '\0') { 3314 n = n * 10 + (*cp - '0'); 3315 cp++; 3316 } 3317 3318 return (n); 3319 } 3320 3321 /* 3322 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3323 * framework. 3324 */ 3325 static int 3326 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3327 { 3328 ksp->ks_snaptime = gethrtime(); 3329 if (rw == KSTAT_WRITE) { 3330 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3331 #ifdef DEBUG 3332 /* 3333 * Currently only the global zone can write to kstats, but we 3334 * add the check just for paranoia. 3335 */ 3336 if (INGLOBALZONE(curproc)) 3337 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3338 sizeof (clstat_debug)); 3339 #endif 3340 } else { 3341 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3342 #ifdef DEBUG 3343 /* 3344 * If we're displaying the "global" debug kstat values, we 3345 * display them as-is to all zones since in fact they apply to 3346 * the system as a whole. 3347 */ 3348 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3349 sizeof (clstat_debug)); 3350 #endif 3351 } 3352 return (0); 3353 } 3354 3355 static void * 3356 clinit_zone(zoneid_t zoneid) 3357 { 3358 kstat_t *nfs_client_kstat; 3359 struct nfs_clnt *nfscl; 3360 uint_t ndata; 3361 3362 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3363 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3364 nfscl->nfscl_chtable = NULL; 3365 nfscl->nfscl_zoneid = zoneid; 3366 3367 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3368 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3369 #ifdef DEBUG 3370 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3371 #endif 3372 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3373 "misc", KSTAT_TYPE_NAMED, ndata, 3374 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3375 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3376 nfs_client_kstat->ks_snapshot = cl_snapshot; 3377 kstat_install(nfs_client_kstat); 3378 } 3379 mutex_enter(&nfs_clnt_list_lock); 3380 list_insert_head(&nfs_clnt_list, nfscl); 3381 mutex_exit(&nfs_clnt_list_lock); 3382 return (nfscl); 3383 } 3384 3385 /*ARGSUSED*/ 3386 static void 3387 clfini_zone(zoneid_t zoneid, void *arg) 3388 { 3389 struct nfs_clnt *nfscl = arg; 3390 chhead_t *chp, *next; 3391 3392 if (nfscl == NULL) 3393 return; 3394 mutex_enter(&nfs_clnt_list_lock); 3395 list_remove(&nfs_clnt_list, nfscl); 3396 mutex_exit(&nfs_clnt_list_lock); 3397 clreclaim_zone(nfscl, 0); 3398 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3399 ASSERT(chp->ch_list == NULL); 3400 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3401 next = chp->ch_next; 3402 kmem_free(chp, sizeof (*chp)); 3403 } 3404 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3405 mutex_destroy(&nfscl->nfscl_chtable_lock); 3406 kmem_free(nfscl, sizeof (*nfscl)); 3407 } 3408 3409 /* 3410 * Called by endpnt_destructor to make sure the client handles are 3411 * cleaned up before the RPC endpoints. This becomes a no-op if 3412 * clfini_zone (above) is called first. This function is needed 3413 * (rather than relying on clfini_zone to clean up) because the ZSD 3414 * callbacks have no ordering mechanism, so we have no way to ensure 3415 * that clfini_zone is called before endpnt_destructor. 3416 */ 3417 void 3418 clcleanup_zone(zoneid_t zoneid) 3419 { 3420 struct nfs_clnt *nfscl; 3421 3422 mutex_enter(&nfs_clnt_list_lock); 3423 nfscl = list_head(&nfs_clnt_list); 3424 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3425 if (nfscl->nfscl_zoneid == zoneid) { 3426 clreclaim_zone(nfscl, 0); 3427 break; 3428 } 3429 } 3430 mutex_exit(&nfs_clnt_list_lock); 3431 } 3432 3433 int 3434 nfs_subrinit(void) 3435 { 3436 int i; 3437 ulong_t nrnode_max; 3438 3439 /* 3440 * Allocate and initialize the rnode hash queues 3441 */ 3442 if (nrnode <= 0) 3443 nrnode = ncsize; 3444 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3445 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3446 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3447 "setting nrnode to max value of %ld", nrnode_max); 3448 nrnode = nrnode_max; 3449 } 3450 3451 rtablesize = 1 << highbit(nrnode / hashlen); 3452 rtablemask = rtablesize - 1; 3453 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3454 for (i = 0; i < rtablesize; i++) { 3455 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3456 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3457 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3458 } 3459 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3460 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3461 3462 /* 3463 * Allocate and initialize the access cache 3464 */ 3465 3466 /* 3467 * Initial guess is one access cache entry per rnode unless 3468 * nacache is set to a non-zero value and then it is used to 3469 * indicate a guess at the number of access cache entries. 3470 */ 3471 if (nacache > 0) 3472 acachesize = 1 << highbit(nacache / hashlen); 3473 else 3474 acachesize = rtablesize; 3475 acachemask = acachesize - 1; 3476 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3477 for (i = 0; i < acachesize; i++) { 3478 acache[i].next = (acache_t *)&acache[i]; 3479 acache[i].prev = (acache_t *)&acache[i]; 3480 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3481 } 3482 acache_cache = kmem_cache_create("nfs_access_cache", 3483 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3484 /* 3485 * Allocate and initialize the client handle cache 3486 */ 3487 chtab_cache = kmem_cache_create("client_handle_cache", 3488 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3489 /* 3490 * Initialize the list of per-zone client handles (and associated data). 3491 * This needs to be done before we call zone_key_create(). 3492 */ 3493 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3494 offsetof(struct nfs_clnt, nfscl_node)); 3495 /* 3496 * Initialize the zone_key for per-zone client handle lists. 3497 */ 3498 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3499 /* 3500 * Initialize the various mutexes and reader/writer locks 3501 */ 3502 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3503 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3504 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3505 3506 /* 3507 * Assign unique major number for all nfs mounts 3508 */ 3509 if ((nfs_major = getudev()) == -1) { 3510 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3511 "nfs: init: can't get unique device number"); 3512 nfs_major = 0; 3513 } 3514 nfs_minor = 0; 3515 3516 if (nfs3_jukebox_delay == 0) 3517 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3518 3519 return (0); 3520 } 3521 3522 void 3523 nfs_subrfini(void) 3524 { 3525 int i; 3526 3527 /* 3528 * Deallocate the rnode hash queues 3529 */ 3530 kmem_cache_destroy(rnode_cache); 3531 3532 for (i = 0; i < rtablesize; i++) 3533 rw_destroy(&rtable[i].r_lock); 3534 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3535 3536 /* 3537 * Deallocated the access cache 3538 */ 3539 kmem_cache_destroy(acache_cache); 3540 3541 for (i = 0; i < acachesize; i++) 3542 rw_destroy(&acache[i].lock); 3543 kmem_free(acache, acachesize * sizeof (*acache)); 3544 3545 /* 3546 * Deallocate the client handle cache 3547 */ 3548 kmem_cache_destroy(chtab_cache); 3549 3550 /* 3551 * Destroy the various mutexes and reader/writer locks 3552 */ 3553 mutex_destroy(&rpfreelist_lock); 3554 mutex_destroy(&newnum_lock); 3555 mutex_destroy(&nfs_minor_lock); 3556 (void) zone_key_delete(nfsclnt_zone_key); 3557 } 3558 3559 enum nfsstat 3560 puterrno(int error) 3561 { 3562 3563 switch (error) { 3564 case EOPNOTSUPP: 3565 return (NFSERR_OPNOTSUPP); 3566 case ENAMETOOLONG: 3567 return (NFSERR_NAMETOOLONG); 3568 case ENOTEMPTY: 3569 return (NFSERR_NOTEMPTY); 3570 case EDQUOT: 3571 return (NFSERR_DQUOT); 3572 case ESTALE: 3573 return (NFSERR_STALE); 3574 case EREMOTE: 3575 return (NFSERR_REMOTE); 3576 case ENOSYS: 3577 return (NFSERR_OPNOTSUPP); 3578 case EOVERFLOW: 3579 return (NFSERR_INVAL); 3580 default: 3581 return ((enum nfsstat)error); 3582 } 3583 /* NOTREACHED */ 3584 } 3585 3586 int 3587 geterrno(enum nfsstat status) 3588 { 3589 3590 switch (status) { 3591 case NFSERR_OPNOTSUPP: 3592 return (EOPNOTSUPP); 3593 case NFSERR_NAMETOOLONG: 3594 return (ENAMETOOLONG); 3595 case NFSERR_NOTEMPTY: 3596 return (ENOTEMPTY); 3597 case NFSERR_DQUOT: 3598 return (EDQUOT); 3599 case NFSERR_STALE: 3600 return (ESTALE); 3601 case NFSERR_REMOTE: 3602 return (EREMOTE); 3603 case NFSERR_WFLUSH: 3604 return (EIO); 3605 default: 3606 return ((int)status); 3607 } 3608 /* NOTREACHED */ 3609 } 3610 3611 enum nfsstat3 3612 puterrno3(int error) 3613 { 3614 3615 #ifdef DEBUG 3616 switch (error) { 3617 case 0: 3618 return (NFS3_OK); 3619 case EPERM: 3620 return (NFS3ERR_PERM); 3621 case ENOENT: 3622 return (NFS3ERR_NOENT); 3623 case EIO: 3624 return (NFS3ERR_IO); 3625 case ENXIO: 3626 return (NFS3ERR_NXIO); 3627 case EACCES: 3628 return (NFS3ERR_ACCES); 3629 case EEXIST: 3630 return (NFS3ERR_EXIST); 3631 case EXDEV: 3632 return (NFS3ERR_XDEV); 3633 case ENODEV: 3634 return (NFS3ERR_NODEV); 3635 case ENOTDIR: 3636 return (NFS3ERR_NOTDIR); 3637 case EISDIR: 3638 return (NFS3ERR_ISDIR); 3639 case EINVAL: 3640 return (NFS3ERR_INVAL); 3641 case EFBIG: 3642 return (NFS3ERR_FBIG); 3643 case ENOSPC: 3644 return (NFS3ERR_NOSPC); 3645 case EROFS: 3646 return (NFS3ERR_ROFS); 3647 case EMLINK: 3648 return (NFS3ERR_MLINK); 3649 case ENAMETOOLONG: 3650 return (NFS3ERR_NAMETOOLONG); 3651 case ENOTEMPTY: 3652 return (NFS3ERR_NOTEMPTY); 3653 case EDQUOT: 3654 return (NFS3ERR_DQUOT); 3655 case ESTALE: 3656 return (NFS3ERR_STALE); 3657 case EREMOTE: 3658 return (NFS3ERR_REMOTE); 3659 case ENOSYS: 3660 case EOPNOTSUPP: 3661 return (NFS3ERR_NOTSUPP); 3662 case EOVERFLOW: 3663 return (NFS3ERR_INVAL); 3664 default: 3665 zcmn_err(getzoneid(), CE_WARN, 3666 "puterrno3: got error %d", error); 3667 return ((enum nfsstat3)error); 3668 } 3669 #else 3670 switch (error) { 3671 case ENAMETOOLONG: 3672 return (NFS3ERR_NAMETOOLONG); 3673 case ENOTEMPTY: 3674 return (NFS3ERR_NOTEMPTY); 3675 case EDQUOT: 3676 return (NFS3ERR_DQUOT); 3677 case ESTALE: 3678 return (NFS3ERR_STALE); 3679 case ENOSYS: 3680 case EOPNOTSUPP: 3681 return (NFS3ERR_NOTSUPP); 3682 case EREMOTE: 3683 return (NFS3ERR_REMOTE); 3684 case EOVERFLOW: 3685 return (NFS3ERR_INVAL); 3686 default: 3687 return ((enum nfsstat3)error); 3688 } 3689 #endif 3690 } 3691 3692 int 3693 geterrno3(enum nfsstat3 status) 3694 { 3695 3696 #ifdef DEBUG 3697 switch (status) { 3698 case NFS3_OK: 3699 return (0); 3700 case NFS3ERR_PERM: 3701 return (EPERM); 3702 case NFS3ERR_NOENT: 3703 return (ENOENT); 3704 case NFS3ERR_IO: 3705 return (EIO); 3706 case NFS3ERR_NXIO: 3707 return (ENXIO); 3708 case NFS3ERR_ACCES: 3709 return (EACCES); 3710 case NFS3ERR_EXIST: 3711 return (EEXIST); 3712 case NFS3ERR_XDEV: 3713 return (EXDEV); 3714 case NFS3ERR_NODEV: 3715 return (ENODEV); 3716 case NFS3ERR_NOTDIR: 3717 return (ENOTDIR); 3718 case NFS3ERR_ISDIR: 3719 return (EISDIR); 3720 case NFS3ERR_INVAL: 3721 return (EINVAL); 3722 case NFS3ERR_FBIG: 3723 return (EFBIG); 3724 case NFS3ERR_NOSPC: 3725 return (ENOSPC); 3726 case NFS3ERR_ROFS: 3727 return (EROFS); 3728 case NFS3ERR_MLINK: 3729 return (EMLINK); 3730 case NFS3ERR_NAMETOOLONG: 3731 return (ENAMETOOLONG); 3732 case NFS3ERR_NOTEMPTY: 3733 return (ENOTEMPTY); 3734 case NFS3ERR_DQUOT: 3735 return (EDQUOT); 3736 case NFS3ERR_STALE: 3737 return (ESTALE); 3738 case NFS3ERR_REMOTE: 3739 return (EREMOTE); 3740 case NFS3ERR_BADHANDLE: 3741 return (ESTALE); 3742 case NFS3ERR_NOT_SYNC: 3743 return (EINVAL); 3744 case NFS3ERR_BAD_COOKIE: 3745 return (ENOENT); 3746 case NFS3ERR_NOTSUPP: 3747 return (EOPNOTSUPP); 3748 case NFS3ERR_TOOSMALL: 3749 return (EINVAL); 3750 case NFS3ERR_SERVERFAULT: 3751 return (EIO); 3752 case NFS3ERR_BADTYPE: 3753 return (EINVAL); 3754 case NFS3ERR_JUKEBOX: 3755 return (ENXIO); 3756 default: 3757 zcmn_err(getzoneid(), CE_WARN, 3758 "geterrno3: got status %d", status); 3759 return ((int)status); 3760 } 3761 #else 3762 switch (status) { 3763 case NFS3ERR_NAMETOOLONG: 3764 return (ENAMETOOLONG); 3765 case NFS3ERR_NOTEMPTY: 3766 return (ENOTEMPTY); 3767 case NFS3ERR_DQUOT: 3768 return (EDQUOT); 3769 case NFS3ERR_STALE: 3770 case NFS3ERR_BADHANDLE: 3771 return (ESTALE); 3772 case NFS3ERR_NOTSUPP: 3773 return (EOPNOTSUPP); 3774 case NFS3ERR_REMOTE: 3775 return (EREMOTE); 3776 case NFS3ERR_NOT_SYNC: 3777 case NFS3ERR_TOOSMALL: 3778 case NFS3ERR_BADTYPE: 3779 return (EINVAL); 3780 case NFS3ERR_BAD_COOKIE: 3781 return (ENOENT); 3782 case NFS3ERR_SERVERFAULT: 3783 return (EIO); 3784 case NFS3ERR_JUKEBOX: 3785 return (ENXIO); 3786 default: 3787 return ((int)status); 3788 } 3789 #endif 3790 } 3791 3792 rddir_cache * 3793 rddir_cache_alloc(int flags) 3794 { 3795 rddir_cache *rc; 3796 3797 rc = kmem_alloc(sizeof (*rc), flags); 3798 if (rc != NULL) { 3799 rc->entries = NULL; 3800 rc->flags = RDDIR; 3801 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3802 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3803 rc->count = 1; 3804 #ifdef DEBUG 3805 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3806 #endif 3807 } 3808 return (rc); 3809 } 3810 3811 static void 3812 rddir_cache_free(rddir_cache *rc) 3813 { 3814 3815 #ifdef DEBUG 3816 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3817 #endif 3818 if (rc->entries != NULL) { 3819 #ifdef DEBUG 3820 rddir_cache_buf_free(rc->entries, rc->buflen); 3821 #else 3822 kmem_free(rc->entries, rc->buflen); 3823 #endif 3824 } 3825 cv_destroy(&rc->cv); 3826 mutex_destroy(&rc->lock); 3827 kmem_free(rc, sizeof (*rc)); 3828 } 3829 3830 void 3831 rddir_cache_hold(rddir_cache *rc) 3832 { 3833 3834 mutex_enter(&rc->lock); 3835 rc->count++; 3836 mutex_exit(&rc->lock); 3837 } 3838 3839 void 3840 rddir_cache_rele(rddir_cache *rc) 3841 { 3842 3843 mutex_enter(&rc->lock); 3844 ASSERT(rc->count > 0); 3845 if (--rc->count == 0) { 3846 mutex_exit(&rc->lock); 3847 rddir_cache_free(rc); 3848 } else 3849 mutex_exit(&rc->lock); 3850 } 3851 3852 #ifdef DEBUG 3853 char * 3854 rddir_cache_buf_alloc(size_t size, int flags) 3855 { 3856 char *rc; 3857 3858 rc = kmem_alloc(size, flags); 3859 if (rc != NULL) 3860 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3861 return (rc); 3862 } 3863 3864 void 3865 rddir_cache_buf_free(void *addr, size_t size) 3866 { 3867 3868 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3869 kmem_free(addr, size); 3870 } 3871 #endif 3872 3873 static int 3874 nfs_free_data_reclaim(rnode_t *rp) 3875 { 3876 char *contents; 3877 int size; 3878 vsecattr_t *vsp; 3879 nfs3_pathconf_info *info; 3880 int freed; 3881 cred_t *cred; 3882 3883 /* 3884 * Free any held credentials and caches which 3885 * may be associated with this rnode. 3886 */ 3887 mutex_enter(&rp->r_statelock); 3888 cred = rp->r_cred; 3889 rp->r_cred = NULL; 3890 contents = rp->r_symlink.contents; 3891 size = rp->r_symlink.size; 3892 rp->r_symlink.contents = NULL; 3893 vsp = rp->r_secattr; 3894 rp->r_secattr = NULL; 3895 info = rp->r_pathconf; 3896 rp->r_pathconf = NULL; 3897 mutex_exit(&rp->r_statelock); 3898 3899 if (cred != NULL) 3900 crfree(cred); 3901 3902 /* 3903 * Free the access cache entries. 3904 */ 3905 freed = nfs_access_purge_rp(rp); 3906 3907 if (!HAVE_RDDIR_CACHE(rp) && 3908 contents == NULL && 3909 vsp == NULL && 3910 info == NULL) 3911 return (freed); 3912 3913 /* 3914 * Free the readdir cache entries 3915 */ 3916 if (HAVE_RDDIR_CACHE(rp)) 3917 nfs_purge_rddir_cache(RTOV(rp)); 3918 3919 /* 3920 * Free the symbolic link cache. 3921 */ 3922 if (contents != NULL) { 3923 3924 kmem_free((void *)contents, size); 3925 } 3926 3927 /* 3928 * Free any cached ACL. 3929 */ 3930 if (vsp != NULL) 3931 nfs_acl_free(vsp); 3932 3933 /* 3934 * Free any cached pathconf information. 3935 */ 3936 if (info != NULL) 3937 kmem_free(info, sizeof (*info)); 3938 3939 return (1); 3940 } 3941 3942 static int 3943 nfs_active_data_reclaim(rnode_t *rp) 3944 { 3945 char *contents; 3946 int size; 3947 vsecattr_t *vsp; 3948 nfs3_pathconf_info *info; 3949 int freed; 3950 3951 /* 3952 * Free any held credentials and caches which 3953 * may be associated with this rnode. 3954 */ 3955 if (!mutex_tryenter(&rp->r_statelock)) 3956 return (0); 3957 contents = rp->r_symlink.contents; 3958 size = rp->r_symlink.size; 3959 rp->r_symlink.contents = NULL; 3960 vsp = rp->r_secattr; 3961 rp->r_secattr = NULL; 3962 info = rp->r_pathconf; 3963 rp->r_pathconf = NULL; 3964 mutex_exit(&rp->r_statelock); 3965 3966 /* 3967 * Free the access cache entries. 3968 */ 3969 freed = nfs_access_purge_rp(rp); 3970 3971 if (!HAVE_RDDIR_CACHE(rp) && 3972 contents == NULL && 3973 vsp == NULL && 3974 info == NULL) 3975 return (freed); 3976 3977 /* 3978 * Free the readdir cache entries 3979 */ 3980 if (HAVE_RDDIR_CACHE(rp)) 3981 nfs_purge_rddir_cache(RTOV(rp)); 3982 3983 /* 3984 * Free the symbolic link cache. 3985 */ 3986 if (contents != NULL) { 3987 3988 kmem_free((void *)contents, size); 3989 } 3990 3991 /* 3992 * Free any cached ACL. 3993 */ 3994 if (vsp != NULL) 3995 nfs_acl_free(vsp); 3996 3997 /* 3998 * Free any cached pathconf information. 3999 */ 4000 if (info != NULL) 4001 kmem_free(info, sizeof (*info)); 4002 4003 return (1); 4004 } 4005 4006 static int 4007 nfs_free_reclaim(void) 4008 { 4009 int freed; 4010 rnode_t *rp; 4011 4012 #ifdef DEBUG 4013 clstat_debug.f_reclaim.value.ui64++; 4014 #endif 4015 freed = 0; 4016 mutex_enter(&rpfreelist_lock); 4017 rp = rpfreelist; 4018 if (rp != NULL) { 4019 do { 4020 if (nfs_free_data_reclaim(rp)) 4021 freed = 1; 4022 } while ((rp = rp->r_freef) != rpfreelist); 4023 } 4024 mutex_exit(&rpfreelist_lock); 4025 return (freed); 4026 } 4027 4028 static int 4029 nfs_active_reclaim(void) 4030 { 4031 int freed; 4032 int index; 4033 rnode_t *rp; 4034 4035 #ifdef DEBUG 4036 clstat_debug.a_reclaim.value.ui64++; 4037 #endif 4038 freed = 0; 4039 for (index = 0; index < rtablesize; index++) { 4040 rw_enter(&rtable[index].r_lock, RW_READER); 4041 for (rp = rtable[index].r_hashf; 4042 rp != (rnode_t *)(&rtable[index]); 4043 rp = rp->r_hashf) { 4044 if (nfs_active_data_reclaim(rp)) 4045 freed = 1; 4046 } 4047 rw_exit(&rtable[index].r_lock); 4048 } 4049 return (freed); 4050 } 4051 4052 static int 4053 nfs_rnode_reclaim(void) 4054 { 4055 int freed; 4056 rnode_t *rp; 4057 vnode_t *vp; 4058 4059 #ifdef DEBUG 4060 clstat_debug.r_reclaim.value.ui64++; 4061 #endif 4062 freed = 0; 4063 mutex_enter(&rpfreelist_lock); 4064 while ((rp = rpfreelist) != NULL) { 4065 rp_rmfree(rp); 4066 mutex_exit(&rpfreelist_lock); 4067 if (rp->r_flags & RHASHED) { 4068 vp = RTOV(rp); 4069 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4070 mutex_enter(&vp->v_lock); 4071 if (vp->v_count > 1) { 4072 vp->v_count--; 4073 mutex_exit(&vp->v_lock); 4074 rw_exit(&rp->r_hashq->r_lock); 4075 mutex_enter(&rpfreelist_lock); 4076 continue; 4077 } 4078 mutex_exit(&vp->v_lock); 4079 rp_rmhash_locked(rp); 4080 rw_exit(&rp->r_hashq->r_lock); 4081 } 4082 /* 4083 * This call to rp_addfree will end up destroying the 4084 * rnode, but in a safe way with the appropriate set 4085 * of checks done. 4086 */ 4087 rp_addfree(rp, CRED()); 4088 mutex_enter(&rpfreelist_lock); 4089 } 4090 mutex_exit(&rpfreelist_lock); 4091 return (freed); 4092 } 4093 4094 /*ARGSUSED*/ 4095 static void 4096 nfs_reclaim(void *cdrarg) 4097 { 4098 4099 #ifdef DEBUG 4100 clstat_debug.reclaim.value.ui64++; 4101 #endif 4102 if (nfs_free_reclaim()) 4103 return; 4104 4105 if (nfs_active_reclaim()) 4106 return; 4107 4108 (void) nfs_rnode_reclaim(); 4109 } 4110 4111 /* 4112 * NFS client failover support 4113 * 4114 * Routines to copy filehandles 4115 */ 4116 void 4117 nfscopyfh(caddr_t fhp, vnode_t *vp) 4118 { 4119 fhandle_t *dest = (fhandle_t *)fhp; 4120 4121 if (dest != NULL) 4122 *dest = *VTOFH(vp); 4123 } 4124 4125 void 4126 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4127 { 4128 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4129 4130 if (dest != NULL) 4131 *dest = *VTOFH3(vp); 4132 } 4133 4134 /* 4135 * NFS client failover support 4136 * 4137 * failover_safe() will test various conditions to ensure that 4138 * failover is permitted for this vnode. It will be denied 4139 * if: 4140 * 1) the operation in progress does not support failover (NULL fi) 4141 * 2) there are no available replicas (NULL mi_servers->sv_next) 4142 * 3) any locks are outstanding on this file 4143 */ 4144 static int 4145 failover_safe(failinfo_t *fi) 4146 { 4147 4148 /* 4149 * Does this op permit failover? 4150 */ 4151 if (fi == NULL || fi->vp == NULL) 4152 return (0); 4153 4154 /* 4155 * Are there any alternates to failover to? 4156 */ 4157 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4158 return (0); 4159 4160 /* 4161 * Disable check; we've forced local locking 4162 * 4163 * if (flk_has_remote_locks(fi->vp)) 4164 * return (0); 4165 */ 4166 4167 /* 4168 * If we have no partial path, we can't do anything 4169 */ 4170 if (VTOR(fi->vp)->r_path == NULL) 4171 return (0); 4172 4173 return (1); 4174 } 4175 4176 #include <sys/thread.h> 4177 4178 /* 4179 * NFS client failover support 4180 * 4181 * failover_newserver() will start a search for a new server, 4182 * preferably by starting an async thread to do the work. If 4183 * someone is already doing this (recognizable by MI_BINDINPROG 4184 * being set), it will simply return and the calling thread 4185 * will queue on the mi_failover_cv condition variable. 4186 */ 4187 static void 4188 failover_newserver(mntinfo_t *mi) 4189 { 4190 /* 4191 * Check if someone else is doing this already 4192 */ 4193 mutex_enter(&mi->mi_lock); 4194 if (mi->mi_flags & MI_BINDINPROG) { 4195 mutex_exit(&mi->mi_lock); 4196 return; 4197 } 4198 mi->mi_flags |= MI_BINDINPROG; 4199 4200 /* 4201 * Need to hold the vfs struct so that it can't be released 4202 * while the failover thread is selecting a new server. 4203 */ 4204 VFS_HOLD(mi->mi_vfsp); 4205 4206 /* 4207 * Start a thread to do the real searching. 4208 */ 4209 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4210 4211 mutex_exit(&mi->mi_lock); 4212 } 4213 4214 /* 4215 * NFS client failover support 4216 * 4217 * failover_thread() will find a new server to replace the one 4218 * currently in use, wake up other threads waiting on this mount 4219 * point, and die. It will start at the head of the server list 4220 * and poll servers until it finds one with an NFS server which is 4221 * registered and responds to a NULL procedure ping. 4222 * 4223 * XXX failover_thread is unsafe within the scope of the 4224 * present model defined for cpr to suspend the system. 4225 * Specifically, over-the-wire calls made by the thread 4226 * are unsafe. The thread needs to be reevaluated in case of 4227 * future updates to the cpr suspend model. 4228 */ 4229 static void 4230 failover_thread(mntinfo_t *mi) 4231 { 4232 servinfo_t *svp = NULL; 4233 CLIENT *cl; 4234 enum clnt_stat status; 4235 struct timeval tv; 4236 int error; 4237 int oncethru = 0; 4238 callb_cpr_t cprinfo; 4239 rnode_t *rp; 4240 int index; 4241 char *srvnames; 4242 size_t srvnames_len; 4243 struct nfs_clnt *nfscl = NULL; 4244 zoneid_t zoneid = getzoneid(); 4245 4246 #ifdef DEBUG 4247 /* 4248 * This is currently only needed to access counters which exist on 4249 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4250 * on non-DEBUG kernels. 4251 */ 4252 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4253 ASSERT(nfscl != NULL); 4254 #endif 4255 4256 /* 4257 * Its safe to piggyback on the mi_lock since failover_newserver() 4258 * code guarantees that there will be only one failover thread 4259 * per mountinfo at any instance. 4260 */ 4261 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4262 "failover_thread"); 4263 4264 mutex_enter(&mi->mi_lock); 4265 while (mi->mi_readers) { 4266 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4267 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4268 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4269 } 4270 mutex_exit(&mi->mi_lock); 4271 4272 tv.tv_sec = 2; 4273 tv.tv_usec = 0; 4274 4275 /* 4276 * Ping the null NFS procedure of every server in 4277 * the list until one responds. We always start 4278 * at the head of the list and always skip the one 4279 * that is current, since it's caused us a problem. 4280 */ 4281 while (svp == NULL) { 4282 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4283 if (!oncethru && svp == mi->mi_curr_serv) 4284 continue; 4285 4286 /* 4287 * If the file system was forcibly umounted 4288 * while trying to do a failover, then just 4289 * give up on the failover. It won't matter 4290 * what the server is. 4291 */ 4292 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4293 svp = NULL; 4294 goto done; 4295 } 4296 4297 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4298 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4299 if (error) 4300 continue; 4301 4302 if (!(mi->mi_flags & MI_INT)) 4303 cl->cl_nosignal = TRUE; 4304 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4305 xdr_void, NULL, tv); 4306 if (!(mi->mi_flags & MI_INT)) 4307 cl->cl_nosignal = FALSE; 4308 AUTH_DESTROY(cl->cl_auth); 4309 CLNT_DESTROY(cl); 4310 if (status == RPC_SUCCESS) { 4311 if (svp == mi->mi_curr_serv) { 4312 #ifdef DEBUG 4313 zcmn_err(zoneid, CE_NOTE, 4314 "NFS%d: failing over: selecting original server %s", 4315 mi->mi_vers, svp->sv_hostname); 4316 #else 4317 zcmn_err(zoneid, CE_NOTE, 4318 "NFS: failing over: selecting original server %s", 4319 svp->sv_hostname); 4320 #endif 4321 } else { 4322 #ifdef DEBUG 4323 zcmn_err(zoneid, CE_NOTE, 4324 "NFS%d: failing over from %s to %s", 4325 mi->mi_vers, 4326 mi->mi_curr_serv->sv_hostname, 4327 svp->sv_hostname); 4328 #else 4329 zcmn_err(zoneid, CE_NOTE, 4330 "NFS: failing over from %s to %s", 4331 mi->mi_curr_serv->sv_hostname, 4332 svp->sv_hostname); 4333 #endif 4334 } 4335 break; 4336 } 4337 } 4338 4339 if (svp == NULL) { 4340 if (!oncethru) { 4341 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4342 #ifdef DEBUG 4343 zprintf(zoneid, 4344 "NFS%d servers %s not responding " 4345 "still trying\n", mi->mi_vers, srvnames); 4346 #else 4347 zprintf(zoneid, "NFS servers %s not responding " 4348 "still trying\n", srvnames); 4349 #endif 4350 oncethru = 1; 4351 } 4352 mutex_enter(&mi->mi_lock); 4353 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4354 mutex_exit(&mi->mi_lock); 4355 delay(hz); 4356 mutex_enter(&mi->mi_lock); 4357 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4358 mutex_exit(&mi->mi_lock); 4359 } 4360 } 4361 4362 if (oncethru) { 4363 #ifdef DEBUG 4364 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4365 #else 4366 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4367 #endif 4368 } 4369 4370 if (svp != mi->mi_curr_serv) { 4371 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4372 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4373 rw_enter(&rtable[index].r_lock, RW_WRITER); 4374 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4375 mi->mi_vfsp); 4376 if (rp != NULL) { 4377 if (rp->r_flags & RHASHED) 4378 rp_rmhash_locked(rp); 4379 rw_exit(&rtable[index].r_lock); 4380 rp->r_server = svp; 4381 rp->r_fh = svp->sv_fhandle; 4382 (void) nfs_free_data_reclaim(rp); 4383 index = rtablehash(&rp->r_fh); 4384 rp->r_hashq = &rtable[index]; 4385 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4386 vn_exists(RTOV(rp)); 4387 rp_addhash(rp); 4388 rw_exit(&rp->r_hashq->r_lock); 4389 VN_RELE(RTOV(rp)); 4390 } else 4391 rw_exit(&rtable[index].r_lock); 4392 } 4393 4394 done: 4395 if (oncethru) 4396 kmem_free(srvnames, srvnames_len); 4397 mutex_enter(&mi->mi_lock); 4398 mi->mi_flags &= ~MI_BINDINPROG; 4399 if (svp != NULL) { 4400 mi->mi_curr_serv = svp; 4401 mi->mi_failover++; 4402 #ifdef DEBUG 4403 nfscl->nfscl_stat.failover.value.ui64++; 4404 #endif 4405 } 4406 cv_broadcast(&mi->mi_failover_cv); 4407 CALLB_CPR_EXIT(&cprinfo); 4408 VFS_RELE(mi->mi_vfsp); 4409 zthread_exit(); 4410 /* NOTREACHED */ 4411 } 4412 4413 /* 4414 * NFS client failover support 4415 * 4416 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4417 * is cleared, meaning that failover is complete. Called with 4418 * mi_lock mutex held. 4419 */ 4420 static int 4421 failover_wait(mntinfo_t *mi) 4422 { 4423 k_sigset_t smask; 4424 4425 /* 4426 * If someone else is hunting for a living server, 4427 * sleep until it's done. After our sleep, we may 4428 * be bound to the right server and get off cheaply. 4429 */ 4430 while (mi->mi_flags & MI_BINDINPROG) { 4431 /* 4432 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4433 * and SIGTERM. (Preserving the existing masks). 4434 * Mask out SIGINT if mount option nointr is specified. 4435 */ 4436 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4437 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4438 /* 4439 * restore original signal mask 4440 */ 4441 sigunintr(&smask); 4442 return (EINTR); 4443 } 4444 /* 4445 * restore original signal mask 4446 */ 4447 sigunintr(&smask); 4448 } 4449 return (0); 4450 } 4451 4452 /* 4453 * NFS client failover support 4454 * 4455 * failover_remap() will do a partial pathname lookup and find the 4456 * desired vnode on the current server. The interim vnode will be 4457 * discarded after we pilfer the new filehandle. 4458 * 4459 * Side effects: 4460 * - This routine will also update the filehandle in the args structure 4461 * pointed to by the fi->fhp pointer if it is non-NULL. 4462 */ 4463 4464 static int 4465 failover_remap(failinfo_t *fi) 4466 { 4467 vnode_t *vp, *nvp, *rootvp; 4468 rnode_t *rp, *nrp; 4469 mntinfo_t *mi; 4470 int error; 4471 #ifdef DEBUG 4472 struct nfs_clnt *nfscl; 4473 4474 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4475 ASSERT(nfscl != NULL); 4476 #endif 4477 /* 4478 * Sanity check 4479 */ 4480 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4481 return (EINVAL); 4482 vp = fi->vp; 4483 rp = VTOR(vp); 4484 mi = VTOMI(vp); 4485 4486 if (!(vp->v_flag & VROOT)) { 4487 /* 4488 * Given the root fh, use the path stored in 4489 * the rnode to find the fh for the new server. 4490 */ 4491 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4492 if (error) 4493 return (error); 4494 4495 error = failover_lookup(rp->r_path, rootvp, 4496 fi->lookupproc, fi->xattrdirproc, &nvp); 4497 4498 VN_RELE(rootvp); 4499 4500 if (error) 4501 return (error); 4502 4503 /* 4504 * If we found the same rnode, we're done now 4505 */ 4506 if (nvp == vp) { 4507 /* 4508 * Failed and the new server may physically be same 4509 * OR may share a same disk subsystem. In this case 4510 * file handle for a particular file path is not going 4511 * to change, given the same filehandle lookup will 4512 * always locate the same rnode as the existing one. 4513 * All we might need to do is to update the r_server 4514 * with the current servinfo. 4515 */ 4516 if (!VALID_FH(fi)) { 4517 rp->r_server = mi->mi_curr_serv; 4518 } 4519 VN_RELE(nvp); 4520 return (0); 4521 } 4522 4523 /* 4524 * Try to make it so that no one else will find this 4525 * vnode because it is just a temporary to hold the 4526 * new file handle until that file handle can be 4527 * copied to the original vnode/rnode. 4528 */ 4529 nrp = VTOR(nvp); 4530 mutex_enter(&mi->mi_remap_lock); 4531 /* 4532 * Some other thread could have raced in here and could 4533 * have done the remap for this particular rnode before 4534 * this thread here. Check for rp->r_server and 4535 * mi->mi_curr_serv and return if they are same. 4536 */ 4537 if (VALID_FH(fi)) { 4538 mutex_exit(&mi->mi_remap_lock); 4539 VN_RELE(nvp); 4540 return (0); 4541 } 4542 4543 if (nrp->r_flags & RHASHED) 4544 rp_rmhash(nrp); 4545 4546 /* 4547 * As a heuristic check on the validity of the new 4548 * file, check that the size and type match against 4549 * that we remember from the old version. 4550 */ 4551 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4552 mutex_exit(&mi->mi_remap_lock); 4553 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4554 "NFS replicas %s and %s: file %s not same.", 4555 rp->r_server->sv_hostname, 4556 nrp->r_server->sv_hostname, rp->r_path); 4557 VN_RELE(nvp); 4558 return (EINVAL); 4559 } 4560 4561 /* 4562 * snarf the filehandle from the new rnode 4563 * then release it, again while updating the 4564 * hash queues for the rnode. 4565 */ 4566 if (rp->r_flags & RHASHED) 4567 rp_rmhash(rp); 4568 rp->r_server = mi->mi_curr_serv; 4569 rp->r_fh = nrp->r_fh; 4570 rp->r_hashq = nrp->r_hashq; 4571 /* 4572 * Copy the attributes from the new rnode to the old 4573 * rnode. This will help to reduce unnecessary page 4574 * cache flushes. 4575 */ 4576 rp->r_attr = nrp->r_attr; 4577 rp->r_attrtime = nrp->r_attrtime; 4578 rp->r_mtime = nrp->r_mtime; 4579 (void) nfs_free_data_reclaim(rp); 4580 nfs_setswaplike(vp, &rp->r_attr); 4581 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4582 rp_addhash(rp); 4583 rw_exit(&rp->r_hashq->r_lock); 4584 mutex_exit(&mi->mi_remap_lock); 4585 VN_RELE(nvp); 4586 } 4587 4588 /* 4589 * Update successful failover remap count 4590 */ 4591 mutex_enter(&mi->mi_lock); 4592 mi->mi_remap++; 4593 mutex_exit(&mi->mi_lock); 4594 #ifdef DEBUG 4595 nfscl->nfscl_stat.remap.value.ui64++; 4596 #endif 4597 4598 /* 4599 * If we have a copied filehandle to update, do it now. 4600 */ 4601 if (fi->fhp != NULL && fi->copyproc != NULL) 4602 (*fi->copyproc)(fi->fhp, vp); 4603 4604 return (0); 4605 } 4606 4607 /* 4608 * NFS client failover support 4609 * 4610 * We want a simple pathname lookup routine to parse the pieces 4611 * of path in rp->r_path. We know that the path was a created 4612 * as rnodes were made, so we know we have only to deal with 4613 * paths that look like: 4614 * dir1/dir2/dir3/file 4615 * Any evidence of anything like .., symlinks, and ENOTDIR 4616 * are hard errors, because they mean something in this filesystem 4617 * is different from the one we came from, or has changed under 4618 * us in some way. If this is true, we want the failure. 4619 * 4620 * Extended attributes: if the filesystem is mounted with extended 4621 * attributes enabled (-o xattr), the attribute directory will be 4622 * represented in the r_path as the magic name XATTR_RPATH. So if 4623 * we see that name in the pathname, is must be because this node 4624 * is an extended attribute. Therefore, look it up that way. 4625 */ 4626 static int 4627 failover_lookup(char *path, vnode_t *root, 4628 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4629 vnode_t *, cred_t *, int), 4630 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4631 vnode_t **new) 4632 { 4633 vnode_t *dvp, *nvp; 4634 int error = EINVAL; 4635 char *s, *p, *tmppath; 4636 size_t len; 4637 mntinfo_t *mi; 4638 bool_t xattr; 4639 4640 /* Make local copy of path */ 4641 len = strlen(path) + 1; 4642 tmppath = kmem_alloc(len, KM_SLEEP); 4643 (void) strcpy(tmppath, path); 4644 s = tmppath; 4645 4646 dvp = root; 4647 VN_HOLD(dvp); 4648 mi = VTOMI(root); 4649 xattr = mi->mi_flags & MI_EXTATTR; 4650 4651 do { 4652 p = strchr(s, '/'); 4653 if (p != NULL) 4654 *p = '\0'; 4655 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4656 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4657 RFSCALL_SOFT); 4658 } else { 4659 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4660 CRED(), RFSCALL_SOFT); 4661 } 4662 if (p != NULL) 4663 *p++ = '/'; 4664 if (error) { 4665 VN_RELE(dvp); 4666 kmem_free(tmppath, len); 4667 return (error); 4668 } 4669 s = p; 4670 VN_RELE(dvp); 4671 dvp = nvp; 4672 } while (p != NULL); 4673 4674 if (nvp != NULL && new != NULL) 4675 *new = nvp; 4676 kmem_free(tmppath, len); 4677 return (0); 4678 } 4679 4680 /* 4681 * NFS client failover support 4682 * 4683 * sv_free() frees the malloc'd portion of a "servinfo_t". 4684 */ 4685 void 4686 sv_free(servinfo_t *svp) 4687 { 4688 servinfo_t *next; 4689 struct knetconfig *knconf; 4690 4691 while (svp != NULL) { 4692 next = svp->sv_next; 4693 if (svp->sv_secdata) 4694 sec_clnt_freeinfo(svp->sv_secdata); 4695 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4696 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4697 knconf = svp->sv_knconf; 4698 if (knconf != NULL) { 4699 if (knconf->knc_protofmly != NULL) 4700 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4701 if (knconf->knc_proto != NULL) 4702 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4703 kmem_free(knconf, sizeof (*knconf)); 4704 } 4705 knconf = svp->sv_origknconf; 4706 if (knconf != NULL) { 4707 if (knconf->knc_protofmly != NULL) 4708 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4709 if (knconf->knc_proto != NULL) 4710 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4711 kmem_free(knconf, sizeof (*knconf)); 4712 } 4713 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4714 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4715 mutex_destroy(&svp->sv_lock); 4716 kmem_free(svp, sizeof (*svp)); 4717 svp = next; 4718 } 4719 } 4720 4721 /* 4722 * Only can return non-zero if intr != 0. 4723 */ 4724 int 4725 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4726 { 4727 4728 mutex_enter(&l->lock); 4729 4730 /* 4731 * If this is a nested enter, then allow it. There 4732 * must be as many exits as enters through. 4733 */ 4734 if (l->owner == curthread) { 4735 /* lock is held for writing by current thread */ 4736 ASSERT(rw == RW_READER || rw == RW_WRITER); 4737 l->count--; 4738 } else if (rw == RW_READER) { 4739 /* 4740 * While there is a writer active or writers waiting, 4741 * then wait for them to finish up and move on. Then, 4742 * increment the count to indicate that a reader is 4743 * active. 4744 */ 4745 while (l->count < 0 || l->waiters > 0) { 4746 if (intr) { 4747 klwp_t *lwp = ttolwp(curthread); 4748 4749 if (lwp != NULL) 4750 lwp->lwp_nostop++; 4751 if (!cv_wait_sig(&l->cv, &l->lock)) { 4752 if (lwp != NULL) 4753 lwp->lwp_nostop--; 4754 mutex_exit(&l->lock); 4755 return (EINTR); 4756 } 4757 if (lwp != NULL) 4758 lwp->lwp_nostop--; 4759 } else 4760 cv_wait(&l->cv, &l->lock); 4761 } 4762 ASSERT(l->count < INT_MAX); 4763 #ifdef DEBUG 4764 if ((l->count % 10000) == 9999) 4765 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4766 "rwlock @ %p\n", l->count, (void *)&l); 4767 #endif 4768 l->count++; 4769 } else { 4770 ASSERT(rw == RW_WRITER); 4771 /* 4772 * While there are readers active or a writer 4773 * active, then wait for all of the readers 4774 * to finish or for the writer to finish. 4775 * Then, set the owner field to curthread and 4776 * decrement count to indicate that a writer 4777 * is active. 4778 */ 4779 while (l->count > 0 || l->owner != NULL) { 4780 l->waiters++; 4781 if (intr) { 4782 klwp_t *lwp = ttolwp(curthread); 4783 4784 if (lwp != NULL) 4785 lwp->lwp_nostop++; 4786 if (!cv_wait_sig(&l->cv, &l->lock)) { 4787 if (lwp != NULL) 4788 lwp->lwp_nostop--; 4789 l->waiters--; 4790 cv_broadcast(&l->cv); 4791 mutex_exit(&l->lock); 4792 return (EINTR); 4793 } 4794 if (lwp != NULL) 4795 lwp->lwp_nostop--; 4796 } else 4797 cv_wait(&l->cv, &l->lock); 4798 l->waiters--; 4799 } 4800 l->owner = curthread; 4801 l->count--; 4802 } 4803 4804 mutex_exit(&l->lock); 4805 4806 return (0); 4807 } 4808 4809 /* 4810 * If the lock is available, obtain it and return non-zero. If there is 4811 * already a conflicting lock, return 0 immediately. 4812 */ 4813 4814 int 4815 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4816 { 4817 mutex_enter(&l->lock); 4818 4819 /* 4820 * If this is a nested enter, then allow it. There 4821 * must be as many exits as enters through. 4822 */ 4823 if (l->owner == curthread) { 4824 /* lock is held for writing by current thread */ 4825 ASSERT(rw == RW_READER || rw == RW_WRITER); 4826 l->count--; 4827 } else if (rw == RW_READER) { 4828 /* 4829 * If there is a writer active or writers waiting, deny the 4830 * lock. Otherwise, bump the count of readers. 4831 */ 4832 if (l->count < 0 || l->waiters > 0) { 4833 mutex_exit(&l->lock); 4834 return (0); 4835 } 4836 l->count++; 4837 } else { 4838 ASSERT(rw == RW_WRITER); 4839 /* 4840 * If there are readers active or a writer active, deny the 4841 * lock. Otherwise, set the owner field to curthread and 4842 * decrement count to indicate that a writer is active. 4843 */ 4844 if (l->count > 0 || l->owner != NULL) { 4845 mutex_exit(&l->lock); 4846 return (0); 4847 } 4848 l->owner = curthread; 4849 l->count--; 4850 } 4851 4852 mutex_exit(&l->lock); 4853 4854 return (1); 4855 } 4856 4857 void 4858 nfs_rw_exit(nfs_rwlock_t *l) 4859 { 4860 4861 mutex_enter(&l->lock); 4862 /* 4863 * If this is releasing a writer lock, then increment count to 4864 * indicate that there is one less writer active. If this was 4865 * the last of possibly nested writer locks, then clear the owner 4866 * field as well to indicate that there is no writer active 4867 * and wakeup any possible waiting writers or readers. 4868 * 4869 * If releasing a reader lock, then just decrement count to 4870 * indicate that there is one less reader active. If this was 4871 * the last active reader and there are writer(s) waiting, 4872 * then wake up the first. 4873 */ 4874 if (l->owner != NULL) { 4875 ASSERT(l->owner == curthread); 4876 l->count++; 4877 if (l->count == 0) { 4878 l->owner = NULL; 4879 cv_broadcast(&l->cv); 4880 } 4881 } else { 4882 ASSERT(l->count > 0); 4883 l->count--; 4884 if (l->count == 0 && l->waiters > 0) 4885 cv_broadcast(&l->cv); 4886 } 4887 mutex_exit(&l->lock); 4888 } 4889 4890 int 4891 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4892 { 4893 4894 if (rw == RW_READER) 4895 return (l->count > 0); 4896 ASSERT(rw == RW_WRITER); 4897 return (l->count < 0); 4898 } 4899 4900 /* ARGSUSED */ 4901 void 4902 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4903 { 4904 4905 l->count = 0; 4906 l->waiters = 0; 4907 l->owner = NULL; 4908 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4909 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4910 } 4911 4912 void 4913 nfs_rw_destroy(nfs_rwlock_t *l) 4914 { 4915 4916 mutex_destroy(&l->lock); 4917 cv_destroy(&l->cv); 4918 } 4919 4920 int 4921 nfs3_rddir_compar(const void *x, const void *y) 4922 { 4923 rddir_cache *a = (rddir_cache *)x; 4924 rddir_cache *b = (rddir_cache *)y; 4925 4926 if (a->nfs3_cookie == b->nfs3_cookie) { 4927 if (a->buflen == b->buflen) 4928 return (0); 4929 if (a->buflen < b->buflen) 4930 return (-1); 4931 return (1); 4932 } 4933 4934 if (a->nfs3_cookie < b->nfs3_cookie) 4935 return (-1); 4936 4937 return (1); 4938 } 4939 4940 int 4941 nfs_rddir_compar(const void *x, const void *y) 4942 { 4943 rddir_cache *a = (rddir_cache *)x; 4944 rddir_cache *b = (rddir_cache *)y; 4945 4946 if (a->nfs_cookie == b->nfs_cookie) { 4947 if (a->buflen == b->buflen) 4948 return (0); 4949 if (a->buflen < b->buflen) 4950 return (-1); 4951 return (1); 4952 } 4953 4954 if (a->nfs_cookie < b->nfs_cookie) 4955 return (-1); 4956 4957 return (1); 4958 } 4959 4960 static char * 4961 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4962 { 4963 servinfo_t *s; 4964 char *srvnames; 4965 char *namep; 4966 size_t length; 4967 4968 /* 4969 * Calculate the length of the string required to hold all 4970 * of the server names plus either a comma or a null 4971 * character following each individual one. 4972 */ 4973 length = 0; 4974 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4975 length += s->sv_hostnamelen; 4976 4977 srvnames = kmem_alloc(length, KM_SLEEP); 4978 4979 namep = srvnames; 4980 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4981 (void) strcpy(namep, s->sv_hostname); 4982 namep += s->sv_hostnamelen - 1; 4983 *namep++ = ','; 4984 } 4985 *--namep = '\0'; 4986 4987 *len = length; 4988 4989 return (srvnames); 4990 } 4991 4992 /* 4993 * These two functions are temporary and designed for the upgrade-workaround 4994 * only. They cannot be used for general zone-crossing NFS client support, and 4995 * will be removed shortly. 4996 * 4997 * When the workaround is enabled, all NFS traffic is forced into the global 4998 * zone. These functions are called when the code needs to refer to the state 4999 * of the underlying network connection. They're not called when the function 5000 * needs to refer to the state of the process that invoked the system call. 5001 * (E.g., when checking whether the zone is shutting down during the mount() 5002 * call.) 5003 */ 5004 5005 struct zone * 5006 nfs_zone(void) 5007 { 5008 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5009 } 5010 5011 zoneid_t 5012 nfs_zoneid(void) 5013 { 5014 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5015 } 5016 5017 /* 5018 * nfs_mount_label_policy: 5019 * Determine whether the mount is allowed according to MAC check, 5020 * by comparing (where appropriate) label of the remote server 5021 * against the label of the zone being mounted into. 5022 * 5023 * Returns: 5024 * 0 : access allowed 5025 * -1 : read-only access allowed (i.e., read-down) 5026 * >0 : error code, such as EACCES 5027 */ 5028 int 5029 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5030 struct knetconfig *knconf, cred_t *cr) 5031 { 5032 int addr_type; 5033 void *ipaddr; 5034 bslabel_t *server_sl, *mntlabel; 5035 zone_t *mntzone = NULL; 5036 ts_label_t *zlabel; 5037 tsol_tpc_t *tp; 5038 ts_label_t *tsl = NULL; 5039 int retv; 5040 5041 /* 5042 * Get the zone's label. Each zone on a labeled system has a label. 5043 */ 5044 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5045 zlabel = mntzone->zone_slabel; 5046 ASSERT(zlabel != NULL); 5047 label_hold(zlabel); 5048 5049 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5050 addr_type = IPV4_VERSION; 5051 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5052 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5053 addr_type = IPV6_VERSION; 5054 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5055 } else { 5056 retv = 0; 5057 goto out; 5058 } 5059 5060 retv = EACCES; /* assume the worst */ 5061 5062 /* 5063 * Next, get the assigned label of the remote server. 5064 */ 5065 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5066 if (tp == NULL) 5067 goto out; /* error getting host entry */ 5068 5069 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5070 goto rel_tpc; /* invalid domain */ 5071 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5072 (tp->tpc_tp.host_type != UNLABELED)) 5073 goto rel_tpc; /* invalid hosttype */ 5074 5075 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5076 tsl = getflabel_cipso(vfsp); 5077 if (tsl == NULL) 5078 goto rel_tpc; /* error getting server lbl */ 5079 5080 server_sl = label2bslabel(tsl); 5081 } else { /* UNLABELED */ 5082 server_sl = &tp->tpc_tp.tp_def_label; 5083 } 5084 5085 mntlabel = label2bslabel(zlabel); 5086 5087 /* 5088 * Now compare labels to complete the MAC check. If the labels 5089 * are equal or if the requestor is in the global zone and has 5090 * NET_MAC_AWARE, then allow read-write access. (Except for 5091 * mounts into the global zone itself; restrict these to 5092 * read-only.) 5093 * 5094 * If the requestor is in some other zone, but his label 5095 * dominates the server, then allow read-down. 5096 * 5097 * Otherwise, access is denied. 5098 */ 5099 if (blequal(mntlabel, server_sl) || 5100 (crgetzoneid(cr) == GLOBAL_ZONEID && 5101 getpflags(NET_MAC_AWARE, cr) != 0)) { 5102 if ((mntzone == global_zone) || 5103 !blequal(mntlabel, server_sl)) 5104 retv = -1; /* read-only */ 5105 else 5106 retv = 0; /* access OK */ 5107 } else if (bldominates(mntlabel, server_sl)) { 5108 retv = -1; /* read-only */ 5109 } else { 5110 retv = EACCES; 5111 } 5112 5113 if (tsl != NULL) 5114 label_rele(tsl); 5115 5116 rel_tpc: 5117 TPC_RELE(tp); 5118 out: 5119 if (mntzone) 5120 zone_rele(mntzone); 5121 label_rele(zlabel); 5122 return (retv); 5123 } 5124 5125 boolean_t 5126 nfs_has_ctty(void) 5127 { 5128 boolean_t rv; 5129 mutex_enter(&curproc->p_splock); 5130 rv = (curproc->p_sessp->s_vp != NULL); 5131 mutex_exit(&curproc->p_splock); 5132 return (rv); 5133 } 5134 5135 /* 5136 * TX NFS routine used by NFSv3 and NFSv4 to do label check 5137 * on client label and server's file object lable. 5138 */ 5139 boolean_t 5140 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag) 5141 { 5142 bslabel_t *slabel; 5143 ts_label_t *tslabel; 5144 boolean_t result; 5145 5146 if ((tslabel = nfs_getflabel(vp)) == NULL) { 5147 return (B_FALSE); 5148 } 5149 slabel = label2bslabel(tslabel); 5150 DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *, 5151 "comparing server's file label(1) with client label(2) (vp(3))", 5152 bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp); 5153 5154 if (flag == EQUALITY_CHECK) 5155 result = blequal(clabel, slabel); 5156 else 5157 result = bldominates(clabel, slabel); 5158 label_rele(tslabel); 5159 return (result); 5160 } 5161