1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/cred_impl.h> 32 #include <sys/proc.h> 33 #include <sys/user.h> 34 #include <sys/time.h> 35 #include <sys/buf.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/socket.h> 39 #include <sys/uio.h> 40 #include <sys/tiuser.h> 41 #include <sys/swap.h> 42 #include <sys/errno.h> 43 #include <sys/debug.h> 44 #include <sys/kmem.h> 45 #include <sys/kstat.h> 46 #include <sys/cmn_err.h> 47 #include <sys/vtrace.h> 48 #include <sys/session.h> 49 #include <sys/dnlc.h> 50 #include <sys/bitmap.h> 51 #include <sys/acl.h> 52 #include <sys/ddi.h> 53 #include <sys/pathname.h> 54 #include <sys/flock.h> 55 #include <sys/dirent.h> 56 #include <sys/flock.h> 57 #include <sys/callb.h> 58 #include <sys/atomic.h> 59 #include <sys/list.h> 60 #include <sys/tsol/tnet.h> 61 #include <sys/priv.h> 62 #include <sys/sdt.h> 63 #include <sys/attr.h> 64 65 #include <inet/ip6.h> 66 67 #include <rpc/types.h> 68 #include <rpc/xdr.h> 69 #include <rpc/auth.h> 70 #include <rpc/clnt.h> 71 72 #include <nfs/nfs.h> 73 #include <nfs/nfs4.h> 74 #include <nfs/nfs_clnt.h> 75 #include <nfs/rnode.h> 76 #include <nfs/nfs_acl.h> 77 78 #include <sys/tsol/label.h> 79 80 /* 81 * The hash queues for the access to active and cached rnodes 82 * are organized as doubly linked lists. A reader/writer lock 83 * for each hash bucket is used to control access and to synchronize 84 * lookups, additions, and deletions from the hash queue. 85 * 86 * The rnode freelist is organized as a doubly linked list with 87 * a head pointer. Additions and deletions are synchronized via 88 * a single mutex. 89 * 90 * In order to add an rnode to the free list, it must be hashed into 91 * a hash queue and the exclusive lock to the hash queue be held. 92 * If an rnode is not hashed into a hash queue, then it is destroyed 93 * because it represents no valuable information that can be reused 94 * about the file. The exclusive lock to the hash queue must be 95 * held in order to prevent a lookup in the hash queue from finding 96 * the rnode and using it and assuming that the rnode is not on the 97 * freelist. The lookup in the hash queue will have the hash queue 98 * locked, either exclusive or shared. 99 * 100 * The vnode reference count for each rnode is not allowed to drop 101 * below 1. This prevents external entities, such as the VM 102 * subsystem, from acquiring references to vnodes already on the 103 * freelist and then trying to place them back on the freelist 104 * when their reference is released. This means that the when an 105 * rnode is looked up in the hash queues, then either the rnode 106 * is removed from the freelist and that reference is transferred to 107 * the new reference or the vnode reference count must be incremented 108 * accordingly. The mutex for the freelist must be held in order to 109 * accurately test to see if the rnode is on the freelist or not. 110 * The hash queue lock might be held shared and it is possible that 111 * two different threads may race to remove the rnode from the 112 * freelist. This race can be resolved by holding the mutex for the 113 * freelist. Please note that the mutex for the freelist does not 114 * need to held if the rnode is not on the freelist. It can not be 115 * placed on the freelist due to the requirement that the thread 116 * putting the rnode on the freelist must hold the exclusive lock 117 * to the hash queue and the thread doing the lookup in the hash 118 * queue is holding either a shared or exclusive lock to the hash 119 * queue. 120 * 121 * The lock ordering is: 122 * 123 * hash bucket lock -> vnode lock 124 * hash bucket lock -> freelist lock 125 */ 126 static rhashq_t *rtable; 127 128 static kmutex_t rpfreelist_lock; 129 static rnode_t *rpfreelist = NULL; 130 static long rnew = 0; 131 long nrnode = 0; 132 133 static int rtablesize; 134 static int rtablemask; 135 136 static int hashlen = 4; 137 138 static struct kmem_cache *rnode_cache; 139 140 /* 141 * Mutex to protect the following variables: 142 * nfs_major 143 * nfs_minor 144 */ 145 kmutex_t nfs_minor_lock; 146 int nfs_major; 147 int nfs_minor; 148 149 /* Do we allow preepoch (negative) time values otw? */ 150 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 151 152 /* 153 * Access cache 154 */ 155 static acache_hash_t *acache; 156 static long nacache; /* used strictly to size the number of hash queues */ 157 158 static int acachesize; 159 static int acachemask; 160 static struct kmem_cache *acache_cache; 161 162 /* 163 * Client side utilities 164 */ 165 166 /* 167 * client side statistics 168 */ 169 static const struct clstat clstat_tmpl = { 170 { "calls", KSTAT_DATA_UINT64 }, 171 { "badcalls", KSTAT_DATA_UINT64 }, 172 { "clgets", KSTAT_DATA_UINT64 }, 173 { "cltoomany", KSTAT_DATA_UINT64 }, 174 #ifdef DEBUG 175 { "clalloc", KSTAT_DATA_UINT64 }, 176 { "noresponse", KSTAT_DATA_UINT64 }, 177 { "failover", KSTAT_DATA_UINT64 }, 178 { "remap", KSTAT_DATA_UINT64 }, 179 #endif 180 }; 181 182 /* 183 * The following are statistics that describe behavior of the system as a whole 184 * and doesn't correspond to any one particular zone. 185 */ 186 #ifdef DEBUG 187 static struct clstat_debug { 188 kstat_named_t nrnode; /* number of allocated rnodes */ 189 kstat_named_t access; /* size of access cache */ 190 kstat_named_t dirent; /* size of readdir cache */ 191 kstat_named_t dirents; /* size of readdir buf cache */ 192 kstat_named_t reclaim; /* number of reclaims */ 193 kstat_named_t clreclaim; /* number of cl reclaims */ 194 kstat_named_t f_reclaim; /* number of free reclaims */ 195 kstat_named_t a_reclaim; /* number of active reclaims */ 196 kstat_named_t r_reclaim; /* number of rnode reclaims */ 197 kstat_named_t rpath; /* bytes used to store rpaths */ 198 } clstat_debug = { 199 { "nrnode", KSTAT_DATA_UINT64 }, 200 { "access", KSTAT_DATA_UINT64 }, 201 { "dirent", KSTAT_DATA_UINT64 }, 202 { "dirents", KSTAT_DATA_UINT64 }, 203 { "reclaim", KSTAT_DATA_UINT64 }, 204 { "clreclaim", KSTAT_DATA_UINT64 }, 205 { "f_reclaim", KSTAT_DATA_UINT64 }, 206 { "a_reclaim", KSTAT_DATA_UINT64 }, 207 { "r_reclaim", KSTAT_DATA_UINT64 }, 208 { "r_path", KSTAT_DATA_UINT64 }, 209 }; 210 #endif /* DEBUG */ 211 212 /* 213 * We keep a global list of per-zone client data, so we can clean up all zones 214 * if we get low on memory. 215 */ 216 static list_t nfs_clnt_list; 217 static kmutex_t nfs_clnt_list_lock; 218 static zone_key_t nfsclnt_zone_key; 219 220 static struct kmem_cache *chtab_cache; 221 222 /* 223 * Some servers do not properly update the attributes of the 224 * directory when changes are made. To allow interoperability 225 * with these broken servers, the nfs_disable_rddir_cache 226 * parameter must be set in /etc/system 227 */ 228 int nfs_disable_rddir_cache = 0; 229 230 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 231 struct chtab **); 232 void clfree(CLIENT *, struct chtab *); 233 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 234 struct chtab **, struct nfs_clnt *); 235 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 236 struct chtab **, struct nfs_clnt *); 237 static void clreclaim(void *); 238 static int nfs_feedback(int, int, mntinfo_t *); 239 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 240 caddr_t, cred_t *, int *, enum clnt_stat *, int, 241 failinfo_t *); 242 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 243 caddr_t, cred_t *, int *, int, failinfo_t *); 244 static void rinactive(rnode_t *, cred_t *); 245 static int rtablehash(nfs_fhandle *); 246 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 247 struct vnodeops *, 248 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 249 cred_t *), 250 int (*)(const void *, const void *), int *, cred_t *, 251 char *, char *); 252 static void rp_rmfree(rnode_t *); 253 static void rp_addhash(rnode_t *); 254 static void rp_rmhash_locked(rnode_t *); 255 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 256 static void destroy_rnode(rnode_t *); 257 static void rddir_cache_free(rddir_cache *); 258 static int nfs_free_data_reclaim(rnode_t *); 259 static int nfs_active_data_reclaim(rnode_t *); 260 static int nfs_free_reclaim(void); 261 static int nfs_active_reclaim(void); 262 static int nfs_rnode_reclaim(void); 263 static void nfs_reclaim(void *); 264 static int failover_safe(failinfo_t *); 265 static void failover_newserver(mntinfo_t *mi); 266 static void failover_thread(mntinfo_t *mi); 267 static int failover_wait(mntinfo_t *); 268 static int failover_remap(failinfo_t *); 269 static int failover_lookup(char *, vnode_t *, 270 int (*)(vnode_t *, char *, vnode_t **, 271 struct pathname *, int, vnode_t *, cred_t *, int), 272 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 273 vnode_t **); 274 static void nfs_free_r_path(rnode_t *); 275 static void nfs_set_vroot(vnode_t *); 276 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 277 278 /* 279 * from rpcsec module (common/rpcsec) 280 */ 281 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 282 extern void sec_clnt_freeh(AUTH *); 283 extern void sec_clnt_freeinfo(struct sec_data *); 284 285 /* 286 * used in mount policy 287 */ 288 extern ts_label_t *getflabel_cipso(vfs_t *); 289 290 /* 291 * EIO or EINTR are not recoverable errors. 292 */ 293 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 294 295 /* 296 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 297 */ 298 static int 299 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 300 struct chtab **chp, struct nfs_clnt *nfscl) 301 { 302 struct chhead *ch, *newch; 303 struct chhead **plistp; 304 struct chtab *cp; 305 int error; 306 k_sigset_t smask; 307 308 if (newcl == NULL || chp == NULL || ci == NULL) 309 return (EINVAL); 310 311 *newcl = NULL; 312 *chp = NULL; 313 314 /* 315 * Find an unused handle or create one 316 */ 317 newch = NULL; 318 nfscl->nfscl_stat.clgets.value.ui64++; 319 top: 320 /* 321 * Find the correct entry in the cache to check for free 322 * client handles. The search is based on the RPC program 323 * number, program version number, dev_t for the transport 324 * device, and the protocol family. 325 */ 326 mutex_enter(&nfscl->nfscl_chtable_lock); 327 plistp = &nfscl->nfscl_chtable; 328 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 329 if (ch->ch_prog == ci->cl_prog && 330 ch->ch_vers == ci->cl_vers && 331 ch->ch_dev == svp->sv_knconf->knc_rdev && 332 (strcmp(ch->ch_protofmly, 333 svp->sv_knconf->knc_protofmly) == 0)) 334 break; 335 plistp = &ch->ch_next; 336 } 337 338 /* 339 * If we didn't find a cache entry for this quadruple, then 340 * create one. If we don't have one already preallocated, 341 * then drop the cache lock, create one, and then start over. 342 * If we did have a preallocated entry, then just add it to 343 * the front of the list. 344 */ 345 if (ch == NULL) { 346 if (newch == NULL) { 347 mutex_exit(&nfscl->nfscl_chtable_lock); 348 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 349 newch->ch_timesused = 0; 350 newch->ch_prog = ci->cl_prog; 351 newch->ch_vers = ci->cl_vers; 352 newch->ch_dev = svp->sv_knconf->knc_rdev; 353 newch->ch_protofmly = kmem_alloc( 354 strlen(svp->sv_knconf->knc_protofmly) + 1, 355 KM_SLEEP); 356 (void) strcpy(newch->ch_protofmly, 357 svp->sv_knconf->knc_protofmly); 358 newch->ch_list = NULL; 359 goto top; 360 } 361 ch = newch; 362 newch = NULL; 363 ch->ch_next = nfscl->nfscl_chtable; 364 nfscl->nfscl_chtable = ch; 365 /* 366 * We found a cache entry, but if it isn't on the front of the 367 * list, then move it to the front of the list to try to take 368 * advantage of locality of operations. 369 */ 370 } else if (ch != nfscl->nfscl_chtable) { 371 *plistp = ch->ch_next; 372 ch->ch_next = nfscl->nfscl_chtable; 373 nfscl->nfscl_chtable = ch; 374 } 375 376 /* 377 * If there was a free client handle cached, then remove it 378 * from the list, init it, and use it. 379 */ 380 if (ch->ch_list != NULL) { 381 cp = ch->ch_list; 382 ch->ch_list = cp->ch_list; 383 mutex_exit(&nfscl->nfscl_chtable_lock); 384 if (newch != NULL) { 385 kmem_free(newch->ch_protofmly, 386 strlen(newch->ch_protofmly) + 1); 387 kmem_free(newch, sizeof (*newch)); 388 } 389 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 390 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 391 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 392 &cp->ch_client->cl_auth); 393 if (error || cp->ch_client->cl_auth == NULL) { 394 CLNT_DESTROY(cp->ch_client); 395 kmem_cache_free(chtab_cache, cp); 396 return ((error != 0) ? error : EINTR); 397 } 398 ch->ch_timesused++; 399 *newcl = cp->ch_client; 400 *chp = cp; 401 return (0); 402 } 403 404 /* 405 * There weren't any free client handles which fit, so allocate 406 * a new one and use that. 407 */ 408 #ifdef DEBUG 409 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 410 #endif 411 mutex_exit(&nfscl->nfscl_chtable_lock); 412 413 nfscl->nfscl_stat.cltoomany.value.ui64++; 414 if (newch != NULL) { 415 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 416 kmem_free(newch, sizeof (*newch)); 417 } 418 419 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 420 cp->ch_head = ch; 421 422 sigintr(&smask, (int)ci->cl_flags & MI_INT); 423 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 424 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 425 sigunintr(&smask); 426 427 if (error != 0) { 428 kmem_cache_free(chtab_cache, cp); 429 #ifdef DEBUG 430 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 431 #endif 432 /* 433 * Warning is unnecessary if error is EINTR. 434 */ 435 if (error != EINTR) { 436 nfs_cmn_err(error, CE_WARN, 437 "clget: couldn't create handle: %m\n"); 438 } 439 return (error); 440 } 441 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 442 auth_destroy(cp->ch_client->cl_auth); 443 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 444 &cp->ch_client->cl_auth); 445 if (error || cp->ch_client->cl_auth == NULL) { 446 CLNT_DESTROY(cp->ch_client); 447 kmem_cache_free(chtab_cache, cp); 448 #ifdef DEBUG 449 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 450 #endif 451 return ((error != 0) ? error : EINTR); 452 } 453 ch->ch_timesused++; 454 *newcl = cp->ch_client; 455 ASSERT(cp->ch_client->cl_nosignal == FALSE); 456 *chp = cp; 457 return (0); 458 } 459 460 int 461 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 462 struct chtab **chp) 463 { 464 struct nfs_clnt *nfscl; 465 466 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 467 ASSERT(nfscl != NULL); 468 469 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 470 } 471 472 static int 473 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 474 struct chtab **chp, struct nfs_clnt *nfscl) 475 { 476 clinfo_t ci; 477 int error; 478 479 /* 480 * Set read buffer size to rsize 481 * and add room for RPC headers. 482 */ 483 ci.cl_readsize = mi->mi_tsize; 484 if (ci.cl_readsize != 0) 485 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 486 487 /* 488 * If soft mount and server is down just try once. 489 * meaning: do not retransmit. 490 */ 491 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 492 ci.cl_retrans = 0; 493 else 494 ci.cl_retrans = mi->mi_retrans; 495 496 ci.cl_prog = NFS_ACL_PROGRAM; 497 ci.cl_vers = mi->mi_vers; 498 ci.cl_flags = mi->mi_flags; 499 500 /* 501 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 502 * security flavor, the client tries to establish a security context 503 * by contacting the server. If the connection is timed out or reset, 504 * e.g. server reboot, we will try again. 505 */ 506 do { 507 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 508 509 if (error == 0) 510 break; 511 512 /* 513 * For forced unmount or zone shutdown, bail out, no retry. 514 */ 515 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 516 error = EIO; 517 break; 518 } 519 520 /* do not retry for softmount */ 521 if (!(mi->mi_flags & MI_HARD)) 522 break; 523 524 /* let the caller deal with the failover case */ 525 if (FAILOVER_MOUNT(mi)) 526 break; 527 528 } while (error == ETIMEDOUT || error == ECONNRESET); 529 530 return (error); 531 } 532 533 static int 534 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 535 struct chtab **chp, struct nfs_clnt *nfscl) 536 { 537 clinfo_t ci; 538 int error; 539 540 /* 541 * Set read buffer size to rsize 542 * and add room for RPC headers. 543 */ 544 ci.cl_readsize = mi->mi_tsize; 545 if (ci.cl_readsize != 0) 546 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 547 548 /* 549 * If soft mount and server is down just try once. 550 * meaning: do not retransmit. 551 */ 552 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 553 ci.cl_retrans = 0; 554 else 555 ci.cl_retrans = mi->mi_retrans; 556 557 ci.cl_prog = mi->mi_prog; 558 ci.cl_vers = mi->mi_vers; 559 ci.cl_flags = mi->mi_flags; 560 561 /* 562 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 563 * security flavor, the client tries to establish a security context 564 * by contacting the server. If the connection is timed out or reset, 565 * e.g. server reboot, we will try again. 566 */ 567 do { 568 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 569 570 if (error == 0) 571 break; 572 573 /* 574 * For forced unmount or zone shutdown, bail out, no retry. 575 */ 576 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 577 error = EIO; 578 break; 579 } 580 581 /* do not retry for softmount */ 582 if (!(mi->mi_flags & MI_HARD)) 583 break; 584 585 /* let the caller deal with the failover case */ 586 if (FAILOVER_MOUNT(mi)) 587 break; 588 589 } while (error == ETIMEDOUT || error == ECONNRESET); 590 591 return (error); 592 } 593 594 static void 595 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 596 { 597 if (cl->cl_auth != NULL) { 598 sec_clnt_freeh(cl->cl_auth); 599 cl->cl_auth = NULL; 600 } 601 602 /* 603 * Timestamp this cache entry so that we know when it was last 604 * used. 605 */ 606 cp->ch_freed = gethrestime_sec(); 607 608 /* 609 * Add the free client handle to the front of the list. 610 * This way, the list will be sorted in youngest to oldest 611 * order. 612 */ 613 mutex_enter(&nfscl->nfscl_chtable_lock); 614 cp->ch_list = cp->ch_head->ch_list; 615 cp->ch_head->ch_list = cp; 616 mutex_exit(&nfscl->nfscl_chtable_lock); 617 } 618 619 void 620 clfree(CLIENT *cl, struct chtab *cp) 621 { 622 struct nfs_clnt *nfscl; 623 624 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 625 ASSERT(nfscl != NULL); 626 627 clfree_impl(cl, cp, nfscl); 628 } 629 630 #define CL_HOLDTIME 60 /* time to hold client handles */ 631 632 static void 633 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 634 { 635 struct chhead *ch; 636 struct chtab *cp; /* list of objects that can be reclaimed */ 637 struct chtab *cpe; 638 struct chtab *cpl; 639 struct chtab **cpp; 640 #ifdef DEBUG 641 int n = 0; 642 #endif 643 644 /* 645 * Need to reclaim some memory, so step through the cache 646 * looking through the lists for entries which can be freed. 647 */ 648 cp = NULL; 649 650 mutex_enter(&nfscl->nfscl_chtable_lock); 651 652 /* 653 * Here we step through each non-NULL quadruple and start to 654 * construct the reclaim list pointed to by cp. Note that 655 * cp will contain all eligible chtab entries. When this traversal 656 * completes, chtab entries from the last quadruple will be at the 657 * front of cp and entries from previously inspected quadruples have 658 * been appended to the rear of cp. 659 */ 660 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 661 if (ch->ch_list == NULL) 662 continue; 663 /* 664 * Search each list for entries older then 665 * cl_holdtime seconds. The lists are maintained 666 * in youngest to oldest order so that when the 667 * first entry is found which is old enough, then 668 * all of the rest of the entries on the list will 669 * be old enough as well. 670 */ 671 cpl = ch->ch_list; 672 cpp = &ch->ch_list; 673 while (cpl != NULL && 674 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 675 cpp = &cpl->ch_list; 676 cpl = cpl->ch_list; 677 } 678 if (cpl != NULL) { 679 *cpp = NULL; 680 if (cp != NULL) { 681 cpe = cpl; 682 while (cpe->ch_list != NULL) 683 cpe = cpe->ch_list; 684 cpe->ch_list = cp; 685 } 686 cp = cpl; 687 } 688 } 689 690 mutex_exit(&nfscl->nfscl_chtable_lock); 691 692 /* 693 * If cp is empty, then there is nothing to reclaim here. 694 */ 695 if (cp == NULL) 696 return; 697 698 /* 699 * Step through the list of entries to free, destroying each client 700 * handle and kmem_free'ing the memory for each entry. 701 */ 702 while (cp != NULL) { 703 #ifdef DEBUG 704 n++; 705 #endif 706 CLNT_DESTROY(cp->ch_client); 707 cpl = cp->ch_list; 708 kmem_cache_free(chtab_cache, cp); 709 cp = cpl; 710 } 711 712 #ifdef DEBUG 713 /* 714 * Update clalloc so that nfsstat shows the current number 715 * of allocated client handles. 716 */ 717 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 718 #endif 719 } 720 721 /* ARGSUSED */ 722 static void 723 clreclaim(void *all) 724 { 725 struct nfs_clnt *nfscl; 726 727 #ifdef DEBUG 728 clstat_debug.clreclaim.value.ui64++; 729 #endif 730 /* 731 * The system is low on memory; go through and try to reclaim some from 732 * every zone on the system. 733 */ 734 mutex_enter(&nfs_clnt_list_lock); 735 nfscl = list_head(&nfs_clnt_list); 736 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 737 clreclaim_zone(nfscl, CL_HOLDTIME); 738 mutex_exit(&nfs_clnt_list_lock); 739 } 740 741 /* 742 * Minimum time-out values indexed by call type 743 * These units are in "eights" of a second to avoid multiplies 744 */ 745 static unsigned int minimum_timeo[] = { 746 6, 7, 10 747 }; 748 749 /* 750 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 751 */ 752 #define MAXTIMO (20*hz) 753 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 754 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 755 756 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 757 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 758 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 759 760 /* 761 * Function called when rfscall notices that we have been 762 * re-transmitting, or when we get a response without retransmissions. 763 * Return 1 if the transfer size was adjusted down - 0 if no change. 764 */ 765 static int 766 nfs_feedback(int flag, int which, mntinfo_t *mi) 767 { 768 int kind; 769 int r = 0; 770 771 mutex_enter(&mi->mi_lock); 772 if (flag == FEEDBACK_REXMIT1) { 773 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 774 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 775 goto done; 776 if (mi->mi_curread > MIN_NFS_TSIZE) { 777 mi->mi_curread /= 2; 778 if (mi->mi_curread < MIN_NFS_TSIZE) 779 mi->mi_curread = MIN_NFS_TSIZE; 780 r = 1; 781 } 782 783 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 784 mi->mi_curwrite /= 2; 785 if (mi->mi_curwrite < MIN_NFS_TSIZE) 786 mi->mi_curwrite = MIN_NFS_TSIZE; 787 r = 1; 788 } 789 } else if (flag == FEEDBACK_OK) { 790 kind = mi->mi_timer_type[which]; 791 if (kind == 0 || 792 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 793 goto done; 794 if (kind == 1) { 795 if (mi->mi_curread >= mi->mi_tsize) 796 goto done; 797 mi->mi_curread += MIN_NFS_TSIZE; 798 if (mi->mi_curread > mi->mi_tsize/2) 799 mi->mi_curread = mi->mi_tsize; 800 } else if (kind == 2) { 801 if (mi->mi_curwrite >= mi->mi_stsize) 802 goto done; 803 mi->mi_curwrite += MIN_NFS_TSIZE; 804 if (mi->mi_curwrite > mi->mi_stsize/2) 805 mi->mi_curwrite = mi->mi_stsize; 806 } 807 } 808 done: 809 mutex_exit(&mi->mi_lock); 810 return (r); 811 } 812 813 #ifdef DEBUG 814 static int rfs2call_hits = 0; 815 static int rfs2call_misses = 0; 816 #endif 817 818 int 819 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 820 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 821 enum nfsstat *statusp, int flags, failinfo_t *fi) 822 { 823 int rpcerror; 824 enum clnt_stat rpc_status; 825 826 ASSERT(statusp != NULL); 827 828 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 829 cr, douprintf, &rpc_status, flags, fi); 830 if (!rpcerror) { 831 /* 832 * See crnetadjust() for comments. 833 */ 834 if (*statusp == NFSERR_ACCES && 835 (cr = crnetadjust(cr)) != NULL) { 836 #ifdef DEBUG 837 rfs2call_hits++; 838 #endif 839 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 840 resp, cr, douprintf, NULL, flags, fi); 841 crfree(cr); 842 #ifdef DEBUG 843 if (*statusp == NFSERR_ACCES) 844 rfs2call_misses++; 845 #endif 846 } 847 } else if (rpc_status == RPC_PROCUNAVAIL) { 848 *statusp = NFSERR_OPNOTSUPP; 849 rpcerror = 0; 850 } 851 852 return (rpcerror); 853 } 854 855 #define NFS3_JUKEBOX_DELAY 10 * hz 856 857 static clock_t nfs3_jukebox_delay = 0; 858 859 #ifdef DEBUG 860 static int rfs3call_hits = 0; 861 static int rfs3call_misses = 0; 862 #endif 863 864 int 865 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 866 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 867 nfsstat3 *statusp, int flags, failinfo_t *fi) 868 { 869 int rpcerror; 870 int user_informed; 871 872 user_informed = 0; 873 do { 874 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 875 cr, douprintf, NULL, flags, fi); 876 if (!rpcerror) { 877 cred_t *crr; 878 if (*statusp == NFS3ERR_JUKEBOX) { 879 if (ttoproc(curthread) == &p0) { 880 rpcerror = EAGAIN; 881 break; 882 } 883 if (!user_informed) { 884 user_informed = 1; 885 uprintf( 886 "file temporarily unavailable on the server, retrying...\n"); 887 } 888 delay(nfs3_jukebox_delay); 889 } 890 /* 891 * See crnetadjust() for comments. 892 */ 893 else if (*statusp == NFS3ERR_ACCES && 894 (crr = crnetadjust(cr)) != NULL) { 895 #ifdef DEBUG 896 rfs3call_hits++; 897 #endif 898 rpcerror = rfscall(mi, which, xdrargs, argsp, 899 xdrres, resp, crr, douprintf, 900 NULL, flags, fi); 901 902 crfree(crr); 903 #ifdef DEBUG 904 if (*statusp == NFS3ERR_ACCES) 905 rfs3call_misses++; 906 #endif 907 } 908 } 909 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 910 911 return (rpcerror); 912 } 913 914 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 915 #define INC_READERS(mi) { \ 916 mi->mi_readers++; \ 917 } 918 #define DEC_READERS(mi) { \ 919 mi->mi_readers--; \ 920 if (mi->mi_readers == 0) \ 921 cv_broadcast(&mi->mi_failover_cv); \ 922 } 923 924 static int 925 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 926 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 927 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 928 { 929 CLIENT *client; 930 struct chtab *ch; 931 cred_t *cr = icr; 932 enum clnt_stat status; 933 struct rpc_err rpcerr; 934 struct timeval wait; 935 int timeo; /* in units of hz */ 936 int my_rsize, my_wsize; 937 bool_t tryagain; 938 bool_t cred_cloned = FALSE; 939 k_sigset_t smask; 940 servinfo_t *svp; 941 struct nfs_clnt *nfscl; 942 zoneid_t zoneid = getzoneid(); 943 #ifdef DEBUG 944 char *bufp; 945 #endif 946 947 948 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 949 "rfscall_start:which %d mi %p", which, mi); 950 951 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 952 ASSERT(nfscl != NULL); 953 954 nfscl->nfscl_stat.calls.value.ui64++; 955 mi->mi_reqs[which].value.ui64++; 956 957 rpcerr.re_status = RPC_SUCCESS; 958 959 /* 960 * In case of forced unmount or zone shutdown, return EIO. 961 */ 962 963 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 964 rpcerr.re_status = RPC_FAILED; 965 rpcerr.re_errno = EIO; 966 return (rpcerr.re_errno); 967 } 968 969 /* 970 * Remember the transfer sizes in case 971 * nfs_feedback changes them underneath us. 972 */ 973 my_rsize = mi->mi_curread; 974 my_wsize = mi->mi_curwrite; 975 976 /* 977 * NFS client failover support 978 * 979 * If this rnode is not in sync with the current server (VALID_FH), 980 * we'd like to do a remap to get in sync. We can be interrupted 981 * in failover_remap(), and if so we'll bail. Otherwise, we'll 982 * use the best info we have to try the RPC. Part of that is 983 * unconditionally updating the filehandle copy kept for V3. 984 * 985 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 986 * rw_enter(); we're trying to keep the current server from being 987 * changed on us until we're done with the remapping and have a 988 * matching client handle. We don't want to sending a filehandle 989 * to the wrong host. 990 */ 991 failoverretry: 992 if (FAILOVER_MOUNT(mi)) { 993 mutex_enter(&mi->mi_lock); 994 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 995 if (failover_wait(mi)) { 996 mutex_exit(&mi->mi_lock); 997 return (EINTR); 998 } 999 } 1000 INC_READERS(mi); 1001 mutex_exit(&mi->mi_lock); 1002 if (fi) { 1003 if (!VALID_FH(fi) && 1004 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1005 int remaperr; 1006 1007 svp = mi->mi_curr_serv; 1008 remaperr = failover_remap(fi); 1009 if (remaperr != 0) { 1010 #ifdef DEBUG 1011 if (remaperr != EINTR) 1012 nfs_cmn_err(remaperr, CE_WARN, 1013 "rfscall couldn't failover: %m"); 1014 #endif 1015 mutex_enter(&mi->mi_lock); 1016 DEC_READERS(mi); 1017 mutex_exit(&mi->mi_lock); 1018 /* 1019 * If failover_remap returns ETIMEDOUT 1020 * and the filesystem is hard mounted 1021 * we have to retry the call with a new 1022 * server. 1023 */ 1024 if ((mi->mi_flags & MI_HARD) && 1025 IS_RECOVERABLE_ERROR(remaperr)) { 1026 if (svp == mi->mi_curr_serv) 1027 failover_newserver(mi); 1028 rpcerr.re_status = RPC_SUCCESS; 1029 goto failoverretry; 1030 } 1031 rpcerr.re_errno = remaperr; 1032 return (remaperr); 1033 } 1034 } 1035 if (fi->fhp && fi->copyproc) 1036 (*fi->copyproc)(fi->fhp, fi->vp); 1037 } 1038 } 1039 1040 /* For TSOL, use a new cred which has net_mac_aware flag */ 1041 if (!cred_cloned && is_system_labeled()) { 1042 cred_cloned = TRUE; 1043 cr = crdup(icr); 1044 (void) setpflags(NET_MAC_AWARE, 1, cr); 1045 } 1046 1047 /* 1048 * clget() calls clnt_tli_kinit() which clears the xid, so we 1049 * are guaranteed to reprocess the retry as a new request. 1050 */ 1051 svp = mi->mi_curr_serv; 1052 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1053 1054 if (FAILOVER_MOUNT(mi)) { 1055 mutex_enter(&mi->mi_lock); 1056 DEC_READERS(mi); 1057 mutex_exit(&mi->mi_lock); 1058 1059 if ((rpcerr.re_errno == ETIMEDOUT || 1060 rpcerr.re_errno == ECONNRESET) && 1061 failover_safe(fi)) { 1062 if (svp == mi->mi_curr_serv) 1063 failover_newserver(mi); 1064 goto failoverretry; 1065 } 1066 } 1067 if (rpcerr.re_errno != 0) 1068 return (rpcerr.re_errno); 1069 1070 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1071 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1072 timeo = (mi->mi_timeo * hz) / 10; 1073 } else { 1074 mutex_enter(&mi->mi_lock); 1075 timeo = CLNT_SETTIMERS(client, 1076 &(mi->mi_timers[mi->mi_timer_type[which]]), 1077 &(mi->mi_timers[NFS_CALLTYPES]), 1078 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1079 (void (*)())NULL, (caddr_t)mi, 0); 1080 mutex_exit(&mi->mi_lock); 1081 } 1082 1083 /* 1084 * If hard mounted fs, retry call forever unless hard error occurs. 1085 */ 1086 do { 1087 tryagain = FALSE; 1088 1089 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1090 status = RPC_FAILED; 1091 rpcerr.re_status = RPC_FAILED; 1092 rpcerr.re_errno = EIO; 1093 break; 1094 } 1095 1096 TICK_TO_TIMEVAL(timeo, &wait); 1097 1098 /* 1099 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1100 * and SIGTERM. (Preserving the existing masks). 1101 * Mask out SIGINT if mount option nointr is specified. 1102 */ 1103 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1104 if (!(mi->mi_flags & MI_INT)) 1105 client->cl_nosignal = TRUE; 1106 1107 /* 1108 * If there is a current signal, then don't bother 1109 * even trying to send out the request because we 1110 * won't be able to block waiting for the response. 1111 * Simply assume RPC_INTR and get on with it. 1112 */ 1113 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1114 status = RPC_INTR; 1115 else { 1116 status = CLNT_CALL(client, which, xdrargs, argsp, 1117 xdrres, resp, wait); 1118 } 1119 1120 if (!(mi->mi_flags & MI_INT)) 1121 client->cl_nosignal = FALSE; 1122 /* 1123 * restore original signal mask 1124 */ 1125 sigunintr(&smask); 1126 1127 switch (status) { 1128 case RPC_SUCCESS: 1129 if ((mi->mi_flags & MI_DYNAMIC) && 1130 mi->mi_timer_type[which] != 0 && 1131 (mi->mi_curread != my_rsize || 1132 mi->mi_curwrite != my_wsize)) 1133 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1134 break; 1135 1136 case RPC_INTR: 1137 /* 1138 * There is no way to recover from this error, 1139 * even if mount option nointr is specified. 1140 * SIGKILL, for example, cannot be blocked. 1141 */ 1142 rpcerr.re_status = RPC_INTR; 1143 rpcerr.re_errno = EINTR; 1144 break; 1145 1146 case RPC_UDERROR: 1147 /* 1148 * If the NFS server is local (vold) and 1149 * it goes away then we get RPC_UDERROR. 1150 * This is a retryable error, so we would 1151 * loop, so check to see if the specific 1152 * error was ECONNRESET, indicating that 1153 * target did not exist at all. If so, 1154 * return with RPC_PROGUNAVAIL and 1155 * ECONNRESET to indicate why. 1156 */ 1157 CLNT_GETERR(client, &rpcerr); 1158 if (rpcerr.re_errno == ECONNRESET) { 1159 rpcerr.re_status = RPC_PROGUNAVAIL; 1160 rpcerr.re_errno = ECONNRESET; 1161 break; 1162 } 1163 /*FALLTHROUGH*/ 1164 1165 default: /* probably RPC_TIMEDOUT */ 1166 if (IS_UNRECOVERABLE_RPC(status)) 1167 break; 1168 1169 /* 1170 * increment server not responding count 1171 */ 1172 mutex_enter(&mi->mi_lock); 1173 mi->mi_noresponse++; 1174 mutex_exit(&mi->mi_lock); 1175 #ifdef DEBUG 1176 nfscl->nfscl_stat.noresponse.value.ui64++; 1177 #endif 1178 1179 if (!(mi->mi_flags & MI_HARD)) { 1180 if (!(mi->mi_flags & MI_SEMISOFT) || 1181 (mi->mi_ss_call_type[which] == 0)) 1182 break; 1183 } 1184 1185 /* 1186 * The call is in progress (over COTS). 1187 * Try the CLNT_CALL again, but don't 1188 * print a noisy error message. 1189 */ 1190 if (status == RPC_INPROGRESS) { 1191 tryagain = TRUE; 1192 break; 1193 } 1194 1195 if (flags & RFSCALL_SOFT) 1196 break; 1197 1198 /* 1199 * On zone shutdown, just move on. 1200 */ 1201 if (zone_status_get(curproc->p_zone) >= 1202 ZONE_IS_SHUTTING_DOWN) { 1203 rpcerr.re_status = RPC_FAILED; 1204 rpcerr.re_errno = EIO; 1205 break; 1206 } 1207 1208 /* 1209 * NFS client failover support 1210 * 1211 * If the current server just failed us, we'll 1212 * start the process of finding a new server. 1213 * After that, we can just retry. 1214 */ 1215 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1216 if (svp == mi->mi_curr_serv) 1217 failover_newserver(mi); 1218 clfree_impl(client, ch, nfscl); 1219 goto failoverretry; 1220 } 1221 1222 tryagain = TRUE; 1223 timeo = backoff(timeo); 1224 mutex_enter(&mi->mi_lock); 1225 if (!(mi->mi_flags & MI_PRINTED)) { 1226 mi->mi_flags |= MI_PRINTED; 1227 mutex_exit(&mi->mi_lock); 1228 #ifdef DEBUG 1229 zprintf(zoneid, 1230 "NFS%d server %s not responding still trying\n", 1231 mi->mi_vers, svp->sv_hostname); 1232 #else 1233 zprintf(zoneid, 1234 "NFS server %s not responding still trying\n", 1235 svp->sv_hostname); 1236 #endif 1237 } else 1238 mutex_exit(&mi->mi_lock); 1239 if (*douprintf && nfs_has_ctty()) { 1240 *douprintf = 0; 1241 if (!(mi->mi_flags & MI_NOPRINT)) 1242 #ifdef DEBUG 1243 uprintf( 1244 "NFS%d server %s not responding still trying\n", 1245 mi->mi_vers, svp->sv_hostname); 1246 #else 1247 uprintf( 1248 "NFS server %s not responding still trying\n", 1249 svp->sv_hostname); 1250 #endif 1251 } 1252 1253 /* 1254 * If doing dynamic adjustment of transfer 1255 * size and if it's a read or write call 1256 * and if the transfer size changed while 1257 * retransmitting or if the feedback routine 1258 * changed the transfer size, 1259 * then exit rfscall so that the transfer 1260 * size can be adjusted at the vnops level. 1261 */ 1262 if ((mi->mi_flags & MI_DYNAMIC) && 1263 mi->mi_timer_type[which] != 0 && 1264 (mi->mi_curread != my_rsize || 1265 mi->mi_curwrite != my_wsize || 1266 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1267 /* 1268 * On read or write calls, return 1269 * back to the vnode ops level if 1270 * the transfer size changed. 1271 */ 1272 clfree_impl(client, ch, nfscl); 1273 if (cred_cloned) 1274 crfree(cr); 1275 return (ENFS_TRYAGAIN); 1276 } 1277 } 1278 } while (tryagain); 1279 1280 if (status != RPC_SUCCESS) { 1281 /* 1282 * Let soft mounts use the timed out message. 1283 */ 1284 if (status == RPC_INPROGRESS) 1285 status = RPC_TIMEDOUT; 1286 nfscl->nfscl_stat.badcalls.value.ui64++; 1287 if (status != RPC_INTR) { 1288 mutex_enter(&mi->mi_lock); 1289 mi->mi_flags |= MI_DOWN; 1290 mutex_exit(&mi->mi_lock); 1291 CLNT_GETERR(client, &rpcerr); 1292 #ifdef DEBUG 1293 bufp = clnt_sperror(client, svp->sv_hostname); 1294 zprintf(zoneid, "NFS%d %s failed for %s\n", 1295 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1296 if (nfs_has_ctty()) { 1297 if (!(mi->mi_flags & MI_NOPRINT)) { 1298 uprintf("NFS%d %s failed for %s\n", 1299 mi->mi_vers, mi->mi_rfsnames[which], 1300 bufp); 1301 } 1302 } 1303 kmem_free(bufp, MAXPATHLEN); 1304 #else 1305 zprintf(zoneid, 1306 "NFS %s failed for server %s: error %d (%s)\n", 1307 mi->mi_rfsnames[which], svp->sv_hostname, 1308 status, clnt_sperrno(status)); 1309 if (nfs_has_ctty()) { 1310 if (!(mi->mi_flags & MI_NOPRINT)) { 1311 uprintf( 1312 "NFS %s failed for server %s: error %d (%s)\n", 1313 mi->mi_rfsnames[which], 1314 svp->sv_hostname, status, 1315 clnt_sperrno(status)); 1316 } 1317 } 1318 #endif 1319 /* 1320 * when CLNT_CALL() fails with RPC_AUTHERROR, 1321 * re_errno is set appropriately depending on 1322 * the authentication error 1323 */ 1324 if (status == RPC_VERSMISMATCH || 1325 status == RPC_PROGVERSMISMATCH) 1326 rpcerr.re_errno = EIO; 1327 } 1328 } else { 1329 /* 1330 * Test the value of mi_down and mi_printed without 1331 * holding the mi_lock mutex. If they are both zero, 1332 * then it is okay to skip the down and printed 1333 * processing. This saves on a mutex_enter and 1334 * mutex_exit pair for a normal, successful RPC. 1335 * This was just complete overhead. 1336 */ 1337 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1338 mutex_enter(&mi->mi_lock); 1339 mi->mi_flags &= ~MI_DOWN; 1340 if (mi->mi_flags & MI_PRINTED) { 1341 mi->mi_flags &= ~MI_PRINTED; 1342 mutex_exit(&mi->mi_lock); 1343 #ifdef DEBUG 1344 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1345 zprintf(zoneid, "NFS%d server %s ok\n", 1346 mi->mi_vers, svp->sv_hostname); 1347 #else 1348 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1349 zprintf(zoneid, "NFS server %s ok\n", 1350 svp->sv_hostname); 1351 #endif 1352 } else 1353 mutex_exit(&mi->mi_lock); 1354 } 1355 1356 if (*douprintf == 0) { 1357 if (!(mi->mi_flags & MI_NOPRINT)) 1358 #ifdef DEBUG 1359 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1360 uprintf("NFS%d server %s ok\n", 1361 mi->mi_vers, svp->sv_hostname); 1362 #else 1363 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1364 uprintf("NFS server %s ok\n", svp->sv_hostname); 1365 #endif 1366 *douprintf = 1; 1367 } 1368 } 1369 1370 clfree_impl(client, ch, nfscl); 1371 if (cred_cloned) 1372 crfree(cr); 1373 1374 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1375 1376 if (rpc_status != NULL) 1377 *rpc_status = rpcerr.re_status; 1378 1379 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1380 rpcerr.re_errno); 1381 1382 return (rpcerr.re_errno); 1383 } 1384 1385 #ifdef DEBUG 1386 static int acl2call_hits = 0; 1387 static int acl2call_misses = 0; 1388 #endif 1389 1390 int 1391 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1392 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1393 enum nfsstat *statusp, int flags, failinfo_t *fi) 1394 { 1395 int rpcerror; 1396 1397 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1398 cr, douprintf, flags, fi); 1399 if (!rpcerror) { 1400 /* 1401 * See comments with crnetadjust(). 1402 */ 1403 if (*statusp == NFSERR_ACCES && 1404 (cr = crnetadjust(cr)) != NULL) { 1405 #ifdef DEBUG 1406 acl2call_hits++; 1407 #endif 1408 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1409 resp, cr, douprintf, flags, fi); 1410 crfree(cr); 1411 #ifdef DEBUG 1412 if (*statusp == NFSERR_ACCES) 1413 acl2call_misses++; 1414 #endif 1415 } 1416 } 1417 1418 return (rpcerror); 1419 } 1420 1421 #ifdef DEBUG 1422 static int acl3call_hits = 0; 1423 static int acl3call_misses = 0; 1424 #endif 1425 1426 int 1427 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1428 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1429 nfsstat3 *statusp, int flags, failinfo_t *fi) 1430 { 1431 int rpcerror; 1432 int user_informed; 1433 1434 user_informed = 0; 1435 1436 do { 1437 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1438 cr, douprintf, flags, fi); 1439 if (!rpcerror) { 1440 cred_t *crr; 1441 if (*statusp == NFS3ERR_JUKEBOX) { 1442 if (!user_informed) { 1443 user_informed = 1; 1444 uprintf( 1445 "file temporarily unavailable on the server, retrying...\n"); 1446 } 1447 delay(nfs3_jukebox_delay); 1448 } 1449 /* 1450 * See crnetadjust() for comments. 1451 */ 1452 else if (*statusp == NFS3ERR_ACCES && 1453 (crr = crnetadjust(cr)) != NULL) { 1454 #ifdef DEBUG 1455 acl3call_hits++; 1456 #endif 1457 rpcerror = aclcall(mi, which, xdrargs, argsp, 1458 xdrres, resp, crr, douprintf, flags, fi); 1459 1460 crfree(crr); 1461 #ifdef DEBUG 1462 if (*statusp == NFS3ERR_ACCES) 1463 acl3call_misses++; 1464 #endif 1465 } 1466 } 1467 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1468 1469 return (rpcerror); 1470 } 1471 1472 static int 1473 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1474 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1475 int flags, failinfo_t *fi) 1476 { 1477 CLIENT *client; 1478 struct chtab *ch; 1479 cred_t *cr = icr; 1480 bool_t cred_cloned = FALSE; 1481 enum clnt_stat status; 1482 struct rpc_err rpcerr; 1483 struct timeval wait; 1484 int timeo; /* in units of hz */ 1485 #if 0 /* notyet */ 1486 int my_rsize, my_wsize; 1487 #endif 1488 bool_t tryagain; 1489 k_sigset_t smask; 1490 servinfo_t *svp; 1491 struct nfs_clnt *nfscl; 1492 zoneid_t zoneid = getzoneid(); 1493 #ifdef DEBUG 1494 char *bufp; 1495 #endif 1496 1497 #if 0 /* notyet */ 1498 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1499 "rfscall_start:which %d mi %p", which, mi); 1500 #endif 1501 1502 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1503 ASSERT(nfscl != NULL); 1504 1505 nfscl->nfscl_stat.calls.value.ui64++; 1506 mi->mi_aclreqs[which].value.ui64++; 1507 1508 rpcerr.re_status = RPC_SUCCESS; 1509 1510 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1511 rpcerr.re_status = RPC_FAILED; 1512 rpcerr.re_errno = EIO; 1513 return (rpcerr.re_errno); 1514 } 1515 1516 #if 0 /* notyet */ 1517 /* 1518 * Remember the transfer sizes in case 1519 * nfs_feedback changes them underneath us. 1520 */ 1521 my_rsize = mi->mi_curread; 1522 my_wsize = mi->mi_curwrite; 1523 #endif 1524 1525 /* 1526 * NFS client failover support 1527 * 1528 * If this rnode is not in sync with the current server (VALID_FH), 1529 * we'd like to do a remap to get in sync. We can be interrupted 1530 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1531 * use the best info we have to try the RPC. Part of that is 1532 * unconditionally updating the filehandle copy kept for V3. 1533 * 1534 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1535 * rw_enter(); we're trying to keep the current server from being 1536 * changed on us until we're done with the remapping and have a 1537 * matching client handle. We don't want to sending a filehandle 1538 * to the wrong host. 1539 */ 1540 failoverretry: 1541 if (FAILOVER_MOUNT(mi)) { 1542 mutex_enter(&mi->mi_lock); 1543 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1544 if (failover_wait(mi)) { 1545 mutex_exit(&mi->mi_lock); 1546 return (EINTR); 1547 } 1548 } 1549 INC_READERS(mi); 1550 mutex_exit(&mi->mi_lock); 1551 if (fi) { 1552 if (!VALID_FH(fi) && 1553 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1554 int remaperr; 1555 1556 svp = mi->mi_curr_serv; 1557 remaperr = failover_remap(fi); 1558 if (remaperr != 0) { 1559 #ifdef DEBUG 1560 if (remaperr != EINTR) 1561 nfs_cmn_err(remaperr, CE_WARN, 1562 "aclcall couldn't failover: %m"); 1563 #endif 1564 mutex_enter(&mi->mi_lock); 1565 DEC_READERS(mi); 1566 mutex_exit(&mi->mi_lock); 1567 1568 /* 1569 * If failover_remap returns ETIMEDOUT 1570 * and the filesystem is hard mounted 1571 * we have to retry the call with a new 1572 * server. 1573 */ 1574 if ((mi->mi_flags & MI_HARD) && 1575 IS_RECOVERABLE_ERROR(remaperr)) { 1576 if (svp == mi->mi_curr_serv) 1577 failover_newserver(mi); 1578 rpcerr.re_status = RPC_SUCCESS; 1579 goto failoverretry; 1580 } 1581 return (remaperr); 1582 } 1583 } 1584 if (fi->fhp && fi->copyproc) 1585 (*fi->copyproc)(fi->fhp, fi->vp); 1586 } 1587 } 1588 1589 /* For TSOL, use a new cred which has net_mac_aware flag */ 1590 if (!cred_cloned && is_system_labeled()) { 1591 cred_cloned = TRUE; 1592 cr = crdup(icr); 1593 (void) setpflags(NET_MAC_AWARE, 1, cr); 1594 } 1595 1596 /* 1597 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1598 * are guaranteed to reprocess the retry as a new request. 1599 */ 1600 svp = mi->mi_curr_serv; 1601 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1602 if (FAILOVER_MOUNT(mi)) { 1603 mutex_enter(&mi->mi_lock); 1604 DEC_READERS(mi); 1605 mutex_exit(&mi->mi_lock); 1606 1607 if ((rpcerr.re_errno == ETIMEDOUT || 1608 rpcerr.re_errno == ECONNRESET) && 1609 failover_safe(fi)) { 1610 if (svp == mi->mi_curr_serv) 1611 failover_newserver(mi); 1612 goto failoverretry; 1613 } 1614 } 1615 if (rpcerr.re_errno != 0) { 1616 if (cred_cloned) 1617 crfree(cr); 1618 return (rpcerr.re_errno); 1619 } 1620 1621 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1622 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1623 timeo = (mi->mi_timeo * hz) / 10; 1624 } else { 1625 mutex_enter(&mi->mi_lock); 1626 timeo = CLNT_SETTIMERS(client, 1627 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1628 &(mi->mi_timers[NFS_CALLTYPES]), 1629 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1630 (void (*)()) 0, (caddr_t)mi, 0); 1631 mutex_exit(&mi->mi_lock); 1632 } 1633 1634 /* 1635 * If hard mounted fs, retry call forever unless hard error occurs. 1636 */ 1637 do { 1638 tryagain = FALSE; 1639 1640 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1641 status = RPC_FAILED; 1642 rpcerr.re_status = RPC_FAILED; 1643 rpcerr.re_errno = EIO; 1644 break; 1645 } 1646 1647 TICK_TO_TIMEVAL(timeo, &wait); 1648 1649 /* 1650 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1651 * and SIGTERM. (Preserving the existing masks). 1652 * Mask out SIGINT if mount option nointr is specified. 1653 */ 1654 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1655 if (!(mi->mi_flags & MI_INT)) 1656 client->cl_nosignal = TRUE; 1657 1658 /* 1659 * If there is a current signal, then don't bother 1660 * even trying to send out the request because we 1661 * won't be able to block waiting for the response. 1662 * Simply assume RPC_INTR and get on with it. 1663 */ 1664 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1665 status = RPC_INTR; 1666 else { 1667 status = CLNT_CALL(client, which, xdrargs, argsp, 1668 xdrres, resp, wait); 1669 } 1670 1671 if (!(mi->mi_flags & MI_INT)) 1672 client->cl_nosignal = FALSE; 1673 /* 1674 * restore original signal mask 1675 */ 1676 sigunintr(&smask); 1677 1678 switch (status) { 1679 case RPC_SUCCESS: 1680 #if 0 /* notyet */ 1681 if ((mi->mi_flags & MI_DYNAMIC) && 1682 mi->mi_timer_type[which] != 0 && 1683 (mi->mi_curread != my_rsize || 1684 mi->mi_curwrite != my_wsize)) 1685 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1686 #endif 1687 break; 1688 1689 /* 1690 * Unfortunately, there are servers in the world which 1691 * are not coded correctly. They are not prepared to 1692 * handle RPC requests to the NFS port which are not 1693 * NFS requests. Thus, they may try to process the 1694 * NFS_ACL request as if it were an NFS request. This 1695 * does not work. Generally, an error will be generated 1696 * on the client because it will not be able to decode 1697 * the response from the server. However, it seems 1698 * possible that the server may not be able to decode 1699 * the arguments. Thus, the criteria for deciding 1700 * whether the server supports NFS_ACL or not is whether 1701 * the following RPC errors are returned from CLNT_CALL. 1702 */ 1703 case RPC_CANTDECODERES: 1704 case RPC_PROGUNAVAIL: 1705 case RPC_CANTDECODEARGS: 1706 case RPC_PROGVERSMISMATCH: 1707 mutex_enter(&mi->mi_lock); 1708 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1709 mutex_exit(&mi->mi_lock); 1710 break; 1711 1712 /* 1713 * If the server supports NFS_ACL but not the new ops 1714 * for extended attributes, make sure we don't retry. 1715 */ 1716 case RPC_PROCUNAVAIL: 1717 mutex_enter(&mi->mi_lock); 1718 mi->mi_flags &= ~MI_EXTATTR; 1719 mutex_exit(&mi->mi_lock); 1720 break; 1721 1722 case RPC_INTR: 1723 /* 1724 * There is no way to recover from this error, 1725 * even if mount option nointr is specified. 1726 * SIGKILL, for example, cannot be blocked. 1727 */ 1728 rpcerr.re_status = RPC_INTR; 1729 rpcerr.re_errno = EINTR; 1730 break; 1731 1732 case RPC_UDERROR: 1733 /* 1734 * If the NFS server is local (vold) and 1735 * it goes away then we get RPC_UDERROR. 1736 * This is a retryable error, so we would 1737 * loop, so check to see if the specific 1738 * error was ECONNRESET, indicating that 1739 * target did not exist at all. If so, 1740 * return with RPC_PROGUNAVAIL and 1741 * ECONNRESET to indicate why. 1742 */ 1743 CLNT_GETERR(client, &rpcerr); 1744 if (rpcerr.re_errno == ECONNRESET) { 1745 rpcerr.re_status = RPC_PROGUNAVAIL; 1746 rpcerr.re_errno = ECONNRESET; 1747 break; 1748 } 1749 /*FALLTHROUGH*/ 1750 1751 default: /* probably RPC_TIMEDOUT */ 1752 if (IS_UNRECOVERABLE_RPC(status)) 1753 break; 1754 1755 /* 1756 * increment server not responding count 1757 */ 1758 mutex_enter(&mi->mi_lock); 1759 mi->mi_noresponse++; 1760 mutex_exit(&mi->mi_lock); 1761 #ifdef DEBUG 1762 nfscl->nfscl_stat.noresponse.value.ui64++; 1763 #endif 1764 1765 if (!(mi->mi_flags & MI_HARD)) { 1766 if (!(mi->mi_flags & MI_SEMISOFT) || 1767 (mi->mi_acl_ss_call_type[which] == 0)) 1768 break; 1769 } 1770 1771 /* 1772 * The call is in progress (over COTS). 1773 * Try the CLNT_CALL again, but don't 1774 * print a noisy error message. 1775 */ 1776 if (status == RPC_INPROGRESS) { 1777 tryagain = TRUE; 1778 break; 1779 } 1780 1781 if (flags & RFSCALL_SOFT) 1782 break; 1783 1784 /* 1785 * On zone shutdown, just move on. 1786 */ 1787 if (zone_status_get(curproc->p_zone) >= 1788 ZONE_IS_SHUTTING_DOWN) { 1789 rpcerr.re_status = RPC_FAILED; 1790 rpcerr.re_errno = EIO; 1791 break; 1792 } 1793 1794 /* 1795 * NFS client failover support 1796 * 1797 * If the current server just failed us, we'll 1798 * start the process of finding a new server. 1799 * After that, we can just retry. 1800 */ 1801 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1802 if (svp == mi->mi_curr_serv) 1803 failover_newserver(mi); 1804 clfree_impl(client, ch, nfscl); 1805 goto failoverretry; 1806 } 1807 1808 tryagain = TRUE; 1809 timeo = backoff(timeo); 1810 mutex_enter(&mi->mi_lock); 1811 if (!(mi->mi_flags & MI_PRINTED)) { 1812 mi->mi_flags |= MI_PRINTED; 1813 mutex_exit(&mi->mi_lock); 1814 #ifdef DEBUG 1815 zprintf(zoneid, 1816 "NFS_ACL%d server %s not responding still trying\n", 1817 mi->mi_vers, svp->sv_hostname); 1818 #else 1819 zprintf(zoneid, 1820 "NFS server %s not responding still trying\n", 1821 svp->sv_hostname); 1822 #endif 1823 } else 1824 mutex_exit(&mi->mi_lock); 1825 if (*douprintf && nfs_has_ctty()) { 1826 *douprintf = 0; 1827 if (!(mi->mi_flags & MI_NOPRINT)) 1828 #ifdef DEBUG 1829 uprintf( 1830 "NFS_ACL%d server %s not responding still trying\n", 1831 mi->mi_vers, svp->sv_hostname); 1832 #else 1833 uprintf( 1834 "NFS server %s not responding still trying\n", 1835 svp->sv_hostname); 1836 #endif 1837 } 1838 1839 #if 0 /* notyet */ 1840 /* 1841 * If doing dynamic adjustment of transfer 1842 * size and if it's a read or write call 1843 * and if the transfer size changed while 1844 * retransmitting or if the feedback routine 1845 * changed the transfer size, 1846 * then exit rfscall so that the transfer 1847 * size can be adjusted at the vnops level. 1848 */ 1849 if ((mi->mi_flags & MI_DYNAMIC) && 1850 mi->mi_acl_timer_type[which] != 0 && 1851 (mi->mi_curread != my_rsize || 1852 mi->mi_curwrite != my_wsize || 1853 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1854 /* 1855 * On read or write calls, return 1856 * back to the vnode ops level if 1857 * the transfer size changed. 1858 */ 1859 clfree_impl(client, ch, nfscl); 1860 if (cred_cloned) 1861 crfree(cr); 1862 return (ENFS_TRYAGAIN); 1863 } 1864 #endif 1865 } 1866 } while (tryagain); 1867 1868 if (status != RPC_SUCCESS) { 1869 /* 1870 * Let soft mounts use the timed out message. 1871 */ 1872 if (status == RPC_INPROGRESS) 1873 status = RPC_TIMEDOUT; 1874 nfscl->nfscl_stat.badcalls.value.ui64++; 1875 if (status == RPC_CANTDECODERES || 1876 status == RPC_PROGUNAVAIL || 1877 status == RPC_PROCUNAVAIL || 1878 status == RPC_CANTDECODEARGS || 1879 status == RPC_PROGVERSMISMATCH) 1880 CLNT_GETERR(client, &rpcerr); 1881 else if (status != RPC_INTR) { 1882 mutex_enter(&mi->mi_lock); 1883 mi->mi_flags |= MI_DOWN; 1884 mutex_exit(&mi->mi_lock); 1885 CLNT_GETERR(client, &rpcerr); 1886 #ifdef DEBUG 1887 bufp = clnt_sperror(client, svp->sv_hostname); 1888 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1889 mi->mi_vers, mi->mi_aclnames[which], bufp); 1890 if (nfs_has_ctty()) { 1891 if (!(mi->mi_flags & MI_NOPRINT)) { 1892 uprintf("NFS_ACL%d %s failed for %s\n", 1893 mi->mi_vers, mi->mi_aclnames[which], 1894 bufp); 1895 } 1896 } 1897 kmem_free(bufp, MAXPATHLEN); 1898 #else 1899 zprintf(zoneid, 1900 "NFS %s failed for server %s: error %d (%s)\n", 1901 mi->mi_aclnames[which], svp->sv_hostname, 1902 status, clnt_sperrno(status)); 1903 if (nfs_has_ctty()) { 1904 if (!(mi->mi_flags & MI_NOPRINT)) 1905 uprintf( 1906 "NFS %s failed for server %s: error %d (%s)\n", 1907 mi->mi_aclnames[which], 1908 svp->sv_hostname, status, 1909 clnt_sperrno(status)); 1910 } 1911 #endif 1912 /* 1913 * when CLNT_CALL() fails with RPC_AUTHERROR, 1914 * re_errno is set appropriately depending on 1915 * the authentication error 1916 */ 1917 if (status == RPC_VERSMISMATCH || 1918 status == RPC_PROGVERSMISMATCH) 1919 rpcerr.re_errno = EIO; 1920 } 1921 } else { 1922 /* 1923 * Test the value of mi_down and mi_printed without 1924 * holding the mi_lock mutex. If they are both zero, 1925 * then it is okay to skip the down and printed 1926 * processing. This saves on a mutex_enter and 1927 * mutex_exit pair for a normal, successful RPC. 1928 * This was just complete overhead. 1929 */ 1930 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1931 mutex_enter(&mi->mi_lock); 1932 mi->mi_flags &= ~MI_DOWN; 1933 if (mi->mi_flags & MI_PRINTED) { 1934 mi->mi_flags &= ~MI_PRINTED; 1935 mutex_exit(&mi->mi_lock); 1936 #ifdef DEBUG 1937 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1938 mi->mi_vers, svp->sv_hostname); 1939 #else 1940 zprintf(zoneid, "NFS server %s ok\n", 1941 svp->sv_hostname); 1942 #endif 1943 } else 1944 mutex_exit(&mi->mi_lock); 1945 } 1946 1947 if (*douprintf == 0) { 1948 if (!(mi->mi_flags & MI_NOPRINT)) 1949 #ifdef DEBUG 1950 uprintf("NFS_ACL%d server %s ok\n", 1951 mi->mi_vers, svp->sv_hostname); 1952 #else 1953 uprintf("NFS server %s ok\n", svp->sv_hostname); 1954 #endif 1955 *douprintf = 1; 1956 } 1957 } 1958 1959 clfree_impl(client, ch, nfscl); 1960 if (cred_cloned) 1961 crfree(cr); 1962 1963 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1964 1965 #if 0 /* notyet */ 1966 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1967 rpcerr.re_errno); 1968 #endif 1969 1970 return (rpcerr.re_errno); 1971 } 1972 1973 int 1974 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1975 { 1976 uint_t mask = vap->va_mask; 1977 1978 if (!(mask & AT_MODE)) 1979 sa->sa_mode = (uint32_t)-1; 1980 else 1981 sa->sa_mode = vap->va_mode; 1982 if (!(mask & AT_UID)) 1983 sa->sa_uid = (uint32_t)-1; 1984 else 1985 sa->sa_uid = (uint32_t)vap->va_uid; 1986 if (!(mask & AT_GID)) 1987 sa->sa_gid = (uint32_t)-1; 1988 else 1989 sa->sa_gid = (uint32_t)vap->va_gid; 1990 if (!(mask & AT_SIZE)) 1991 sa->sa_size = (uint32_t)-1; 1992 else 1993 sa->sa_size = (uint32_t)vap->va_size; 1994 if (!(mask & AT_ATIME)) 1995 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 1996 else { 1997 /* check time validity */ 1998 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 1999 return (EOVERFLOW); 2000 } 2001 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2002 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2003 } 2004 if (!(mask & AT_MTIME)) 2005 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2006 else { 2007 /* check time validity */ 2008 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2009 return (EOVERFLOW); 2010 } 2011 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2012 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2013 } 2014 return (0); 2015 } 2016 2017 int 2018 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2019 { 2020 uint_t mask = vap->va_mask; 2021 2022 if (!(mask & AT_MODE)) 2023 sa->mode.set_it = FALSE; 2024 else { 2025 sa->mode.set_it = TRUE; 2026 sa->mode.mode = (mode3)vap->va_mode; 2027 } 2028 if (!(mask & AT_UID)) 2029 sa->uid.set_it = FALSE; 2030 else { 2031 sa->uid.set_it = TRUE; 2032 sa->uid.uid = (uid3)vap->va_uid; 2033 } 2034 if (!(mask & AT_GID)) 2035 sa->gid.set_it = FALSE; 2036 else { 2037 sa->gid.set_it = TRUE; 2038 sa->gid.gid = (gid3)vap->va_gid; 2039 } 2040 if (!(mask & AT_SIZE)) 2041 sa->size.set_it = FALSE; 2042 else { 2043 sa->size.set_it = TRUE; 2044 sa->size.size = (size3)vap->va_size; 2045 } 2046 if (!(mask & AT_ATIME)) 2047 sa->atime.set_it = DONT_CHANGE; 2048 else { 2049 /* check time validity */ 2050 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2051 return (EOVERFLOW); 2052 } 2053 sa->atime.set_it = SET_TO_CLIENT_TIME; 2054 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2055 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2056 } 2057 if (!(mask & AT_MTIME)) 2058 sa->mtime.set_it = DONT_CHANGE; 2059 else { 2060 /* check time validity */ 2061 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2062 return (EOVERFLOW); 2063 } 2064 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2065 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2066 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2067 } 2068 return (0); 2069 } 2070 2071 void 2072 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2073 { 2074 2075 da->da_fhandle = VTOFH(dvp); 2076 da->da_name = nm; 2077 da->da_flags = 0; 2078 } 2079 2080 void 2081 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2082 { 2083 2084 da->dirp = VTOFH3(dvp); 2085 da->name = nm; 2086 } 2087 2088 int 2089 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2090 { 2091 int error; 2092 rnode_t *rp; 2093 struct vattr va; 2094 2095 va.va_mask = AT_MODE | AT_GID; 2096 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2097 if (error) 2098 return (error); 2099 2100 /* 2101 * To determine the expected group-id of the created file: 2102 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2103 * GRPID option, and the directory's set-gid bit is clear, 2104 * then use the process's gid. 2105 * 2) Otherwise, set the group-id to the gid of the parent directory. 2106 */ 2107 rp = VTOR(dvp); 2108 mutex_enter(&rp->r_statelock); 2109 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2110 *gidp = crgetgid(cr); 2111 else 2112 *gidp = va.va_gid; 2113 mutex_exit(&rp->r_statelock); 2114 return (0); 2115 } 2116 2117 int 2118 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2119 { 2120 int error; 2121 struct vattr va; 2122 2123 va.va_mask = AT_MODE; 2124 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2125 if (error) 2126 return (error); 2127 2128 /* 2129 * Modify the expected mode (om) so that the set-gid bit matches 2130 * that of the parent directory (dvp). 2131 */ 2132 if (va.va_mode & VSGID) 2133 *omp |= VSGID; 2134 else 2135 *omp &= ~VSGID; 2136 return (0); 2137 } 2138 2139 void 2140 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2141 { 2142 2143 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2144 if (!(vp->v_flag & VSWAPLIKE)) { 2145 mutex_enter(&vp->v_lock); 2146 vp->v_flag |= VSWAPLIKE; 2147 mutex_exit(&vp->v_lock); 2148 } 2149 } else { 2150 if (vp->v_flag & VSWAPLIKE) { 2151 mutex_enter(&vp->v_lock); 2152 vp->v_flag &= ~VSWAPLIKE; 2153 mutex_exit(&vp->v_lock); 2154 } 2155 } 2156 } 2157 2158 /* 2159 * Free the resources associated with an rnode. 2160 */ 2161 static void 2162 rinactive(rnode_t *rp, cred_t *cr) 2163 { 2164 vnode_t *vp; 2165 cred_t *cred; 2166 char *contents; 2167 int size; 2168 vsecattr_t *vsp; 2169 int error; 2170 nfs3_pathconf_info *info; 2171 2172 /* 2173 * Before freeing anything, wait until all asynchronous 2174 * activity is done on this rnode. This will allow all 2175 * asynchronous read ahead and write behind i/o's to 2176 * finish. 2177 */ 2178 mutex_enter(&rp->r_statelock); 2179 while (rp->r_count > 0) 2180 cv_wait(&rp->r_cv, &rp->r_statelock); 2181 mutex_exit(&rp->r_statelock); 2182 2183 /* 2184 * Flush and invalidate all pages associated with the vnode. 2185 */ 2186 vp = RTOV(rp); 2187 if (vn_has_cached_data(vp)) { 2188 ASSERT(vp->v_type != VCHR); 2189 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2190 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2191 if (error && (error == ENOSPC || error == EDQUOT)) { 2192 mutex_enter(&rp->r_statelock); 2193 if (!rp->r_error) 2194 rp->r_error = error; 2195 mutex_exit(&rp->r_statelock); 2196 } 2197 } 2198 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2199 } 2200 2201 /* 2202 * Free any held credentials and caches which may be associated 2203 * with this rnode. 2204 */ 2205 mutex_enter(&rp->r_statelock); 2206 cred = rp->r_cred; 2207 rp->r_cred = NULL; 2208 contents = rp->r_symlink.contents; 2209 size = rp->r_symlink.size; 2210 rp->r_symlink.contents = NULL; 2211 vsp = rp->r_secattr; 2212 rp->r_secattr = NULL; 2213 info = rp->r_pathconf; 2214 rp->r_pathconf = NULL; 2215 mutex_exit(&rp->r_statelock); 2216 2217 /* 2218 * Free the held credential. 2219 */ 2220 if (cred != NULL) 2221 crfree(cred); 2222 2223 /* 2224 * Free the access cache entries. 2225 */ 2226 (void) nfs_access_purge_rp(rp); 2227 2228 /* 2229 * Free the readdir cache entries. 2230 */ 2231 if (HAVE_RDDIR_CACHE(rp)) 2232 nfs_purge_rddir_cache(vp); 2233 2234 /* 2235 * Free the symbolic link cache. 2236 */ 2237 if (contents != NULL) { 2238 2239 kmem_free((void *)contents, size); 2240 } 2241 2242 /* 2243 * Free any cached ACL. 2244 */ 2245 if (vsp != NULL) 2246 nfs_acl_free(vsp); 2247 2248 /* 2249 * Free any cached pathconf information. 2250 */ 2251 if (info != NULL) 2252 kmem_free(info, sizeof (*info)); 2253 } 2254 2255 /* 2256 * Return a vnode for the given NFS Version 2 file handle. 2257 * If no rnode exists for this fhandle, create one and put it 2258 * into the hash queues. If the rnode for this fhandle 2259 * already exists, return it. 2260 * 2261 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2262 */ 2263 vnode_t * 2264 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2265 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2266 { 2267 int newnode; 2268 int index; 2269 vnode_t *vp; 2270 nfs_fhandle nfh; 2271 vattr_t va; 2272 2273 nfh.fh_len = NFS_FHSIZE; 2274 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2275 2276 index = rtablehash(&nfh); 2277 rw_enter(&rtable[index].r_lock, RW_READER); 2278 2279 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2280 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2281 2282 if (attr != NULL) { 2283 if (!newnode) { 2284 rw_exit(&rtable[index].r_lock); 2285 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2286 } else { 2287 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2288 vp->v_type = VBAD; 2289 else 2290 vp->v_type = n2v_type(attr); 2291 /* 2292 * A translation here seems to be necessary 2293 * because this function can be called 2294 * with `attr' that has come from the wire, 2295 * and been operated on by vattr_to_nattr(). 2296 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2297 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2298 * ->makenfsnode(). 2299 */ 2300 if ((attr->na_rdev & 0xffff0000) == 0) 2301 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2302 else 2303 vp->v_rdev = expldev(n2v_rdev(attr)); 2304 nfs_attrcache(vp, attr, t); 2305 rw_exit(&rtable[index].r_lock); 2306 } 2307 } else { 2308 if (newnode) { 2309 PURGE_ATTRCACHE(vp); 2310 } 2311 rw_exit(&rtable[index].r_lock); 2312 } 2313 2314 return (vp); 2315 } 2316 2317 /* 2318 * Return a vnode for the given NFS Version 3 file handle. 2319 * If no rnode exists for this fhandle, create one and put it 2320 * into the hash queues. If the rnode for this fhandle 2321 * already exists, return it. 2322 * 2323 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2324 */ 2325 vnode_t * 2326 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2327 cred_t *cr, char *dnm, char *nm) 2328 { 2329 int newnode; 2330 int index; 2331 vnode_t *vp; 2332 2333 index = rtablehash((nfs_fhandle *)fh); 2334 rw_enter(&rtable[index].r_lock, RW_READER); 2335 2336 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2337 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2338 dnm, nm); 2339 2340 if (vap == NULL) { 2341 if (newnode) { 2342 PURGE_ATTRCACHE(vp); 2343 } 2344 rw_exit(&rtable[index].r_lock); 2345 return (vp); 2346 } 2347 2348 if (!newnode) { 2349 rw_exit(&rtable[index].r_lock); 2350 nfs_attr_cache(vp, vap, t, cr); 2351 } else { 2352 rnode_t *rp = VTOR(vp); 2353 2354 vp->v_type = vap->va_type; 2355 vp->v_rdev = vap->va_rdev; 2356 2357 mutex_enter(&rp->r_statelock); 2358 if (rp->r_mtime <= t) 2359 nfs_attrcache_va(vp, vap); 2360 mutex_exit(&rp->r_statelock); 2361 rw_exit(&rtable[index].r_lock); 2362 } 2363 2364 return (vp); 2365 } 2366 2367 vnode_t * 2368 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2369 cred_t *cr, char *dnm, char *nm) 2370 { 2371 int newnode; 2372 int index; 2373 vnode_t *vp; 2374 vattr_t va; 2375 2376 index = rtablehash((nfs_fhandle *)fh); 2377 rw_enter(&rtable[index].r_lock, RW_READER); 2378 2379 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2380 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2381 dnm, nm); 2382 2383 if (attr == NULL) { 2384 if (newnode) { 2385 PURGE_ATTRCACHE(vp); 2386 } 2387 rw_exit(&rtable[index].r_lock); 2388 return (vp); 2389 } 2390 2391 if (!newnode) { 2392 rw_exit(&rtable[index].r_lock); 2393 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2394 } else { 2395 if (attr->type < NF3REG || attr->type > NF3FIFO) 2396 vp->v_type = VBAD; 2397 else 2398 vp->v_type = nf3_to_vt[attr->type]; 2399 vp->v_rdev = makedevice(attr->rdev.specdata1, 2400 attr->rdev.specdata2); 2401 nfs3_attrcache(vp, attr, t); 2402 rw_exit(&rtable[index].r_lock); 2403 } 2404 2405 return (vp); 2406 } 2407 2408 /* 2409 * Read this comment before making changes to rtablehash()! 2410 * This is a hash function in which seemingly obvious and harmless 2411 * changes can cause escalations costing million dollars! 2412 * Know what you are doing. 2413 * 2414 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2415 * algorithm is currently detailed here: 2416 * 2417 * http://burtleburtle.net/bob/hash/doobs.html 2418 * 2419 * Of course, the above link may not be valid by the time you are reading 2420 * this, but suffice it to say that the one-at-a-time algorithm works well in 2421 * almost all cases. If you are changing the algorithm be sure to verify that 2422 * the hash algorithm still provides even distribution in all cases and with 2423 * any server returning filehandles in whatever order (sequential or random). 2424 */ 2425 static int 2426 rtablehash(nfs_fhandle *fh) 2427 { 2428 ulong_t hash, len, i; 2429 char *key; 2430 2431 key = fh->fh_buf; 2432 len = (ulong_t)fh->fh_len; 2433 for (hash = 0, i = 0; i < len; i++) { 2434 hash += key[i]; 2435 hash += (hash << 10); 2436 hash ^= (hash >> 6); 2437 } 2438 hash += (hash << 3); 2439 hash ^= (hash >> 11); 2440 hash += (hash << 15); 2441 return (hash & rtablemask); 2442 } 2443 2444 static vnode_t * 2445 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2446 struct vnodeops *vops, 2447 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2448 int (*compar)(const void *, const void *), 2449 int *newnode, cred_t *cr, char *dnm, char *nm) 2450 { 2451 rnode_t *rp; 2452 rnode_t *trp; 2453 vnode_t *vp; 2454 mntinfo_t *mi; 2455 2456 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2457 2458 mi = VFTOMI(vfsp); 2459 start: 2460 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2461 vp = RTOV(rp); 2462 nfs_set_vroot(vp); 2463 *newnode = 0; 2464 return (vp); 2465 } 2466 rw_exit(&rhtp->r_lock); 2467 2468 mutex_enter(&rpfreelist_lock); 2469 if (rpfreelist != NULL && rnew >= nrnode) { 2470 rp = rpfreelist; 2471 rp_rmfree(rp); 2472 mutex_exit(&rpfreelist_lock); 2473 2474 vp = RTOV(rp); 2475 2476 if (rp->r_flags & RHASHED) { 2477 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2478 mutex_enter(&vp->v_lock); 2479 if (vp->v_count > 1) { 2480 vp->v_count--; 2481 mutex_exit(&vp->v_lock); 2482 rw_exit(&rp->r_hashq->r_lock); 2483 rw_enter(&rhtp->r_lock, RW_READER); 2484 goto start; 2485 } 2486 mutex_exit(&vp->v_lock); 2487 rp_rmhash_locked(rp); 2488 rw_exit(&rp->r_hashq->r_lock); 2489 } 2490 2491 rinactive(rp, cr); 2492 2493 mutex_enter(&vp->v_lock); 2494 if (vp->v_count > 1) { 2495 vp->v_count--; 2496 mutex_exit(&vp->v_lock); 2497 rw_enter(&rhtp->r_lock, RW_READER); 2498 goto start; 2499 } 2500 mutex_exit(&vp->v_lock); 2501 vn_invalid(vp); 2502 /* 2503 * destroy old locks before bzero'ing and 2504 * recreating the locks below. 2505 */ 2506 nfs_rw_destroy(&rp->r_rwlock); 2507 nfs_rw_destroy(&rp->r_lkserlock); 2508 mutex_destroy(&rp->r_statelock); 2509 cv_destroy(&rp->r_cv); 2510 cv_destroy(&rp->r_commit.c_cv); 2511 nfs_free_r_path(rp); 2512 avl_destroy(&rp->r_dir); 2513 /* 2514 * Make sure that if rnode is recycled then 2515 * VFS count is decremented properly before 2516 * reuse. 2517 */ 2518 VFS_RELE(vp->v_vfsp); 2519 vn_reinit(vp); 2520 } else { 2521 vnode_t *new_vp; 2522 2523 mutex_exit(&rpfreelist_lock); 2524 2525 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2526 new_vp = vn_alloc(KM_SLEEP); 2527 2528 atomic_add_long((ulong_t *)&rnew, 1); 2529 #ifdef DEBUG 2530 clstat_debug.nrnode.value.ui64++; 2531 #endif 2532 vp = new_vp; 2533 } 2534 2535 bzero(rp, sizeof (*rp)); 2536 rp->r_vnode = vp; 2537 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2538 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2539 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2540 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2541 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2542 rp->r_fh.fh_len = fh->fh_len; 2543 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2544 rp->r_server = mi->mi_curr_serv; 2545 if (FAILOVER_MOUNT(mi)) { 2546 /* 2547 * If replicated servers, stash pathnames 2548 */ 2549 if (dnm != NULL && nm != NULL) { 2550 char *s, *p; 2551 uint_t len; 2552 2553 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2554 rp->r_path = kmem_alloc(len, KM_SLEEP); 2555 #ifdef DEBUG 2556 clstat_debug.rpath.value.ui64 += len; 2557 #endif 2558 s = rp->r_path; 2559 for (p = dnm; *p; p++) 2560 *s++ = *p; 2561 *s++ = '/'; 2562 for (p = nm; *p; p++) 2563 *s++ = *p; 2564 *s = '\0'; 2565 } else { 2566 /* special case for root */ 2567 rp->r_path = kmem_alloc(2, KM_SLEEP); 2568 #ifdef DEBUG 2569 clstat_debug.rpath.value.ui64 += 2; 2570 #endif 2571 *rp->r_path = '.'; 2572 *(rp->r_path + 1) = '\0'; 2573 } 2574 } 2575 VFS_HOLD(vfsp); 2576 rp->r_putapage = putapage; 2577 rp->r_hashq = rhtp; 2578 rp->r_flags = RREADDIRPLUS; 2579 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2580 offsetof(rddir_cache, tree)); 2581 vn_setops(vp, vops); 2582 vp->v_data = (caddr_t)rp; 2583 vp->v_vfsp = vfsp; 2584 vp->v_type = VNON; 2585 nfs_set_vroot(vp); 2586 2587 /* 2588 * There is a race condition if someone else 2589 * alloc's the rnode while no locks are held, so we 2590 * check again and recover if found. 2591 */ 2592 rw_enter(&rhtp->r_lock, RW_WRITER); 2593 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2594 vp = RTOV(trp); 2595 nfs_set_vroot(vp); 2596 *newnode = 0; 2597 rw_exit(&rhtp->r_lock); 2598 rp_addfree(rp, cr); 2599 rw_enter(&rhtp->r_lock, RW_READER); 2600 return (vp); 2601 } 2602 rp_addhash(rp); 2603 *newnode = 1; 2604 return (vp); 2605 } 2606 2607 static void 2608 nfs_set_vroot(vnode_t *vp) 2609 { 2610 rnode_t *rp; 2611 nfs_fhandle *rootfh; 2612 2613 rp = VTOR(vp); 2614 rootfh = &rp->r_server->sv_fhandle; 2615 if (rootfh->fh_len == rp->r_fh.fh_len && 2616 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2617 if (!(vp->v_flag & VROOT)) { 2618 mutex_enter(&vp->v_lock); 2619 vp->v_flag |= VROOT; 2620 mutex_exit(&vp->v_lock); 2621 } 2622 } 2623 } 2624 2625 static void 2626 nfs_free_r_path(rnode_t *rp) 2627 { 2628 char *path; 2629 size_t len; 2630 2631 path = rp->r_path; 2632 if (path) { 2633 rp->r_path = NULL; 2634 len = strlen(path) + 1; 2635 kmem_free(path, len); 2636 #ifdef DEBUG 2637 clstat_debug.rpath.value.ui64 -= len; 2638 #endif 2639 } 2640 } 2641 2642 /* 2643 * Put an rnode on the free list. 2644 * 2645 * Rnodes which were allocated above and beyond the normal limit 2646 * are immediately freed. 2647 */ 2648 void 2649 rp_addfree(rnode_t *rp, cred_t *cr) 2650 { 2651 vnode_t *vp; 2652 struct vfs *vfsp; 2653 2654 vp = RTOV(rp); 2655 ASSERT(vp->v_count >= 1); 2656 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2657 2658 /* 2659 * If we have too many rnodes allocated and there are no 2660 * references to this rnode, or if the rnode is no longer 2661 * accessible by it does not reside in the hash queues, 2662 * or if an i/o error occurred while writing to the file, 2663 * then just free it instead of putting it on the rnode 2664 * freelist. 2665 */ 2666 vfsp = vp->v_vfsp; 2667 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2668 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2669 if (rp->r_flags & RHASHED) { 2670 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2671 mutex_enter(&vp->v_lock); 2672 if (vp->v_count > 1) { 2673 vp->v_count--; 2674 mutex_exit(&vp->v_lock); 2675 rw_exit(&rp->r_hashq->r_lock); 2676 return; 2677 } 2678 mutex_exit(&vp->v_lock); 2679 rp_rmhash_locked(rp); 2680 rw_exit(&rp->r_hashq->r_lock); 2681 } 2682 2683 rinactive(rp, cr); 2684 2685 /* 2686 * Recheck the vnode reference count. We need to 2687 * make sure that another reference has not been 2688 * acquired while we were not holding v_lock. The 2689 * rnode is not in the rnode hash queues, so the 2690 * only way for a reference to have been acquired 2691 * is for a VOP_PUTPAGE because the rnode was marked 2692 * with RDIRTY or for a modified page. This 2693 * reference may have been acquired before our call 2694 * to rinactive. The i/o may have been completed, 2695 * thus allowing rinactive to complete, but the 2696 * reference to the vnode may not have been released 2697 * yet. In any case, the rnode can not be destroyed 2698 * until the other references to this vnode have been 2699 * released. The other references will take care of 2700 * either destroying the rnode or placing it on the 2701 * rnode freelist. If there are no other references, 2702 * then the rnode may be safely destroyed. 2703 */ 2704 mutex_enter(&vp->v_lock); 2705 if (vp->v_count > 1) { 2706 vp->v_count--; 2707 mutex_exit(&vp->v_lock); 2708 return; 2709 } 2710 mutex_exit(&vp->v_lock); 2711 2712 destroy_rnode(rp); 2713 return; 2714 } 2715 2716 /* 2717 * Lock the hash queue and then recheck the reference count 2718 * to ensure that no other threads have acquired a reference 2719 * to indicate that the rnode should not be placed on the 2720 * freelist. If another reference has been acquired, then 2721 * just release this one and let the other thread complete 2722 * the processing of adding this rnode to the freelist. 2723 */ 2724 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2725 2726 mutex_enter(&vp->v_lock); 2727 if (vp->v_count > 1) { 2728 vp->v_count--; 2729 mutex_exit(&vp->v_lock); 2730 rw_exit(&rp->r_hashq->r_lock); 2731 return; 2732 } 2733 mutex_exit(&vp->v_lock); 2734 2735 /* 2736 * If there is no cached data or metadata for this file, then 2737 * put the rnode on the front of the freelist so that it will 2738 * be reused before other rnodes which may have cached data or 2739 * metadata associated with them. 2740 */ 2741 mutex_enter(&rpfreelist_lock); 2742 if (rpfreelist == NULL) { 2743 rp->r_freef = rp; 2744 rp->r_freeb = rp; 2745 rpfreelist = rp; 2746 } else { 2747 rp->r_freef = rpfreelist; 2748 rp->r_freeb = rpfreelist->r_freeb; 2749 rpfreelist->r_freeb->r_freef = rp; 2750 rpfreelist->r_freeb = rp; 2751 if (!vn_has_cached_data(vp) && 2752 !HAVE_RDDIR_CACHE(rp) && 2753 rp->r_symlink.contents == NULL && 2754 rp->r_secattr == NULL && 2755 rp->r_pathconf == NULL) 2756 rpfreelist = rp; 2757 } 2758 mutex_exit(&rpfreelist_lock); 2759 2760 rw_exit(&rp->r_hashq->r_lock); 2761 } 2762 2763 /* 2764 * Remove an rnode from the free list. 2765 * 2766 * The caller must be holding rpfreelist_lock and the rnode 2767 * must be on the freelist. 2768 */ 2769 static void 2770 rp_rmfree(rnode_t *rp) 2771 { 2772 2773 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2774 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2775 2776 if (rp == rpfreelist) { 2777 rpfreelist = rp->r_freef; 2778 if (rp == rpfreelist) 2779 rpfreelist = NULL; 2780 } 2781 2782 rp->r_freeb->r_freef = rp->r_freef; 2783 rp->r_freef->r_freeb = rp->r_freeb; 2784 2785 rp->r_freef = rp->r_freeb = NULL; 2786 } 2787 2788 /* 2789 * Put a rnode in the hash table. 2790 * 2791 * The caller must be holding the exclusive hash queue lock. 2792 */ 2793 static void 2794 rp_addhash(rnode_t *rp) 2795 { 2796 2797 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2798 ASSERT(!(rp->r_flags & RHASHED)); 2799 2800 rp->r_hashf = rp->r_hashq->r_hashf; 2801 rp->r_hashq->r_hashf = rp; 2802 rp->r_hashb = (rnode_t *)rp->r_hashq; 2803 rp->r_hashf->r_hashb = rp; 2804 2805 mutex_enter(&rp->r_statelock); 2806 rp->r_flags |= RHASHED; 2807 mutex_exit(&rp->r_statelock); 2808 } 2809 2810 /* 2811 * Remove a rnode from the hash table. 2812 * 2813 * The caller must be holding the hash queue lock. 2814 */ 2815 static void 2816 rp_rmhash_locked(rnode_t *rp) 2817 { 2818 2819 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2820 ASSERT(rp->r_flags & RHASHED); 2821 2822 rp->r_hashb->r_hashf = rp->r_hashf; 2823 rp->r_hashf->r_hashb = rp->r_hashb; 2824 2825 mutex_enter(&rp->r_statelock); 2826 rp->r_flags &= ~RHASHED; 2827 mutex_exit(&rp->r_statelock); 2828 } 2829 2830 /* 2831 * Remove a rnode from the hash table. 2832 * 2833 * The caller must not be holding the hash queue lock. 2834 */ 2835 void 2836 rp_rmhash(rnode_t *rp) 2837 { 2838 2839 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2840 rp_rmhash_locked(rp); 2841 rw_exit(&rp->r_hashq->r_lock); 2842 } 2843 2844 /* 2845 * Lookup a rnode by fhandle. 2846 * 2847 * The caller must be holding the hash queue lock, either shared or exclusive. 2848 */ 2849 static rnode_t * 2850 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2851 { 2852 rnode_t *rp; 2853 vnode_t *vp; 2854 2855 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2856 2857 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2858 vp = RTOV(rp); 2859 if (vp->v_vfsp == vfsp && 2860 rp->r_fh.fh_len == fh->fh_len && 2861 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2862 /* 2863 * remove rnode from free list, if necessary. 2864 */ 2865 if (rp->r_freef != NULL) { 2866 mutex_enter(&rpfreelist_lock); 2867 /* 2868 * If the rnode is on the freelist, 2869 * then remove it and use that reference 2870 * as the new reference. Otherwise, 2871 * need to increment the reference count. 2872 */ 2873 if (rp->r_freef != NULL) { 2874 rp_rmfree(rp); 2875 mutex_exit(&rpfreelist_lock); 2876 } else { 2877 mutex_exit(&rpfreelist_lock); 2878 VN_HOLD(vp); 2879 } 2880 } else 2881 VN_HOLD(vp); 2882 return (rp); 2883 } 2884 } 2885 return (NULL); 2886 } 2887 2888 /* 2889 * Return 1 if there is a active vnode belonging to this vfs in the 2890 * rtable cache. 2891 * 2892 * Several of these checks are done without holding the usual 2893 * locks. This is safe because destroy_rtable(), rp_addfree(), 2894 * etc. will redo the necessary checks before actually destroying 2895 * any rnodes. 2896 */ 2897 int 2898 check_rtable(struct vfs *vfsp) 2899 { 2900 int index; 2901 rnode_t *rp; 2902 vnode_t *vp; 2903 2904 for (index = 0; index < rtablesize; index++) { 2905 rw_enter(&rtable[index].r_lock, RW_READER); 2906 for (rp = rtable[index].r_hashf; 2907 rp != (rnode_t *)(&rtable[index]); 2908 rp = rp->r_hashf) { 2909 vp = RTOV(rp); 2910 if (vp->v_vfsp == vfsp) { 2911 if (rp->r_freef == NULL || 2912 (vn_has_cached_data(vp) && 2913 (rp->r_flags & RDIRTY)) || 2914 rp->r_count > 0) { 2915 rw_exit(&rtable[index].r_lock); 2916 return (1); 2917 } 2918 } 2919 } 2920 rw_exit(&rtable[index].r_lock); 2921 } 2922 return (0); 2923 } 2924 2925 /* 2926 * Destroy inactive vnodes from the hash queues which belong to this 2927 * vfs. It is essential that we destroy all inactive vnodes during a 2928 * forced unmount as well as during a normal unmount. 2929 */ 2930 void 2931 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2932 { 2933 int index; 2934 rnode_t *rp; 2935 rnode_t *rlist; 2936 rnode_t *r_hashf; 2937 vnode_t *vp; 2938 2939 rlist = NULL; 2940 2941 for (index = 0; index < rtablesize; index++) { 2942 rw_enter(&rtable[index].r_lock, RW_WRITER); 2943 for (rp = rtable[index].r_hashf; 2944 rp != (rnode_t *)(&rtable[index]); 2945 rp = r_hashf) { 2946 /* save the hash pointer before destroying */ 2947 r_hashf = rp->r_hashf; 2948 vp = RTOV(rp); 2949 if (vp->v_vfsp == vfsp) { 2950 mutex_enter(&rpfreelist_lock); 2951 if (rp->r_freef != NULL) { 2952 rp_rmfree(rp); 2953 mutex_exit(&rpfreelist_lock); 2954 rp_rmhash_locked(rp); 2955 rp->r_hashf = rlist; 2956 rlist = rp; 2957 } else 2958 mutex_exit(&rpfreelist_lock); 2959 } 2960 } 2961 rw_exit(&rtable[index].r_lock); 2962 } 2963 2964 for (rp = rlist; rp != NULL; rp = rlist) { 2965 rlist = rp->r_hashf; 2966 /* 2967 * This call to rp_addfree will end up destroying the 2968 * rnode, but in a safe way with the appropriate set 2969 * of checks done. 2970 */ 2971 rp_addfree(rp, cr); 2972 } 2973 2974 } 2975 2976 /* 2977 * This routine destroys all the resources associated with the rnode 2978 * and then the rnode itself. 2979 */ 2980 static void 2981 destroy_rnode(rnode_t *rp) 2982 { 2983 vnode_t *vp; 2984 vfs_t *vfsp; 2985 2986 vp = RTOV(rp); 2987 vfsp = vp->v_vfsp; 2988 2989 ASSERT(vp->v_count == 1); 2990 ASSERT(rp->r_count == 0); 2991 ASSERT(rp->r_lmpl == NULL); 2992 ASSERT(rp->r_mapcnt == 0); 2993 ASSERT(!(rp->r_flags & RHASHED)); 2994 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2995 atomic_add_long((ulong_t *)&rnew, -1); 2996 #ifdef DEBUG 2997 clstat_debug.nrnode.value.ui64--; 2998 #endif 2999 nfs_rw_destroy(&rp->r_rwlock); 3000 nfs_rw_destroy(&rp->r_lkserlock); 3001 mutex_destroy(&rp->r_statelock); 3002 cv_destroy(&rp->r_cv); 3003 cv_destroy(&rp->r_commit.c_cv); 3004 if (rp->r_flags & RDELMAPLIST) 3005 list_destroy(&rp->r_indelmap); 3006 nfs_free_r_path(rp); 3007 avl_destroy(&rp->r_dir); 3008 vn_invalid(vp); 3009 vn_free(vp); 3010 kmem_cache_free(rnode_cache, rp); 3011 VFS_RELE(vfsp); 3012 } 3013 3014 /* 3015 * Flush all vnodes in this (or every) vfs. 3016 * Used by nfs_sync and by nfs_unmount. 3017 */ 3018 void 3019 rflush(struct vfs *vfsp, cred_t *cr) 3020 { 3021 int index; 3022 rnode_t *rp; 3023 vnode_t *vp, **vplist; 3024 long num, cnt; 3025 3026 /* 3027 * Check to see whether there is anything to do. 3028 */ 3029 num = rnew; 3030 if (num == 0) 3031 return; 3032 3033 /* 3034 * Allocate a slot for all currently active rnodes on the 3035 * supposition that they all may need flushing. 3036 */ 3037 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3038 cnt = 0; 3039 3040 /* 3041 * Walk the hash queues looking for rnodes with page 3042 * lists associated with them. Make a list of these 3043 * files. 3044 */ 3045 for (index = 0; index < rtablesize; index++) { 3046 rw_enter(&rtable[index].r_lock, RW_READER); 3047 for (rp = rtable[index].r_hashf; 3048 rp != (rnode_t *)(&rtable[index]); 3049 rp = rp->r_hashf) { 3050 vp = RTOV(rp); 3051 /* 3052 * Don't bother sync'ing a vp if it 3053 * is part of virtual swap device or 3054 * if VFS is read-only 3055 */ 3056 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3057 continue; 3058 /* 3059 * If flushing all mounted file systems or 3060 * the vnode belongs to this vfs, has pages 3061 * and is marked as either dirty or mmap'd, 3062 * hold and add this vnode to the list of 3063 * vnodes to flush. 3064 */ 3065 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3066 vn_has_cached_data(vp) && 3067 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3068 VN_HOLD(vp); 3069 vplist[cnt++] = vp; 3070 if (cnt == num) { 3071 rw_exit(&rtable[index].r_lock); 3072 goto toomany; 3073 } 3074 } 3075 } 3076 rw_exit(&rtable[index].r_lock); 3077 } 3078 toomany: 3079 3080 /* 3081 * Flush and release all of the files on the list. 3082 */ 3083 while (cnt-- > 0) { 3084 vp = vplist[cnt]; 3085 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3086 VN_RELE(vp); 3087 } 3088 3089 /* 3090 * Free the space allocated to hold the list. 3091 */ 3092 kmem_free(vplist, num * sizeof (*vplist)); 3093 } 3094 3095 /* 3096 * This probably needs to be larger than or equal to 3097 * log2(sizeof (struct rnode)) due to the way that rnodes are 3098 * allocated. 3099 */ 3100 #define ACACHE_SHIFT_BITS 9 3101 3102 static int 3103 acachehash(rnode_t *rp, cred_t *cr) 3104 { 3105 3106 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3107 acachemask); 3108 } 3109 3110 #ifdef DEBUG 3111 static long nfs_access_cache_hits = 0; 3112 static long nfs_access_cache_misses = 0; 3113 #endif 3114 3115 nfs_access_type_t 3116 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3117 { 3118 vnode_t *vp; 3119 acache_t *ap; 3120 acache_hash_t *hp; 3121 nfs_access_type_t all; 3122 3123 vp = RTOV(rp); 3124 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3125 return (NFS_ACCESS_UNKNOWN); 3126 3127 if (rp->r_acache != NULL) { 3128 hp = &acache[acachehash(rp, cr)]; 3129 rw_enter(&hp->lock, RW_READER); 3130 ap = hp->next; 3131 while (ap != (acache_t *)hp) { 3132 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3133 if ((ap->known & acc) == acc) { 3134 #ifdef DEBUG 3135 nfs_access_cache_hits++; 3136 #endif 3137 if ((ap->allowed & acc) == acc) 3138 all = NFS_ACCESS_ALLOWED; 3139 else 3140 all = NFS_ACCESS_DENIED; 3141 } else { 3142 #ifdef DEBUG 3143 nfs_access_cache_misses++; 3144 #endif 3145 all = NFS_ACCESS_UNKNOWN; 3146 } 3147 rw_exit(&hp->lock); 3148 return (all); 3149 } 3150 ap = ap->next; 3151 } 3152 rw_exit(&hp->lock); 3153 } 3154 3155 #ifdef DEBUG 3156 nfs_access_cache_misses++; 3157 #endif 3158 return (NFS_ACCESS_UNKNOWN); 3159 } 3160 3161 void 3162 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3163 { 3164 acache_t *ap; 3165 acache_t *nap; 3166 acache_hash_t *hp; 3167 3168 hp = &acache[acachehash(rp, cr)]; 3169 3170 /* 3171 * Allocate now assuming that mostly an allocation will be 3172 * required. This allows the allocation to happen without 3173 * holding the hash bucket locked. 3174 */ 3175 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3176 if (nap != NULL) { 3177 nap->known = acc; 3178 nap->allowed = resacc; 3179 nap->rnode = rp; 3180 crhold(cr); 3181 nap->cred = cr; 3182 nap->hashq = hp; 3183 } 3184 3185 rw_enter(&hp->lock, RW_WRITER); 3186 3187 if (rp->r_acache != NULL) { 3188 ap = hp->next; 3189 while (ap != (acache_t *)hp) { 3190 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3191 ap->known |= acc; 3192 ap->allowed &= ~acc; 3193 ap->allowed |= resacc; 3194 rw_exit(&hp->lock); 3195 if (nap != NULL) { 3196 crfree(nap->cred); 3197 kmem_cache_free(acache_cache, nap); 3198 } 3199 return; 3200 } 3201 ap = ap->next; 3202 } 3203 } 3204 3205 if (nap != NULL) { 3206 #ifdef DEBUG 3207 clstat_debug.access.value.ui64++; 3208 #endif 3209 nap->next = hp->next; 3210 hp->next = nap; 3211 nap->next->prev = nap; 3212 nap->prev = (acache_t *)hp; 3213 3214 mutex_enter(&rp->r_statelock); 3215 nap->list = rp->r_acache; 3216 rp->r_acache = nap; 3217 mutex_exit(&rp->r_statelock); 3218 } 3219 3220 rw_exit(&hp->lock); 3221 } 3222 3223 int 3224 nfs_access_purge_rp(rnode_t *rp) 3225 { 3226 acache_t *ap; 3227 acache_t *tmpap; 3228 acache_t *rplist; 3229 3230 /* 3231 * If there aren't any cached entries, then there is nothing 3232 * to free. 3233 */ 3234 if (rp->r_acache == NULL) 3235 return (0); 3236 3237 mutex_enter(&rp->r_statelock); 3238 rplist = rp->r_acache; 3239 rp->r_acache = NULL; 3240 mutex_exit(&rp->r_statelock); 3241 3242 /* 3243 * Loop through each entry in the list pointed to in the 3244 * rnode. Remove each of these entries from the hash 3245 * queue that it is on and remove it from the list in 3246 * the rnode. 3247 */ 3248 for (ap = rplist; ap != NULL; ap = tmpap) { 3249 rw_enter(&ap->hashq->lock, RW_WRITER); 3250 ap->prev->next = ap->next; 3251 ap->next->prev = ap->prev; 3252 rw_exit(&ap->hashq->lock); 3253 3254 tmpap = ap->list; 3255 crfree(ap->cred); 3256 kmem_cache_free(acache_cache, ap); 3257 #ifdef DEBUG 3258 clstat_debug.access.value.ui64--; 3259 #endif 3260 } 3261 3262 return (1); 3263 } 3264 3265 static const char prefix[] = ".nfs"; 3266 3267 static kmutex_t newnum_lock; 3268 3269 int 3270 newnum(void) 3271 { 3272 static uint_t newnum = 0; 3273 uint_t id; 3274 3275 mutex_enter(&newnum_lock); 3276 if (newnum == 0) 3277 newnum = gethrestime_sec() & 0xffff; 3278 id = newnum++; 3279 mutex_exit(&newnum_lock); 3280 return (id); 3281 } 3282 3283 char * 3284 newname(void) 3285 { 3286 char *news; 3287 char *s; 3288 const char *p; 3289 uint_t id; 3290 3291 id = newnum(); 3292 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3293 s = news; 3294 p = prefix; 3295 while (*p != '\0') 3296 *s++ = *p++; 3297 while (id != 0) { 3298 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3299 id >>= 4; 3300 } 3301 *s = '\0'; 3302 return (news); 3303 } 3304 3305 int 3306 nfs_atoi(char *cp) 3307 { 3308 int n; 3309 3310 n = 0; 3311 while (*cp != '\0') { 3312 n = n * 10 + (*cp - '0'); 3313 cp++; 3314 } 3315 3316 return (n); 3317 } 3318 3319 /* 3320 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3321 * framework. 3322 */ 3323 static int 3324 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3325 { 3326 ksp->ks_snaptime = gethrtime(); 3327 if (rw == KSTAT_WRITE) { 3328 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3329 #ifdef DEBUG 3330 /* 3331 * Currently only the global zone can write to kstats, but we 3332 * add the check just for paranoia. 3333 */ 3334 if (INGLOBALZONE(curproc)) 3335 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3336 sizeof (clstat_debug)); 3337 #endif 3338 } else { 3339 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3340 #ifdef DEBUG 3341 /* 3342 * If we're displaying the "global" debug kstat values, we 3343 * display them as-is to all zones since in fact they apply to 3344 * the system as a whole. 3345 */ 3346 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3347 sizeof (clstat_debug)); 3348 #endif 3349 } 3350 return (0); 3351 } 3352 3353 static void * 3354 clinit_zone(zoneid_t zoneid) 3355 { 3356 kstat_t *nfs_client_kstat; 3357 struct nfs_clnt *nfscl; 3358 uint_t ndata; 3359 3360 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3361 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3362 nfscl->nfscl_chtable = NULL; 3363 nfscl->nfscl_zoneid = zoneid; 3364 3365 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3366 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3367 #ifdef DEBUG 3368 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3369 #endif 3370 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3371 "misc", KSTAT_TYPE_NAMED, ndata, 3372 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3373 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3374 nfs_client_kstat->ks_snapshot = cl_snapshot; 3375 kstat_install(nfs_client_kstat); 3376 } 3377 mutex_enter(&nfs_clnt_list_lock); 3378 list_insert_head(&nfs_clnt_list, nfscl); 3379 mutex_exit(&nfs_clnt_list_lock); 3380 return (nfscl); 3381 } 3382 3383 /*ARGSUSED*/ 3384 static void 3385 clfini_zone(zoneid_t zoneid, void *arg) 3386 { 3387 struct nfs_clnt *nfscl = arg; 3388 chhead_t *chp, *next; 3389 3390 if (nfscl == NULL) 3391 return; 3392 mutex_enter(&nfs_clnt_list_lock); 3393 list_remove(&nfs_clnt_list, nfscl); 3394 mutex_exit(&nfs_clnt_list_lock); 3395 clreclaim_zone(nfscl, 0); 3396 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3397 ASSERT(chp->ch_list == NULL); 3398 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3399 next = chp->ch_next; 3400 kmem_free(chp, sizeof (*chp)); 3401 } 3402 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3403 mutex_destroy(&nfscl->nfscl_chtable_lock); 3404 kmem_free(nfscl, sizeof (*nfscl)); 3405 } 3406 3407 /* 3408 * Called by endpnt_destructor to make sure the client handles are 3409 * cleaned up before the RPC endpoints. This becomes a no-op if 3410 * clfini_zone (above) is called first. This function is needed 3411 * (rather than relying on clfini_zone to clean up) because the ZSD 3412 * callbacks have no ordering mechanism, so we have no way to ensure 3413 * that clfini_zone is called before endpnt_destructor. 3414 */ 3415 void 3416 clcleanup_zone(zoneid_t zoneid) 3417 { 3418 struct nfs_clnt *nfscl; 3419 3420 mutex_enter(&nfs_clnt_list_lock); 3421 nfscl = list_head(&nfs_clnt_list); 3422 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3423 if (nfscl->nfscl_zoneid == zoneid) { 3424 clreclaim_zone(nfscl, 0); 3425 break; 3426 } 3427 } 3428 mutex_exit(&nfs_clnt_list_lock); 3429 } 3430 3431 int 3432 nfs_subrinit(void) 3433 { 3434 int i; 3435 ulong_t nrnode_max; 3436 3437 /* 3438 * Allocate and initialize the rnode hash queues 3439 */ 3440 if (nrnode <= 0) 3441 nrnode = ncsize; 3442 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3443 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3444 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3445 "setting nrnode to max value of %ld", nrnode_max); 3446 nrnode = nrnode_max; 3447 } 3448 3449 rtablesize = 1 << highbit(nrnode / hashlen); 3450 rtablemask = rtablesize - 1; 3451 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3452 for (i = 0; i < rtablesize; i++) { 3453 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3454 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3455 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3456 } 3457 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3458 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3459 3460 /* 3461 * Allocate and initialize the access cache 3462 */ 3463 3464 /* 3465 * Initial guess is one access cache entry per rnode unless 3466 * nacache is set to a non-zero value and then it is used to 3467 * indicate a guess at the number of access cache entries. 3468 */ 3469 if (nacache > 0) 3470 acachesize = 1 << highbit(nacache / hashlen); 3471 else 3472 acachesize = rtablesize; 3473 acachemask = acachesize - 1; 3474 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3475 for (i = 0; i < acachesize; i++) { 3476 acache[i].next = (acache_t *)&acache[i]; 3477 acache[i].prev = (acache_t *)&acache[i]; 3478 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3479 } 3480 acache_cache = kmem_cache_create("nfs_access_cache", 3481 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3482 /* 3483 * Allocate and initialize the client handle cache 3484 */ 3485 chtab_cache = kmem_cache_create("client_handle_cache", 3486 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3487 /* 3488 * Initialize the list of per-zone client handles (and associated data). 3489 * This needs to be done before we call zone_key_create(). 3490 */ 3491 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3492 offsetof(struct nfs_clnt, nfscl_node)); 3493 /* 3494 * Initialize the zone_key for per-zone client handle lists. 3495 */ 3496 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3497 /* 3498 * Initialize the various mutexes and reader/writer locks 3499 */ 3500 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3501 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3502 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3503 3504 /* 3505 * Assign unique major number for all nfs mounts 3506 */ 3507 if ((nfs_major = getudev()) == -1) { 3508 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3509 "nfs: init: can't get unique device number"); 3510 nfs_major = 0; 3511 } 3512 nfs_minor = 0; 3513 3514 if (nfs3_jukebox_delay == 0) 3515 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3516 3517 return (0); 3518 } 3519 3520 void 3521 nfs_subrfini(void) 3522 { 3523 int i; 3524 3525 /* 3526 * Deallocate the rnode hash queues 3527 */ 3528 kmem_cache_destroy(rnode_cache); 3529 3530 for (i = 0; i < rtablesize; i++) 3531 rw_destroy(&rtable[i].r_lock); 3532 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3533 3534 /* 3535 * Deallocated the access cache 3536 */ 3537 kmem_cache_destroy(acache_cache); 3538 3539 for (i = 0; i < acachesize; i++) 3540 rw_destroy(&acache[i].lock); 3541 kmem_free(acache, acachesize * sizeof (*acache)); 3542 3543 /* 3544 * Deallocate the client handle cache 3545 */ 3546 kmem_cache_destroy(chtab_cache); 3547 3548 /* 3549 * Destroy the various mutexes and reader/writer locks 3550 */ 3551 mutex_destroy(&rpfreelist_lock); 3552 mutex_destroy(&newnum_lock); 3553 mutex_destroy(&nfs_minor_lock); 3554 (void) zone_key_delete(nfsclnt_zone_key); 3555 } 3556 3557 enum nfsstat 3558 puterrno(int error) 3559 { 3560 3561 switch (error) { 3562 case EOPNOTSUPP: 3563 return (NFSERR_OPNOTSUPP); 3564 case ENAMETOOLONG: 3565 return (NFSERR_NAMETOOLONG); 3566 case ENOTEMPTY: 3567 return (NFSERR_NOTEMPTY); 3568 case EDQUOT: 3569 return (NFSERR_DQUOT); 3570 case ESTALE: 3571 return (NFSERR_STALE); 3572 case EREMOTE: 3573 return (NFSERR_REMOTE); 3574 case ENOSYS: 3575 return (NFSERR_OPNOTSUPP); 3576 case EOVERFLOW: 3577 return (NFSERR_INVAL); 3578 default: 3579 return ((enum nfsstat)error); 3580 } 3581 /* NOTREACHED */ 3582 } 3583 3584 int 3585 geterrno(enum nfsstat status) 3586 { 3587 3588 switch (status) { 3589 case NFSERR_OPNOTSUPP: 3590 return (EOPNOTSUPP); 3591 case NFSERR_NAMETOOLONG: 3592 return (ENAMETOOLONG); 3593 case NFSERR_NOTEMPTY: 3594 return (ENOTEMPTY); 3595 case NFSERR_DQUOT: 3596 return (EDQUOT); 3597 case NFSERR_STALE: 3598 return (ESTALE); 3599 case NFSERR_REMOTE: 3600 return (EREMOTE); 3601 case NFSERR_WFLUSH: 3602 return (EIO); 3603 default: 3604 return ((int)status); 3605 } 3606 /* NOTREACHED */ 3607 } 3608 3609 enum nfsstat3 3610 puterrno3(int error) 3611 { 3612 3613 #ifdef DEBUG 3614 switch (error) { 3615 case 0: 3616 return (NFS3_OK); 3617 case EPERM: 3618 return (NFS3ERR_PERM); 3619 case ENOENT: 3620 return (NFS3ERR_NOENT); 3621 case EIO: 3622 return (NFS3ERR_IO); 3623 case ENXIO: 3624 return (NFS3ERR_NXIO); 3625 case EACCES: 3626 return (NFS3ERR_ACCES); 3627 case EEXIST: 3628 return (NFS3ERR_EXIST); 3629 case EXDEV: 3630 return (NFS3ERR_XDEV); 3631 case ENODEV: 3632 return (NFS3ERR_NODEV); 3633 case ENOTDIR: 3634 return (NFS3ERR_NOTDIR); 3635 case EISDIR: 3636 return (NFS3ERR_ISDIR); 3637 case EINVAL: 3638 return (NFS3ERR_INVAL); 3639 case EFBIG: 3640 return (NFS3ERR_FBIG); 3641 case ENOSPC: 3642 return (NFS3ERR_NOSPC); 3643 case EROFS: 3644 return (NFS3ERR_ROFS); 3645 case EMLINK: 3646 return (NFS3ERR_MLINK); 3647 case ENAMETOOLONG: 3648 return (NFS3ERR_NAMETOOLONG); 3649 case ENOTEMPTY: 3650 return (NFS3ERR_NOTEMPTY); 3651 case EDQUOT: 3652 return (NFS3ERR_DQUOT); 3653 case ESTALE: 3654 return (NFS3ERR_STALE); 3655 case EREMOTE: 3656 return (NFS3ERR_REMOTE); 3657 case ENOSYS: 3658 case EOPNOTSUPP: 3659 return (NFS3ERR_NOTSUPP); 3660 case EOVERFLOW: 3661 return (NFS3ERR_INVAL); 3662 default: 3663 zcmn_err(getzoneid(), CE_WARN, 3664 "puterrno3: got error %d", error); 3665 return ((enum nfsstat3)error); 3666 } 3667 #else 3668 switch (error) { 3669 case ENAMETOOLONG: 3670 return (NFS3ERR_NAMETOOLONG); 3671 case ENOTEMPTY: 3672 return (NFS3ERR_NOTEMPTY); 3673 case EDQUOT: 3674 return (NFS3ERR_DQUOT); 3675 case ESTALE: 3676 return (NFS3ERR_STALE); 3677 case ENOSYS: 3678 case EOPNOTSUPP: 3679 return (NFS3ERR_NOTSUPP); 3680 case EREMOTE: 3681 return (NFS3ERR_REMOTE); 3682 case EOVERFLOW: 3683 return (NFS3ERR_INVAL); 3684 default: 3685 return ((enum nfsstat3)error); 3686 } 3687 #endif 3688 } 3689 3690 int 3691 geterrno3(enum nfsstat3 status) 3692 { 3693 3694 #ifdef DEBUG 3695 switch (status) { 3696 case NFS3_OK: 3697 return (0); 3698 case NFS3ERR_PERM: 3699 return (EPERM); 3700 case NFS3ERR_NOENT: 3701 return (ENOENT); 3702 case NFS3ERR_IO: 3703 return (EIO); 3704 case NFS3ERR_NXIO: 3705 return (ENXIO); 3706 case NFS3ERR_ACCES: 3707 return (EACCES); 3708 case NFS3ERR_EXIST: 3709 return (EEXIST); 3710 case NFS3ERR_XDEV: 3711 return (EXDEV); 3712 case NFS3ERR_NODEV: 3713 return (ENODEV); 3714 case NFS3ERR_NOTDIR: 3715 return (ENOTDIR); 3716 case NFS3ERR_ISDIR: 3717 return (EISDIR); 3718 case NFS3ERR_INVAL: 3719 return (EINVAL); 3720 case NFS3ERR_FBIG: 3721 return (EFBIG); 3722 case NFS3ERR_NOSPC: 3723 return (ENOSPC); 3724 case NFS3ERR_ROFS: 3725 return (EROFS); 3726 case NFS3ERR_MLINK: 3727 return (EMLINK); 3728 case NFS3ERR_NAMETOOLONG: 3729 return (ENAMETOOLONG); 3730 case NFS3ERR_NOTEMPTY: 3731 return (ENOTEMPTY); 3732 case NFS3ERR_DQUOT: 3733 return (EDQUOT); 3734 case NFS3ERR_STALE: 3735 return (ESTALE); 3736 case NFS3ERR_REMOTE: 3737 return (EREMOTE); 3738 case NFS3ERR_BADHANDLE: 3739 return (ESTALE); 3740 case NFS3ERR_NOT_SYNC: 3741 return (EINVAL); 3742 case NFS3ERR_BAD_COOKIE: 3743 return (ENOENT); 3744 case NFS3ERR_NOTSUPP: 3745 return (EOPNOTSUPP); 3746 case NFS3ERR_TOOSMALL: 3747 return (EINVAL); 3748 case NFS3ERR_SERVERFAULT: 3749 return (EIO); 3750 case NFS3ERR_BADTYPE: 3751 return (EINVAL); 3752 case NFS3ERR_JUKEBOX: 3753 return (ENXIO); 3754 default: 3755 zcmn_err(getzoneid(), CE_WARN, 3756 "geterrno3: got status %d", status); 3757 return ((int)status); 3758 } 3759 #else 3760 switch (status) { 3761 case NFS3ERR_NAMETOOLONG: 3762 return (ENAMETOOLONG); 3763 case NFS3ERR_NOTEMPTY: 3764 return (ENOTEMPTY); 3765 case NFS3ERR_DQUOT: 3766 return (EDQUOT); 3767 case NFS3ERR_STALE: 3768 case NFS3ERR_BADHANDLE: 3769 return (ESTALE); 3770 case NFS3ERR_NOTSUPP: 3771 return (EOPNOTSUPP); 3772 case NFS3ERR_REMOTE: 3773 return (EREMOTE); 3774 case NFS3ERR_NOT_SYNC: 3775 case NFS3ERR_TOOSMALL: 3776 case NFS3ERR_BADTYPE: 3777 return (EINVAL); 3778 case NFS3ERR_BAD_COOKIE: 3779 return (ENOENT); 3780 case NFS3ERR_SERVERFAULT: 3781 return (EIO); 3782 case NFS3ERR_JUKEBOX: 3783 return (ENXIO); 3784 default: 3785 return ((int)status); 3786 } 3787 #endif 3788 } 3789 3790 rddir_cache * 3791 rddir_cache_alloc(int flags) 3792 { 3793 rddir_cache *rc; 3794 3795 rc = kmem_alloc(sizeof (*rc), flags); 3796 if (rc != NULL) { 3797 rc->entries = NULL; 3798 rc->flags = RDDIR; 3799 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3800 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3801 rc->count = 1; 3802 #ifdef DEBUG 3803 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3804 #endif 3805 } 3806 return (rc); 3807 } 3808 3809 static void 3810 rddir_cache_free(rddir_cache *rc) 3811 { 3812 3813 #ifdef DEBUG 3814 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3815 #endif 3816 if (rc->entries != NULL) { 3817 #ifdef DEBUG 3818 rddir_cache_buf_free(rc->entries, rc->buflen); 3819 #else 3820 kmem_free(rc->entries, rc->buflen); 3821 #endif 3822 } 3823 cv_destroy(&rc->cv); 3824 mutex_destroy(&rc->lock); 3825 kmem_free(rc, sizeof (*rc)); 3826 } 3827 3828 void 3829 rddir_cache_hold(rddir_cache *rc) 3830 { 3831 3832 mutex_enter(&rc->lock); 3833 rc->count++; 3834 mutex_exit(&rc->lock); 3835 } 3836 3837 void 3838 rddir_cache_rele(rddir_cache *rc) 3839 { 3840 3841 mutex_enter(&rc->lock); 3842 ASSERT(rc->count > 0); 3843 if (--rc->count == 0) { 3844 mutex_exit(&rc->lock); 3845 rddir_cache_free(rc); 3846 } else 3847 mutex_exit(&rc->lock); 3848 } 3849 3850 #ifdef DEBUG 3851 char * 3852 rddir_cache_buf_alloc(size_t size, int flags) 3853 { 3854 char *rc; 3855 3856 rc = kmem_alloc(size, flags); 3857 if (rc != NULL) 3858 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3859 return (rc); 3860 } 3861 3862 void 3863 rddir_cache_buf_free(void *addr, size_t size) 3864 { 3865 3866 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3867 kmem_free(addr, size); 3868 } 3869 #endif 3870 3871 static int 3872 nfs_free_data_reclaim(rnode_t *rp) 3873 { 3874 char *contents; 3875 int size; 3876 vsecattr_t *vsp; 3877 nfs3_pathconf_info *info; 3878 int freed; 3879 cred_t *cred; 3880 3881 /* 3882 * Free any held credentials and caches which 3883 * may be associated with this rnode. 3884 */ 3885 mutex_enter(&rp->r_statelock); 3886 cred = rp->r_cred; 3887 rp->r_cred = NULL; 3888 contents = rp->r_symlink.contents; 3889 size = rp->r_symlink.size; 3890 rp->r_symlink.contents = NULL; 3891 vsp = rp->r_secattr; 3892 rp->r_secattr = NULL; 3893 info = rp->r_pathconf; 3894 rp->r_pathconf = NULL; 3895 mutex_exit(&rp->r_statelock); 3896 3897 if (cred != NULL) 3898 crfree(cred); 3899 3900 /* 3901 * Free the access cache entries. 3902 */ 3903 freed = nfs_access_purge_rp(rp); 3904 3905 if (!HAVE_RDDIR_CACHE(rp) && 3906 contents == NULL && 3907 vsp == NULL && 3908 info == NULL) 3909 return (freed); 3910 3911 /* 3912 * Free the readdir cache entries 3913 */ 3914 if (HAVE_RDDIR_CACHE(rp)) 3915 nfs_purge_rddir_cache(RTOV(rp)); 3916 3917 /* 3918 * Free the symbolic link cache. 3919 */ 3920 if (contents != NULL) { 3921 3922 kmem_free((void *)contents, size); 3923 } 3924 3925 /* 3926 * Free any cached ACL. 3927 */ 3928 if (vsp != NULL) 3929 nfs_acl_free(vsp); 3930 3931 /* 3932 * Free any cached pathconf information. 3933 */ 3934 if (info != NULL) 3935 kmem_free(info, sizeof (*info)); 3936 3937 return (1); 3938 } 3939 3940 static int 3941 nfs_active_data_reclaim(rnode_t *rp) 3942 { 3943 char *contents; 3944 int size; 3945 vsecattr_t *vsp; 3946 nfs3_pathconf_info *info; 3947 int freed; 3948 3949 /* 3950 * Free any held credentials and caches which 3951 * may be associated with this rnode. 3952 */ 3953 if (!mutex_tryenter(&rp->r_statelock)) 3954 return (0); 3955 contents = rp->r_symlink.contents; 3956 size = rp->r_symlink.size; 3957 rp->r_symlink.contents = NULL; 3958 vsp = rp->r_secattr; 3959 rp->r_secattr = NULL; 3960 info = rp->r_pathconf; 3961 rp->r_pathconf = NULL; 3962 mutex_exit(&rp->r_statelock); 3963 3964 /* 3965 * Free the access cache entries. 3966 */ 3967 freed = nfs_access_purge_rp(rp); 3968 3969 if (!HAVE_RDDIR_CACHE(rp) && 3970 contents == NULL && 3971 vsp == NULL && 3972 info == NULL) 3973 return (freed); 3974 3975 /* 3976 * Free the readdir cache entries 3977 */ 3978 if (HAVE_RDDIR_CACHE(rp)) 3979 nfs_purge_rddir_cache(RTOV(rp)); 3980 3981 /* 3982 * Free the symbolic link cache. 3983 */ 3984 if (contents != NULL) { 3985 3986 kmem_free((void *)contents, size); 3987 } 3988 3989 /* 3990 * Free any cached ACL. 3991 */ 3992 if (vsp != NULL) 3993 nfs_acl_free(vsp); 3994 3995 /* 3996 * Free any cached pathconf information. 3997 */ 3998 if (info != NULL) 3999 kmem_free(info, sizeof (*info)); 4000 4001 return (1); 4002 } 4003 4004 static int 4005 nfs_free_reclaim(void) 4006 { 4007 int freed; 4008 rnode_t *rp; 4009 4010 #ifdef DEBUG 4011 clstat_debug.f_reclaim.value.ui64++; 4012 #endif 4013 freed = 0; 4014 mutex_enter(&rpfreelist_lock); 4015 rp = rpfreelist; 4016 if (rp != NULL) { 4017 do { 4018 if (nfs_free_data_reclaim(rp)) 4019 freed = 1; 4020 } while ((rp = rp->r_freef) != rpfreelist); 4021 } 4022 mutex_exit(&rpfreelist_lock); 4023 return (freed); 4024 } 4025 4026 static int 4027 nfs_active_reclaim(void) 4028 { 4029 int freed; 4030 int index; 4031 rnode_t *rp; 4032 4033 #ifdef DEBUG 4034 clstat_debug.a_reclaim.value.ui64++; 4035 #endif 4036 freed = 0; 4037 for (index = 0; index < rtablesize; index++) { 4038 rw_enter(&rtable[index].r_lock, RW_READER); 4039 for (rp = rtable[index].r_hashf; 4040 rp != (rnode_t *)(&rtable[index]); 4041 rp = rp->r_hashf) { 4042 if (nfs_active_data_reclaim(rp)) 4043 freed = 1; 4044 } 4045 rw_exit(&rtable[index].r_lock); 4046 } 4047 return (freed); 4048 } 4049 4050 static int 4051 nfs_rnode_reclaim(void) 4052 { 4053 int freed; 4054 rnode_t *rp; 4055 vnode_t *vp; 4056 4057 #ifdef DEBUG 4058 clstat_debug.r_reclaim.value.ui64++; 4059 #endif 4060 freed = 0; 4061 mutex_enter(&rpfreelist_lock); 4062 while ((rp = rpfreelist) != NULL) { 4063 rp_rmfree(rp); 4064 mutex_exit(&rpfreelist_lock); 4065 if (rp->r_flags & RHASHED) { 4066 vp = RTOV(rp); 4067 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4068 mutex_enter(&vp->v_lock); 4069 if (vp->v_count > 1) { 4070 vp->v_count--; 4071 mutex_exit(&vp->v_lock); 4072 rw_exit(&rp->r_hashq->r_lock); 4073 mutex_enter(&rpfreelist_lock); 4074 continue; 4075 } 4076 mutex_exit(&vp->v_lock); 4077 rp_rmhash_locked(rp); 4078 rw_exit(&rp->r_hashq->r_lock); 4079 } 4080 /* 4081 * This call to rp_addfree will end up destroying the 4082 * rnode, but in a safe way with the appropriate set 4083 * of checks done. 4084 */ 4085 rp_addfree(rp, CRED()); 4086 mutex_enter(&rpfreelist_lock); 4087 } 4088 mutex_exit(&rpfreelist_lock); 4089 return (freed); 4090 } 4091 4092 /*ARGSUSED*/ 4093 static void 4094 nfs_reclaim(void *cdrarg) 4095 { 4096 4097 #ifdef DEBUG 4098 clstat_debug.reclaim.value.ui64++; 4099 #endif 4100 if (nfs_free_reclaim()) 4101 return; 4102 4103 if (nfs_active_reclaim()) 4104 return; 4105 4106 (void) nfs_rnode_reclaim(); 4107 } 4108 4109 /* 4110 * NFS client failover support 4111 * 4112 * Routines to copy filehandles 4113 */ 4114 void 4115 nfscopyfh(caddr_t fhp, vnode_t *vp) 4116 { 4117 fhandle_t *dest = (fhandle_t *)fhp; 4118 4119 if (dest != NULL) 4120 *dest = *VTOFH(vp); 4121 } 4122 4123 void 4124 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4125 { 4126 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4127 4128 if (dest != NULL) 4129 *dest = *VTOFH3(vp); 4130 } 4131 4132 /* 4133 * NFS client failover support 4134 * 4135 * failover_safe() will test various conditions to ensure that 4136 * failover is permitted for this vnode. It will be denied 4137 * if: 4138 * 1) the operation in progress does not support failover (NULL fi) 4139 * 2) there are no available replicas (NULL mi_servers->sv_next) 4140 * 3) any locks are outstanding on this file 4141 */ 4142 static int 4143 failover_safe(failinfo_t *fi) 4144 { 4145 4146 /* 4147 * Does this op permit failover? 4148 */ 4149 if (fi == NULL || fi->vp == NULL) 4150 return (0); 4151 4152 /* 4153 * Are there any alternates to failover to? 4154 */ 4155 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4156 return (0); 4157 4158 /* 4159 * Disable check; we've forced local locking 4160 * 4161 * if (flk_has_remote_locks(fi->vp)) 4162 * return (0); 4163 */ 4164 4165 /* 4166 * If we have no partial path, we can't do anything 4167 */ 4168 if (VTOR(fi->vp)->r_path == NULL) 4169 return (0); 4170 4171 return (1); 4172 } 4173 4174 #include <sys/thread.h> 4175 4176 /* 4177 * NFS client failover support 4178 * 4179 * failover_newserver() will start a search for a new server, 4180 * preferably by starting an async thread to do the work. If 4181 * someone is already doing this (recognizable by MI_BINDINPROG 4182 * being set), it will simply return and the calling thread 4183 * will queue on the mi_failover_cv condition variable. 4184 */ 4185 static void 4186 failover_newserver(mntinfo_t *mi) 4187 { 4188 /* 4189 * Check if someone else is doing this already 4190 */ 4191 mutex_enter(&mi->mi_lock); 4192 if (mi->mi_flags & MI_BINDINPROG) { 4193 mutex_exit(&mi->mi_lock); 4194 return; 4195 } 4196 mi->mi_flags |= MI_BINDINPROG; 4197 4198 /* 4199 * Need to hold the vfs struct so that it can't be released 4200 * while the failover thread is selecting a new server. 4201 */ 4202 VFS_HOLD(mi->mi_vfsp); 4203 4204 /* 4205 * Start a thread to do the real searching. 4206 */ 4207 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4208 4209 mutex_exit(&mi->mi_lock); 4210 } 4211 4212 /* 4213 * NFS client failover support 4214 * 4215 * failover_thread() will find a new server to replace the one 4216 * currently in use, wake up other threads waiting on this mount 4217 * point, and die. It will start at the head of the server list 4218 * and poll servers until it finds one with an NFS server which is 4219 * registered and responds to a NULL procedure ping. 4220 * 4221 * XXX failover_thread is unsafe within the scope of the 4222 * present model defined for cpr to suspend the system. 4223 * Specifically, over-the-wire calls made by the thread 4224 * are unsafe. The thread needs to be reevaluated in case of 4225 * future updates to the cpr suspend model. 4226 */ 4227 static void 4228 failover_thread(mntinfo_t *mi) 4229 { 4230 servinfo_t *svp = NULL; 4231 CLIENT *cl; 4232 enum clnt_stat status; 4233 struct timeval tv; 4234 int error; 4235 int oncethru = 0; 4236 callb_cpr_t cprinfo; 4237 rnode_t *rp; 4238 int index; 4239 char *srvnames; 4240 size_t srvnames_len; 4241 struct nfs_clnt *nfscl = NULL; 4242 zoneid_t zoneid = getzoneid(); 4243 4244 #ifdef DEBUG 4245 /* 4246 * This is currently only needed to access counters which exist on 4247 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4248 * on non-DEBUG kernels. 4249 */ 4250 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4251 ASSERT(nfscl != NULL); 4252 #endif 4253 4254 /* 4255 * Its safe to piggyback on the mi_lock since failover_newserver() 4256 * code guarantees that there will be only one failover thread 4257 * per mountinfo at any instance. 4258 */ 4259 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4260 "failover_thread"); 4261 4262 mutex_enter(&mi->mi_lock); 4263 while (mi->mi_readers) { 4264 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4265 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4266 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4267 } 4268 mutex_exit(&mi->mi_lock); 4269 4270 tv.tv_sec = 2; 4271 tv.tv_usec = 0; 4272 4273 /* 4274 * Ping the null NFS procedure of every server in 4275 * the list until one responds. We always start 4276 * at the head of the list and always skip the one 4277 * that is current, since it's caused us a problem. 4278 */ 4279 while (svp == NULL) { 4280 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4281 if (!oncethru && svp == mi->mi_curr_serv) 4282 continue; 4283 4284 /* 4285 * If the file system was forcibly umounted 4286 * while trying to do a failover, then just 4287 * give up on the failover. It won't matter 4288 * what the server is. 4289 */ 4290 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4291 svp = NULL; 4292 goto done; 4293 } 4294 4295 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4296 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4297 if (error) 4298 continue; 4299 4300 if (!(mi->mi_flags & MI_INT)) 4301 cl->cl_nosignal = TRUE; 4302 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4303 xdr_void, NULL, tv); 4304 if (!(mi->mi_flags & MI_INT)) 4305 cl->cl_nosignal = FALSE; 4306 AUTH_DESTROY(cl->cl_auth); 4307 CLNT_DESTROY(cl); 4308 if (status == RPC_SUCCESS) { 4309 if (svp == mi->mi_curr_serv) { 4310 #ifdef DEBUG 4311 zcmn_err(zoneid, CE_NOTE, 4312 "NFS%d: failing over: selecting original server %s", 4313 mi->mi_vers, svp->sv_hostname); 4314 #else 4315 zcmn_err(zoneid, CE_NOTE, 4316 "NFS: failing over: selecting original server %s", 4317 svp->sv_hostname); 4318 #endif 4319 } else { 4320 #ifdef DEBUG 4321 zcmn_err(zoneid, CE_NOTE, 4322 "NFS%d: failing over from %s to %s", 4323 mi->mi_vers, 4324 mi->mi_curr_serv->sv_hostname, 4325 svp->sv_hostname); 4326 #else 4327 zcmn_err(zoneid, CE_NOTE, 4328 "NFS: failing over from %s to %s", 4329 mi->mi_curr_serv->sv_hostname, 4330 svp->sv_hostname); 4331 #endif 4332 } 4333 break; 4334 } 4335 } 4336 4337 if (svp == NULL) { 4338 if (!oncethru) { 4339 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4340 #ifdef DEBUG 4341 zprintf(zoneid, 4342 "NFS%d servers %s not responding " 4343 "still trying\n", mi->mi_vers, srvnames); 4344 #else 4345 zprintf(zoneid, "NFS servers %s not responding " 4346 "still trying\n", srvnames); 4347 #endif 4348 oncethru = 1; 4349 } 4350 mutex_enter(&mi->mi_lock); 4351 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4352 mutex_exit(&mi->mi_lock); 4353 delay(hz); 4354 mutex_enter(&mi->mi_lock); 4355 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4356 mutex_exit(&mi->mi_lock); 4357 } 4358 } 4359 4360 if (oncethru) { 4361 #ifdef DEBUG 4362 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4363 #else 4364 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4365 #endif 4366 } 4367 4368 if (svp != mi->mi_curr_serv) { 4369 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4370 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4371 rw_enter(&rtable[index].r_lock, RW_WRITER); 4372 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4373 mi->mi_vfsp); 4374 if (rp != NULL) { 4375 if (rp->r_flags & RHASHED) 4376 rp_rmhash_locked(rp); 4377 rw_exit(&rtable[index].r_lock); 4378 rp->r_server = svp; 4379 rp->r_fh = svp->sv_fhandle; 4380 (void) nfs_free_data_reclaim(rp); 4381 index = rtablehash(&rp->r_fh); 4382 rp->r_hashq = &rtable[index]; 4383 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4384 vn_exists(RTOV(rp)); 4385 rp_addhash(rp); 4386 rw_exit(&rp->r_hashq->r_lock); 4387 VN_RELE(RTOV(rp)); 4388 } else 4389 rw_exit(&rtable[index].r_lock); 4390 } 4391 4392 done: 4393 if (oncethru) 4394 kmem_free(srvnames, srvnames_len); 4395 mutex_enter(&mi->mi_lock); 4396 mi->mi_flags &= ~MI_BINDINPROG; 4397 if (svp != NULL) { 4398 mi->mi_curr_serv = svp; 4399 mi->mi_failover++; 4400 #ifdef DEBUG 4401 nfscl->nfscl_stat.failover.value.ui64++; 4402 #endif 4403 } 4404 cv_broadcast(&mi->mi_failover_cv); 4405 CALLB_CPR_EXIT(&cprinfo); 4406 VFS_RELE(mi->mi_vfsp); 4407 zthread_exit(); 4408 /* NOTREACHED */ 4409 } 4410 4411 /* 4412 * NFS client failover support 4413 * 4414 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4415 * is cleared, meaning that failover is complete. Called with 4416 * mi_lock mutex held. 4417 */ 4418 static int 4419 failover_wait(mntinfo_t *mi) 4420 { 4421 k_sigset_t smask; 4422 4423 /* 4424 * If someone else is hunting for a living server, 4425 * sleep until it's done. After our sleep, we may 4426 * be bound to the right server and get off cheaply. 4427 */ 4428 while (mi->mi_flags & MI_BINDINPROG) { 4429 /* 4430 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4431 * and SIGTERM. (Preserving the existing masks). 4432 * Mask out SIGINT if mount option nointr is specified. 4433 */ 4434 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4435 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4436 /* 4437 * restore original signal mask 4438 */ 4439 sigunintr(&smask); 4440 return (EINTR); 4441 } 4442 /* 4443 * restore original signal mask 4444 */ 4445 sigunintr(&smask); 4446 } 4447 return (0); 4448 } 4449 4450 /* 4451 * NFS client failover support 4452 * 4453 * failover_remap() will do a partial pathname lookup and find the 4454 * desired vnode on the current server. The interim vnode will be 4455 * discarded after we pilfer the new filehandle. 4456 * 4457 * Side effects: 4458 * - This routine will also update the filehandle in the args structure 4459 * pointed to by the fi->fhp pointer if it is non-NULL. 4460 */ 4461 4462 static int 4463 failover_remap(failinfo_t *fi) 4464 { 4465 vnode_t *vp, *nvp, *rootvp; 4466 rnode_t *rp, *nrp; 4467 mntinfo_t *mi; 4468 int error; 4469 #ifdef DEBUG 4470 struct nfs_clnt *nfscl; 4471 4472 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4473 ASSERT(nfscl != NULL); 4474 #endif 4475 /* 4476 * Sanity check 4477 */ 4478 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4479 return (EINVAL); 4480 vp = fi->vp; 4481 rp = VTOR(vp); 4482 mi = VTOMI(vp); 4483 4484 if (!(vp->v_flag & VROOT)) { 4485 /* 4486 * Given the root fh, use the path stored in 4487 * the rnode to find the fh for the new server. 4488 */ 4489 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4490 if (error) 4491 return (error); 4492 4493 error = failover_lookup(rp->r_path, rootvp, 4494 fi->lookupproc, fi->xattrdirproc, &nvp); 4495 4496 VN_RELE(rootvp); 4497 4498 if (error) 4499 return (error); 4500 4501 /* 4502 * If we found the same rnode, we're done now 4503 */ 4504 if (nvp == vp) { 4505 /* 4506 * Failed and the new server may physically be same 4507 * OR may share a same disk subsystem. In this case 4508 * file handle for a particular file path is not going 4509 * to change, given the same filehandle lookup will 4510 * always locate the same rnode as the existing one. 4511 * All we might need to do is to update the r_server 4512 * with the current servinfo. 4513 */ 4514 if (!VALID_FH(fi)) { 4515 rp->r_server = mi->mi_curr_serv; 4516 } 4517 VN_RELE(nvp); 4518 return (0); 4519 } 4520 4521 /* 4522 * Try to make it so that no one else will find this 4523 * vnode because it is just a temporary to hold the 4524 * new file handle until that file handle can be 4525 * copied to the original vnode/rnode. 4526 */ 4527 nrp = VTOR(nvp); 4528 mutex_enter(&mi->mi_remap_lock); 4529 /* 4530 * Some other thread could have raced in here and could 4531 * have done the remap for this particular rnode before 4532 * this thread here. Check for rp->r_server and 4533 * mi->mi_curr_serv and return if they are same. 4534 */ 4535 if (VALID_FH(fi)) { 4536 mutex_exit(&mi->mi_remap_lock); 4537 VN_RELE(nvp); 4538 return (0); 4539 } 4540 4541 if (nrp->r_flags & RHASHED) 4542 rp_rmhash(nrp); 4543 4544 /* 4545 * As a heuristic check on the validity of the new 4546 * file, check that the size and type match against 4547 * that we remember from the old version. 4548 */ 4549 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4550 mutex_exit(&mi->mi_remap_lock); 4551 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4552 "NFS replicas %s and %s: file %s not same.", 4553 rp->r_server->sv_hostname, 4554 nrp->r_server->sv_hostname, rp->r_path); 4555 VN_RELE(nvp); 4556 return (EINVAL); 4557 } 4558 4559 /* 4560 * snarf the filehandle from the new rnode 4561 * then release it, again while updating the 4562 * hash queues for the rnode. 4563 */ 4564 if (rp->r_flags & RHASHED) 4565 rp_rmhash(rp); 4566 rp->r_server = mi->mi_curr_serv; 4567 rp->r_fh = nrp->r_fh; 4568 rp->r_hashq = nrp->r_hashq; 4569 /* 4570 * Copy the attributes from the new rnode to the old 4571 * rnode. This will help to reduce unnecessary page 4572 * cache flushes. 4573 */ 4574 rp->r_attr = nrp->r_attr; 4575 rp->r_attrtime = nrp->r_attrtime; 4576 rp->r_mtime = nrp->r_mtime; 4577 (void) nfs_free_data_reclaim(rp); 4578 nfs_setswaplike(vp, &rp->r_attr); 4579 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4580 rp_addhash(rp); 4581 rw_exit(&rp->r_hashq->r_lock); 4582 mutex_exit(&mi->mi_remap_lock); 4583 VN_RELE(nvp); 4584 } 4585 4586 /* 4587 * Update successful failover remap count 4588 */ 4589 mutex_enter(&mi->mi_lock); 4590 mi->mi_remap++; 4591 mutex_exit(&mi->mi_lock); 4592 #ifdef DEBUG 4593 nfscl->nfscl_stat.remap.value.ui64++; 4594 #endif 4595 4596 /* 4597 * If we have a copied filehandle to update, do it now. 4598 */ 4599 if (fi->fhp != NULL && fi->copyproc != NULL) 4600 (*fi->copyproc)(fi->fhp, vp); 4601 4602 return (0); 4603 } 4604 4605 /* 4606 * NFS client failover support 4607 * 4608 * We want a simple pathname lookup routine to parse the pieces 4609 * of path in rp->r_path. We know that the path was a created 4610 * as rnodes were made, so we know we have only to deal with 4611 * paths that look like: 4612 * dir1/dir2/dir3/file 4613 * Any evidence of anything like .., symlinks, and ENOTDIR 4614 * are hard errors, because they mean something in this filesystem 4615 * is different from the one we came from, or has changed under 4616 * us in some way. If this is true, we want the failure. 4617 * 4618 * Extended attributes: if the filesystem is mounted with extended 4619 * attributes enabled (-o xattr), the attribute directory will be 4620 * represented in the r_path as the magic name XATTR_RPATH. So if 4621 * we see that name in the pathname, is must be because this node 4622 * is an extended attribute. Therefore, look it up that way. 4623 */ 4624 static int 4625 failover_lookup(char *path, vnode_t *root, 4626 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4627 vnode_t *, cred_t *, int), 4628 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4629 vnode_t **new) 4630 { 4631 vnode_t *dvp, *nvp; 4632 int error = EINVAL; 4633 char *s, *p, *tmppath; 4634 size_t len; 4635 mntinfo_t *mi; 4636 bool_t xattr; 4637 4638 /* Make local copy of path */ 4639 len = strlen(path) + 1; 4640 tmppath = kmem_alloc(len, KM_SLEEP); 4641 (void) strcpy(tmppath, path); 4642 s = tmppath; 4643 4644 dvp = root; 4645 VN_HOLD(dvp); 4646 mi = VTOMI(root); 4647 xattr = mi->mi_flags & MI_EXTATTR; 4648 4649 do { 4650 p = strchr(s, '/'); 4651 if (p != NULL) 4652 *p = '\0'; 4653 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4654 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4655 RFSCALL_SOFT); 4656 } else { 4657 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4658 CRED(), RFSCALL_SOFT); 4659 } 4660 if (p != NULL) 4661 *p++ = '/'; 4662 if (error) { 4663 VN_RELE(dvp); 4664 kmem_free(tmppath, len); 4665 return (error); 4666 } 4667 s = p; 4668 VN_RELE(dvp); 4669 dvp = nvp; 4670 } while (p != NULL); 4671 4672 if (nvp != NULL && new != NULL) 4673 *new = nvp; 4674 kmem_free(tmppath, len); 4675 return (0); 4676 } 4677 4678 /* 4679 * NFS client failover support 4680 * 4681 * sv_free() frees the malloc'd portion of a "servinfo_t". 4682 */ 4683 void 4684 sv_free(servinfo_t *svp) 4685 { 4686 servinfo_t *next; 4687 struct knetconfig *knconf; 4688 4689 while (svp != NULL) { 4690 next = svp->sv_next; 4691 if (svp->sv_secdata) 4692 sec_clnt_freeinfo(svp->sv_secdata); 4693 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4694 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4695 knconf = svp->sv_knconf; 4696 if (knconf != NULL) { 4697 if (knconf->knc_protofmly != NULL) 4698 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4699 if (knconf->knc_proto != NULL) 4700 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4701 kmem_free(knconf, sizeof (*knconf)); 4702 } 4703 knconf = svp->sv_origknconf; 4704 if (knconf != NULL) { 4705 if (knconf->knc_protofmly != NULL) 4706 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4707 if (knconf->knc_proto != NULL) 4708 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4709 kmem_free(knconf, sizeof (*knconf)); 4710 } 4711 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4712 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4713 mutex_destroy(&svp->sv_lock); 4714 kmem_free(svp, sizeof (*svp)); 4715 svp = next; 4716 } 4717 } 4718 4719 /* 4720 * Only can return non-zero if intr != 0. 4721 */ 4722 int 4723 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4724 { 4725 4726 mutex_enter(&l->lock); 4727 4728 /* 4729 * If this is a nested enter, then allow it. There 4730 * must be as many exits as enters through. 4731 */ 4732 if (l->owner == curthread) { 4733 /* lock is held for writing by current thread */ 4734 ASSERT(rw == RW_READER || rw == RW_WRITER); 4735 l->count--; 4736 } else if (rw == RW_READER) { 4737 /* 4738 * While there is a writer active or writers waiting, 4739 * then wait for them to finish up and move on. Then, 4740 * increment the count to indicate that a reader is 4741 * active. 4742 */ 4743 while (l->count < 0 || l->waiters > 0) { 4744 if (intr) { 4745 klwp_t *lwp = ttolwp(curthread); 4746 4747 if (lwp != NULL) 4748 lwp->lwp_nostop++; 4749 if (!cv_wait_sig(&l->cv, &l->lock)) { 4750 if (lwp != NULL) 4751 lwp->lwp_nostop--; 4752 mutex_exit(&l->lock); 4753 return (EINTR); 4754 } 4755 if (lwp != NULL) 4756 lwp->lwp_nostop--; 4757 } else 4758 cv_wait(&l->cv, &l->lock); 4759 } 4760 ASSERT(l->count < INT_MAX); 4761 #ifdef DEBUG 4762 if ((l->count % 10000) == 9999) 4763 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4764 "rwlock @ %p\n", l->count, (void *)&l); 4765 #endif 4766 l->count++; 4767 } else { 4768 ASSERT(rw == RW_WRITER); 4769 /* 4770 * While there are readers active or a writer 4771 * active, then wait for all of the readers 4772 * to finish or for the writer to finish. 4773 * Then, set the owner field to curthread and 4774 * decrement count to indicate that a writer 4775 * is active. 4776 */ 4777 while (l->count > 0 || l->owner != NULL) { 4778 l->waiters++; 4779 if (intr) { 4780 klwp_t *lwp = ttolwp(curthread); 4781 4782 if (lwp != NULL) 4783 lwp->lwp_nostop++; 4784 if (!cv_wait_sig(&l->cv, &l->lock)) { 4785 if (lwp != NULL) 4786 lwp->lwp_nostop--; 4787 l->waiters--; 4788 cv_broadcast(&l->cv); 4789 mutex_exit(&l->lock); 4790 return (EINTR); 4791 } 4792 if (lwp != NULL) 4793 lwp->lwp_nostop--; 4794 } else 4795 cv_wait(&l->cv, &l->lock); 4796 l->waiters--; 4797 } 4798 l->owner = curthread; 4799 l->count--; 4800 } 4801 4802 mutex_exit(&l->lock); 4803 4804 return (0); 4805 } 4806 4807 /* 4808 * If the lock is available, obtain it and return non-zero. If there is 4809 * already a conflicting lock, return 0 immediately. 4810 */ 4811 4812 int 4813 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4814 { 4815 mutex_enter(&l->lock); 4816 4817 /* 4818 * If this is a nested enter, then allow it. There 4819 * must be as many exits as enters through. 4820 */ 4821 if (l->owner == curthread) { 4822 /* lock is held for writing by current thread */ 4823 ASSERT(rw == RW_READER || rw == RW_WRITER); 4824 l->count--; 4825 } else if (rw == RW_READER) { 4826 /* 4827 * If there is a writer active or writers waiting, deny the 4828 * lock. Otherwise, bump the count of readers. 4829 */ 4830 if (l->count < 0 || l->waiters > 0) { 4831 mutex_exit(&l->lock); 4832 return (0); 4833 } 4834 l->count++; 4835 } else { 4836 ASSERT(rw == RW_WRITER); 4837 /* 4838 * If there are readers active or a writer active, deny the 4839 * lock. Otherwise, set the owner field to curthread and 4840 * decrement count to indicate that a writer is active. 4841 */ 4842 if (l->count > 0 || l->owner != NULL) { 4843 mutex_exit(&l->lock); 4844 return (0); 4845 } 4846 l->owner = curthread; 4847 l->count--; 4848 } 4849 4850 mutex_exit(&l->lock); 4851 4852 return (1); 4853 } 4854 4855 void 4856 nfs_rw_exit(nfs_rwlock_t *l) 4857 { 4858 4859 mutex_enter(&l->lock); 4860 /* 4861 * If this is releasing a writer lock, then increment count to 4862 * indicate that there is one less writer active. If this was 4863 * the last of possibly nested writer locks, then clear the owner 4864 * field as well to indicate that there is no writer active 4865 * and wakeup any possible waiting writers or readers. 4866 * 4867 * If releasing a reader lock, then just decrement count to 4868 * indicate that there is one less reader active. If this was 4869 * the last active reader and there are writer(s) waiting, 4870 * then wake up the first. 4871 */ 4872 if (l->owner != NULL) { 4873 ASSERT(l->owner == curthread); 4874 l->count++; 4875 if (l->count == 0) { 4876 l->owner = NULL; 4877 cv_broadcast(&l->cv); 4878 } 4879 } else { 4880 ASSERT(l->count > 0); 4881 l->count--; 4882 if (l->count == 0 && l->waiters > 0) 4883 cv_broadcast(&l->cv); 4884 } 4885 mutex_exit(&l->lock); 4886 } 4887 4888 int 4889 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4890 { 4891 4892 if (rw == RW_READER) 4893 return (l->count > 0); 4894 ASSERT(rw == RW_WRITER); 4895 return (l->count < 0); 4896 } 4897 4898 /* ARGSUSED */ 4899 void 4900 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4901 { 4902 4903 l->count = 0; 4904 l->waiters = 0; 4905 l->owner = NULL; 4906 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4907 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4908 } 4909 4910 void 4911 nfs_rw_destroy(nfs_rwlock_t *l) 4912 { 4913 4914 mutex_destroy(&l->lock); 4915 cv_destroy(&l->cv); 4916 } 4917 4918 int 4919 nfs3_rddir_compar(const void *x, const void *y) 4920 { 4921 rddir_cache *a = (rddir_cache *)x; 4922 rddir_cache *b = (rddir_cache *)y; 4923 4924 if (a->nfs3_cookie == b->nfs3_cookie) { 4925 if (a->buflen == b->buflen) 4926 return (0); 4927 if (a->buflen < b->buflen) 4928 return (-1); 4929 return (1); 4930 } 4931 4932 if (a->nfs3_cookie < b->nfs3_cookie) 4933 return (-1); 4934 4935 return (1); 4936 } 4937 4938 int 4939 nfs_rddir_compar(const void *x, const void *y) 4940 { 4941 rddir_cache *a = (rddir_cache *)x; 4942 rddir_cache *b = (rddir_cache *)y; 4943 4944 if (a->nfs_cookie == b->nfs_cookie) { 4945 if (a->buflen == b->buflen) 4946 return (0); 4947 if (a->buflen < b->buflen) 4948 return (-1); 4949 return (1); 4950 } 4951 4952 if (a->nfs_cookie < b->nfs_cookie) 4953 return (-1); 4954 4955 return (1); 4956 } 4957 4958 static char * 4959 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4960 { 4961 servinfo_t *s; 4962 char *srvnames; 4963 char *namep; 4964 size_t length; 4965 4966 /* 4967 * Calculate the length of the string required to hold all 4968 * of the server names plus either a comma or a null 4969 * character following each individual one. 4970 */ 4971 length = 0; 4972 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4973 length += s->sv_hostnamelen; 4974 4975 srvnames = kmem_alloc(length, KM_SLEEP); 4976 4977 namep = srvnames; 4978 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4979 (void) strcpy(namep, s->sv_hostname); 4980 namep += s->sv_hostnamelen - 1; 4981 *namep++ = ','; 4982 } 4983 *--namep = '\0'; 4984 4985 *len = length; 4986 4987 return (srvnames); 4988 } 4989 4990 /* 4991 * These two functions are temporary and designed for the upgrade-workaround 4992 * only. They cannot be used for general zone-crossing NFS client support, and 4993 * will be removed shortly. 4994 * 4995 * When the workaround is enabled, all NFS traffic is forced into the global 4996 * zone. These functions are called when the code needs to refer to the state 4997 * of the underlying network connection. They're not called when the function 4998 * needs to refer to the state of the process that invoked the system call. 4999 * (E.g., when checking whether the zone is shutting down during the mount() 5000 * call.) 5001 */ 5002 5003 struct zone * 5004 nfs_zone(void) 5005 { 5006 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5007 } 5008 5009 zoneid_t 5010 nfs_zoneid(void) 5011 { 5012 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5013 } 5014 5015 /* 5016 * nfs_mount_label_policy: 5017 * Determine whether the mount is allowed according to MAC check, 5018 * by comparing (where appropriate) label of the remote server 5019 * against the label of the zone being mounted into. 5020 * 5021 * Returns: 5022 * 0 : access allowed 5023 * -1 : read-only access allowed (i.e., read-down) 5024 * >0 : error code, such as EACCES 5025 */ 5026 int 5027 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5028 struct knetconfig *knconf, cred_t *cr) 5029 { 5030 int addr_type; 5031 void *ipaddr; 5032 bslabel_t *server_sl, *mntlabel; 5033 zone_t *mntzone = NULL; 5034 ts_label_t *zlabel; 5035 tsol_tpc_t *tp; 5036 ts_label_t *tsl = NULL; 5037 int retv; 5038 5039 /* 5040 * Get the zone's label. Each zone on a labeled system has a label. 5041 */ 5042 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5043 zlabel = mntzone->zone_slabel; 5044 ASSERT(zlabel != NULL); 5045 label_hold(zlabel); 5046 5047 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5048 addr_type = IPV4_VERSION; 5049 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5050 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5051 addr_type = IPV6_VERSION; 5052 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5053 } else { 5054 retv = 0; 5055 goto out; 5056 } 5057 5058 retv = EACCES; /* assume the worst */ 5059 5060 /* 5061 * Next, get the assigned label of the remote server. 5062 */ 5063 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5064 if (tp == NULL) 5065 goto out; /* error getting host entry */ 5066 5067 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5068 goto rel_tpc; /* invalid domain */ 5069 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5070 (tp->tpc_tp.host_type != UNLABELED)) 5071 goto rel_tpc; /* invalid hosttype */ 5072 5073 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5074 tsl = getflabel_cipso(vfsp); 5075 if (tsl == NULL) 5076 goto rel_tpc; /* error getting server lbl */ 5077 5078 server_sl = label2bslabel(tsl); 5079 } else { /* UNLABELED */ 5080 server_sl = &tp->tpc_tp.tp_def_label; 5081 } 5082 5083 mntlabel = label2bslabel(zlabel); 5084 5085 /* 5086 * Now compare labels to complete the MAC check. If the labels 5087 * are equal or if the requestor is in the global zone and has 5088 * NET_MAC_AWARE, then allow read-write access. (Except for 5089 * mounts into the global zone itself; restrict these to 5090 * read-only.) 5091 * 5092 * If the requestor is in some other zone, but his label 5093 * dominates the server, then allow read-down. 5094 * 5095 * Otherwise, access is denied. 5096 */ 5097 if (blequal(mntlabel, server_sl) || 5098 (crgetzoneid(cr) == GLOBAL_ZONEID && 5099 getpflags(NET_MAC_AWARE, cr) != 0)) { 5100 if ((mntzone == global_zone) || 5101 !blequal(mntlabel, server_sl)) 5102 retv = -1; /* read-only */ 5103 else 5104 retv = 0; /* access OK */ 5105 } else if (bldominates(mntlabel, server_sl)) { 5106 retv = -1; /* read-only */ 5107 } else { 5108 retv = EACCES; 5109 } 5110 5111 if (tsl != NULL) 5112 label_rele(tsl); 5113 5114 rel_tpc: 5115 TPC_RELE(tp); 5116 out: 5117 if (mntzone) 5118 zone_rele(mntzone); 5119 label_rele(zlabel); 5120 return (retv); 5121 } 5122 5123 boolean_t 5124 nfs_has_ctty(void) 5125 { 5126 boolean_t rv; 5127 mutex_enter(&curproc->p_splock); 5128 rv = (curproc->p_sessp->s_vp != NULL); 5129 mutex_exit(&curproc->p_splock); 5130 return (rv); 5131 } 5132 5133 /* 5134 * TX NFS routine used by NFSv3 and NFSv4 to do label check 5135 * on client label and server's file object lable. 5136 */ 5137 boolean_t 5138 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag) 5139 { 5140 bslabel_t *slabel; 5141 ts_label_t *tslabel; 5142 boolean_t result; 5143 5144 if ((tslabel = nfs_getflabel(vp)) == NULL) { 5145 return (B_FALSE); 5146 } 5147 slabel = label2bslabel(tslabel); 5148 DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *, 5149 "comparing server's file label(1) with client label(2) (vp(3))", 5150 bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp); 5151 5152 if (flag == EQUALITY_CHECK) 5153 result = blequal(clabel, slabel); 5154 else 5155 result = bldominates(clabel, slabel); 5156 label_rele(tslabel); 5157 return (result); 5158 } 5159 5160 /* 5161 * See if xattr directory to see if it has any generic user attributes 5162 */ 5163 int 5164 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5165 { 5166 struct uio uio; 5167 struct iovec iov; 5168 char *dbuf; 5169 struct dirent64 *dp; 5170 size_t dlen = 8 * 1024; 5171 size_t dbuflen; 5172 int eof = 0; 5173 int error; 5174 5175 *valp = 0; 5176 dbuf = kmem_alloc(dlen, KM_SLEEP); 5177 uio.uio_iov = &iov; 5178 uio.uio_iovcnt = 1; 5179 uio.uio_segflg = UIO_SYSSPACE; 5180 uio.uio_fmode = 0; 5181 uio.uio_extflg = UIO_COPY_CACHED; 5182 uio.uio_loffset = 0; 5183 uio.uio_resid = dlen; 5184 iov.iov_base = dbuf; 5185 iov.iov_len = dlen; 5186 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5187 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5188 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5189 5190 dbuflen = dlen - uio.uio_resid; 5191 5192 if (error || dbuflen == 0) { 5193 kmem_free(dbuf, dlen); 5194 return (error); 5195 } 5196 5197 dp = (dirent64_t *)dbuf; 5198 5199 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5200 if (strcmp(dp->d_name, ".") == 0 || 5201 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5202 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5203 VIEW_READONLY) == 0) { 5204 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5205 continue; 5206 } 5207 5208 *valp = 1; 5209 break; 5210 } 5211 kmem_free(dbuf, dlen); 5212 return (0); 5213 } 5214