1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/param.h> 27 #include <sys/types.h> 28 #include <sys/systm.h> 29 #include <sys/cred.h> 30 #include <sys/proc.h> 31 #include <sys/user.h> 32 #include <sys/time.h> 33 #include <sys/buf.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/socket.h> 37 #include <sys/uio.h> 38 #include <sys/tiuser.h> 39 #include <sys/swap.h> 40 #include <sys/errno.h> 41 #include <sys/debug.h> 42 #include <sys/kmem.h> 43 #include <sys/kstat.h> 44 #include <sys/cmn_err.h> 45 #include <sys/vtrace.h> 46 #include <sys/session.h> 47 #include <sys/dnlc.h> 48 #include <sys/bitmap.h> 49 #include <sys/acl.h> 50 #include <sys/ddi.h> 51 #include <sys/pathname.h> 52 #include <sys/flock.h> 53 #include <sys/dirent.h> 54 #include <sys/flock.h> 55 #include <sys/callb.h> 56 #include <sys/atomic.h> 57 #include <sys/list.h> 58 #include <sys/tsol/tnet.h> 59 #include <sys/priv.h> 60 #include <sys/sdt.h> 61 #include <sys/attr.h> 62 63 #include <inet/ip6.h> 64 65 #include <rpc/types.h> 66 #include <rpc/xdr.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs4.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/rnode.h> 74 #include <nfs/nfs_acl.h> 75 76 #include <sys/tsol/label.h> 77 78 /* 79 * The hash queues for the access to active and cached rnodes 80 * are organized as doubly linked lists. A reader/writer lock 81 * for each hash bucket is used to control access and to synchronize 82 * lookups, additions, and deletions from the hash queue. 83 * 84 * The rnode freelist is organized as a doubly linked list with 85 * a head pointer. Additions and deletions are synchronized via 86 * a single mutex. 87 * 88 * In order to add an rnode to the free list, it must be hashed into 89 * a hash queue and the exclusive lock to the hash queue be held. 90 * If an rnode is not hashed into a hash queue, then it is destroyed 91 * because it represents no valuable information that can be reused 92 * about the file. The exclusive lock to the hash queue must be 93 * held in order to prevent a lookup in the hash queue from finding 94 * the rnode and using it and assuming that the rnode is not on the 95 * freelist. The lookup in the hash queue will have the hash queue 96 * locked, either exclusive or shared. 97 * 98 * The vnode reference count for each rnode is not allowed to drop 99 * below 1. This prevents external entities, such as the VM 100 * subsystem, from acquiring references to vnodes already on the 101 * freelist and then trying to place them back on the freelist 102 * when their reference is released. This means that the when an 103 * rnode is looked up in the hash queues, then either the rnode 104 * is removed from the freelist and that reference is transferred to 105 * the new reference or the vnode reference count must be incremented 106 * accordingly. The mutex for the freelist must be held in order to 107 * accurately test to see if the rnode is on the freelist or not. 108 * The hash queue lock might be held shared and it is possible that 109 * two different threads may race to remove the rnode from the 110 * freelist. This race can be resolved by holding the mutex for the 111 * freelist. Please note that the mutex for the freelist does not 112 * need to held if the rnode is not on the freelist. It can not be 113 * placed on the freelist due to the requirement that the thread 114 * putting the rnode on the freelist must hold the exclusive lock 115 * to the hash queue and the thread doing the lookup in the hash 116 * queue is holding either a shared or exclusive lock to the hash 117 * queue. 118 * 119 * The lock ordering is: 120 * 121 * hash bucket lock -> vnode lock 122 * hash bucket lock -> freelist lock 123 */ 124 static rhashq_t *rtable; 125 126 static kmutex_t rpfreelist_lock; 127 static rnode_t *rpfreelist = NULL; 128 static long rnew = 0; 129 long nrnode = 0; 130 131 static int rtablesize; 132 static int rtablemask; 133 134 static int hashlen = 4; 135 136 static struct kmem_cache *rnode_cache; 137 138 /* 139 * Mutex to protect the following variables: 140 * nfs_major 141 * nfs_minor 142 */ 143 kmutex_t nfs_minor_lock; 144 int nfs_major; 145 int nfs_minor; 146 147 /* Do we allow preepoch (negative) time values otw? */ 148 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 149 150 /* 151 * Access cache 152 */ 153 static acache_hash_t *acache; 154 static long nacache; /* used strictly to size the number of hash queues */ 155 156 static int acachesize; 157 static int acachemask; 158 static struct kmem_cache *acache_cache; 159 160 /* 161 * Client side utilities 162 */ 163 164 /* 165 * client side statistics 166 */ 167 static const struct clstat clstat_tmpl = { 168 { "calls", KSTAT_DATA_UINT64 }, 169 { "badcalls", KSTAT_DATA_UINT64 }, 170 { "clgets", KSTAT_DATA_UINT64 }, 171 { "cltoomany", KSTAT_DATA_UINT64 }, 172 #ifdef DEBUG 173 { "clalloc", KSTAT_DATA_UINT64 }, 174 { "noresponse", KSTAT_DATA_UINT64 }, 175 { "failover", KSTAT_DATA_UINT64 }, 176 { "remap", KSTAT_DATA_UINT64 }, 177 #endif 178 }; 179 180 /* 181 * The following are statistics that describe behavior of the system as a whole 182 * and doesn't correspond to any one particular zone. 183 */ 184 #ifdef DEBUG 185 static struct clstat_debug { 186 kstat_named_t nrnode; /* number of allocated rnodes */ 187 kstat_named_t access; /* size of access cache */ 188 kstat_named_t dirent; /* size of readdir cache */ 189 kstat_named_t dirents; /* size of readdir buf cache */ 190 kstat_named_t reclaim; /* number of reclaims */ 191 kstat_named_t clreclaim; /* number of cl reclaims */ 192 kstat_named_t f_reclaim; /* number of free reclaims */ 193 kstat_named_t a_reclaim; /* number of active reclaims */ 194 kstat_named_t r_reclaim; /* number of rnode reclaims */ 195 kstat_named_t rpath; /* bytes used to store rpaths */ 196 } clstat_debug = { 197 { "nrnode", KSTAT_DATA_UINT64 }, 198 { "access", KSTAT_DATA_UINT64 }, 199 { "dirent", KSTAT_DATA_UINT64 }, 200 { "dirents", KSTAT_DATA_UINT64 }, 201 { "reclaim", KSTAT_DATA_UINT64 }, 202 { "clreclaim", KSTAT_DATA_UINT64 }, 203 { "f_reclaim", KSTAT_DATA_UINT64 }, 204 { "a_reclaim", KSTAT_DATA_UINT64 }, 205 { "r_reclaim", KSTAT_DATA_UINT64 }, 206 { "r_path", KSTAT_DATA_UINT64 }, 207 }; 208 #endif /* DEBUG */ 209 210 /* 211 * We keep a global list of per-zone client data, so we can clean up all zones 212 * if we get low on memory. 213 */ 214 static list_t nfs_clnt_list; 215 static kmutex_t nfs_clnt_list_lock; 216 static zone_key_t nfsclnt_zone_key; 217 218 static struct kmem_cache *chtab_cache; 219 220 /* 221 * Some servers do not properly update the attributes of the 222 * directory when changes are made. To allow interoperability 223 * with these broken servers, the nfs_disable_rddir_cache 224 * parameter must be set in /etc/system 225 */ 226 int nfs_disable_rddir_cache = 0; 227 228 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 229 struct chtab **); 230 void clfree(CLIENT *, struct chtab *); 231 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 232 struct chtab **, struct nfs_clnt *); 233 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 234 struct chtab **, struct nfs_clnt *); 235 static void clreclaim(void *); 236 static int nfs_feedback(int, int, mntinfo_t *); 237 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 238 caddr_t, cred_t *, int *, enum clnt_stat *, int, 239 failinfo_t *); 240 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 241 caddr_t, cred_t *, int *, int, failinfo_t *); 242 static void rinactive(rnode_t *, cred_t *); 243 static int rtablehash(nfs_fhandle *); 244 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 245 struct vnodeops *, 246 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 247 cred_t *), 248 int (*)(const void *, const void *), int *, cred_t *, 249 char *, char *); 250 static void rp_rmfree(rnode_t *); 251 static void rp_addhash(rnode_t *); 252 static void rp_rmhash_locked(rnode_t *); 253 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 254 static void destroy_rnode(rnode_t *); 255 static void rddir_cache_free(rddir_cache *); 256 static int nfs_free_data_reclaim(rnode_t *); 257 static int nfs_active_data_reclaim(rnode_t *); 258 static int nfs_free_reclaim(void); 259 static int nfs_active_reclaim(void); 260 static int nfs_rnode_reclaim(void); 261 static void nfs_reclaim(void *); 262 static int failover_safe(failinfo_t *); 263 static void failover_newserver(mntinfo_t *mi); 264 static void failover_thread(mntinfo_t *mi); 265 static int failover_wait(mntinfo_t *); 266 static int failover_remap(failinfo_t *); 267 static int failover_lookup(char *, vnode_t *, 268 int (*)(vnode_t *, char *, vnode_t **, 269 struct pathname *, int, vnode_t *, cred_t *, int), 270 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 271 vnode_t **); 272 static void nfs_free_r_path(rnode_t *); 273 static void nfs_set_vroot(vnode_t *); 274 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 275 276 /* 277 * from rpcsec module (common/rpcsec) 278 */ 279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 280 extern void sec_clnt_freeh(AUTH *); 281 extern void sec_clnt_freeinfo(struct sec_data *); 282 283 /* 284 * used in mount policy 285 */ 286 extern ts_label_t *getflabel_cipso(vfs_t *); 287 288 /* 289 * EIO or EINTR are not recoverable errors. 290 */ 291 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 292 293 #ifdef DEBUG 294 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" 295 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" 296 #else 297 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" 298 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" 299 #endif 300 /* 301 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 302 */ 303 static int 304 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 305 struct chtab **chp, struct nfs_clnt *nfscl) 306 { 307 struct chhead *ch, *newch; 308 struct chhead **plistp; 309 struct chtab *cp; 310 int error; 311 k_sigset_t smask; 312 313 if (newcl == NULL || chp == NULL || ci == NULL) 314 return (EINVAL); 315 316 *newcl = NULL; 317 *chp = NULL; 318 319 /* 320 * Find an unused handle or create one 321 */ 322 newch = NULL; 323 nfscl->nfscl_stat.clgets.value.ui64++; 324 top: 325 /* 326 * Find the correct entry in the cache to check for free 327 * client handles. The search is based on the RPC program 328 * number, program version number, dev_t for the transport 329 * device, and the protocol family. 330 */ 331 mutex_enter(&nfscl->nfscl_chtable_lock); 332 plistp = &nfscl->nfscl_chtable; 333 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 334 if (ch->ch_prog == ci->cl_prog && 335 ch->ch_vers == ci->cl_vers && 336 ch->ch_dev == svp->sv_knconf->knc_rdev && 337 (strcmp(ch->ch_protofmly, 338 svp->sv_knconf->knc_protofmly) == 0)) 339 break; 340 plistp = &ch->ch_next; 341 } 342 343 /* 344 * If we didn't find a cache entry for this quadruple, then 345 * create one. If we don't have one already preallocated, 346 * then drop the cache lock, create one, and then start over. 347 * If we did have a preallocated entry, then just add it to 348 * the front of the list. 349 */ 350 if (ch == NULL) { 351 if (newch == NULL) { 352 mutex_exit(&nfscl->nfscl_chtable_lock); 353 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 354 newch->ch_timesused = 0; 355 newch->ch_prog = ci->cl_prog; 356 newch->ch_vers = ci->cl_vers; 357 newch->ch_dev = svp->sv_knconf->knc_rdev; 358 newch->ch_protofmly = kmem_alloc( 359 strlen(svp->sv_knconf->knc_protofmly) + 1, 360 KM_SLEEP); 361 (void) strcpy(newch->ch_protofmly, 362 svp->sv_knconf->knc_protofmly); 363 newch->ch_list = NULL; 364 goto top; 365 } 366 ch = newch; 367 newch = NULL; 368 ch->ch_next = nfscl->nfscl_chtable; 369 nfscl->nfscl_chtable = ch; 370 /* 371 * We found a cache entry, but if it isn't on the front of the 372 * list, then move it to the front of the list to try to take 373 * advantage of locality of operations. 374 */ 375 } else if (ch != nfscl->nfscl_chtable) { 376 *plistp = ch->ch_next; 377 ch->ch_next = nfscl->nfscl_chtable; 378 nfscl->nfscl_chtable = ch; 379 } 380 381 /* 382 * If there was a free client handle cached, then remove it 383 * from the list, init it, and use it. 384 */ 385 if (ch->ch_list != NULL) { 386 cp = ch->ch_list; 387 ch->ch_list = cp->ch_list; 388 mutex_exit(&nfscl->nfscl_chtable_lock); 389 if (newch != NULL) { 390 kmem_free(newch->ch_protofmly, 391 strlen(newch->ch_protofmly) + 1); 392 kmem_free(newch, sizeof (*newch)); 393 } 394 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 395 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 396 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 397 &cp->ch_client->cl_auth); 398 if (error || cp->ch_client->cl_auth == NULL) { 399 CLNT_DESTROY(cp->ch_client); 400 kmem_cache_free(chtab_cache, cp); 401 return ((error != 0) ? error : EINTR); 402 } 403 ch->ch_timesused++; 404 *newcl = cp->ch_client; 405 *chp = cp; 406 return (0); 407 } 408 409 /* 410 * There weren't any free client handles which fit, so allocate 411 * a new one and use that. 412 */ 413 #ifdef DEBUG 414 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); 415 #endif 416 mutex_exit(&nfscl->nfscl_chtable_lock); 417 418 nfscl->nfscl_stat.cltoomany.value.ui64++; 419 if (newch != NULL) { 420 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 421 kmem_free(newch, sizeof (*newch)); 422 } 423 424 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 425 cp->ch_head = ch; 426 427 sigintr(&smask, (int)ci->cl_flags & MI_INT); 428 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 429 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 430 sigunintr(&smask); 431 432 if (error != 0) { 433 kmem_cache_free(chtab_cache, cp); 434 #ifdef DEBUG 435 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 436 #endif 437 /* 438 * Warning is unnecessary if error is EINTR. 439 */ 440 if (error != EINTR) { 441 nfs_cmn_err(error, CE_WARN, 442 "clget: couldn't create handle: %m\n"); 443 } 444 return (error); 445 } 446 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 447 auth_destroy(cp->ch_client->cl_auth); 448 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 449 &cp->ch_client->cl_auth); 450 if (error || cp->ch_client->cl_auth == NULL) { 451 CLNT_DESTROY(cp->ch_client); 452 kmem_cache_free(chtab_cache, cp); 453 #ifdef DEBUG 454 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); 455 #endif 456 return ((error != 0) ? error : EINTR); 457 } 458 ch->ch_timesused++; 459 *newcl = cp->ch_client; 460 ASSERT(cp->ch_client->cl_nosignal == FALSE); 461 *chp = cp; 462 return (0); 463 } 464 465 int 466 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 467 struct chtab **chp) 468 { 469 struct nfs_clnt *nfscl; 470 471 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 472 ASSERT(nfscl != NULL); 473 474 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 475 } 476 477 static int 478 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 479 struct chtab **chp, struct nfs_clnt *nfscl) 480 { 481 clinfo_t ci; 482 int error; 483 484 /* 485 * Set read buffer size to rsize 486 * and add room for RPC headers. 487 */ 488 ci.cl_readsize = mi->mi_tsize; 489 if (ci.cl_readsize != 0) 490 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 491 492 /* 493 * If soft mount and server is down just try once. 494 * meaning: do not retransmit. 495 */ 496 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 497 ci.cl_retrans = 0; 498 else 499 ci.cl_retrans = mi->mi_retrans; 500 501 ci.cl_prog = NFS_ACL_PROGRAM; 502 ci.cl_vers = mi->mi_vers; 503 ci.cl_flags = mi->mi_flags; 504 505 /* 506 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 507 * security flavor, the client tries to establish a security context 508 * by contacting the server. If the connection is timed out or reset, 509 * e.g. server reboot, we will try again. 510 */ 511 do { 512 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 513 514 if (error == 0) 515 break; 516 517 /* 518 * For forced unmount or zone shutdown, bail out, no retry. 519 */ 520 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 521 error = EIO; 522 break; 523 } 524 525 /* do not retry for softmount */ 526 if (!(mi->mi_flags & MI_HARD)) 527 break; 528 529 /* let the caller deal with the failover case */ 530 if (FAILOVER_MOUNT(mi)) 531 break; 532 533 } while (error == ETIMEDOUT || error == ECONNRESET); 534 535 return (error); 536 } 537 538 static int 539 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 540 struct chtab **chp, struct nfs_clnt *nfscl) 541 { 542 clinfo_t ci; 543 int error; 544 545 /* 546 * Set read buffer size to rsize 547 * and add room for RPC headers. 548 */ 549 ci.cl_readsize = mi->mi_tsize; 550 if (ci.cl_readsize != 0) 551 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 552 553 /* 554 * If soft mount and server is down just try once. 555 * meaning: do not retransmit. 556 */ 557 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 558 ci.cl_retrans = 0; 559 else 560 ci.cl_retrans = mi->mi_retrans; 561 562 ci.cl_prog = mi->mi_prog; 563 ci.cl_vers = mi->mi_vers; 564 ci.cl_flags = mi->mi_flags; 565 566 /* 567 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 568 * security flavor, the client tries to establish a security context 569 * by contacting the server. If the connection is timed out or reset, 570 * e.g. server reboot, we will try again. 571 */ 572 do { 573 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 574 575 if (error == 0) 576 break; 577 578 /* 579 * For forced unmount or zone shutdown, bail out, no retry. 580 */ 581 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 582 error = EIO; 583 break; 584 } 585 586 /* do not retry for softmount */ 587 if (!(mi->mi_flags & MI_HARD)) 588 break; 589 590 /* let the caller deal with the failover case */ 591 if (FAILOVER_MOUNT(mi)) 592 break; 593 594 } while (error == ETIMEDOUT || error == ECONNRESET); 595 596 return (error); 597 } 598 599 static void 600 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 601 { 602 if (cl->cl_auth != NULL) { 603 sec_clnt_freeh(cl->cl_auth); 604 cl->cl_auth = NULL; 605 } 606 607 /* 608 * Timestamp this cache entry so that we know when it was last 609 * used. 610 */ 611 cp->ch_freed = gethrestime_sec(); 612 613 /* 614 * Add the free client handle to the front of the list. 615 * This way, the list will be sorted in youngest to oldest 616 * order. 617 */ 618 mutex_enter(&nfscl->nfscl_chtable_lock); 619 cp->ch_list = cp->ch_head->ch_list; 620 cp->ch_head->ch_list = cp; 621 mutex_exit(&nfscl->nfscl_chtable_lock); 622 } 623 624 void 625 clfree(CLIENT *cl, struct chtab *cp) 626 { 627 struct nfs_clnt *nfscl; 628 629 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 630 ASSERT(nfscl != NULL); 631 632 clfree_impl(cl, cp, nfscl); 633 } 634 635 #define CL_HOLDTIME 60 /* time to hold client handles */ 636 637 static void 638 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 639 { 640 struct chhead *ch; 641 struct chtab *cp; /* list of objects that can be reclaimed */ 642 struct chtab *cpe; 643 struct chtab *cpl; 644 struct chtab **cpp; 645 #ifdef DEBUG 646 int n = 0; 647 #endif 648 649 /* 650 * Need to reclaim some memory, so step through the cache 651 * looking through the lists for entries which can be freed. 652 */ 653 cp = NULL; 654 655 mutex_enter(&nfscl->nfscl_chtable_lock); 656 657 /* 658 * Here we step through each non-NULL quadruple and start to 659 * construct the reclaim list pointed to by cp. Note that 660 * cp will contain all eligible chtab entries. When this traversal 661 * completes, chtab entries from the last quadruple will be at the 662 * front of cp and entries from previously inspected quadruples have 663 * been appended to the rear of cp. 664 */ 665 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 666 if (ch->ch_list == NULL) 667 continue; 668 /* 669 * Search each list for entries older then 670 * cl_holdtime seconds. The lists are maintained 671 * in youngest to oldest order so that when the 672 * first entry is found which is old enough, then 673 * all of the rest of the entries on the list will 674 * be old enough as well. 675 */ 676 cpl = ch->ch_list; 677 cpp = &ch->ch_list; 678 while (cpl != NULL && 679 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 680 cpp = &cpl->ch_list; 681 cpl = cpl->ch_list; 682 } 683 if (cpl != NULL) { 684 *cpp = NULL; 685 if (cp != NULL) { 686 cpe = cpl; 687 while (cpe->ch_list != NULL) 688 cpe = cpe->ch_list; 689 cpe->ch_list = cp; 690 } 691 cp = cpl; 692 } 693 } 694 695 mutex_exit(&nfscl->nfscl_chtable_lock); 696 697 /* 698 * If cp is empty, then there is nothing to reclaim here. 699 */ 700 if (cp == NULL) 701 return; 702 703 /* 704 * Step through the list of entries to free, destroying each client 705 * handle and kmem_free'ing the memory for each entry. 706 */ 707 while (cp != NULL) { 708 #ifdef DEBUG 709 n++; 710 #endif 711 CLNT_DESTROY(cp->ch_client); 712 cpl = cp->ch_list; 713 kmem_cache_free(chtab_cache, cp); 714 cp = cpl; 715 } 716 717 #ifdef DEBUG 718 /* 719 * Update clalloc so that nfsstat shows the current number 720 * of allocated client handles. 721 */ 722 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 723 #endif 724 } 725 726 /* ARGSUSED */ 727 static void 728 clreclaim(void *all) 729 { 730 struct nfs_clnt *nfscl; 731 732 #ifdef DEBUG 733 clstat_debug.clreclaim.value.ui64++; 734 #endif 735 /* 736 * The system is low on memory; go through and try to reclaim some from 737 * every zone on the system. 738 */ 739 mutex_enter(&nfs_clnt_list_lock); 740 nfscl = list_head(&nfs_clnt_list); 741 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 742 clreclaim_zone(nfscl, CL_HOLDTIME); 743 mutex_exit(&nfs_clnt_list_lock); 744 } 745 746 /* 747 * Minimum time-out values indexed by call type 748 * These units are in "eights" of a second to avoid multiplies 749 */ 750 static unsigned int minimum_timeo[] = { 751 6, 7, 10 752 }; 753 754 /* 755 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 756 */ 757 #define MAXTIMO (20*hz) 758 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 759 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 760 761 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 762 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 763 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 764 765 /* 766 * Function called when rfscall notices that we have been 767 * re-transmitting, or when we get a response without retransmissions. 768 * Return 1 if the transfer size was adjusted down - 0 if no change. 769 */ 770 static int 771 nfs_feedback(int flag, int which, mntinfo_t *mi) 772 { 773 int kind; 774 int r = 0; 775 776 mutex_enter(&mi->mi_lock); 777 if (flag == FEEDBACK_REXMIT1) { 778 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 779 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 780 goto done; 781 if (mi->mi_curread > MIN_NFS_TSIZE) { 782 mi->mi_curread /= 2; 783 if (mi->mi_curread < MIN_NFS_TSIZE) 784 mi->mi_curread = MIN_NFS_TSIZE; 785 r = 1; 786 } 787 788 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 789 mi->mi_curwrite /= 2; 790 if (mi->mi_curwrite < MIN_NFS_TSIZE) 791 mi->mi_curwrite = MIN_NFS_TSIZE; 792 r = 1; 793 } 794 } else if (flag == FEEDBACK_OK) { 795 kind = mi->mi_timer_type[which]; 796 if (kind == 0 || 797 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 798 goto done; 799 if (kind == 1) { 800 if (mi->mi_curread >= mi->mi_tsize) 801 goto done; 802 mi->mi_curread += MIN_NFS_TSIZE; 803 if (mi->mi_curread > mi->mi_tsize/2) 804 mi->mi_curread = mi->mi_tsize; 805 } else if (kind == 2) { 806 if (mi->mi_curwrite >= mi->mi_stsize) 807 goto done; 808 mi->mi_curwrite += MIN_NFS_TSIZE; 809 if (mi->mi_curwrite > mi->mi_stsize/2) 810 mi->mi_curwrite = mi->mi_stsize; 811 } 812 } 813 done: 814 mutex_exit(&mi->mi_lock); 815 return (r); 816 } 817 818 #ifdef DEBUG 819 static int rfs2call_hits = 0; 820 static int rfs2call_misses = 0; 821 #endif 822 823 int 824 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 825 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 826 enum nfsstat *statusp, int flags, failinfo_t *fi) 827 { 828 int rpcerror; 829 enum clnt_stat rpc_status; 830 831 ASSERT(statusp != NULL); 832 833 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 834 cr, douprintf, &rpc_status, flags, fi); 835 if (!rpcerror) { 836 /* 837 * See crnetadjust() for comments. 838 */ 839 if (*statusp == NFSERR_ACCES && 840 (cr = crnetadjust(cr)) != NULL) { 841 #ifdef DEBUG 842 rfs2call_hits++; 843 #endif 844 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 845 resp, cr, douprintf, NULL, flags, fi); 846 crfree(cr); 847 #ifdef DEBUG 848 if (*statusp == NFSERR_ACCES) 849 rfs2call_misses++; 850 #endif 851 } 852 } else if (rpc_status == RPC_PROCUNAVAIL) { 853 *statusp = NFSERR_OPNOTSUPP; 854 rpcerror = 0; 855 } 856 857 return (rpcerror); 858 } 859 860 #define NFS3_JUKEBOX_DELAY 10 * hz 861 862 static clock_t nfs3_jukebox_delay = 0; 863 864 #ifdef DEBUG 865 static int rfs3call_hits = 0; 866 static int rfs3call_misses = 0; 867 #endif 868 869 int 870 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 871 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 872 nfsstat3 *statusp, int flags, failinfo_t *fi) 873 { 874 int rpcerror; 875 int user_informed; 876 877 user_informed = 0; 878 do { 879 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 880 cr, douprintf, NULL, flags, fi); 881 if (!rpcerror) { 882 cred_t *crr; 883 if (*statusp == NFS3ERR_JUKEBOX) { 884 if (ttoproc(curthread) == &p0) { 885 rpcerror = EAGAIN; 886 break; 887 } 888 if (!user_informed) { 889 user_informed = 1; 890 uprintf( 891 "file temporarily unavailable on the server, retrying...\n"); 892 } 893 delay(nfs3_jukebox_delay); 894 } 895 /* 896 * See crnetadjust() for comments. 897 */ 898 else if (*statusp == NFS3ERR_ACCES && 899 (crr = crnetadjust(cr)) != NULL) { 900 #ifdef DEBUG 901 rfs3call_hits++; 902 #endif 903 rpcerror = rfscall(mi, which, xdrargs, argsp, 904 xdrres, resp, crr, douprintf, 905 NULL, flags, fi); 906 907 crfree(crr); 908 #ifdef DEBUG 909 if (*statusp == NFS3ERR_ACCES) 910 rfs3call_misses++; 911 #endif 912 } 913 } 914 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 915 916 return (rpcerror); 917 } 918 919 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 920 #define INC_READERS(mi) { \ 921 mi->mi_readers++; \ 922 } 923 #define DEC_READERS(mi) { \ 924 mi->mi_readers--; \ 925 if (mi->mi_readers == 0) \ 926 cv_broadcast(&mi->mi_failover_cv); \ 927 } 928 929 static int 930 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 931 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 932 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 933 { 934 CLIENT *client; 935 struct chtab *ch; 936 cred_t *cr = icr; 937 enum clnt_stat status; 938 struct rpc_err rpcerr, rpcerr_tmp; 939 struct timeval wait; 940 int timeo; /* in units of hz */ 941 int my_rsize, my_wsize; 942 bool_t tryagain; 943 bool_t cred_cloned = FALSE; 944 k_sigset_t smask; 945 servinfo_t *svp; 946 struct nfs_clnt *nfscl; 947 zoneid_t zoneid = getzoneid(); 948 char *msg; 949 #ifdef DEBUG 950 char *bufp; 951 #endif 952 953 954 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 955 "rfscall_start:which %d mi %p", which, mi); 956 957 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 958 ASSERT(nfscl != NULL); 959 960 nfscl->nfscl_stat.calls.value.ui64++; 961 mi->mi_reqs[which].value.ui64++; 962 963 rpcerr.re_status = RPC_SUCCESS; 964 965 /* 966 * In case of forced unmount or zone shutdown, return EIO. 967 */ 968 969 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 970 rpcerr.re_status = RPC_FAILED; 971 rpcerr.re_errno = EIO; 972 return (rpcerr.re_errno); 973 } 974 975 /* 976 * Remember the transfer sizes in case 977 * nfs_feedback changes them underneath us. 978 */ 979 my_rsize = mi->mi_curread; 980 my_wsize = mi->mi_curwrite; 981 982 /* 983 * NFS client failover support 984 * 985 * If this rnode is not in sync with the current server (VALID_FH), 986 * we'd like to do a remap to get in sync. We can be interrupted 987 * in failover_remap(), and if so we'll bail. Otherwise, we'll 988 * use the best info we have to try the RPC. Part of that is 989 * unconditionally updating the filehandle copy kept for V3. 990 * 991 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 992 * rw_enter(); we're trying to keep the current server from being 993 * changed on us until we're done with the remapping and have a 994 * matching client handle. We don't want to sending a filehandle 995 * to the wrong host. 996 */ 997 failoverretry: 998 if (FAILOVER_MOUNT(mi)) { 999 mutex_enter(&mi->mi_lock); 1000 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1001 if (failover_wait(mi)) { 1002 mutex_exit(&mi->mi_lock); 1003 return (EINTR); 1004 } 1005 } 1006 INC_READERS(mi); 1007 mutex_exit(&mi->mi_lock); 1008 if (fi) { 1009 if (!VALID_FH(fi) && 1010 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1011 int remaperr; 1012 1013 svp = mi->mi_curr_serv; 1014 remaperr = failover_remap(fi); 1015 if (remaperr != 0) { 1016 #ifdef DEBUG 1017 if (remaperr != EINTR) 1018 nfs_cmn_err(remaperr, CE_WARN, 1019 "rfscall couldn't failover: %m"); 1020 #endif 1021 mutex_enter(&mi->mi_lock); 1022 DEC_READERS(mi); 1023 mutex_exit(&mi->mi_lock); 1024 /* 1025 * If failover_remap returns ETIMEDOUT 1026 * and the filesystem is hard mounted 1027 * we have to retry the call with a new 1028 * server. 1029 */ 1030 if ((mi->mi_flags & MI_HARD) && 1031 IS_RECOVERABLE_ERROR(remaperr)) { 1032 if (svp == mi->mi_curr_serv) 1033 failover_newserver(mi); 1034 rpcerr.re_status = RPC_SUCCESS; 1035 goto failoverretry; 1036 } 1037 rpcerr.re_errno = remaperr; 1038 return (remaperr); 1039 } 1040 } 1041 if (fi->fhp && fi->copyproc) 1042 (*fi->copyproc)(fi->fhp, fi->vp); 1043 } 1044 } 1045 1046 /* For TSOL, use a new cred which has net_mac_aware flag */ 1047 if (!cred_cloned && is_system_labeled()) { 1048 cred_cloned = TRUE; 1049 cr = crdup(icr); 1050 (void) setpflags(NET_MAC_AWARE, 1, cr); 1051 } 1052 1053 /* 1054 * clget() calls clnt_tli_kinit() which clears the xid, so we 1055 * are guaranteed to reprocess the retry as a new request. 1056 */ 1057 svp = mi->mi_curr_serv; 1058 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1059 1060 if (FAILOVER_MOUNT(mi)) { 1061 mutex_enter(&mi->mi_lock); 1062 DEC_READERS(mi); 1063 mutex_exit(&mi->mi_lock); 1064 1065 if ((rpcerr.re_errno == ETIMEDOUT || 1066 rpcerr.re_errno == ECONNRESET) && 1067 failover_safe(fi)) { 1068 if (svp == mi->mi_curr_serv) 1069 failover_newserver(mi); 1070 goto failoverretry; 1071 } 1072 } 1073 if (rpcerr.re_errno != 0) 1074 return (rpcerr.re_errno); 1075 1076 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1077 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1078 timeo = (mi->mi_timeo * hz) / 10; 1079 } else { 1080 mutex_enter(&mi->mi_lock); 1081 timeo = CLNT_SETTIMERS(client, 1082 &(mi->mi_timers[mi->mi_timer_type[which]]), 1083 &(mi->mi_timers[NFS_CALLTYPES]), 1084 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1085 (void (*)())NULL, (caddr_t)mi, 0); 1086 mutex_exit(&mi->mi_lock); 1087 } 1088 1089 /* 1090 * If hard mounted fs, retry call forever unless hard error occurs. 1091 */ 1092 do { 1093 tryagain = FALSE; 1094 1095 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1096 status = RPC_FAILED; 1097 rpcerr.re_status = RPC_FAILED; 1098 rpcerr.re_errno = EIO; 1099 break; 1100 } 1101 1102 TICK_TO_TIMEVAL(timeo, &wait); 1103 1104 /* 1105 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1106 * and SIGTERM. (Preserving the existing masks). 1107 * Mask out SIGINT if mount option nointr is specified. 1108 */ 1109 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1110 if (!(mi->mi_flags & MI_INT)) 1111 client->cl_nosignal = TRUE; 1112 1113 /* 1114 * If there is a current signal, then don't bother 1115 * even trying to send out the request because we 1116 * won't be able to block waiting for the response. 1117 * Simply assume RPC_INTR and get on with it. 1118 */ 1119 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1120 status = RPC_INTR; 1121 else { 1122 status = CLNT_CALL(client, which, xdrargs, argsp, 1123 xdrres, resp, wait); 1124 } 1125 1126 if (!(mi->mi_flags & MI_INT)) 1127 client->cl_nosignal = FALSE; 1128 /* 1129 * restore original signal mask 1130 */ 1131 sigunintr(&smask); 1132 1133 switch (status) { 1134 case RPC_SUCCESS: 1135 if ((mi->mi_flags & MI_DYNAMIC) && 1136 mi->mi_timer_type[which] != 0 && 1137 (mi->mi_curread != my_rsize || 1138 mi->mi_curwrite != my_wsize)) 1139 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1140 break; 1141 1142 case RPC_INTR: 1143 /* 1144 * There is no way to recover from this error, 1145 * even if mount option nointr is specified. 1146 * SIGKILL, for example, cannot be blocked. 1147 */ 1148 rpcerr.re_status = RPC_INTR; 1149 rpcerr.re_errno = EINTR; 1150 break; 1151 1152 case RPC_UDERROR: 1153 /* 1154 * If the NFS server is local (vold) and 1155 * it goes away then we get RPC_UDERROR. 1156 * This is a retryable error, so we would 1157 * loop, so check to see if the specific 1158 * error was ECONNRESET, indicating that 1159 * target did not exist at all. If so, 1160 * return with RPC_PROGUNAVAIL and 1161 * ECONNRESET to indicate why. 1162 */ 1163 CLNT_GETERR(client, &rpcerr); 1164 if (rpcerr.re_errno == ECONNRESET) { 1165 rpcerr.re_status = RPC_PROGUNAVAIL; 1166 rpcerr.re_errno = ECONNRESET; 1167 break; 1168 } 1169 /*FALLTHROUGH*/ 1170 1171 default: /* probably RPC_TIMEDOUT */ 1172 if (IS_UNRECOVERABLE_RPC(status)) 1173 break; 1174 1175 /* 1176 * increment server not responding count 1177 */ 1178 mutex_enter(&mi->mi_lock); 1179 mi->mi_noresponse++; 1180 mutex_exit(&mi->mi_lock); 1181 #ifdef DEBUG 1182 nfscl->nfscl_stat.noresponse.value.ui64++; 1183 #endif 1184 1185 if (!(mi->mi_flags & MI_HARD)) { 1186 if (!(mi->mi_flags & MI_SEMISOFT) || 1187 (mi->mi_ss_call_type[which] == 0)) 1188 break; 1189 } 1190 1191 /* 1192 * The call is in progress (over COTS). 1193 * Try the CLNT_CALL again, but don't 1194 * print a noisy error message. 1195 */ 1196 if (status == RPC_INPROGRESS) { 1197 tryagain = TRUE; 1198 break; 1199 } 1200 1201 if (flags & RFSCALL_SOFT) 1202 break; 1203 1204 /* 1205 * On zone shutdown, just move on. 1206 */ 1207 if (zone_status_get(curproc->p_zone) >= 1208 ZONE_IS_SHUTTING_DOWN) { 1209 rpcerr.re_status = RPC_FAILED; 1210 rpcerr.re_errno = EIO; 1211 break; 1212 } 1213 1214 /* 1215 * NFS client failover support 1216 * 1217 * If the current server just failed us, we'll 1218 * start the process of finding a new server. 1219 * After that, we can just retry. 1220 */ 1221 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1222 if (svp == mi->mi_curr_serv) 1223 failover_newserver(mi); 1224 clfree_impl(client, ch, nfscl); 1225 goto failoverretry; 1226 } 1227 1228 tryagain = TRUE; 1229 timeo = backoff(timeo); 1230 1231 CLNT_GETERR(client, &rpcerr_tmp); 1232 if ((status == RPC_CANTSEND) && 1233 (rpcerr_tmp.re_errno == ENOBUFS)) 1234 msg = SRV_QFULL_MSG; 1235 else 1236 msg = SRV_NOTRESP_MSG; 1237 1238 mutex_enter(&mi->mi_lock); 1239 if (!(mi->mi_flags & MI_PRINTED)) { 1240 mi->mi_flags |= MI_PRINTED; 1241 mutex_exit(&mi->mi_lock); 1242 #ifdef DEBUG 1243 zprintf(zoneid, msg, mi->mi_vers, 1244 svp->sv_hostname); 1245 #else 1246 zprintf(zoneid, msg, svp->sv_hostname); 1247 #endif 1248 } else 1249 mutex_exit(&mi->mi_lock); 1250 if (*douprintf && nfs_has_ctty()) { 1251 *douprintf = 0; 1252 if (!(mi->mi_flags & MI_NOPRINT)) 1253 #ifdef DEBUG 1254 uprintf(msg, mi->mi_vers, 1255 svp->sv_hostname); 1256 #else 1257 uprintf(msg, svp->sv_hostname); 1258 #endif 1259 } 1260 1261 /* 1262 * If doing dynamic adjustment of transfer 1263 * size and if it's a read or write call 1264 * and if the transfer size changed while 1265 * retransmitting or if the feedback routine 1266 * changed the transfer size, 1267 * then exit rfscall so that the transfer 1268 * size can be adjusted at the vnops level. 1269 */ 1270 if ((mi->mi_flags & MI_DYNAMIC) && 1271 mi->mi_timer_type[which] != 0 && 1272 (mi->mi_curread != my_rsize || 1273 mi->mi_curwrite != my_wsize || 1274 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1275 /* 1276 * On read or write calls, return 1277 * back to the vnode ops level if 1278 * the transfer size changed. 1279 */ 1280 clfree_impl(client, ch, nfscl); 1281 if (cred_cloned) 1282 crfree(cr); 1283 return (ENFS_TRYAGAIN); 1284 } 1285 } 1286 } while (tryagain); 1287 1288 if (status != RPC_SUCCESS) { 1289 /* 1290 * Let soft mounts use the timed out message. 1291 */ 1292 if (status == RPC_INPROGRESS) 1293 status = RPC_TIMEDOUT; 1294 nfscl->nfscl_stat.badcalls.value.ui64++; 1295 if (status != RPC_INTR) { 1296 mutex_enter(&mi->mi_lock); 1297 mi->mi_flags |= MI_DOWN; 1298 mutex_exit(&mi->mi_lock); 1299 CLNT_GETERR(client, &rpcerr); 1300 #ifdef DEBUG 1301 bufp = clnt_sperror(client, svp->sv_hostname); 1302 zprintf(zoneid, "NFS%d %s failed for %s\n", 1303 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1304 if (nfs_has_ctty()) { 1305 if (!(mi->mi_flags & MI_NOPRINT)) { 1306 uprintf("NFS%d %s failed for %s\n", 1307 mi->mi_vers, mi->mi_rfsnames[which], 1308 bufp); 1309 } 1310 } 1311 kmem_free(bufp, MAXPATHLEN); 1312 #else 1313 zprintf(zoneid, 1314 "NFS %s failed for server %s: error %d (%s)\n", 1315 mi->mi_rfsnames[which], svp->sv_hostname, 1316 status, clnt_sperrno(status)); 1317 if (nfs_has_ctty()) { 1318 if (!(mi->mi_flags & MI_NOPRINT)) { 1319 uprintf( 1320 "NFS %s failed for server %s: error %d (%s)\n", 1321 mi->mi_rfsnames[which], 1322 svp->sv_hostname, status, 1323 clnt_sperrno(status)); 1324 } 1325 } 1326 #endif 1327 /* 1328 * when CLNT_CALL() fails with RPC_AUTHERROR, 1329 * re_errno is set appropriately depending on 1330 * the authentication error 1331 */ 1332 if (status == RPC_VERSMISMATCH || 1333 status == RPC_PROGVERSMISMATCH) 1334 rpcerr.re_errno = EIO; 1335 } 1336 } else { 1337 /* 1338 * Test the value of mi_down and mi_printed without 1339 * holding the mi_lock mutex. If they are both zero, 1340 * then it is okay to skip the down and printed 1341 * processing. This saves on a mutex_enter and 1342 * mutex_exit pair for a normal, successful RPC. 1343 * This was just complete overhead. 1344 */ 1345 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1346 mutex_enter(&mi->mi_lock); 1347 mi->mi_flags &= ~MI_DOWN; 1348 if (mi->mi_flags & MI_PRINTED) { 1349 mi->mi_flags &= ~MI_PRINTED; 1350 mutex_exit(&mi->mi_lock); 1351 #ifdef DEBUG 1352 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1353 zprintf(zoneid, "NFS%d server %s ok\n", 1354 mi->mi_vers, svp->sv_hostname); 1355 #else 1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1357 zprintf(zoneid, "NFS server %s ok\n", 1358 svp->sv_hostname); 1359 #endif 1360 } else 1361 mutex_exit(&mi->mi_lock); 1362 } 1363 1364 if (*douprintf == 0) { 1365 if (!(mi->mi_flags & MI_NOPRINT)) 1366 #ifdef DEBUG 1367 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1368 uprintf("NFS%d server %s ok\n", 1369 mi->mi_vers, svp->sv_hostname); 1370 #else 1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1372 uprintf("NFS server %s ok\n", svp->sv_hostname); 1373 #endif 1374 *douprintf = 1; 1375 } 1376 } 1377 1378 clfree_impl(client, ch, nfscl); 1379 if (cred_cloned) 1380 crfree(cr); 1381 1382 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1383 1384 if (rpc_status != NULL) 1385 *rpc_status = rpcerr.re_status; 1386 1387 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1388 rpcerr.re_errno); 1389 1390 return (rpcerr.re_errno); 1391 } 1392 1393 #ifdef DEBUG 1394 static int acl2call_hits = 0; 1395 static int acl2call_misses = 0; 1396 #endif 1397 1398 int 1399 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1400 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1401 enum nfsstat *statusp, int flags, failinfo_t *fi) 1402 { 1403 int rpcerror; 1404 1405 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1406 cr, douprintf, flags, fi); 1407 if (!rpcerror) { 1408 /* 1409 * See comments with crnetadjust(). 1410 */ 1411 if (*statusp == NFSERR_ACCES && 1412 (cr = crnetadjust(cr)) != NULL) { 1413 #ifdef DEBUG 1414 acl2call_hits++; 1415 #endif 1416 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1417 resp, cr, douprintf, flags, fi); 1418 crfree(cr); 1419 #ifdef DEBUG 1420 if (*statusp == NFSERR_ACCES) 1421 acl2call_misses++; 1422 #endif 1423 } 1424 } 1425 1426 return (rpcerror); 1427 } 1428 1429 #ifdef DEBUG 1430 static int acl3call_hits = 0; 1431 static int acl3call_misses = 0; 1432 #endif 1433 1434 int 1435 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1436 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1437 nfsstat3 *statusp, int flags, failinfo_t *fi) 1438 { 1439 int rpcerror; 1440 int user_informed; 1441 1442 user_informed = 0; 1443 1444 do { 1445 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1446 cr, douprintf, flags, fi); 1447 if (!rpcerror) { 1448 cred_t *crr; 1449 if (*statusp == NFS3ERR_JUKEBOX) { 1450 if (!user_informed) { 1451 user_informed = 1; 1452 uprintf( 1453 "file temporarily unavailable on the server, retrying...\n"); 1454 } 1455 delay(nfs3_jukebox_delay); 1456 } 1457 /* 1458 * See crnetadjust() for comments. 1459 */ 1460 else if (*statusp == NFS3ERR_ACCES && 1461 (crr = crnetadjust(cr)) != NULL) { 1462 #ifdef DEBUG 1463 acl3call_hits++; 1464 #endif 1465 rpcerror = aclcall(mi, which, xdrargs, argsp, 1466 xdrres, resp, crr, douprintf, flags, fi); 1467 1468 crfree(crr); 1469 #ifdef DEBUG 1470 if (*statusp == NFS3ERR_ACCES) 1471 acl3call_misses++; 1472 #endif 1473 } 1474 } 1475 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1476 1477 return (rpcerror); 1478 } 1479 1480 static int 1481 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1482 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1483 int flags, failinfo_t *fi) 1484 { 1485 CLIENT *client; 1486 struct chtab *ch; 1487 cred_t *cr = icr; 1488 bool_t cred_cloned = FALSE; 1489 enum clnt_stat status; 1490 struct rpc_err rpcerr; 1491 struct timeval wait; 1492 int timeo; /* in units of hz */ 1493 #if 0 /* notyet */ 1494 int my_rsize, my_wsize; 1495 #endif 1496 bool_t tryagain; 1497 k_sigset_t smask; 1498 servinfo_t *svp; 1499 struct nfs_clnt *nfscl; 1500 zoneid_t zoneid = getzoneid(); 1501 #ifdef DEBUG 1502 char *bufp; 1503 #endif 1504 1505 #if 0 /* notyet */ 1506 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1507 "rfscall_start:which %d mi %p", which, mi); 1508 #endif 1509 1510 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1511 ASSERT(nfscl != NULL); 1512 1513 nfscl->nfscl_stat.calls.value.ui64++; 1514 mi->mi_aclreqs[which].value.ui64++; 1515 1516 rpcerr.re_status = RPC_SUCCESS; 1517 1518 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1519 rpcerr.re_status = RPC_FAILED; 1520 rpcerr.re_errno = EIO; 1521 return (rpcerr.re_errno); 1522 } 1523 1524 #if 0 /* notyet */ 1525 /* 1526 * Remember the transfer sizes in case 1527 * nfs_feedback changes them underneath us. 1528 */ 1529 my_rsize = mi->mi_curread; 1530 my_wsize = mi->mi_curwrite; 1531 #endif 1532 1533 /* 1534 * NFS client failover support 1535 * 1536 * If this rnode is not in sync with the current server (VALID_FH), 1537 * we'd like to do a remap to get in sync. We can be interrupted 1538 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1539 * use the best info we have to try the RPC. Part of that is 1540 * unconditionally updating the filehandle copy kept for V3. 1541 * 1542 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1543 * rw_enter(); we're trying to keep the current server from being 1544 * changed on us until we're done with the remapping and have a 1545 * matching client handle. We don't want to sending a filehandle 1546 * to the wrong host. 1547 */ 1548 failoverretry: 1549 if (FAILOVER_MOUNT(mi)) { 1550 mutex_enter(&mi->mi_lock); 1551 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1552 if (failover_wait(mi)) { 1553 mutex_exit(&mi->mi_lock); 1554 return (EINTR); 1555 } 1556 } 1557 INC_READERS(mi); 1558 mutex_exit(&mi->mi_lock); 1559 if (fi) { 1560 if (!VALID_FH(fi) && 1561 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1562 int remaperr; 1563 1564 svp = mi->mi_curr_serv; 1565 remaperr = failover_remap(fi); 1566 if (remaperr != 0) { 1567 #ifdef DEBUG 1568 if (remaperr != EINTR) 1569 nfs_cmn_err(remaperr, CE_WARN, 1570 "aclcall couldn't failover: %m"); 1571 #endif 1572 mutex_enter(&mi->mi_lock); 1573 DEC_READERS(mi); 1574 mutex_exit(&mi->mi_lock); 1575 1576 /* 1577 * If failover_remap returns ETIMEDOUT 1578 * and the filesystem is hard mounted 1579 * we have to retry the call with a new 1580 * server. 1581 */ 1582 if ((mi->mi_flags & MI_HARD) && 1583 IS_RECOVERABLE_ERROR(remaperr)) { 1584 if (svp == mi->mi_curr_serv) 1585 failover_newserver(mi); 1586 rpcerr.re_status = RPC_SUCCESS; 1587 goto failoverretry; 1588 } 1589 return (remaperr); 1590 } 1591 } 1592 if (fi->fhp && fi->copyproc) 1593 (*fi->copyproc)(fi->fhp, fi->vp); 1594 } 1595 } 1596 1597 /* For TSOL, use a new cred which has net_mac_aware flag */ 1598 if (!cred_cloned && is_system_labeled()) { 1599 cred_cloned = TRUE; 1600 cr = crdup(icr); 1601 (void) setpflags(NET_MAC_AWARE, 1, cr); 1602 } 1603 1604 /* 1605 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1606 * are guaranteed to reprocess the retry as a new request. 1607 */ 1608 svp = mi->mi_curr_serv; 1609 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1610 if (FAILOVER_MOUNT(mi)) { 1611 mutex_enter(&mi->mi_lock); 1612 DEC_READERS(mi); 1613 mutex_exit(&mi->mi_lock); 1614 1615 if ((rpcerr.re_errno == ETIMEDOUT || 1616 rpcerr.re_errno == ECONNRESET) && 1617 failover_safe(fi)) { 1618 if (svp == mi->mi_curr_serv) 1619 failover_newserver(mi); 1620 goto failoverretry; 1621 } 1622 } 1623 if (rpcerr.re_errno != 0) { 1624 if (cred_cloned) 1625 crfree(cr); 1626 return (rpcerr.re_errno); 1627 } 1628 1629 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1630 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1631 timeo = (mi->mi_timeo * hz) / 10; 1632 } else { 1633 mutex_enter(&mi->mi_lock); 1634 timeo = CLNT_SETTIMERS(client, 1635 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1636 &(mi->mi_timers[NFS_CALLTYPES]), 1637 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1638 (void (*)()) 0, (caddr_t)mi, 0); 1639 mutex_exit(&mi->mi_lock); 1640 } 1641 1642 /* 1643 * If hard mounted fs, retry call forever unless hard error occurs. 1644 */ 1645 do { 1646 tryagain = FALSE; 1647 1648 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1649 status = RPC_FAILED; 1650 rpcerr.re_status = RPC_FAILED; 1651 rpcerr.re_errno = EIO; 1652 break; 1653 } 1654 1655 TICK_TO_TIMEVAL(timeo, &wait); 1656 1657 /* 1658 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1659 * and SIGTERM. (Preserving the existing masks). 1660 * Mask out SIGINT if mount option nointr is specified. 1661 */ 1662 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1663 if (!(mi->mi_flags & MI_INT)) 1664 client->cl_nosignal = TRUE; 1665 1666 /* 1667 * If there is a current signal, then don't bother 1668 * even trying to send out the request because we 1669 * won't be able to block waiting for the response. 1670 * Simply assume RPC_INTR and get on with it. 1671 */ 1672 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1673 status = RPC_INTR; 1674 else { 1675 status = CLNT_CALL(client, which, xdrargs, argsp, 1676 xdrres, resp, wait); 1677 } 1678 1679 if (!(mi->mi_flags & MI_INT)) 1680 client->cl_nosignal = FALSE; 1681 /* 1682 * restore original signal mask 1683 */ 1684 sigunintr(&smask); 1685 1686 switch (status) { 1687 case RPC_SUCCESS: 1688 #if 0 /* notyet */ 1689 if ((mi->mi_flags & MI_DYNAMIC) && 1690 mi->mi_timer_type[which] != 0 && 1691 (mi->mi_curread != my_rsize || 1692 mi->mi_curwrite != my_wsize)) 1693 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1694 #endif 1695 break; 1696 1697 /* 1698 * Unfortunately, there are servers in the world which 1699 * are not coded correctly. They are not prepared to 1700 * handle RPC requests to the NFS port which are not 1701 * NFS requests. Thus, they may try to process the 1702 * NFS_ACL request as if it were an NFS request. This 1703 * does not work. Generally, an error will be generated 1704 * on the client because it will not be able to decode 1705 * the response from the server. However, it seems 1706 * possible that the server may not be able to decode 1707 * the arguments. Thus, the criteria for deciding 1708 * whether the server supports NFS_ACL or not is whether 1709 * the following RPC errors are returned from CLNT_CALL. 1710 */ 1711 case RPC_CANTDECODERES: 1712 case RPC_PROGUNAVAIL: 1713 case RPC_CANTDECODEARGS: 1714 case RPC_PROGVERSMISMATCH: 1715 mutex_enter(&mi->mi_lock); 1716 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1717 mutex_exit(&mi->mi_lock); 1718 break; 1719 1720 /* 1721 * If the server supports NFS_ACL but not the new ops 1722 * for extended attributes, make sure we don't retry. 1723 */ 1724 case RPC_PROCUNAVAIL: 1725 mutex_enter(&mi->mi_lock); 1726 mi->mi_flags &= ~MI_EXTATTR; 1727 mutex_exit(&mi->mi_lock); 1728 break; 1729 1730 case RPC_INTR: 1731 /* 1732 * There is no way to recover from this error, 1733 * even if mount option nointr is specified. 1734 * SIGKILL, for example, cannot be blocked. 1735 */ 1736 rpcerr.re_status = RPC_INTR; 1737 rpcerr.re_errno = EINTR; 1738 break; 1739 1740 case RPC_UDERROR: 1741 /* 1742 * If the NFS server is local (vold) and 1743 * it goes away then we get RPC_UDERROR. 1744 * This is a retryable error, so we would 1745 * loop, so check to see if the specific 1746 * error was ECONNRESET, indicating that 1747 * target did not exist at all. If so, 1748 * return with RPC_PROGUNAVAIL and 1749 * ECONNRESET to indicate why. 1750 */ 1751 CLNT_GETERR(client, &rpcerr); 1752 if (rpcerr.re_errno == ECONNRESET) { 1753 rpcerr.re_status = RPC_PROGUNAVAIL; 1754 rpcerr.re_errno = ECONNRESET; 1755 break; 1756 } 1757 /*FALLTHROUGH*/ 1758 1759 default: /* probably RPC_TIMEDOUT */ 1760 if (IS_UNRECOVERABLE_RPC(status)) 1761 break; 1762 1763 /* 1764 * increment server not responding count 1765 */ 1766 mutex_enter(&mi->mi_lock); 1767 mi->mi_noresponse++; 1768 mutex_exit(&mi->mi_lock); 1769 #ifdef DEBUG 1770 nfscl->nfscl_stat.noresponse.value.ui64++; 1771 #endif 1772 1773 if (!(mi->mi_flags & MI_HARD)) { 1774 if (!(mi->mi_flags & MI_SEMISOFT) || 1775 (mi->mi_acl_ss_call_type[which] == 0)) 1776 break; 1777 } 1778 1779 /* 1780 * The call is in progress (over COTS). 1781 * Try the CLNT_CALL again, but don't 1782 * print a noisy error message. 1783 */ 1784 if (status == RPC_INPROGRESS) { 1785 tryagain = TRUE; 1786 break; 1787 } 1788 1789 if (flags & RFSCALL_SOFT) 1790 break; 1791 1792 /* 1793 * On zone shutdown, just move on. 1794 */ 1795 if (zone_status_get(curproc->p_zone) >= 1796 ZONE_IS_SHUTTING_DOWN) { 1797 rpcerr.re_status = RPC_FAILED; 1798 rpcerr.re_errno = EIO; 1799 break; 1800 } 1801 1802 /* 1803 * NFS client failover support 1804 * 1805 * If the current server just failed us, we'll 1806 * start the process of finding a new server. 1807 * After that, we can just retry. 1808 */ 1809 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1810 if (svp == mi->mi_curr_serv) 1811 failover_newserver(mi); 1812 clfree_impl(client, ch, nfscl); 1813 goto failoverretry; 1814 } 1815 1816 tryagain = TRUE; 1817 timeo = backoff(timeo); 1818 mutex_enter(&mi->mi_lock); 1819 if (!(mi->mi_flags & MI_PRINTED)) { 1820 mi->mi_flags |= MI_PRINTED; 1821 mutex_exit(&mi->mi_lock); 1822 #ifdef DEBUG 1823 zprintf(zoneid, 1824 "NFS_ACL%d server %s not responding still trying\n", 1825 mi->mi_vers, svp->sv_hostname); 1826 #else 1827 zprintf(zoneid, 1828 "NFS server %s not responding still trying\n", 1829 svp->sv_hostname); 1830 #endif 1831 } else 1832 mutex_exit(&mi->mi_lock); 1833 if (*douprintf && nfs_has_ctty()) { 1834 *douprintf = 0; 1835 if (!(mi->mi_flags & MI_NOPRINT)) 1836 #ifdef DEBUG 1837 uprintf( 1838 "NFS_ACL%d server %s not responding still trying\n", 1839 mi->mi_vers, svp->sv_hostname); 1840 #else 1841 uprintf( 1842 "NFS server %s not responding still trying\n", 1843 svp->sv_hostname); 1844 #endif 1845 } 1846 1847 #if 0 /* notyet */ 1848 /* 1849 * If doing dynamic adjustment of transfer 1850 * size and if it's a read or write call 1851 * and if the transfer size changed while 1852 * retransmitting or if the feedback routine 1853 * changed the transfer size, 1854 * then exit rfscall so that the transfer 1855 * size can be adjusted at the vnops level. 1856 */ 1857 if ((mi->mi_flags & MI_DYNAMIC) && 1858 mi->mi_acl_timer_type[which] != 0 && 1859 (mi->mi_curread != my_rsize || 1860 mi->mi_curwrite != my_wsize || 1861 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1862 /* 1863 * On read or write calls, return 1864 * back to the vnode ops level if 1865 * the transfer size changed. 1866 */ 1867 clfree_impl(client, ch, nfscl); 1868 if (cred_cloned) 1869 crfree(cr); 1870 return (ENFS_TRYAGAIN); 1871 } 1872 #endif 1873 } 1874 } while (tryagain); 1875 1876 if (status != RPC_SUCCESS) { 1877 /* 1878 * Let soft mounts use the timed out message. 1879 */ 1880 if (status == RPC_INPROGRESS) 1881 status = RPC_TIMEDOUT; 1882 nfscl->nfscl_stat.badcalls.value.ui64++; 1883 if (status == RPC_CANTDECODERES || 1884 status == RPC_PROGUNAVAIL || 1885 status == RPC_PROCUNAVAIL || 1886 status == RPC_CANTDECODEARGS || 1887 status == RPC_PROGVERSMISMATCH) 1888 CLNT_GETERR(client, &rpcerr); 1889 else if (status != RPC_INTR) { 1890 mutex_enter(&mi->mi_lock); 1891 mi->mi_flags |= MI_DOWN; 1892 mutex_exit(&mi->mi_lock); 1893 CLNT_GETERR(client, &rpcerr); 1894 #ifdef DEBUG 1895 bufp = clnt_sperror(client, svp->sv_hostname); 1896 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1897 mi->mi_vers, mi->mi_aclnames[which], bufp); 1898 if (nfs_has_ctty()) { 1899 if (!(mi->mi_flags & MI_NOPRINT)) { 1900 uprintf("NFS_ACL%d %s failed for %s\n", 1901 mi->mi_vers, mi->mi_aclnames[which], 1902 bufp); 1903 } 1904 } 1905 kmem_free(bufp, MAXPATHLEN); 1906 #else 1907 zprintf(zoneid, 1908 "NFS %s failed for server %s: error %d (%s)\n", 1909 mi->mi_aclnames[which], svp->sv_hostname, 1910 status, clnt_sperrno(status)); 1911 if (nfs_has_ctty()) { 1912 if (!(mi->mi_flags & MI_NOPRINT)) 1913 uprintf( 1914 "NFS %s failed for server %s: error %d (%s)\n", 1915 mi->mi_aclnames[which], 1916 svp->sv_hostname, status, 1917 clnt_sperrno(status)); 1918 } 1919 #endif 1920 /* 1921 * when CLNT_CALL() fails with RPC_AUTHERROR, 1922 * re_errno is set appropriately depending on 1923 * the authentication error 1924 */ 1925 if (status == RPC_VERSMISMATCH || 1926 status == RPC_PROGVERSMISMATCH) 1927 rpcerr.re_errno = EIO; 1928 } 1929 } else { 1930 /* 1931 * Test the value of mi_down and mi_printed without 1932 * holding the mi_lock mutex. If they are both zero, 1933 * then it is okay to skip the down and printed 1934 * processing. This saves on a mutex_enter and 1935 * mutex_exit pair for a normal, successful RPC. 1936 * This was just complete overhead. 1937 */ 1938 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1939 mutex_enter(&mi->mi_lock); 1940 mi->mi_flags &= ~MI_DOWN; 1941 if (mi->mi_flags & MI_PRINTED) { 1942 mi->mi_flags &= ~MI_PRINTED; 1943 mutex_exit(&mi->mi_lock); 1944 #ifdef DEBUG 1945 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1946 mi->mi_vers, svp->sv_hostname); 1947 #else 1948 zprintf(zoneid, "NFS server %s ok\n", 1949 svp->sv_hostname); 1950 #endif 1951 } else 1952 mutex_exit(&mi->mi_lock); 1953 } 1954 1955 if (*douprintf == 0) { 1956 if (!(mi->mi_flags & MI_NOPRINT)) 1957 #ifdef DEBUG 1958 uprintf("NFS_ACL%d server %s ok\n", 1959 mi->mi_vers, svp->sv_hostname); 1960 #else 1961 uprintf("NFS server %s ok\n", svp->sv_hostname); 1962 #endif 1963 *douprintf = 1; 1964 } 1965 } 1966 1967 clfree_impl(client, ch, nfscl); 1968 if (cred_cloned) 1969 crfree(cr); 1970 1971 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1972 1973 #if 0 /* notyet */ 1974 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1975 rpcerr.re_errno); 1976 #endif 1977 1978 return (rpcerr.re_errno); 1979 } 1980 1981 int 1982 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1983 { 1984 uint_t mask = vap->va_mask; 1985 1986 if (!(mask & AT_MODE)) 1987 sa->sa_mode = (uint32_t)-1; 1988 else 1989 sa->sa_mode = vap->va_mode; 1990 if (!(mask & AT_UID)) 1991 sa->sa_uid = (uint32_t)-1; 1992 else 1993 sa->sa_uid = (uint32_t)vap->va_uid; 1994 if (!(mask & AT_GID)) 1995 sa->sa_gid = (uint32_t)-1; 1996 else 1997 sa->sa_gid = (uint32_t)vap->va_gid; 1998 if (!(mask & AT_SIZE)) 1999 sa->sa_size = (uint32_t)-1; 2000 else 2001 sa->sa_size = (uint32_t)vap->va_size; 2002 if (!(mask & AT_ATIME)) 2003 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 2004 else { 2005 /* check time validity */ 2006 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2007 return (EOVERFLOW); 2008 } 2009 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2010 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2011 } 2012 if (!(mask & AT_MTIME)) 2013 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2014 else { 2015 /* check time validity */ 2016 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2017 return (EOVERFLOW); 2018 } 2019 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2020 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2021 } 2022 return (0); 2023 } 2024 2025 int 2026 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2027 { 2028 uint_t mask = vap->va_mask; 2029 2030 if (!(mask & AT_MODE)) 2031 sa->mode.set_it = FALSE; 2032 else { 2033 sa->mode.set_it = TRUE; 2034 sa->mode.mode = (mode3)vap->va_mode; 2035 } 2036 if (!(mask & AT_UID)) 2037 sa->uid.set_it = FALSE; 2038 else { 2039 sa->uid.set_it = TRUE; 2040 sa->uid.uid = (uid3)vap->va_uid; 2041 } 2042 if (!(mask & AT_GID)) 2043 sa->gid.set_it = FALSE; 2044 else { 2045 sa->gid.set_it = TRUE; 2046 sa->gid.gid = (gid3)vap->va_gid; 2047 } 2048 if (!(mask & AT_SIZE)) 2049 sa->size.set_it = FALSE; 2050 else { 2051 sa->size.set_it = TRUE; 2052 sa->size.size = (size3)vap->va_size; 2053 } 2054 if (!(mask & AT_ATIME)) 2055 sa->atime.set_it = DONT_CHANGE; 2056 else { 2057 /* check time validity */ 2058 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2059 return (EOVERFLOW); 2060 } 2061 sa->atime.set_it = SET_TO_CLIENT_TIME; 2062 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2063 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2064 } 2065 if (!(mask & AT_MTIME)) 2066 sa->mtime.set_it = DONT_CHANGE; 2067 else { 2068 /* check time validity */ 2069 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2070 return (EOVERFLOW); 2071 } 2072 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2073 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2074 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2075 } 2076 return (0); 2077 } 2078 2079 void 2080 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2081 { 2082 2083 da->da_fhandle = VTOFH(dvp); 2084 da->da_name = nm; 2085 da->da_flags = 0; 2086 } 2087 2088 void 2089 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2090 { 2091 2092 da->dirp = VTOFH3(dvp); 2093 da->name = nm; 2094 } 2095 2096 int 2097 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2098 { 2099 int error; 2100 rnode_t *rp; 2101 struct vattr va; 2102 2103 va.va_mask = AT_MODE | AT_GID; 2104 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2105 if (error) 2106 return (error); 2107 2108 /* 2109 * To determine the expected group-id of the created file: 2110 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2111 * GRPID option, and the directory's set-gid bit is clear, 2112 * then use the process's gid. 2113 * 2) Otherwise, set the group-id to the gid of the parent directory. 2114 */ 2115 rp = VTOR(dvp); 2116 mutex_enter(&rp->r_statelock); 2117 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2118 *gidp = crgetgid(cr); 2119 else 2120 *gidp = va.va_gid; 2121 mutex_exit(&rp->r_statelock); 2122 return (0); 2123 } 2124 2125 int 2126 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2127 { 2128 int error; 2129 struct vattr va; 2130 2131 va.va_mask = AT_MODE; 2132 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2133 if (error) 2134 return (error); 2135 2136 /* 2137 * Modify the expected mode (om) so that the set-gid bit matches 2138 * that of the parent directory (dvp). 2139 */ 2140 if (va.va_mode & VSGID) 2141 *omp |= VSGID; 2142 else 2143 *omp &= ~VSGID; 2144 return (0); 2145 } 2146 2147 void 2148 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2149 { 2150 2151 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2152 if (!(vp->v_flag & VSWAPLIKE)) { 2153 mutex_enter(&vp->v_lock); 2154 vp->v_flag |= VSWAPLIKE; 2155 mutex_exit(&vp->v_lock); 2156 } 2157 } else { 2158 if (vp->v_flag & VSWAPLIKE) { 2159 mutex_enter(&vp->v_lock); 2160 vp->v_flag &= ~VSWAPLIKE; 2161 mutex_exit(&vp->v_lock); 2162 } 2163 } 2164 } 2165 2166 /* 2167 * Free the resources associated with an rnode. 2168 */ 2169 static void 2170 rinactive(rnode_t *rp, cred_t *cr) 2171 { 2172 vnode_t *vp; 2173 cred_t *cred; 2174 char *contents; 2175 int size; 2176 vsecattr_t *vsp; 2177 int error; 2178 nfs3_pathconf_info *info; 2179 2180 /* 2181 * Before freeing anything, wait until all asynchronous 2182 * activity is done on this rnode. This will allow all 2183 * asynchronous read ahead and write behind i/o's to 2184 * finish. 2185 */ 2186 mutex_enter(&rp->r_statelock); 2187 while (rp->r_count > 0) 2188 cv_wait(&rp->r_cv, &rp->r_statelock); 2189 mutex_exit(&rp->r_statelock); 2190 2191 /* 2192 * Flush and invalidate all pages associated with the vnode. 2193 */ 2194 vp = RTOV(rp); 2195 if (vn_has_cached_data(vp)) { 2196 ASSERT(vp->v_type != VCHR); 2197 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2198 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2199 if (error && (error == ENOSPC || error == EDQUOT)) { 2200 mutex_enter(&rp->r_statelock); 2201 if (!rp->r_error) 2202 rp->r_error = error; 2203 mutex_exit(&rp->r_statelock); 2204 } 2205 } 2206 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2207 } 2208 2209 /* 2210 * Free any held credentials and caches which may be associated 2211 * with this rnode. 2212 */ 2213 mutex_enter(&rp->r_statelock); 2214 cred = rp->r_cred; 2215 rp->r_cred = NULL; 2216 contents = rp->r_symlink.contents; 2217 size = rp->r_symlink.size; 2218 rp->r_symlink.contents = NULL; 2219 vsp = rp->r_secattr; 2220 rp->r_secattr = NULL; 2221 info = rp->r_pathconf; 2222 rp->r_pathconf = NULL; 2223 mutex_exit(&rp->r_statelock); 2224 2225 /* 2226 * Free the held credential. 2227 */ 2228 if (cred != NULL) 2229 crfree(cred); 2230 2231 /* 2232 * Free the access cache entries. 2233 */ 2234 (void) nfs_access_purge_rp(rp); 2235 2236 /* 2237 * Free the readdir cache entries. 2238 */ 2239 if (HAVE_RDDIR_CACHE(rp)) 2240 nfs_purge_rddir_cache(vp); 2241 2242 /* 2243 * Free the symbolic link cache. 2244 */ 2245 if (contents != NULL) { 2246 2247 kmem_free((void *)contents, size); 2248 } 2249 2250 /* 2251 * Free any cached ACL. 2252 */ 2253 if (vsp != NULL) 2254 nfs_acl_free(vsp); 2255 2256 /* 2257 * Free any cached pathconf information. 2258 */ 2259 if (info != NULL) 2260 kmem_free(info, sizeof (*info)); 2261 } 2262 2263 /* 2264 * Return a vnode for the given NFS Version 2 file handle. 2265 * If no rnode exists for this fhandle, create one and put it 2266 * into the hash queues. If the rnode for this fhandle 2267 * already exists, return it. 2268 * 2269 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2270 */ 2271 vnode_t * 2272 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2273 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2274 { 2275 int newnode; 2276 int index; 2277 vnode_t *vp; 2278 nfs_fhandle nfh; 2279 vattr_t va; 2280 2281 nfh.fh_len = NFS_FHSIZE; 2282 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2283 2284 index = rtablehash(&nfh); 2285 rw_enter(&rtable[index].r_lock, RW_READER); 2286 2287 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2288 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2289 2290 if (attr != NULL) { 2291 if (!newnode) { 2292 rw_exit(&rtable[index].r_lock); 2293 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2294 } else { 2295 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2296 vp->v_type = VBAD; 2297 else 2298 vp->v_type = n2v_type(attr); 2299 /* 2300 * A translation here seems to be necessary 2301 * because this function can be called 2302 * with `attr' that has come from the wire, 2303 * and been operated on by vattr_to_nattr(). 2304 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2305 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2306 * ->makenfsnode(). 2307 */ 2308 if ((attr->na_rdev & 0xffff0000) == 0) 2309 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2310 else 2311 vp->v_rdev = expldev(n2v_rdev(attr)); 2312 nfs_attrcache(vp, attr, t); 2313 rw_exit(&rtable[index].r_lock); 2314 } 2315 } else { 2316 if (newnode) { 2317 PURGE_ATTRCACHE(vp); 2318 } 2319 rw_exit(&rtable[index].r_lock); 2320 } 2321 2322 return (vp); 2323 } 2324 2325 /* 2326 * Return a vnode for the given NFS Version 3 file handle. 2327 * If no rnode exists for this fhandle, create one and put it 2328 * into the hash queues. If the rnode for this fhandle 2329 * already exists, return it. 2330 * 2331 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2332 */ 2333 vnode_t * 2334 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2335 cred_t *cr, char *dnm, char *nm) 2336 { 2337 int newnode; 2338 int index; 2339 vnode_t *vp; 2340 2341 index = rtablehash((nfs_fhandle *)fh); 2342 rw_enter(&rtable[index].r_lock, RW_READER); 2343 2344 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2345 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2346 dnm, nm); 2347 2348 if (vap == NULL) { 2349 if (newnode) { 2350 PURGE_ATTRCACHE(vp); 2351 } 2352 rw_exit(&rtable[index].r_lock); 2353 return (vp); 2354 } 2355 2356 if (!newnode) { 2357 rw_exit(&rtable[index].r_lock); 2358 nfs_attr_cache(vp, vap, t, cr); 2359 } else { 2360 rnode_t *rp = VTOR(vp); 2361 2362 vp->v_type = vap->va_type; 2363 vp->v_rdev = vap->va_rdev; 2364 2365 mutex_enter(&rp->r_statelock); 2366 if (rp->r_mtime <= t) 2367 nfs_attrcache_va(vp, vap); 2368 mutex_exit(&rp->r_statelock); 2369 rw_exit(&rtable[index].r_lock); 2370 } 2371 2372 return (vp); 2373 } 2374 2375 vnode_t * 2376 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2377 cred_t *cr, char *dnm, char *nm) 2378 { 2379 int newnode; 2380 int index; 2381 vnode_t *vp; 2382 vattr_t va; 2383 2384 index = rtablehash((nfs_fhandle *)fh); 2385 rw_enter(&rtable[index].r_lock, RW_READER); 2386 2387 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2388 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2389 dnm, nm); 2390 2391 if (attr == NULL) { 2392 if (newnode) { 2393 PURGE_ATTRCACHE(vp); 2394 } 2395 rw_exit(&rtable[index].r_lock); 2396 return (vp); 2397 } 2398 2399 if (!newnode) { 2400 rw_exit(&rtable[index].r_lock); 2401 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2402 } else { 2403 if (attr->type < NF3REG || attr->type > NF3FIFO) 2404 vp->v_type = VBAD; 2405 else 2406 vp->v_type = nf3_to_vt[attr->type]; 2407 vp->v_rdev = makedevice(attr->rdev.specdata1, 2408 attr->rdev.specdata2); 2409 nfs3_attrcache(vp, attr, t); 2410 rw_exit(&rtable[index].r_lock); 2411 } 2412 2413 return (vp); 2414 } 2415 2416 /* 2417 * Read this comment before making changes to rtablehash()! 2418 * This is a hash function in which seemingly obvious and harmless 2419 * changes can cause escalations costing million dollars! 2420 * Know what you are doing. 2421 * 2422 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2423 * algorithm is currently detailed here: 2424 * 2425 * http://burtleburtle.net/bob/hash/doobs.html 2426 * 2427 * Of course, the above link may not be valid by the time you are reading 2428 * this, but suffice it to say that the one-at-a-time algorithm works well in 2429 * almost all cases. If you are changing the algorithm be sure to verify that 2430 * the hash algorithm still provides even distribution in all cases and with 2431 * any server returning filehandles in whatever order (sequential or random). 2432 */ 2433 static int 2434 rtablehash(nfs_fhandle *fh) 2435 { 2436 ulong_t hash, len, i; 2437 char *key; 2438 2439 key = fh->fh_buf; 2440 len = (ulong_t)fh->fh_len; 2441 for (hash = 0, i = 0; i < len; i++) { 2442 hash += key[i]; 2443 hash += (hash << 10); 2444 hash ^= (hash >> 6); 2445 } 2446 hash += (hash << 3); 2447 hash ^= (hash >> 11); 2448 hash += (hash << 15); 2449 return (hash & rtablemask); 2450 } 2451 2452 static vnode_t * 2453 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2454 struct vnodeops *vops, 2455 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2456 int (*compar)(const void *, const void *), 2457 int *newnode, cred_t *cr, char *dnm, char *nm) 2458 { 2459 rnode_t *rp; 2460 rnode_t *trp; 2461 vnode_t *vp; 2462 mntinfo_t *mi; 2463 2464 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2465 2466 mi = VFTOMI(vfsp); 2467 start: 2468 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2469 vp = RTOV(rp); 2470 nfs_set_vroot(vp); 2471 *newnode = 0; 2472 return (vp); 2473 } 2474 rw_exit(&rhtp->r_lock); 2475 2476 mutex_enter(&rpfreelist_lock); 2477 if (rpfreelist != NULL && rnew >= nrnode) { 2478 rp = rpfreelist; 2479 rp_rmfree(rp); 2480 mutex_exit(&rpfreelist_lock); 2481 2482 vp = RTOV(rp); 2483 2484 if (rp->r_flags & RHASHED) { 2485 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2486 mutex_enter(&vp->v_lock); 2487 if (vp->v_count > 1) { 2488 vp->v_count--; 2489 mutex_exit(&vp->v_lock); 2490 rw_exit(&rp->r_hashq->r_lock); 2491 rw_enter(&rhtp->r_lock, RW_READER); 2492 goto start; 2493 } 2494 mutex_exit(&vp->v_lock); 2495 rp_rmhash_locked(rp); 2496 rw_exit(&rp->r_hashq->r_lock); 2497 } 2498 2499 rinactive(rp, cr); 2500 2501 mutex_enter(&vp->v_lock); 2502 if (vp->v_count > 1) { 2503 vp->v_count--; 2504 mutex_exit(&vp->v_lock); 2505 rw_enter(&rhtp->r_lock, RW_READER); 2506 goto start; 2507 } 2508 mutex_exit(&vp->v_lock); 2509 vn_invalid(vp); 2510 /* 2511 * destroy old locks before bzero'ing and 2512 * recreating the locks below. 2513 */ 2514 nfs_rw_destroy(&rp->r_rwlock); 2515 nfs_rw_destroy(&rp->r_lkserlock); 2516 mutex_destroy(&rp->r_statelock); 2517 cv_destroy(&rp->r_cv); 2518 cv_destroy(&rp->r_commit.c_cv); 2519 nfs_free_r_path(rp); 2520 avl_destroy(&rp->r_dir); 2521 /* 2522 * Make sure that if rnode is recycled then 2523 * VFS count is decremented properly before 2524 * reuse. 2525 */ 2526 VFS_RELE(vp->v_vfsp); 2527 vn_reinit(vp); 2528 } else { 2529 vnode_t *new_vp; 2530 2531 mutex_exit(&rpfreelist_lock); 2532 2533 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2534 new_vp = vn_alloc(KM_SLEEP); 2535 2536 atomic_add_long((ulong_t *)&rnew, 1); 2537 #ifdef DEBUG 2538 clstat_debug.nrnode.value.ui64++; 2539 #endif 2540 vp = new_vp; 2541 } 2542 2543 bzero(rp, sizeof (*rp)); 2544 rp->r_vnode = vp; 2545 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2546 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2547 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2548 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2549 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2550 rp->r_fh.fh_len = fh->fh_len; 2551 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2552 rp->r_server = mi->mi_curr_serv; 2553 if (FAILOVER_MOUNT(mi)) { 2554 /* 2555 * If replicated servers, stash pathnames 2556 */ 2557 if (dnm != NULL && nm != NULL) { 2558 char *s, *p; 2559 uint_t len; 2560 2561 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2562 rp->r_path = kmem_alloc(len, KM_SLEEP); 2563 #ifdef DEBUG 2564 clstat_debug.rpath.value.ui64 += len; 2565 #endif 2566 s = rp->r_path; 2567 for (p = dnm; *p; p++) 2568 *s++ = *p; 2569 *s++ = '/'; 2570 for (p = nm; *p; p++) 2571 *s++ = *p; 2572 *s = '\0'; 2573 } else { 2574 /* special case for root */ 2575 rp->r_path = kmem_alloc(2, KM_SLEEP); 2576 #ifdef DEBUG 2577 clstat_debug.rpath.value.ui64 += 2; 2578 #endif 2579 *rp->r_path = '.'; 2580 *(rp->r_path + 1) = '\0'; 2581 } 2582 } 2583 VFS_HOLD(vfsp); 2584 rp->r_putapage = putapage; 2585 rp->r_hashq = rhtp; 2586 rp->r_flags = RREADDIRPLUS; 2587 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2588 offsetof(rddir_cache, tree)); 2589 vn_setops(vp, vops); 2590 vp->v_data = (caddr_t)rp; 2591 vp->v_vfsp = vfsp; 2592 vp->v_type = VNON; 2593 nfs_set_vroot(vp); 2594 2595 /* 2596 * There is a race condition if someone else 2597 * alloc's the rnode while no locks are held, so we 2598 * check again and recover if found. 2599 */ 2600 rw_enter(&rhtp->r_lock, RW_WRITER); 2601 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2602 vp = RTOV(trp); 2603 nfs_set_vroot(vp); 2604 *newnode = 0; 2605 rw_exit(&rhtp->r_lock); 2606 rp_addfree(rp, cr); 2607 rw_enter(&rhtp->r_lock, RW_READER); 2608 return (vp); 2609 } 2610 rp_addhash(rp); 2611 *newnode = 1; 2612 return (vp); 2613 } 2614 2615 static void 2616 nfs_set_vroot(vnode_t *vp) 2617 { 2618 rnode_t *rp; 2619 nfs_fhandle *rootfh; 2620 2621 rp = VTOR(vp); 2622 rootfh = &rp->r_server->sv_fhandle; 2623 if (rootfh->fh_len == rp->r_fh.fh_len && 2624 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2625 if (!(vp->v_flag & VROOT)) { 2626 mutex_enter(&vp->v_lock); 2627 vp->v_flag |= VROOT; 2628 mutex_exit(&vp->v_lock); 2629 } 2630 } 2631 } 2632 2633 static void 2634 nfs_free_r_path(rnode_t *rp) 2635 { 2636 char *path; 2637 size_t len; 2638 2639 path = rp->r_path; 2640 if (path) { 2641 rp->r_path = NULL; 2642 len = strlen(path) + 1; 2643 kmem_free(path, len); 2644 #ifdef DEBUG 2645 clstat_debug.rpath.value.ui64 -= len; 2646 #endif 2647 } 2648 } 2649 2650 /* 2651 * Put an rnode on the free list. 2652 * 2653 * Rnodes which were allocated above and beyond the normal limit 2654 * are immediately freed. 2655 */ 2656 void 2657 rp_addfree(rnode_t *rp, cred_t *cr) 2658 { 2659 vnode_t *vp; 2660 struct vfs *vfsp; 2661 2662 vp = RTOV(rp); 2663 ASSERT(vp->v_count >= 1); 2664 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2665 2666 /* 2667 * If we have too many rnodes allocated and there are no 2668 * references to this rnode, or if the rnode is no longer 2669 * accessible by it does not reside in the hash queues, 2670 * or if an i/o error occurred while writing to the file, 2671 * then just free it instead of putting it on the rnode 2672 * freelist. 2673 */ 2674 vfsp = vp->v_vfsp; 2675 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2676 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2677 if (rp->r_flags & RHASHED) { 2678 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2679 mutex_enter(&vp->v_lock); 2680 if (vp->v_count > 1) { 2681 vp->v_count--; 2682 mutex_exit(&vp->v_lock); 2683 rw_exit(&rp->r_hashq->r_lock); 2684 return; 2685 } 2686 mutex_exit(&vp->v_lock); 2687 rp_rmhash_locked(rp); 2688 rw_exit(&rp->r_hashq->r_lock); 2689 } 2690 2691 rinactive(rp, cr); 2692 2693 /* 2694 * Recheck the vnode reference count. We need to 2695 * make sure that another reference has not been 2696 * acquired while we were not holding v_lock. The 2697 * rnode is not in the rnode hash queues, so the 2698 * only way for a reference to have been acquired 2699 * is for a VOP_PUTPAGE because the rnode was marked 2700 * with RDIRTY or for a modified page. This 2701 * reference may have been acquired before our call 2702 * to rinactive. The i/o may have been completed, 2703 * thus allowing rinactive to complete, but the 2704 * reference to the vnode may not have been released 2705 * yet. In any case, the rnode can not be destroyed 2706 * until the other references to this vnode have been 2707 * released. The other references will take care of 2708 * either destroying the rnode or placing it on the 2709 * rnode freelist. If there are no other references, 2710 * then the rnode may be safely destroyed. 2711 */ 2712 mutex_enter(&vp->v_lock); 2713 if (vp->v_count > 1) { 2714 vp->v_count--; 2715 mutex_exit(&vp->v_lock); 2716 return; 2717 } 2718 mutex_exit(&vp->v_lock); 2719 2720 destroy_rnode(rp); 2721 return; 2722 } 2723 2724 /* 2725 * Lock the hash queue and then recheck the reference count 2726 * to ensure that no other threads have acquired a reference 2727 * to indicate that the rnode should not be placed on the 2728 * freelist. If another reference has been acquired, then 2729 * just release this one and let the other thread complete 2730 * the processing of adding this rnode to the freelist. 2731 */ 2732 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2733 2734 mutex_enter(&vp->v_lock); 2735 if (vp->v_count > 1) { 2736 vp->v_count--; 2737 mutex_exit(&vp->v_lock); 2738 rw_exit(&rp->r_hashq->r_lock); 2739 return; 2740 } 2741 mutex_exit(&vp->v_lock); 2742 2743 /* 2744 * If there is no cached data or metadata for this file, then 2745 * put the rnode on the front of the freelist so that it will 2746 * be reused before other rnodes which may have cached data or 2747 * metadata associated with them. 2748 */ 2749 mutex_enter(&rpfreelist_lock); 2750 if (rpfreelist == NULL) { 2751 rp->r_freef = rp; 2752 rp->r_freeb = rp; 2753 rpfreelist = rp; 2754 } else { 2755 rp->r_freef = rpfreelist; 2756 rp->r_freeb = rpfreelist->r_freeb; 2757 rpfreelist->r_freeb->r_freef = rp; 2758 rpfreelist->r_freeb = rp; 2759 if (!vn_has_cached_data(vp) && 2760 !HAVE_RDDIR_CACHE(rp) && 2761 rp->r_symlink.contents == NULL && 2762 rp->r_secattr == NULL && 2763 rp->r_pathconf == NULL) 2764 rpfreelist = rp; 2765 } 2766 mutex_exit(&rpfreelist_lock); 2767 2768 rw_exit(&rp->r_hashq->r_lock); 2769 } 2770 2771 /* 2772 * Remove an rnode from the free list. 2773 * 2774 * The caller must be holding rpfreelist_lock and the rnode 2775 * must be on the freelist. 2776 */ 2777 static void 2778 rp_rmfree(rnode_t *rp) 2779 { 2780 2781 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2782 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2783 2784 if (rp == rpfreelist) { 2785 rpfreelist = rp->r_freef; 2786 if (rp == rpfreelist) 2787 rpfreelist = NULL; 2788 } 2789 2790 rp->r_freeb->r_freef = rp->r_freef; 2791 rp->r_freef->r_freeb = rp->r_freeb; 2792 2793 rp->r_freef = rp->r_freeb = NULL; 2794 } 2795 2796 /* 2797 * Put a rnode in the hash table. 2798 * 2799 * The caller must be holding the exclusive hash queue lock. 2800 */ 2801 static void 2802 rp_addhash(rnode_t *rp) 2803 { 2804 2805 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2806 ASSERT(!(rp->r_flags & RHASHED)); 2807 2808 rp->r_hashf = rp->r_hashq->r_hashf; 2809 rp->r_hashq->r_hashf = rp; 2810 rp->r_hashb = (rnode_t *)rp->r_hashq; 2811 rp->r_hashf->r_hashb = rp; 2812 2813 mutex_enter(&rp->r_statelock); 2814 rp->r_flags |= RHASHED; 2815 mutex_exit(&rp->r_statelock); 2816 } 2817 2818 /* 2819 * Remove a rnode from the hash table. 2820 * 2821 * The caller must be holding the hash queue lock. 2822 */ 2823 static void 2824 rp_rmhash_locked(rnode_t *rp) 2825 { 2826 2827 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2828 ASSERT(rp->r_flags & RHASHED); 2829 2830 rp->r_hashb->r_hashf = rp->r_hashf; 2831 rp->r_hashf->r_hashb = rp->r_hashb; 2832 2833 mutex_enter(&rp->r_statelock); 2834 rp->r_flags &= ~RHASHED; 2835 mutex_exit(&rp->r_statelock); 2836 } 2837 2838 /* 2839 * Remove a rnode from the hash table. 2840 * 2841 * The caller must not be holding the hash queue lock. 2842 */ 2843 void 2844 rp_rmhash(rnode_t *rp) 2845 { 2846 2847 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2848 rp_rmhash_locked(rp); 2849 rw_exit(&rp->r_hashq->r_lock); 2850 } 2851 2852 /* 2853 * Lookup a rnode by fhandle. 2854 * 2855 * The caller must be holding the hash queue lock, either shared or exclusive. 2856 */ 2857 static rnode_t * 2858 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2859 { 2860 rnode_t *rp; 2861 vnode_t *vp; 2862 2863 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2864 2865 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2866 vp = RTOV(rp); 2867 if (vp->v_vfsp == vfsp && 2868 rp->r_fh.fh_len == fh->fh_len && 2869 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2870 /* 2871 * remove rnode from free list, if necessary. 2872 */ 2873 if (rp->r_freef != NULL) { 2874 mutex_enter(&rpfreelist_lock); 2875 /* 2876 * If the rnode is on the freelist, 2877 * then remove it and use that reference 2878 * as the new reference. Otherwise, 2879 * need to increment the reference count. 2880 */ 2881 if (rp->r_freef != NULL) { 2882 rp_rmfree(rp); 2883 mutex_exit(&rpfreelist_lock); 2884 } else { 2885 mutex_exit(&rpfreelist_lock); 2886 VN_HOLD(vp); 2887 } 2888 } else 2889 VN_HOLD(vp); 2890 return (rp); 2891 } 2892 } 2893 return (NULL); 2894 } 2895 2896 /* 2897 * Return 1 if there is a active vnode belonging to this vfs in the 2898 * rtable cache. 2899 * 2900 * Several of these checks are done without holding the usual 2901 * locks. This is safe because destroy_rtable(), rp_addfree(), 2902 * etc. will redo the necessary checks before actually destroying 2903 * any rnodes. 2904 */ 2905 int 2906 check_rtable(struct vfs *vfsp) 2907 { 2908 int index; 2909 rnode_t *rp; 2910 vnode_t *vp; 2911 2912 for (index = 0; index < rtablesize; index++) { 2913 rw_enter(&rtable[index].r_lock, RW_READER); 2914 for (rp = rtable[index].r_hashf; 2915 rp != (rnode_t *)(&rtable[index]); 2916 rp = rp->r_hashf) { 2917 vp = RTOV(rp); 2918 if (vp->v_vfsp == vfsp) { 2919 if (rp->r_freef == NULL || 2920 (vn_has_cached_data(vp) && 2921 (rp->r_flags & RDIRTY)) || 2922 rp->r_count > 0) { 2923 rw_exit(&rtable[index].r_lock); 2924 return (1); 2925 } 2926 } 2927 } 2928 rw_exit(&rtable[index].r_lock); 2929 } 2930 return (0); 2931 } 2932 2933 /* 2934 * Destroy inactive vnodes from the hash queues which belong to this 2935 * vfs. It is essential that we destroy all inactive vnodes during a 2936 * forced unmount as well as during a normal unmount. 2937 */ 2938 void 2939 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2940 { 2941 int index; 2942 rnode_t *rp; 2943 rnode_t *rlist; 2944 rnode_t *r_hashf; 2945 vnode_t *vp; 2946 2947 rlist = NULL; 2948 2949 for (index = 0; index < rtablesize; index++) { 2950 rw_enter(&rtable[index].r_lock, RW_WRITER); 2951 for (rp = rtable[index].r_hashf; 2952 rp != (rnode_t *)(&rtable[index]); 2953 rp = r_hashf) { 2954 /* save the hash pointer before destroying */ 2955 r_hashf = rp->r_hashf; 2956 vp = RTOV(rp); 2957 if (vp->v_vfsp == vfsp) { 2958 mutex_enter(&rpfreelist_lock); 2959 if (rp->r_freef != NULL) { 2960 rp_rmfree(rp); 2961 mutex_exit(&rpfreelist_lock); 2962 rp_rmhash_locked(rp); 2963 rp->r_hashf = rlist; 2964 rlist = rp; 2965 } else 2966 mutex_exit(&rpfreelist_lock); 2967 } 2968 } 2969 rw_exit(&rtable[index].r_lock); 2970 } 2971 2972 for (rp = rlist; rp != NULL; rp = rlist) { 2973 rlist = rp->r_hashf; 2974 /* 2975 * This call to rp_addfree will end up destroying the 2976 * rnode, but in a safe way with the appropriate set 2977 * of checks done. 2978 */ 2979 rp_addfree(rp, cr); 2980 } 2981 2982 } 2983 2984 /* 2985 * This routine destroys all the resources associated with the rnode 2986 * and then the rnode itself. 2987 */ 2988 static void 2989 destroy_rnode(rnode_t *rp) 2990 { 2991 vnode_t *vp; 2992 vfs_t *vfsp; 2993 2994 vp = RTOV(rp); 2995 vfsp = vp->v_vfsp; 2996 2997 ASSERT(vp->v_count == 1); 2998 ASSERT(rp->r_count == 0); 2999 ASSERT(rp->r_lmpl == NULL); 3000 ASSERT(rp->r_mapcnt == 0); 3001 ASSERT(!(rp->r_flags & RHASHED)); 3002 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 3003 atomic_add_long((ulong_t *)&rnew, -1); 3004 #ifdef DEBUG 3005 clstat_debug.nrnode.value.ui64--; 3006 #endif 3007 nfs_rw_destroy(&rp->r_rwlock); 3008 nfs_rw_destroy(&rp->r_lkserlock); 3009 mutex_destroy(&rp->r_statelock); 3010 cv_destroy(&rp->r_cv); 3011 cv_destroy(&rp->r_commit.c_cv); 3012 if (rp->r_flags & RDELMAPLIST) 3013 list_destroy(&rp->r_indelmap); 3014 nfs_free_r_path(rp); 3015 avl_destroy(&rp->r_dir); 3016 vn_invalid(vp); 3017 vn_free(vp); 3018 kmem_cache_free(rnode_cache, rp); 3019 VFS_RELE(vfsp); 3020 } 3021 3022 /* 3023 * Flush all vnodes in this (or every) vfs. 3024 * Used by nfs_sync and by nfs_unmount. 3025 */ 3026 void 3027 rflush(struct vfs *vfsp, cred_t *cr) 3028 { 3029 int index; 3030 rnode_t *rp; 3031 vnode_t *vp, **vplist; 3032 long num, cnt; 3033 3034 /* 3035 * Check to see whether there is anything to do. 3036 */ 3037 num = rnew; 3038 if (num == 0) 3039 return; 3040 3041 /* 3042 * Allocate a slot for all currently active rnodes on the 3043 * supposition that they all may need flushing. 3044 */ 3045 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3046 cnt = 0; 3047 3048 /* 3049 * Walk the hash queues looking for rnodes with page 3050 * lists associated with them. Make a list of these 3051 * files. 3052 */ 3053 for (index = 0; index < rtablesize; index++) { 3054 rw_enter(&rtable[index].r_lock, RW_READER); 3055 for (rp = rtable[index].r_hashf; 3056 rp != (rnode_t *)(&rtable[index]); 3057 rp = rp->r_hashf) { 3058 vp = RTOV(rp); 3059 /* 3060 * Don't bother sync'ing a vp if it 3061 * is part of virtual swap device or 3062 * if VFS is read-only 3063 */ 3064 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3065 continue; 3066 /* 3067 * If flushing all mounted file systems or 3068 * the vnode belongs to this vfs, has pages 3069 * and is marked as either dirty or mmap'd, 3070 * hold and add this vnode to the list of 3071 * vnodes to flush. 3072 */ 3073 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3074 vn_has_cached_data(vp) && 3075 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3076 VN_HOLD(vp); 3077 vplist[cnt++] = vp; 3078 if (cnt == num) { 3079 rw_exit(&rtable[index].r_lock); 3080 goto toomany; 3081 } 3082 } 3083 } 3084 rw_exit(&rtable[index].r_lock); 3085 } 3086 toomany: 3087 3088 /* 3089 * Flush and release all of the files on the list. 3090 */ 3091 while (cnt-- > 0) { 3092 vp = vplist[cnt]; 3093 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3094 VN_RELE(vp); 3095 } 3096 3097 /* 3098 * Free the space allocated to hold the list. 3099 */ 3100 kmem_free(vplist, num * sizeof (*vplist)); 3101 } 3102 3103 /* 3104 * This probably needs to be larger than or equal to 3105 * log2(sizeof (struct rnode)) due to the way that rnodes are 3106 * allocated. 3107 */ 3108 #define ACACHE_SHIFT_BITS 9 3109 3110 static int 3111 acachehash(rnode_t *rp, cred_t *cr) 3112 { 3113 3114 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3115 acachemask); 3116 } 3117 3118 #ifdef DEBUG 3119 static long nfs_access_cache_hits = 0; 3120 static long nfs_access_cache_misses = 0; 3121 #endif 3122 3123 nfs_access_type_t 3124 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3125 { 3126 vnode_t *vp; 3127 acache_t *ap; 3128 acache_hash_t *hp; 3129 nfs_access_type_t all; 3130 3131 vp = RTOV(rp); 3132 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3133 return (NFS_ACCESS_UNKNOWN); 3134 3135 if (rp->r_acache != NULL) { 3136 hp = &acache[acachehash(rp, cr)]; 3137 rw_enter(&hp->lock, RW_READER); 3138 ap = hp->next; 3139 while (ap != (acache_t *)hp) { 3140 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3141 if ((ap->known & acc) == acc) { 3142 #ifdef DEBUG 3143 nfs_access_cache_hits++; 3144 #endif 3145 if ((ap->allowed & acc) == acc) 3146 all = NFS_ACCESS_ALLOWED; 3147 else 3148 all = NFS_ACCESS_DENIED; 3149 } else { 3150 #ifdef DEBUG 3151 nfs_access_cache_misses++; 3152 #endif 3153 all = NFS_ACCESS_UNKNOWN; 3154 } 3155 rw_exit(&hp->lock); 3156 return (all); 3157 } 3158 ap = ap->next; 3159 } 3160 rw_exit(&hp->lock); 3161 } 3162 3163 #ifdef DEBUG 3164 nfs_access_cache_misses++; 3165 #endif 3166 return (NFS_ACCESS_UNKNOWN); 3167 } 3168 3169 void 3170 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3171 { 3172 acache_t *ap; 3173 acache_t *nap; 3174 acache_hash_t *hp; 3175 3176 hp = &acache[acachehash(rp, cr)]; 3177 3178 /* 3179 * Allocate now assuming that mostly an allocation will be 3180 * required. This allows the allocation to happen without 3181 * holding the hash bucket locked. 3182 */ 3183 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3184 if (nap != NULL) { 3185 nap->known = acc; 3186 nap->allowed = resacc; 3187 nap->rnode = rp; 3188 crhold(cr); 3189 nap->cred = cr; 3190 nap->hashq = hp; 3191 } 3192 3193 rw_enter(&hp->lock, RW_WRITER); 3194 3195 if (rp->r_acache != NULL) { 3196 ap = hp->next; 3197 while (ap != (acache_t *)hp) { 3198 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3199 ap->known |= acc; 3200 ap->allowed &= ~acc; 3201 ap->allowed |= resacc; 3202 rw_exit(&hp->lock); 3203 if (nap != NULL) { 3204 crfree(nap->cred); 3205 kmem_cache_free(acache_cache, nap); 3206 } 3207 return; 3208 } 3209 ap = ap->next; 3210 } 3211 } 3212 3213 if (nap != NULL) { 3214 #ifdef DEBUG 3215 clstat_debug.access.value.ui64++; 3216 #endif 3217 nap->next = hp->next; 3218 hp->next = nap; 3219 nap->next->prev = nap; 3220 nap->prev = (acache_t *)hp; 3221 3222 mutex_enter(&rp->r_statelock); 3223 nap->list = rp->r_acache; 3224 rp->r_acache = nap; 3225 mutex_exit(&rp->r_statelock); 3226 } 3227 3228 rw_exit(&hp->lock); 3229 } 3230 3231 int 3232 nfs_access_purge_rp(rnode_t *rp) 3233 { 3234 acache_t *ap; 3235 acache_t *tmpap; 3236 acache_t *rplist; 3237 3238 /* 3239 * If there aren't any cached entries, then there is nothing 3240 * to free. 3241 */ 3242 if (rp->r_acache == NULL) 3243 return (0); 3244 3245 mutex_enter(&rp->r_statelock); 3246 rplist = rp->r_acache; 3247 rp->r_acache = NULL; 3248 mutex_exit(&rp->r_statelock); 3249 3250 /* 3251 * Loop through each entry in the list pointed to in the 3252 * rnode. Remove each of these entries from the hash 3253 * queue that it is on and remove it from the list in 3254 * the rnode. 3255 */ 3256 for (ap = rplist; ap != NULL; ap = tmpap) { 3257 rw_enter(&ap->hashq->lock, RW_WRITER); 3258 ap->prev->next = ap->next; 3259 ap->next->prev = ap->prev; 3260 rw_exit(&ap->hashq->lock); 3261 3262 tmpap = ap->list; 3263 crfree(ap->cred); 3264 kmem_cache_free(acache_cache, ap); 3265 #ifdef DEBUG 3266 clstat_debug.access.value.ui64--; 3267 #endif 3268 } 3269 3270 return (1); 3271 } 3272 3273 static const char prefix[] = ".nfs"; 3274 3275 static kmutex_t newnum_lock; 3276 3277 int 3278 newnum(void) 3279 { 3280 static uint_t newnum = 0; 3281 uint_t id; 3282 3283 mutex_enter(&newnum_lock); 3284 if (newnum == 0) 3285 newnum = gethrestime_sec() & 0xffff; 3286 id = newnum++; 3287 mutex_exit(&newnum_lock); 3288 return (id); 3289 } 3290 3291 char * 3292 newname(void) 3293 { 3294 char *news; 3295 char *s; 3296 const char *p; 3297 uint_t id; 3298 3299 id = newnum(); 3300 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3301 s = news; 3302 p = prefix; 3303 while (*p != '\0') 3304 *s++ = *p++; 3305 while (id != 0) { 3306 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3307 id >>= 4; 3308 } 3309 *s = '\0'; 3310 return (news); 3311 } 3312 3313 /* 3314 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3315 * framework. 3316 */ 3317 static int 3318 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3319 { 3320 ksp->ks_snaptime = gethrtime(); 3321 if (rw == KSTAT_WRITE) { 3322 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3323 #ifdef DEBUG 3324 /* 3325 * Currently only the global zone can write to kstats, but we 3326 * add the check just for paranoia. 3327 */ 3328 if (INGLOBALZONE(curproc)) 3329 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3330 sizeof (clstat_debug)); 3331 #endif 3332 } else { 3333 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3334 #ifdef DEBUG 3335 /* 3336 * If we're displaying the "global" debug kstat values, we 3337 * display them as-is to all zones since in fact they apply to 3338 * the system as a whole. 3339 */ 3340 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3341 sizeof (clstat_debug)); 3342 #endif 3343 } 3344 return (0); 3345 } 3346 3347 static void * 3348 clinit_zone(zoneid_t zoneid) 3349 { 3350 kstat_t *nfs_client_kstat; 3351 struct nfs_clnt *nfscl; 3352 uint_t ndata; 3353 3354 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3355 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3356 nfscl->nfscl_chtable = NULL; 3357 nfscl->nfscl_zoneid = zoneid; 3358 3359 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3360 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3361 #ifdef DEBUG 3362 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3363 #endif 3364 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3365 "misc", KSTAT_TYPE_NAMED, ndata, 3366 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3367 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3368 nfs_client_kstat->ks_snapshot = cl_snapshot; 3369 kstat_install(nfs_client_kstat); 3370 } 3371 mutex_enter(&nfs_clnt_list_lock); 3372 list_insert_head(&nfs_clnt_list, nfscl); 3373 mutex_exit(&nfs_clnt_list_lock); 3374 return (nfscl); 3375 } 3376 3377 /*ARGSUSED*/ 3378 static void 3379 clfini_zone(zoneid_t zoneid, void *arg) 3380 { 3381 struct nfs_clnt *nfscl = arg; 3382 chhead_t *chp, *next; 3383 3384 if (nfscl == NULL) 3385 return; 3386 mutex_enter(&nfs_clnt_list_lock); 3387 list_remove(&nfs_clnt_list, nfscl); 3388 mutex_exit(&nfs_clnt_list_lock); 3389 clreclaim_zone(nfscl, 0); 3390 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3391 ASSERT(chp->ch_list == NULL); 3392 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3393 next = chp->ch_next; 3394 kmem_free(chp, sizeof (*chp)); 3395 } 3396 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3397 mutex_destroy(&nfscl->nfscl_chtable_lock); 3398 kmem_free(nfscl, sizeof (*nfscl)); 3399 } 3400 3401 /* 3402 * Called by endpnt_destructor to make sure the client handles are 3403 * cleaned up before the RPC endpoints. This becomes a no-op if 3404 * clfini_zone (above) is called first. This function is needed 3405 * (rather than relying on clfini_zone to clean up) because the ZSD 3406 * callbacks have no ordering mechanism, so we have no way to ensure 3407 * that clfini_zone is called before endpnt_destructor. 3408 */ 3409 void 3410 clcleanup_zone(zoneid_t zoneid) 3411 { 3412 struct nfs_clnt *nfscl; 3413 3414 mutex_enter(&nfs_clnt_list_lock); 3415 nfscl = list_head(&nfs_clnt_list); 3416 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3417 if (nfscl->nfscl_zoneid == zoneid) { 3418 clreclaim_zone(nfscl, 0); 3419 break; 3420 } 3421 } 3422 mutex_exit(&nfs_clnt_list_lock); 3423 } 3424 3425 int 3426 nfs_subrinit(void) 3427 { 3428 int i; 3429 ulong_t nrnode_max; 3430 3431 /* 3432 * Allocate and initialize the rnode hash queues 3433 */ 3434 if (nrnode <= 0) 3435 nrnode = ncsize; 3436 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3437 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3438 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3439 "setting nrnode to max value of %ld", nrnode_max); 3440 nrnode = nrnode_max; 3441 } 3442 3443 rtablesize = 1 << highbit(nrnode / hashlen); 3444 rtablemask = rtablesize - 1; 3445 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3446 for (i = 0; i < rtablesize; i++) { 3447 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3448 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3449 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3450 } 3451 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3452 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3453 3454 /* 3455 * Allocate and initialize the access cache 3456 */ 3457 3458 /* 3459 * Initial guess is one access cache entry per rnode unless 3460 * nacache is set to a non-zero value and then it is used to 3461 * indicate a guess at the number of access cache entries. 3462 */ 3463 if (nacache > 0) 3464 acachesize = 1 << highbit(nacache / hashlen); 3465 else 3466 acachesize = rtablesize; 3467 acachemask = acachesize - 1; 3468 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3469 for (i = 0; i < acachesize; i++) { 3470 acache[i].next = (acache_t *)&acache[i]; 3471 acache[i].prev = (acache_t *)&acache[i]; 3472 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3473 } 3474 acache_cache = kmem_cache_create("nfs_access_cache", 3475 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3476 /* 3477 * Allocate and initialize the client handle cache 3478 */ 3479 chtab_cache = kmem_cache_create("client_handle_cache", 3480 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3481 /* 3482 * Initialize the list of per-zone client handles (and associated data). 3483 * This needs to be done before we call zone_key_create(). 3484 */ 3485 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3486 offsetof(struct nfs_clnt, nfscl_node)); 3487 /* 3488 * Initialize the zone_key for per-zone client handle lists. 3489 */ 3490 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3491 /* 3492 * Initialize the various mutexes and reader/writer locks 3493 */ 3494 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3495 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3496 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3497 3498 /* 3499 * Assign unique major number for all nfs mounts 3500 */ 3501 if ((nfs_major = getudev()) == -1) { 3502 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3503 "nfs: init: can't get unique device number"); 3504 nfs_major = 0; 3505 } 3506 nfs_minor = 0; 3507 3508 if (nfs3_jukebox_delay == 0) 3509 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3510 3511 return (0); 3512 } 3513 3514 void 3515 nfs_subrfini(void) 3516 { 3517 int i; 3518 3519 /* 3520 * Deallocate the rnode hash queues 3521 */ 3522 kmem_cache_destroy(rnode_cache); 3523 3524 for (i = 0; i < rtablesize; i++) 3525 rw_destroy(&rtable[i].r_lock); 3526 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3527 3528 /* 3529 * Deallocated the access cache 3530 */ 3531 kmem_cache_destroy(acache_cache); 3532 3533 for (i = 0; i < acachesize; i++) 3534 rw_destroy(&acache[i].lock); 3535 kmem_free(acache, acachesize * sizeof (*acache)); 3536 3537 /* 3538 * Deallocate the client handle cache 3539 */ 3540 kmem_cache_destroy(chtab_cache); 3541 3542 /* 3543 * Destroy the various mutexes and reader/writer locks 3544 */ 3545 mutex_destroy(&rpfreelist_lock); 3546 mutex_destroy(&newnum_lock); 3547 mutex_destroy(&nfs_minor_lock); 3548 (void) zone_key_delete(nfsclnt_zone_key); 3549 } 3550 3551 enum nfsstat 3552 puterrno(int error) 3553 { 3554 3555 switch (error) { 3556 case EOPNOTSUPP: 3557 return (NFSERR_OPNOTSUPP); 3558 case ENAMETOOLONG: 3559 return (NFSERR_NAMETOOLONG); 3560 case ENOTEMPTY: 3561 return (NFSERR_NOTEMPTY); 3562 case EDQUOT: 3563 return (NFSERR_DQUOT); 3564 case ESTALE: 3565 return (NFSERR_STALE); 3566 case EREMOTE: 3567 return (NFSERR_REMOTE); 3568 case ENOSYS: 3569 return (NFSERR_OPNOTSUPP); 3570 case EOVERFLOW: 3571 return (NFSERR_INVAL); 3572 default: 3573 return ((enum nfsstat)error); 3574 } 3575 /* NOTREACHED */ 3576 } 3577 3578 int 3579 geterrno(enum nfsstat status) 3580 { 3581 3582 switch (status) { 3583 case NFSERR_OPNOTSUPP: 3584 return (EOPNOTSUPP); 3585 case NFSERR_NAMETOOLONG: 3586 return (ENAMETOOLONG); 3587 case NFSERR_NOTEMPTY: 3588 return (ENOTEMPTY); 3589 case NFSERR_DQUOT: 3590 return (EDQUOT); 3591 case NFSERR_STALE: 3592 return (ESTALE); 3593 case NFSERR_REMOTE: 3594 return (EREMOTE); 3595 case NFSERR_WFLUSH: 3596 return (EIO); 3597 default: 3598 return ((int)status); 3599 } 3600 /* NOTREACHED */ 3601 } 3602 3603 enum nfsstat3 3604 puterrno3(int error) 3605 { 3606 3607 #ifdef DEBUG 3608 switch (error) { 3609 case 0: 3610 return (NFS3_OK); 3611 case EPERM: 3612 return (NFS3ERR_PERM); 3613 case ENOENT: 3614 return (NFS3ERR_NOENT); 3615 case EIO: 3616 return (NFS3ERR_IO); 3617 case ENXIO: 3618 return (NFS3ERR_NXIO); 3619 case EACCES: 3620 return (NFS3ERR_ACCES); 3621 case EEXIST: 3622 return (NFS3ERR_EXIST); 3623 case EXDEV: 3624 return (NFS3ERR_XDEV); 3625 case ENODEV: 3626 return (NFS3ERR_NODEV); 3627 case ENOTDIR: 3628 return (NFS3ERR_NOTDIR); 3629 case EISDIR: 3630 return (NFS3ERR_ISDIR); 3631 case EINVAL: 3632 return (NFS3ERR_INVAL); 3633 case EFBIG: 3634 return (NFS3ERR_FBIG); 3635 case ENOSPC: 3636 return (NFS3ERR_NOSPC); 3637 case EROFS: 3638 return (NFS3ERR_ROFS); 3639 case EMLINK: 3640 return (NFS3ERR_MLINK); 3641 case ENAMETOOLONG: 3642 return (NFS3ERR_NAMETOOLONG); 3643 case ENOTEMPTY: 3644 return (NFS3ERR_NOTEMPTY); 3645 case EDQUOT: 3646 return (NFS3ERR_DQUOT); 3647 case ESTALE: 3648 return (NFS3ERR_STALE); 3649 case EREMOTE: 3650 return (NFS3ERR_REMOTE); 3651 case ENOSYS: 3652 case EOPNOTSUPP: 3653 return (NFS3ERR_NOTSUPP); 3654 case EOVERFLOW: 3655 return (NFS3ERR_INVAL); 3656 default: 3657 zcmn_err(getzoneid(), CE_WARN, 3658 "puterrno3: got error %d", error); 3659 return ((enum nfsstat3)error); 3660 } 3661 #else 3662 switch (error) { 3663 case ENAMETOOLONG: 3664 return (NFS3ERR_NAMETOOLONG); 3665 case ENOTEMPTY: 3666 return (NFS3ERR_NOTEMPTY); 3667 case EDQUOT: 3668 return (NFS3ERR_DQUOT); 3669 case ESTALE: 3670 return (NFS3ERR_STALE); 3671 case ENOSYS: 3672 case EOPNOTSUPP: 3673 return (NFS3ERR_NOTSUPP); 3674 case EREMOTE: 3675 return (NFS3ERR_REMOTE); 3676 case EOVERFLOW: 3677 return (NFS3ERR_INVAL); 3678 default: 3679 return ((enum nfsstat3)error); 3680 } 3681 #endif 3682 } 3683 3684 int 3685 geterrno3(enum nfsstat3 status) 3686 { 3687 3688 #ifdef DEBUG 3689 switch (status) { 3690 case NFS3_OK: 3691 return (0); 3692 case NFS3ERR_PERM: 3693 return (EPERM); 3694 case NFS3ERR_NOENT: 3695 return (ENOENT); 3696 case NFS3ERR_IO: 3697 return (EIO); 3698 case NFS3ERR_NXIO: 3699 return (ENXIO); 3700 case NFS3ERR_ACCES: 3701 return (EACCES); 3702 case NFS3ERR_EXIST: 3703 return (EEXIST); 3704 case NFS3ERR_XDEV: 3705 return (EXDEV); 3706 case NFS3ERR_NODEV: 3707 return (ENODEV); 3708 case NFS3ERR_NOTDIR: 3709 return (ENOTDIR); 3710 case NFS3ERR_ISDIR: 3711 return (EISDIR); 3712 case NFS3ERR_INVAL: 3713 return (EINVAL); 3714 case NFS3ERR_FBIG: 3715 return (EFBIG); 3716 case NFS3ERR_NOSPC: 3717 return (ENOSPC); 3718 case NFS3ERR_ROFS: 3719 return (EROFS); 3720 case NFS3ERR_MLINK: 3721 return (EMLINK); 3722 case NFS3ERR_NAMETOOLONG: 3723 return (ENAMETOOLONG); 3724 case NFS3ERR_NOTEMPTY: 3725 return (ENOTEMPTY); 3726 case NFS3ERR_DQUOT: 3727 return (EDQUOT); 3728 case NFS3ERR_STALE: 3729 return (ESTALE); 3730 case NFS3ERR_REMOTE: 3731 return (EREMOTE); 3732 case NFS3ERR_BADHANDLE: 3733 return (ESTALE); 3734 case NFS3ERR_NOT_SYNC: 3735 return (EINVAL); 3736 case NFS3ERR_BAD_COOKIE: 3737 return (ENOENT); 3738 case NFS3ERR_NOTSUPP: 3739 return (EOPNOTSUPP); 3740 case NFS3ERR_TOOSMALL: 3741 return (EINVAL); 3742 case NFS3ERR_SERVERFAULT: 3743 return (EIO); 3744 case NFS3ERR_BADTYPE: 3745 return (EINVAL); 3746 case NFS3ERR_JUKEBOX: 3747 return (ENXIO); 3748 default: 3749 zcmn_err(getzoneid(), CE_WARN, 3750 "geterrno3: got status %d", status); 3751 return ((int)status); 3752 } 3753 #else 3754 switch (status) { 3755 case NFS3ERR_NAMETOOLONG: 3756 return (ENAMETOOLONG); 3757 case NFS3ERR_NOTEMPTY: 3758 return (ENOTEMPTY); 3759 case NFS3ERR_DQUOT: 3760 return (EDQUOT); 3761 case NFS3ERR_STALE: 3762 case NFS3ERR_BADHANDLE: 3763 return (ESTALE); 3764 case NFS3ERR_NOTSUPP: 3765 return (EOPNOTSUPP); 3766 case NFS3ERR_REMOTE: 3767 return (EREMOTE); 3768 case NFS3ERR_NOT_SYNC: 3769 case NFS3ERR_TOOSMALL: 3770 case NFS3ERR_BADTYPE: 3771 return (EINVAL); 3772 case NFS3ERR_BAD_COOKIE: 3773 return (ENOENT); 3774 case NFS3ERR_SERVERFAULT: 3775 return (EIO); 3776 case NFS3ERR_JUKEBOX: 3777 return (ENXIO); 3778 default: 3779 return ((int)status); 3780 } 3781 #endif 3782 } 3783 3784 rddir_cache * 3785 rddir_cache_alloc(int flags) 3786 { 3787 rddir_cache *rc; 3788 3789 rc = kmem_alloc(sizeof (*rc), flags); 3790 if (rc != NULL) { 3791 rc->entries = NULL; 3792 rc->flags = RDDIR; 3793 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3794 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3795 rc->count = 1; 3796 #ifdef DEBUG 3797 atomic_add_64(&clstat_debug.dirent.value.ui64, 1); 3798 #endif 3799 } 3800 return (rc); 3801 } 3802 3803 static void 3804 rddir_cache_free(rddir_cache *rc) 3805 { 3806 3807 #ifdef DEBUG 3808 atomic_add_64(&clstat_debug.dirent.value.ui64, -1); 3809 #endif 3810 if (rc->entries != NULL) { 3811 #ifdef DEBUG 3812 rddir_cache_buf_free(rc->entries, rc->buflen); 3813 #else 3814 kmem_free(rc->entries, rc->buflen); 3815 #endif 3816 } 3817 cv_destroy(&rc->cv); 3818 mutex_destroy(&rc->lock); 3819 kmem_free(rc, sizeof (*rc)); 3820 } 3821 3822 void 3823 rddir_cache_hold(rddir_cache *rc) 3824 { 3825 3826 mutex_enter(&rc->lock); 3827 rc->count++; 3828 mutex_exit(&rc->lock); 3829 } 3830 3831 void 3832 rddir_cache_rele(rddir_cache *rc) 3833 { 3834 3835 mutex_enter(&rc->lock); 3836 ASSERT(rc->count > 0); 3837 if (--rc->count == 0) { 3838 mutex_exit(&rc->lock); 3839 rddir_cache_free(rc); 3840 } else 3841 mutex_exit(&rc->lock); 3842 } 3843 3844 #ifdef DEBUG 3845 char * 3846 rddir_cache_buf_alloc(size_t size, int flags) 3847 { 3848 char *rc; 3849 3850 rc = kmem_alloc(size, flags); 3851 if (rc != NULL) 3852 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3853 return (rc); 3854 } 3855 3856 void 3857 rddir_cache_buf_free(void *addr, size_t size) 3858 { 3859 3860 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3861 kmem_free(addr, size); 3862 } 3863 #endif 3864 3865 static int 3866 nfs_free_data_reclaim(rnode_t *rp) 3867 { 3868 char *contents; 3869 int size; 3870 vsecattr_t *vsp; 3871 nfs3_pathconf_info *info; 3872 int freed; 3873 cred_t *cred; 3874 3875 /* 3876 * Free any held credentials and caches which 3877 * may be associated with this rnode. 3878 */ 3879 mutex_enter(&rp->r_statelock); 3880 cred = rp->r_cred; 3881 rp->r_cred = NULL; 3882 contents = rp->r_symlink.contents; 3883 size = rp->r_symlink.size; 3884 rp->r_symlink.contents = NULL; 3885 vsp = rp->r_secattr; 3886 rp->r_secattr = NULL; 3887 info = rp->r_pathconf; 3888 rp->r_pathconf = NULL; 3889 mutex_exit(&rp->r_statelock); 3890 3891 if (cred != NULL) 3892 crfree(cred); 3893 3894 /* 3895 * Free the access cache entries. 3896 */ 3897 freed = nfs_access_purge_rp(rp); 3898 3899 if (!HAVE_RDDIR_CACHE(rp) && 3900 contents == NULL && 3901 vsp == NULL && 3902 info == NULL) 3903 return (freed); 3904 3905 /* 3906 * Free the readdir cache entries 3907 */ 3908 if (HAVE_RDDIR_CACHE(rp)) 3909 nfs_purge_rddir_cache(RTOV(rp)); 3910 3911 /* 3912 * Free the symbolic link cache. 3913 */ 3914 if (contents != NULL) { 3915 3916 kmem_free((void *)contents, size); 3917 } 3918 3919 /* 3920 * Free any cached ACL. 3921 */ 3922 if (vsp != NULL) 3923 nfs_acl_free(vsp); 3924 3925 /* 3926 * Free any cached pathconf information. 3927 */ 3928 if (info != NULL) 3929 kmem_free(info, sizeof (*info)); 3930 3931 return (1); 3932 } 3933 3934 static int 3935 nfs_active_data_reclaim(rnode_t *rp) 3936 { 3937 char *contents; 3938 int size; 3939 vsecattr_t *vsp; 3940 nfs3_pathconf_info *info; 3941 int freed; 3942 3943 /* 3944 * Free any held credentials and caches which 3945 * may be associated with this rnode. 3946 */ 3947 if (!mutex_tryenter(&rp->r_statelock)) 3948 return (0); 3949 contents = rp->r_symlink.contents; 3950 size = rp->r_symlink.size; 3951 rp->r_symlink.contents = NULL; 3952 vsp = rp->r_secattr; 3953 rp->r_secattr = NULL; 3954 info = rp->r_pathconf; 3955 rp->r_pathconf = NULL; 3956 mutex_exit(&rp->r_statelock); 3957 3958 /* 3959 * Free the access cache entries. 3960 */ 3961 freed = nfs_access_purge_rp(rp); 3962 3963 if (!HAVE_RDDIR_CACHE(rp) && 3964 contents == NULL && 3965 vsp == NULL && 3966 info == NULL) 3967 return (freed); 3968 3969 /* 3970 * Free the readdir cache entries 3971 */ 3972 if (HAVE_RDDIR_CACHE(rp)) 3973 nfs_purge_rddir_cache(RTOV(rp)); 3974 3975 /* 3976 * Free the symbolic link cache. 3977 */ 3978 if (contents != NULL) { 3979 3980 kmem_free((void *)contents, size); 3981 } 3982 3983 /* 3984 * Free any cached ACL. 3985 */ 3986 if (vsp != NULL) 3987 nfs_acl_free(vsp); 3988 3989 /* 3990 * Free any cached pathconf information. 3991 */ 3992 if (info != NULL) 3993 kmem_free(info, sizeof (*info)); 3994 3995 return (1); 3996 } 3997 3998 static int 3999 nfs_free_reclaim(void) 4000 { 4001 int freed; 4002 rnode_t *rp; 4003 4004 #ifdef DEBUG 4005 clstat_debug.f_reclaim.value.ui64++; 4006 #endif 4007 freed = 0; 4008 mutex_enter(&rpfreelist_lock); 4009 rp = rpfreelist; 4010 if (rp != NULL) { 4011 do { 4012 if (nfs_free_data_reclaim(rp)) 4013 freed = 1; 4014 } while ((rp = rp->r_freef) != rpfreelist); 4015 } 4016 mutex_exit(&rpfreelist_lock); 4017 return (freed); 4018 } 4019 4020 static int 4021 nfs_active_reclaim(void) 4022 { 4023 int freed; 4024 int index; 4025 rnode_t *rp; 4026 4027 #ifdef DEBUG 4028 clstat_debug.a_reclaim.value.ui64++; 4029 #endif 4030 freed = 0; 4031 for (index = 0; index < rtablesize; index++) { 4032 rw_enter(&rtable[index].r_lock, RW_READER); 4033 for (rp = rtable[index].r_hashf; 4034 rp != (rnode_t *)(&rtable[index]); 4035 rp = rp->r_hashf) { 4036 if (nfs_active_data_reclaim(rp)) 4037 freed = 1; 4038 } 4039 rw_exit(&rtable[index].r_lock); 4040 } 4041 return (freed); 4042 } 4043 4044 static int 4045 nfs_rnode_reclaim(void) 4046 { 4047 int freed; 4048 rnode_t *rp; 4049 vnode_t *vp; 4050 4051 #ifdef DEBUG 4052 clstat_debug.r_reclaim.value.ui64++; 4053 #endif 4054 freed = 0; 4055 mutex_enter(&rpfreelist_lock); 4056 while ((rp = rpfreelist) != NULL) { 4057 rp_rmfree(rp); 4058 mutex_exit(&rpfreelist_lock); 4059 if (rp->r_flags & RHASHED) { 4060 vp = RTOV(rp); 4061 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4062 mutex_enter(&vp->v_lock); 4063 if (vp->v_count > 1) { 4064 vp->v_count--; 4065 mutex_exit(&vp->v_lock); 4066 rw_exit(&rp->r_hashq->r_lock); 4067 mutex_enter(&rpfreelist_lock); 4068 continue; 4069 } 4070 mutex_exit(&vp->v_lock); 4071 rp_rmhash_locked(rp); 4072 rw_exit(&rp->r_hashq->r_lock); 4073 } 4074 /* 4075 * This call to rp_addfree will end up destroying the 4076 * rnode, but in a safe way with the appropriate set 4077 * of checks done. 4078 */ 4079 rp_addfree(rp, CRED()); 4080 mutex_enter(&rpfreelist_lock); 4081 } 4082 mutex_exit(&rpfreelist_lock); 4083 return (freed); 4084 } 4085 4086 /*ARGSUSED*/ 4087 static void 4088 nfs_reclaim(void *cdrarg) 4089 { 4090 4091 #ifdef DEBUG 4092 clstat_debug.reclaim.value.ui64++; 4093 #endif 4094 if (nfs_free_reclaim()) 4095 return; 4096 4097 if (nfs_active_reclaim()) 4098 return; 4099 4100 (void) nfs_rnode_reclaim(); 4101 } 4102 4103 /* 4104 * NFS client failover support 4105 * 4106 * Routines to copy filehandles 4107 */ 4108 void 4109 nfscopyfh(caddr_t fhp, vnode_t *vp) 4110 { 4111 fhandle_t *dest = (fhandle_t *)fhp; 4112 4113 if (dest != NULL) 4114 *dest = *VTOFH(vp); 4115 } 4116 4117 void 4118 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4119 { 4120 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4121 4122 if (dest != NULL) 4123 *dest = *VTOFH3(vp); 4124 } 4125 4126 /* 4127 * NFS client failover support 4128 * 4129 * failover_safe() will test various conditions to ensure that 4130 * failover is permitted for this vnode. It will be denied 4131 * if: 4132 * 1) the operation in progress does not support failover (NULL fi) 4133 * 2) there are no available replicas (NULL mi_servers->sv_next) 4134 * 3) any locks are outstanding on this file 4135 */ 4136 static int 4137 failover_safe(failinfo_t *fi) 4138 { 4139 4140 /* 4141 * Does this op permit failover? 4142 */ 4143 if (fi == NULL || fi->vp == NULL) 4144 return (0); 4145 4146 /* 4147 * Are there any alternates to failover to? 4148 */ 4149 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4150 return (0); 4151 4152 /* 4153 * Disable check; we've forced local locking 4154 * 4155 * if (flk_has_remote_locks(fi->vp)) 4156 * return (0); 4157 */ 4158 4159 /* 4160 * If we have no partial path, we can't do anything 4161 */ 4162 if (VTOR(fi->vp)->r_path == NULL) 4163 return (0); 4164 4165 return (1); 4166 } 4167 4168 #include <sys/thread.h> 4169 4170 /* 4171 * NFS client failover support 4172 * 4173 * failover_newserver() will start a search for a new server, 4174 * preferably by starting an async thread to do the work. If 4175 * someone is already doing this (recognizable by MI_BINDINPROG 4176 * being set), it will simply return and the calling thread 4177 * will queue on the mi_failover_cv condition variable. 4178 */ 4179 static void 4180 failover_newserver(mntinfo_t *mi) 4181 { 4182 /* 4183 * Check if someone else is doing this already 4184 */ 4185 mutex_enter(&mi->mi_lock); 4186 if (mi->mi_flags & MI_BINDINPROG) { 4187 mutex_exit(&mi->mi_lock); 4188 return; 4189 } 4190 mi->mi_flags |= MI_BINDINPROG; 4191 4192 /* 4193 * Need to hold the vfs struct so that it can't be released 4194 * while the failover thread is selecting a new server. 4195 */ 4196 VFS_HOLD(mi->mi_vfsp); 4197 4198 /* 4199 * Start a thread to do the real searching. 4200 */ 4201 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4202 4203 mutex_exit(&mi->mi_lock); 4204 } 4205 4206 /* 4207 * NFS client failover support 4208 * 4209 * failover_thread() will find a new server to replace the one 4210 * currently in use, wake up other threads waiting on this mount 4211 * point, and die. It will start at the head of the server list 4212 * and poll servers until it finds one with an NFS server which is 4213 * registered and responds to a NULL procedure ping. 4214 * 4215 * XXX failover_thread is unsafe within the scope of the 4216 * present model defined for cpr to suspend the system. 4217 * Specifically, over-the-wire calls made by the thread 4218 * are unsafe. The thread needs to be reevaluated in case of 4219 * future updates to the cpr suspend model. 4220 */ 4221 static void 4222 failover_thread(mntinfo_t *mi) 4223 { 4224 servinfo_t *svp = NULL; 4225 CLIENT *cl; 4226 enum clnt_stat status; 4227 struct timeval tv; 4228 int error; 4229 int oncethru = 0; 4230 callb_cpr_t cprinfo; 4231 rnode_t *rp; 4232 int index; 4233 char *srvnames; 4234 size_t srvnames_len; 4235 struct nfs_clnt *nfscl = NULL; 4236 zoneid_t zoneid = getzoneid(); 4237 4238 #ifdef DEBUG 4239 /* 4240 * This is currently only needed to access counters which exist on 4241 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4242 * on non-DEBUG kernels. 4243 */ 4244 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4245 ASSERT(nfscl != NULL); 4246 #endif 4247 4248 /* 4249 * Its safe to piggyback on the mi_lock since failover_newserver() 4250 * code guarantees that there will be only one failover thread 4251 * per mountinfo at any instance. 4252 */ 4253 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4254 "failover_thread"); 4255 4256 mutex_enter(&mi->mi_lock); 4257 while (mi->mi_readers) { 4258 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4259 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4260 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4261 } 4262 mutex_exit(&mi->mi_lock); 4263 4264 tv.tv_sec = 2; 4265 tv.tv_usec = 0; 4266 4267 /* 4268 * Ping the null NFS procedure of every server in 4269 * the list until one responds. We always start 4270 * at the head of the list and always skip the one 4271 * that is current, since it's caused us a problem. 4272 */ 4273 while (svp == NULL) { 4274 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4275 if (!oncethru && svp == mi->mi_curr_serv) 4276 continue; 4277 4278 /* 4279 * If the file system was forcibly umounted 4280 * while trying to do a failover, then just 4281 * give up on the failover. It won't matter 4282 * what the server is. 4283 */ 4284 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4285 svp = NULL; 4286 goto done; 4287 } 4288 4289 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4290 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4291 if (error) 4292 continue; 4293 4294 if (!(mi->mi_flags & MI_INT)) 4295 cl->cl_nosignal = TRUE; 4296 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4297 xdr_void, NULL, tv); 4298 if (!(mi->mi_flags & MI_INT)) 4299 cl->cl_nosignal = FALSE; 4300 AUTH_DESTROY(cl->cl_auth); 4301 CLNT_DESTROY(cl); 4302 if (status == RPC_SUCCESS) { 4303 if (svp == mi->mi_curr_serv) { 4304 #ifdef DEBUG 4305 zcmn_err(zoneid, CE_NOTE, 4306 "NFS%d: failing over: selecting original server %s", 4307 mi->mi_vers, svp->sv_hostname); 4308 #else 4309 zcmn_err(zoneid, CE_NOTE, 4310 "NFS: failing over: selecting original server %s", 4311 svp->sv_hostname); 4312 #endif 4313 } else { 4314 #ifdef DEBUG 4315 zcmn_err(zoneid, CE_NOTE, 4316 "NFS%d: failing over from %s to %s", 4317 mi->mi_vers, 4318 mi->mi_curr_serv->sv_hostname, 4319 svp->sv_hostname); 4320 #else 4321 zcmn_err(zoneid, CE_NOTE, 4322 "NFS: failing over from %s to %s", 4323 mi->mi_curr_serv->sv_hostname, 4324 svp->sv_hostname); 4325 #endif 4326 } 4327 break; 4328 } 4329 } 4330 4331 if (svp == NULL) { 4332 if (!oncethru) { 4333 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4334 #ifdef DEBUG 4335 zprintf(zoneid, 4336 "NFS%d servers %s not responding " 4337 "still trying\n", mi->mi_vers, srvnames); 4338 #else 4339 zprintf(zoneid, "NFS servers %s not responding " 4340 "still trying\n", srvnames); 4341 #endif 4342 oncethru = 1; 4343 } 4344 mutex_enter(&mi->mi_lock); 4345 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4346 mutex_exit(&mi->mi_lock); 4347 delay(hz); 4348 mutex_enter(&mi->mi_lock); 4349 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4350 mutex_exit(&mi->mi_lock); 4351 } 4352 } 4353 4354 if (oncethru) { 4355 #ifdef DEBUG 4356 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4357 #else 4358 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4359 #endif 4360 } 4361 4362 if (svp != mi->mi_curr_serv) { 4363 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4364 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4365 rw_enter(&rtable[index].r_lock, RW_WRITER); 4366 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4367 mi->mi_vfsp); 4368 if (rp != NULL) { 4369 if (rp->r_flags & RHASHED) 4370 rp_rmhash_locked(rp); 4371 rw_exit(&rtable[index].r_lock); 4372 rp->r_server = svp; 4373 rp->r_fh = svp->sv_fhandle; 4374 (void) nfs_free_data_reclaim(rp); 4375 index = rtablehash(&rp->r_fh); 4376 rp->r_hashq = &rtable[index]; 4377 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4378 vn_exists(RTOV(rp)); 4379 rp_addhash(rp); 4380 rw_exit(&rp->r_hashq->r_lock); 4381 VN_RELE(RTOV(rp)); 4382 } else 4383 rw_exit(&rtable[index].r_lock); 4384 } 4385 4386 done: 4387 if (oncethru) 4388 kmem_free(srvnames, srvnames_len); 4389 mutex_enter(&mi->mi_lock); 4390 mi->mi_flags &= ~MI_BINDINPROG; 4391 if (svp != NULL) { 4392 mi->mi_curr_serv = svp; 4393 mi->mi_failover++; 4394 #ifdef DEBUG 4395 nfscl->nfscl_stat.failover.value.ui64++; 4396 #endif 4397 } 4398 cv_broadcast(&mi->mi_failover_cv); 4399 CALLB_CPR_EXIT(&cprinfo); 4400 VFS_RELE(mi->mi_vfsp); 4401 zthread_exit(); 4402 /* NOTREACHED */ 4403 } 4404 4405 /* 4406 * NFS client failover support 4407 * 4408 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4409 * is cleared, meaning that failover is complete. Called with 4410 * mi_lock mutex held. 4411 */ 4412 static int 4413 failover_wait(mntinfo_t *mi) 4414 { 4415 k_sigset_t smask; 4416 4417 /* 4418 * If someone else is hunting for a living server, 4419 * sleep until it's done. After our sleep, we may 4420 * be bound to the right server and get off cheaply. 4421 */ 4422 while (mi->mi_flags & MI_BINDINPROG) { 4423 /* 4424 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4425 * and SIGTERM. (Preserving the existing masks). 4426 * Mask out SIGINT if mount option nointr is specified. 4427 */ 4428 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4429 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4430 /* 4431 * restore original signal mask 4432 */ 4433 sigunintr(&smask); 4434 return (EINTR); 4435 } 4436 /* 4437 * restore original signal mask 4438 */ 4439 sigunintr(&smask); 4440 } 4441 return (0); 4442 } 4443 4444 /* 4445 * NFS client failover support 4446 * 4447 * failover_remap() will do a partial pathname lookup and find the 4448 * desired vnode on the current server. The interim vnode will be 4449 * discarded after we pilfer the new filehandle. 4450 * 4451 * Side effects: 4452 * - This routine will also update the filehandle in the args structure 4453 * pointed to by the fi->fhp pointer if it is non-NULL. 4454 */ 4455 4456 static int 4457 failover_remap(failinfo_t *fi) 4458 { 4459 vnode_t *vp, *nvp, *rootvp; 4460 rnode_t *rp, *nrp; 4461 mntinfo_t *mi; 4462 int error; 4463 #ifdef DEBUG 4464 struct nfs_clnt *nfscl; 4465 4466 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4467 ASSERT(nfscl != NULL); 4468 #endif 4469 /* 4470 * Sanity check 4471 */ 4472 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4473 return (EINVAL); 4474 vp = fi->vp; 4475 rp = VTOR(vp); 4476 mi = VTOMI(vp); 4477 4478 if (!(vp->v_flag & VROOT)) { 4479 /* 4480 * Given the root fh, use the path stored in 4481 * the rnode to find the fh for the new server. 4482 */ 4483 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4484 if (error) 4485 return (error); 4486 4487 error = failover_lookup(rp->r_path, rootvp, 4488 fi->lookupproc, fi->xattrdirproc, &nvp); 4489 4490 VN_RELE(rootvp); 4491 4492 if (error) 4493 return (error); 4494 4495 /* 4496 * If we found the same rnode, we're done now 4497 */ 4498 if (nvp == vp) { 4499 /* 4500 * Failed and the new server may physically be same 4501 * OR may share a same disk subsystem. In this case 4502 * file handle for a particular file path is not going 4503 * to change, given the same filehandle lookup will 4504 * always locate the same rnode as the existing one. 4505 * All we might need to do is to update the r_server 4506 * with the current servinfo. 4507 */ 4508 if (!VALID_FH(fi)) { 4509 rp->r_server = mi->mi_curr_serv; 4510 } 4511 VN_RELE(nvp); 4512 return (0); 4513 } 4514 4515 /* 4516 * Try to make it so that no one else will find this 4517 * vnode because it is just a temporary to hold the 4518 * new file handle until that file handle can be 4519 * copied to the original vnode/rnode. 4520 */ 4521 nrp = VTOR(nvp); 4522 mutex_enter(&mi->mi_remap_lock); 4523 /* 4524 * Some other thread could have raced in here and could 4525 * have done the remap for this particular rnode before 4526 * this thread here. Check for rp->r_server and 4527 * mi->mi_curr_serv and return if they are same. 4528 */ 4529 if (VALID_FH(fi)) { 4530 mutex_exit(&mi->mi_remap_lock); 4531 VN_RELE(nvp); 4532 return (0); 4533 } 4534 4535 if (nrp->r_flags & RHASHED) 4536 rp_rmhash(nrp); 4537 4538 /* 4539 * As a heuristic check on the validity of the new 4540 * file, check that the size and type match against 4541 * that we remember from the old version. 4542 */ 4543 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4544 mutex_exit(&mi->mi_remap_lock); 4545 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4546 "NFS replicas %s and %s: file %s not same.", 4547 rp->r_server->sv_hostname, 4548 nrp->r_server->sv_hostname, rp->r_path); 4549 VN_RELE(nvp); 4550 return (EINVAL); 4551 } 4552 4553 /* 4554 * snarf the filehandle from the new rnode 4555 * then release it, again while updating the 4556 * hash queues for the rnode. 4557 */ 4558 if (rp->r_flags & RHASHED) 4559 rp_rmhash(rp); 4560 rp->r_server = mi->mi_curr_serv; 4561 rp->r_fh = nrp->r_fh; 4562 rp->r_hashq = nrp->r_hashq; 4563 /* 4564 * Copy the attributes from the new rnode to the old 4565 * rnode. This will help to reduce unnecessary page 4566 * cache flushes. 4567 */ 4568 rp->r_attr = nrp->r_attr; 4569 rp->r_attrtime = nrp->r_attrtime; 4570 rp->r_mtime = nrp->r_mtime; 4571 (void) nfs_free_data_reclaim(rp); 4572 nfs_setswaplike(vp, &rp->r_attr); 4573 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4574 rp_addhash(rp); 4575 rw_exit(&rp->r_hashq->r_lock); 4576 mutex_exit(&mi->mi_remap_lock); 4577 VN_RELE(nvp); 4578 } 4579 4580 /* 4581 * Update successful failover remap count 4582 */ 4583 mutex_enter(&mi->mi_lock); 4584 mi->mi_remap++; 4585 mutex_exit(&mi->mi_lock); 4586 #ifdef DEBUG 4587 nfscl->nfscl_stat.remap.value.ui64++; 4588 #endif 4589 4590 /* 4591 * If we have a copied filehandle to update, do it now. 4592 */ 4593 if (fi->fhp != NULL && fi->copyproc != NULL) 4594 (*fi->copyproc)(fi->fhp, vp); 4595 4596 return (0); 4597 } 4598 4599 /* 4600 * NFS client failover support 4601 * 4602 * We want a simple pathname lookup routine to parse the pieces 4603 * of path in rp->r_path. We know that the path was a created 4604 * as rnodes were made, so we know we have only to deal with 4605 * paths that look like: 4606 * dir1/dir2/dir3/file 4607 * Any evidence of anything like .., symlinks, and ENOTDIR 4608 * are hard errors, because they mean something in this filesystem 4609 * is different from the one we came from, or has changed under 4610 * us in some way. If this is true, we want the failure. 4611 * 4612 * Extended attributes: if the filesystem is mounted with extended 4613 * attributes enabled (-o xattr), the attribute directory will be 4614 * represented in the r_path as the magic name XATTR_RPATH. So if 4615 * we see that name in the pathname, is must be because this node 4616 * is an extended attribute. Therefore, look it up that way. 4617 */ 4618 static int 4619 failover_lookup(char *path, vnode_t *root, 4620 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4621 vnode_t *, cred_t *, int), 4622 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4623 vnode_t **new) 4624 { 4625 vnode_t *dvp, *nvp; 4626 int error = EINVAL; 4627 char *s, *p, *tmppath; 4628 size_t len; 4629 mntinfo_t *mi; 4630 bool_t xattr; 4631 4632 /* Make local copy of path */ 4633 len = strlen(path) + 1; 4634 tmppath = kmem_alloc(len, KM_SLEEP); 4635 (void) strcpy(tmppath, path); 4636 s = tmppath; 4637 4638 dvp = root; 4639 VN_HOLD(dvp); 4640 mi = VTOMI(root); 4641 xattr = mi->mi_flags & MI_EXTATTR; 4642 4643 do { 4644 p = strchr(s, '/'); 4645 if (p != NULL) 4646 *p = '\0'; 4647 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4648 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4649 RFSCALL_SOFT); 4650 } else { 4651 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4652 CRED(), RFSCALL_SOFT); 4653 } 4654 if (p != NULL) 4655 *p++ = '/'; 4656 if (error) { 4657 VN_RELE(dvp); 4658 kmem_free(tmppath, len); 4659 return (error); 4660 } 4661 s = p; 4662 VN_RELE(dvp); 4663 dvp = nvp; 4664 } while (p != NULL); 4665 4666 if (nvp != NULL && new != NULL) 4667 *new = nvp; 4668 kmem_free(tmppath, len); 4669 return (0); 4670 } 4671 4672 /* 4673 * NFS client failover support 4674 * 4675 * sv_free() frees the malloc'd portion of a "servinfo_t". 4676 */ 4677 void 4678 sv_free(servinfo_t *svp) 4679 { 4680 servinfo_t *next; 4681 struct knetconfig *knconf; 4682 4683 while (svp != NULL) { 4684 next = svp->sv_next; 4685 if (svp->sv_secdata) 4686 sec_clnt_freeinfo(svp->sv_secdata); 4687 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4688 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4689 knconf = svp->sv_knconf; 4690 if (knconf != NULL) { 4691 if (knconf->knc_protofmly != NULL) 4692 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4693 if (knconf->knc_proto != NULL) 4694 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4695 kmem_free(knconf, sizeof (*knconf)); 4696 } 4697 knconf = svp->sv_origknconf; 4698 if (knconf != NULL) { 4699 if (knconf->knc_protofmly != NULL) 4700 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4701 if (knconf->knc_proto != NULL) 4702 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4703 kmem_free(knconf, sizeof (*knconf)); 4704 } 4705 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4706 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4707 mutex_destroy(&svp->sv_lock); 4708 kmem_free(svp, sizeof (*svp)); 4709 svp = next; 4710 } 4711 } 4712 4713 /* 4714 * Only can return non-zero if intr != 0. 4715 */ 4716 int 4717 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4718 { 4719 4720 mutex_enter(&l->lock); 4721 4722 /* 4723 * If this is a nested enter, then allow it. There 4724 * must be as many exits as enters through. 4725 */ 4726 if (l->owner == curthread) { 4727 /* lock is held for writing by current thread */ 4728 ASSERT(rw == RW_READER || rw == RW_WRITER); 4729 l->count--; 4730 } else if (rw == RW_READER) { 4731 /* 4732 * While there is a writer active or writers waiting, 4733 * then wait for them to finish up and move on. Then, 4734 * increment the count to indicate that a reader is 4735 * active. 4736 */ 4737 while (l->count < 0 || l->waiters > 0) { 4738 if (intr) { 4739 klwp_t *lwp = ttolwp(curthread); 4740 4741 if (lwp != NULL) 4742 lwp->lwp_nostop++; 4743 if (!cv_wait_sig(&l->cv, &l->lock)) { 4744 if (lwp != NULL) 4745 lwp->lwp_nostop--; 4746 mutex_exit(&l->lock); 4747 return (EINTR); 4748 } 4749 if (lwp != NULL) 4750 lwp->lwp_nostop--; 4751 } else 4752 cv_wait(&l->cv, &l->lock); 4753 } 4754 ASSERT(l->count < INT_MAX); 4755 #ifdef DEBUG 4756 if ((l->count % 10000) == 9999) 4757 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4758 "rwlock @ %p\n", l->count, (void *)&l); 4759 #endif 4760 l->count++; 4761 } else { 4762 ASSERT(rw == RW_WRITER); 4763 /* 4764 * While there are readers active or a writer 4765 * active, then wait for all of the readers 4766 * to finish or for the writer to finish. 4767 * Then, set the owner field to curthread and 4768 * decrement count to indicate that a writer 4769 * is active. 4770 */ 4771 while (l->count > 0 || l->owner != NULL) { 4772 l->waiters++; 4773 if (intr) { 4774 klwp_t *lwp = ttolwp(curthread); 4775 4776 if (lwp != NULL) 4777 lwp->lwp_nostop++; 4778 if (!cv_wait_sig(&l->cv, &l->lock)) { 4779 if (lwp != NULL) 4780 lwp->lwp_nostop--; 4781 l->waiters--; 4782 cv_broadcast(&l->cv); 4783 mutex_exit(&l->lock); 4784 return (EINTR); 4785 } 4786 if (lwp != NULL) 4787 lwp->lwp_nostop--; 4788 } else 4789 cv_wait(&l->cv, &l->lock); 4790 l->waiters--; 4791 } 4792 l->owner = curthread; 4793 l->count--; 4794 } 4795 4796 mutex_exit(&l->lock); 4797 4798 return (0); 4799 } 4800 4801 /* 4802 * If the lock is available, obtain it and return non-zero. If there is 4803 * already a conflicting lock, return 0 immediately. 4804 */ 4805 4806 int 4807 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4808 { 4809 mutex_enter(&l->lock); 4810 4811 /* 4812 * If this is a nested enter, then allow it. There 4813 * must be as many exits as enters through. 4814 */ 4815 if (l->owner == curthread) { 4816 /* lock is held for writing by current thread */ 4817 ASSERT(rw == RW_READER || rw == RW_WRITER); 4818 l->count--; 4819 } else if (rw == RW_READER) { 4820 /* 4821 * If there is a writer active or writers waiting, deny the 4822 * lock. Otherwise, bump the count of readers. 4823 */ 4824 if (l->count < 0 || l->waiters > 0) { 4825 mutex_exit(&l->lock); 4826 return (0); 4827 } 4828 l->count++; 4829 } else { 4830 ASSERT(rw == RW_WRITER); 4831 /* 4832 * If there are readers active or a writer active, deny the 4833 * lock. Otherwise, set the owner field to curthread and 4834 * decrement count to indicate that a writer is active. 4835 */ 4836 if (l->count > 0 || l->owner != NULL) { 4837 mutex_exit(&l->lock); 4838 return (0); 4839 } 4840 l->owner = curthread; 4841 l->count--; 4842 } 4843 4844 mutex_exit(&l->lock); 4845 4846 return (1); 4847 } 4848 4849 void 4850 nfs_rw_exit(nfs_rwlock_t *l) 4851 { 4852 4853 mutex_enter(&l->lock); 4854 /* 4855 * If this is releasing a writer lock, then increment count to 4856 * indicate that there is one less writer active. If this was 4857 * the last of possibly nested writer locks, then clear the owner 4858 * field as well to indicate that there is no writer active 4859 * and wakeup any possible waiting writers or readers. 4860 * 4861 * If releasing a reader lock, then just decrement count to 4862 * indicate that there is one less reader active. If this was 4863 * the last active reader and there are writer(s) waiting, 4864 * then wake up the first. 4865 */ 4866 if (l->owner != NULL) { 4867 ASSERT(l->owner == curthread); 4868 l->count++; 4869 if (l->count == 0) { 4870 l->owner = NULL; 4871 cv_broadcast(&l->cv); 4872 } 4873 } else { 4874 ASSERT(l->count > 0); 4875 l->count--; 4876 if (l->count == 0 && l->waiters > 0) 4877 cv_broadcast(&l->cv); 4878 } 4879 mutex_exit(&l->lock); 4880 } 4881 4882 int 4883 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4884 { 4885 4886 if (rw == RW_READER) 4887 return (l->count > 0); 4888 ASSERT(rw == RW_WRITER); 4889 return (l->count < 0); 4890 } 4891 4892 /* ARGSUSED */ 4893 void 4894 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4895 { 4896 4897 l->count = 0; 4898 l->waiters = 0; 4899 l->owner = NULL; 4900 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4901 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4902 } 4903 4904 void 4905 nfs_rw_destroy(nfs_rwlock_t *l) 4906 { 4907 4908 mutex_destroy(&l->lock); 4909 cv_destroy(&l->cv); 4910 } 4911 4912 int 4913 nfs3_rddir_compar(const void *x, const void *y) 4914 { 4915 rddir_cache *a = (rddir_cache *)x; 4916 rddir_cache *b = (rddir_cache *)y; 4917 4918 if (a->nfs3_cookie == b->nfs3_cookie) { 4919 if (a->buflen == b->buflen) 4920 return (0); 4921 if (a->buflen < b->buflen) 4922 return (-1); 4923 return (1); 4924 } 4925 4926 if (a->nfs3_cookie < b->nfs3_cookie) 4927 return (-1); 4928 4929 return (1); 4930 } 4931 4932 int 4933 nfs_rddir_compar(const void *x, const void *y) 4934 { 4935 rddir_cache *a = (rddir_cache *)x; 4936 rddir_cache *b = (rddir_cache *)y; 4937 4938 if (a->nfs_cookie == b->nfs_cookie) { 4939 if (a->buflen == b->buflen) 4940 return (0); 4941 if (a->buflen < b->buflen) 4942 return (-1); 4943 return (1); 4944 } 4945 4946 if (a->nfs_cookie < b->nfs_cookie) 4947 return (-1); 4948 4949 return (1); 4950 } 4951 4952 static char * 4953 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 4954 { 4955 servinfo_t *s; 4956 char *srvnames; 4957 char *namep; 4958 size_t length; 4959 4960 /* 4961 * Calculate the length of the string required to hold all 4962 * of the server names plus either a comma or a null 4963 * character following each individual one. 4964 */ 4965 length = 0; 4966 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 4967 length += s->sv_hostnamelen; 4968 4969 srvnames = kmem_alloc(length, KM_SLEEP); 4970 4971 namep = srvnames; 4972 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 4973 (void) strcpy(namep, s->sv_hostname); 4974 namep += s->sv_hostnamelen - 1; 4975 *namep++ = ','; 4976 } 4977 *--namep = '\0'; 4978 4979 *len = length; 4980 4981 return (srvnames); 4982 } 4983 4984 /* 4985 * These two functions are temporary and designed for the upgrade-workaround 4986 * only. They cannot be used for general zone-crossing NFS client support, and 4987 * will be removed shortly. 4988 * 4989 * When the workaround is enabled, all NFS traffic is forced into the global 4990 * zone. These functions are called when the code needs to refer to the state 4991 * of the underlying network connection. They're not called when the function 4992 * needs to refer to the state of the process that invoked the system call. 4993 * (E.g., when checking whether the zone is shutting down during the mount() 4994 * call.) 4995 */ 4996 4997 struct zone * 4998 nfs_zone(void) 4999 { 5000 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5001 } 5002 5003 zoneid_t 5004 nfs_zoneid(void) 5005 { 5006 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5007 } 5008 5009 /* 5010 * nfs_mount_label_policy: 5011 * Determine whether the mount is allowed according to MAC check, 5012 * by comparing (where appropriate) label of the remote server 5013 * against the label of the zone being mounted into. 5014 * 5015 * Returns: 5016 * 0 : access allowed 5017 * -1 : read-only access allowed (i.e., read-down) 5018 * >0 : error code, such as EACCES 5019 */ 5020 int 5021 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5022 struct knetconfig *knconf, cred_t *cr) 5023 { 5024 int addr_type; 5025 void *ipaddr; 5026 bslabel_t *server_sl, *mntlabel; 5027 zone_t *mntzone = NULL; 5028 ts_label_t *zlabel; 5029 tsol_tpc_t *tp; 5030 ts_label_t *tsl = NULL; 5031 int retv; 5032 5033 /* 5034 * Get the zone's label. Each zone on a labeled system has a label. 5035 */ 5036 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5037 zlabel = mntzone->zone_slabel; 5038 ASSERT(zlabel != NULL); 5039 label_hold(zlabel); 5040 5041 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5042 addr_type = IPV4_VERSION; 5043 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5044 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5045 addr_type = IPV6_VERSION; 5046 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5047 } else { 5048 retv = 0; 5049 goto out; 5050 } 5051 5052 retv = EACCES; /* assume the worst */ 5053 5054 /* 5055 * Next, get the assigned label of the remote server. 5056 */ 5057 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5058 if (tp == NULL) 5059 goto out; /* error getting host entry */ 5060 5061 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5062 goto rel_tpc; /* invalid domain */ 5063 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5064 (tp->tpc_tp.host_type != UNLABELED)) 5065 goto rel_tpc; /* invalid hosttype */ 5066 5067 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5068 tsl = getflabel_cipso(vfsp); 5069 if (tsl == NULL) 5070 goto rel_tpc; /* error getting server lbl */ 5071 5072 server_sl = label2bslabel(tsl); 5073 } else { /* UNLABELED */ 5074 server_sl = &tp->tpc_tp.tp_def_label; 5075 } 5076 5077 mntlabel = label2bslabel(zlabel); 5078 5079 /* 5080 * Now compare labels to complete the MAC check. If the labels 5081 * are equal or if the requestor is in the global zone and has 5082 * NET_MAC_AWARE, then allow read-write access. (Except for 5083 * mounts into the global zone itself; restrict these to 5084 * read-only.) 5085 * 5086 * If the requestor is in some other zone, but his label 5087 * dominates the server, then allow read-down. 5088 * 5089 * Otherwise, access is denied. 5090 */ 5091 if (blequal(mntlabel, server_sl) || 5092 (crgetzoneid(cr) == GLOBAL_ZONEID && 5093 getpflags(NET_MAC_AWARE, cr) != 0)) { 5094 if ((mntzone == global_zone) || 5095 !blequal(mntlabel, server_sl)) 5096 retv = -1; /* read-only */ 5097 else 5098 retv = 0; /* access OK */ 5099 } else if (bldominates(mntlabel, server_sl)) { 5100 retv = -1; /* read-only */ 5101 } else { 5102 retv = EACCES; 5103 } 5104 5105 if (tsl != NULL) 5106 label_rele(tsl); 5107 5108 rel_tpc: 5109 TPC_RELE(tp); 5110 out: 5111 if (mntzone) 5112 zone_rele(mntzone); 5113 label_rele(zlabel); 5114 return (retv); 5115 } 5116 5117 boolean_t 5118 nfs_has_ctty(void) 5119 { 5120 boolean_t rv; 5121 mutex_enter(&curproc->p_splock); 5122 rv = (curproc->p_sessp->s_vp != NULL); 5123 mutex_exit(&curproc->p_splock); 5124 return (rv); 5125 } 5126 5127 /* 5128 * See if xattr directory to see if it has any generic user attributes 5129 */ 5130 int 5131 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5132 { 5133 struct uio uio; 5134 struct iovec iov; 5135 char *dbuf; 5136 struct dirent64 *dp; 5137 size_t dlen = 8 * 1024; 5138 size_t dbuflen; 5139 int eof = 0; 5140 int error; 5141 5142 *valp = 0; 5143 dbuf = kmem_alloc(dlen, KM_SLEEP); 5144 uio.uio_iov = &iov; 5145 uio.uio_iovcnt = 1; 5146 uio.uio_segflg = UIO_SYSSPACE; 5147 uio.uio_fmode = 0; 5148 uio.uio_extflg = UIO_COPY_CACHED; 5149 uio.uio_loffset = 0; 5150 uio.uio_resid = dlen; 5151 iov.iov_base = dbuf; 5152 iov.iov_len = dlen; 5153 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5154 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5155 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5156 5157 dbuflen = dlen - uio.uio_resid; 5158 5159 if (error || dbuflen == 0) { 5160 kmem_free(dbuf, dlen); 5161 return (error); 5162 } 5163 5164 dp = (dirent64_t *)dbuf; 5165 5166 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5167 if (strcmp(dp->d_name, ".") == 0 || 5168 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5169 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5170 VIEW_READONLY) == 0) { 5171 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5172 continue; 5173 } 5174 5175 *valp = 1; 5176 break; 5177 } 5178 kmem_free(dbuf, dlen); 5179 return (0); 5180 } 5181