1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 28 * Copyright (c) 2016 by Delphix. All rights reserved. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/proc.h> 36 #include <sys/user.h> 37 #include <sys/time.h> 38 #include <sys/buf.h> 39 #include <sys/vfs.h> 40 #include <sys/vnode.h> 41 #include <sys/socket.h> 42 #include <sys/uio.h> 43 #include <sys/tiuser.h> 44 #include <sys/swap.h> 45 #include <sys/errno.h> 46 #include <sys/debug.h> 47 #include <sys/kmem.h> 48 #include <sys/kstat.h> 49 #include <sys/cmn_err.h> 50 #include <sys/vtrace.h> 51 #include <sys/session.h> 52 #include <sys/dnlc.h> 53 #include <sys/bitmap.h> 54 #include <sys/acl.h> 55 #include <sys/ddi.h> 56 #include <sys/pathname.h> 57 #include <sys/flock.h> 58 #include <sys/dirent.h> 59 #include <sys/flock.h> 60 #include <sys/callb.h> 61 #include <sys/atomic.h> 62 #include <sys/list.h> 63 #include <sys/tsol/tnet.h> 64 #include <sys/priv.h> 65 #include <sys/sdt.h> 66 #include <sys/attr.h> 67 68 #include <inet/ip6.h> 69 70 #include <rpc/types.h> 71 #include <rpc/xdr.h> 72 #include <rpc/auth.h> 73 #include <rpc/clnt.h> 74 75 #include <nfs/nfs.h> 76 #include <nfs/nfs4.h> 77 #include <nfs/nfs_clnt.h> 78 #include <nfs/rnode.h> 79 #include <nfs/nfs_acl.h> 80 81 #include <sys/tsol/label.h> 82 83 /* 84 * The hash queues for the access to active and cached rnodes 85 * are organized as doubly linked lists. A reader/writer lock 86 * for each hash bucket is used to control access and to synchronize 87 * lookups, additions, and deletions from the hash queue. 88 * 89 * The rnode freelist is organized as a doubly linked list with 90 * a head pointer. Additions and deletions are synchronized via 91 * a single mutex. 92 * 93 * In order to add an rnode to the free list, it must be hashed into 94 * a hash queue and the exclusive lock to the hash queue be held. 95 * If an rnode is not hashed into a hash queue, then it is destroyed 96 * because it represents no valuable information that can be reused 97 * about the file. The exclusive lock to the hash queue must be 98 * held in order to prevent a lookup in the hash queue from finding 99 * the rnode and using it and assuming that the rnode is not on the 100 * freelist. The lookup in the hash queue will have the hash queue 101 * locked, either exclusive or shared. 102 * 103 * The vnode reference count for each rnode is not allowed to drop 104 * below 1. This prevents external entities, such as the VM 105 * subsystem, from acquiring references to vnodes already on the 106 * freelist and then trying to place them back on the freelist 107 * when their reference is released. This means that the when an 108 * rnode is looked up in the hash queues, then either the rnode 109 * is removed from the freelist and that reference is transferred to 110 * the new reference or the vnode reference count must be incremented 111 * accordingly. The mutex for the freelist must be held in order to 112 * accurately test to see if the rnode is on the freelist or not. 113 * The hash queue lock might be held shared and it is possible that 114 * two different threads may race to remove the rnode from the 115 * freelist. This race can be resolved by holding the mutex for the 116 * freelist. Please note that the mutex for the freelist does not 117 * need to held if the rnode is not on the freelist. It can not be 118 * placed on the freelist due to the requirement that the thread 119 * putting the rnode on the freelist must hold the exclusive lock 120 * to the hash queue and the thread doing the lookup in the hash 121 * queue is holding either a shared or exclusive lock to the hash 122 * queue. 123 * 124 * The lock ordering is: 125 * 126 * hash bucket lock -> vnode lock 127 * hash bucket lock -> freelist lock 128 */ 129 static rhashq_t *rtable; 130 131 static kmutex_t rpfreelist_lock; 132 static rnode_t *rpfreelist = NULL; 133 static long rnew = 0; 134 long nrnode = 0; 135 136 static int rtablesize; 137 static int rtablemask; 138 139 static int hashlen = 4; 140 141 static struct kmem_cache *rnode_cache; 142 143 /* 144 * Mutex to protect the following variables: 145 * nfs_major 146 * nfs_minor 147 */ 148 kmutex_t nfs_minor_lock; 149 int nfs_major; 150 int nfs_minor; 151 152 /* Do we allow preepoch (negative) time values otw? */ 153 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ 154 155 /* 156 * Access cache 157 */ 158 static acache_hash_t *acache; 159 static long nacache; /* used strictly to size the number of hash queues */ 160 161 static int acachesize; 162 static int acachemask; 163 static struct kmem_cache *acache_cache; 164 165 /* 166 * Client side utilities 167 */ 168 169 /* 170 * client side statistics 171 */ 172 static const struct clstat clstat_tmpl = { 173 { "calls", KSTAT_DATA_UINT64 }, 174 { "badcalls", KSTAT_DATA_UINT64 }, 175 { "clgets", KSTAT_DATA_UINT64 }, 176 { "cltoomany", KSTAT_DATA_UINT64 }, 177 #ifdef DEBUG 178 { "clalloc", KSTAT_DATA_UINT64 }, 179 { "noresponse", KSTAT_DATA_UINT64 }, 180 { "failover", KSTAT_DATA_UINT64 }, 181 { "remap", KSTAT_DATA_UINT64 }, 182 #endif 183 }; 184 185 /* 186 * The following are statistics that describe behavior of the system as a whole 187 * and doesn't correspond to any one particular zone. 188 */ 189 #ifdef DEBUG 190 static struct clstat_debug { 191 kstat_named_t nrnode; /* number of allocated rnodes */ 192 kstat_named_t access; /* size of access cache */ 193 kstat_named_t dirent; /* size of readdir cache */ 194 kstat_named_t dirents; /* size of readdir buf cache */ 195 kstat_named_t reclaim; /* number of reclaims */ 196 kstat_named_t clreclaim; /* number of cl reclaims */ 197 kstat_named_t f_reclaim; /* number of free reclaims */ 198 kstat_named_t a_reclaim; /* number of active reclaims */ 199 kstat_named_t r_reclaim; /* number of rnode reclaims */ 200 kstat_named_t rpath; /* bytes used to store rpaths */ 201 } clstat_debug = { 202 { "nrnode", KSTAT_DATA_UINT64 }, 203 { "access", KSTAT_DATA_UINT64 }, 204 { "dirent", KSTAT_DATA_UINT64 }, 205 { "dirents", KSTAT_DATA_UINT64 }, 206 { "reclaim", KSTAT_DATA_UINT64 }, 207 { "clreclaim", KSTAT_DATA_UINT64 }, 208 { "f_reclaim", KSTAT_DATA_UINT64 }, 209 { "a_reclaim", KSTAT_DATA_UINT64 }, 210 { "r_reclaim", KSTAT_DATA_UINT64 }, 211 { "r_path", KSTAT_DATA_UINT64 }, 212 }; 213 #endif /* DEBUG */ 214 215 /* 216 * We keep a global list of per-zone client data, so we can clean up all zones 217 * if we get low on memory. 218 */ 219 static list_t nfs_clnt_list; 220 static kmutex_t nfs_clnt_list_lock; 221 static zone_key_t nfsclnt_zone_key; 222 223 static struct kmem_cache *chtab_cache; 224 225 /* 226 * Some servers do not properly update the attributes of the 227 * directory when changes are made. To allow interoperability 228 * with these broken servers, the nfs_disable_rddir_cache 229 * parameter must be set in /etc/system 230 */ 231 int nfs_disable_rddir_cache = 0; 232 233 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, 234 struct chtab **); 235 void clfree(CLIENT *, struct chtab *); 236 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 237 struct chtab **, struct nfs_clnt *); 238 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, 239 struct chtab **, struct nfs_clnt *); 240 static void clreclaim(void *); 241 static int nfs_feedback(int, int, mntinfo_t *); 242 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 243 caddr_t, cred_t *, int *, enum clnt_stat *, int, 244 failinfo_t *); 245 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, 246 caddr_t, cred_t *, int *, int, failinfo_t *); 247 static void rinactive(rnode_t *, cred_t *); 248 static int rtablehash(nfs_fhandle *); 249 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, 250 struct vnodeops *, 251 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, 252 cred_t *), 253 int (*)(const void *, const void *), int *, cred_t *, 254 char *, char *); 255 static void rp_rmfree(rnode_t *); 256 static void rp_addhash(rnode_t *); 257 static void rp_rmhash_locked(rnode_t *); 258 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); 259 static void destroy_rnode(rnode_t *); 260 static void rddir_cache_free(rddir_cache *); 261 static int nfs_free_data_reclaim(rnode_t *); 262 static int nfs_active_data_reclaim(rnode_t *); 263 static int nfs_free_reclaim(void); 264 static int nfs_active_reclaim(void); 265 static int nfs_rnode_reclaim(void); 266 static void nfs_reclaim(void *); 267 static int failover_safe(failinfo_t *); 268 static void failover_newserver(mntinfo_t *mi); 269 static void failover_thread(mntinfo_t *mi); 270 static int failover_wait(mntinfo_t *); 271 static int failover_remap(failinfo_t *); 272 static int failover_lookup(char *, vnode_t *, 273 int (*)(vnode_t *, char *, vnode_t **, 274 struct pathname *, int, vnode_t *, cred_t *, int), 275 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 276 vnode_t **); 277 static void nfs_free_r_path(rnode_t *); 278 static void nfs_set_vroot(vnode_t *); 279 static char *nfs_getsrvnames(mntinfo_t *, size_t *); 280 281 /* 282 * from rpcsec module (common/rpcsec) 283 */ 284 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); 285 extern void sec_clnt_freeh(AUTH *); 286 extern void sec_clnt_freeinfo(struct sec_data *); 287 288 /* 289 * used in mount policy 290 */ 291 extern ts_label_t *getflabel_cipso(vfs_t *); 292 293 /* 294 * EIO or EINTR are not recoverable errors. 295 */ 296 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) 297 298 #ifdef DEBUG 299 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n" 300 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n" 301 #else 302 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n" 303 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n" 304 #endif 305 /* 306 * Common handle get program for NFS, NFS ACL, and NFS AUTH client. 307 */ 308 static int 309 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 310 struct chtab **chp, struct nfs_clnt *nfscl) 311 { 312 struct chhead *ch, *newch; 313 struct chhead **plistp; 314 struct chtab *cp; 315 int error; 316 k_sigset_t smask; 317 318 if (newcl == NULL || chp == NULL || ci == NULL) 319 return (EINVAL); 320 321 *newcl = NULL; 322 *chp = NULL; 323 324 /* 325 * Find an unused handle or create one 326 */ 327 newch = NULL; 328 nfscl->nfscl_stat.clgets.value.ui64++; 329 top: 330 /* 331 * Find the correct entry in the cache to check for free 332 * client handles. The search is based on the RPC program 333 * number, program version number, dev_t for the transport 334 * device, and the protocol family. 335 */ 336 mutex_enter(&nfscl->nfscl_chtable_lock); 337 plistp = &nfscl->nfscl_chtable; 338 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 339 if (ch->ch_prog == ci->cl_prog && 340 ch->ch_vers == ci->cl_vers && 341 ch->ch_dev == svp->sv_knconf->knc_rdev && 342 (strcmp(ch->ch_protofmly, 343 svp->sv_knconf->knc_protofmly) == 0)) 344 break; 345 plistp = &ch->ch_next; 346 } 347 348 /* 349 * If we didn't find a cache entry for this quadruple, then 350 * create one. If we don't have one already preallocated, 351 * then drop the cache lock, create one, and then start over. 352 * If we did have a preallocated entry, then just add it to 353 * the front of the list. 354 */ 355 if (ch == NULL) { 356 if (newch == NULL) { 357 mutex_exit(&nfscl->nfscl_chtable_lock); 358 newch = kmem_alloc(sizeof (*newch), KM_SLEEP); 359 newch->ch_timesused = 0; 360 newch->ch_prog = ci->cl_prog; 361 newch->ch_vers = ci->cl_vers; 362 newch->ch_dev = svp->sv_knconf->knc_rdev; 363 newch->ch_protofmly = kmem_alloc( 364 strlen(svp->sv_knconf->knc_protofmly) + 1, 365 KM_SLEEP); 366 (void) strcpy(newch->ch_protofmly, 367 svp->sv_knconf->knc_protofmly); 368 newch->ch_list = NULL; 369 goto top; 370 } 371 ch = newch; 372 newch = NULL; 373 ch->ch_next = nfscl->nfscl_chtable; 374 nfscl->nfscl_chtable = ch; 375 /* 376 * We found a cache entry, but if it isn't on the front of the 377 * list, then move it to the front of the list to try to take 378 * advantage of locality of operations. 379 */ 380 } else if (ch != nfscl->nfscl_chtable) { 381 *plistp = ch->ch_next; 382 ch->ch_next = nfscl->nfscl_chtable; 383 nfscl->nfscl_chtable = ch; 384 } 385 386 /* 387 * If there was a free client handle cached, then remove it 388 * from the list, init it, and use it. 389 */ 390 if (ch->ch_list != NULL) { 391 cp = ch->ch_list; 392 ch->ch_list = cp->ch_list; 393 mutex_exit(&nfscl->nfscl_chtable_lock); 394 if (newch != NULL) { 395 kmem_free(newch->ch_protofmly, 396 strlen(newch->ch_protofmly) + 1); 397 kmem_free(newch, sizeof (*newch)); 398 } 399 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, 400 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); 401 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 402 &cp->ch_client->cl_auth); 403 if (error || cp->ch_client->cl_auth == NULL) { 404 CLNT_DESTROY(cp->ch_client); 405 kmem_cache_free(chtab_cache, cp); 406 return ((error != 0) ? error : EINTR); 407 } 408 ch->ch_timesused++; 409 *newcl = cp->ch_client; 410 *chp = cp; 411 return (0); 412 } 413 414 /* 415 * There weren't any free client handles which fit, so allocate 416 * a new one and use that. 417 */ 418 #ifdef DEBUG 419 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64); 420 #endif 421 mutex_exit(&nfscl->nfscl_chtable_lock); 422 423 nfscl->nfscl_stat.cltoomany.value.ui64++; 424 if (newch != NULL) { 425 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); 426 kmem_free(newch, sizeof (*newch)); 427 } 428 429 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); 430 cp->ch_head = ch; 431 432 sigintr(&smask, (int)ci->cl_flags & MI_INT); 433 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, 434 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); 435 sigunintr(&smask); 436 437 if (error != 0) { 438 kmem_cache_free(chtab_cache, cp); 439 #ifdef DEBUG 440 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 441 #endif 442 /* 443 * Warning is unnecessary if error is EINTR. 444 */ 445 if (error != EINTR) { 446 nfs_cmn_err(error, CE_WARN, 447 "clget: couldn't create handle: %m\n"); 448 } 449 return (error); 450 } 451 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); 452 auth_destroy(cp->ch_client->cl_auth); 453 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, 454 &cp->ch_client->cl_auth); 455 if (error || cp->ch_client->cl_auth == NULL) { 456 CLNT_DESTROY(cp->ch_client); 457 kmem_cache_free(chtab_cache, cp); 458 #ifdef DEBUG 459 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64); 460 #endif 461 return ((error != 0) ? error : EINTR); 462 } 463 ch->ch_timesused++; 464 *newcl = cp->ch_client; 465 ASSERT(cp->ch_client->cl_nosignal == FALSE); 466 *chp = cp; 467 return (0); 468 } 469 470 int 471 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 472 struct chtab **chp) 473 { 474 struct nfs_clnt *nfscl; 475 476 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 477 ASSERT(nfscl != NULL); 478 479 return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); 480 } 481 482 static int 483 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 484 struct chtab **chp, struct nfs_clnt *nfscl) 485 { 486 clinfo_t ci; 487 int error; 488 489 /* 490 * Set read buffer size to rsize 491 * and add room for RPC headers. 492 */ 493 ci.cl_readsize = mi->mi_tsize; 494 if (ci.cl_readsize != 0) 495 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 496 497 /* 498 * If soft mount and server is down just try once. 499 * meaning: do not retransmit. 500 */ 501 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 502 ci.cl_retrans = 0; 503 else 504 ci.cl_retrans = mi->mi_retrans; 505 506 ci.cl_prog = NFS_ACL_PROGRAM; 507 ci.cl_vers = mi->mi_vers; 508 ci.cl_flags = mi->mi_flags; 509 510 /* 511 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 512 * security flavor, the client tries to establish a security context 513 * by contacting the server. If the connection is timed out or reset, 514 * e.g. server reboot, we will try again. 515 */ 516 do { 517 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 518 519 if (error == 0) 520 break; 521 522 /* 523 * For forced unmount or zone shutdown, bail out, no retry. 524 */ 525 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 526 error = EIO; 527 break; 528 } 529 530 /* do not retry for softmount */ 531 if (!(mi->mi_flags & MI_HARD)) 532 break; 533 534 /* let the caller deal with the failover case */ 535 if (FAILOVER_MOUNT(mi)) 536 break; 537 538 } while (error == ETIMEDOUT || error == ECONNRESET); 539 540 return (error); 541 } 542 543 static int 544 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, 545 struct chtab **chp, struct nfs_clnt *nfscl) 546 { 547 clinfo_t ci; 548 int error; 549 550 /* 551 * Set read buffer size to rsize 552 * and add room for RPC headers. 553 */ 554 ci.cl_readsize = mi->mi_tsize; 555 if (ci.cl_readsize != 0) 556 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); 557 558 /* 559 * If soft mount and server is down just try once. 560 * meaning: do not retransmit. 561 */ 562 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) 563 ci.cl_retrans = 0; 564 else 565 ci.cl_retrans = mi->mi_retrans; 566 567 ci.cl_prog = mi->mi_prog; 568 ci.cl_vers = mi->mi_vers; 569 ci.cl_flags = mi->mi_flags; 570 571 /* 572 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS 573 * security flavor, the client tries to establish a security context 574 * by contacting the server. If the connection is timed out or reset, 575 * e.g. server reboot, we will try again. 576 */ 577 do { 578 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); 579 580 if (error == 0) 581 break; 582 583 /* 584 * For forced unmount or zone shutdown, bail out, no retry. 585 */ 586 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 587 error = EIO; 588 break; 589 } 590 591 /* do not retry for softmount */ 592 if (!(mi->mi_flags & MI_HARD)) 593 break; 594 595 /* let the caller deal with the failover case */ 596 if (FAILOVER_MOUNT(mi)) 597 break; 598 599 } while (error == ETIMEDOUT || error == ECONNRESET); 600 601 return (error); 602 } 603 604 static void 605 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) 606 { 607 if (cl->cl_auth != NULL) { 608 sec_clnt_freeh(cl->cl_auth); 609 cl->cl_auth = NULL; 610 } 611 612 /* 613 * Timestamp this cache entry so that we know when it was last 614 * used. 615 */ 616 cp->ch_freed = gethrestime_sec(); 617 618 /* 619 * Add the free client handle to the front of the list. 620 * This way, the list will be sorted in youngest to oldest 621 * order. 622 */ 623 mutex_enter(&nfscl->nfscl_chtable_lock); 624 cp->ch_list = cp->ch_head->ch_list; 625 cp->ch_head->ch_list = cp; 626 mutex_exit(&nfscl->nfscl_chtable_lock); 627 } 628 629 void 630 clfree(CLIENT *cl, struct chtab *cp) 631 { 632 struct nfs_clnt *nfscl; 633 634 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 635 ASSERT(nfscl != NULL); 636 637 clfree_impl(cl, cp, nfscl); 638 } 639 640 #define CL_HOLDTIME 60 /* time to hold client handles */ 641 642 static void 643 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) 644 { 645 struct chhead *ch; 646 struct chtab *cp; /* list of objects that can be reclaimed */ 647 struct chtab *cpe; 648 struct chtab *cpl; 649 struct chtab **cpp; 650 #ifdef DEBUG 651 int n = 0; 652 #endif 653 654 /* 655 * Need to reclaim some memory, so step through the cache 656 * looking through the lists for entries which can be freed. 657 */ 658 cp = NULL; 659 660 mutex_enter(&nfscl->nfscl_chtable_lock); 661 662 /* 663 * Here we step through each non-NULL quadruple and start to 664 * construct the reclaim list pointed to by cp. Note that 665 * cp will contain all eligible chtab entries. When this traversal 666 * completes, chtab entries from the last quadruple will be at the 667 * front of cp and entries from previously inspected quadruples have 668 * been appended to the rear of cp. 669 */ 670 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { 671 if (ch->ch_list == NULL) 672 continue; 673 /* 674 * Search each list for entries older then 675 * cl_holdtime seconds. The lists are maintained 676 * in youngest to oldest order so that when the 677 * first entry is found which is old enough, then 678 * all of the rest of the entries on the list will 679 * be old enough as well. 680 */ 681 cpl = ch->ch_list; 682 cpp = &ch->ch_list; 683 while (cpl != NULL && 684 cpl->ch_freed + cl_holdtime > gethrestime_sec()) { 685 cpp = &cpl->ch_list; 686 cpl = cpl->ch_list; 687 } 688 if (cpl != NULL) { 689 *cpp = NULL; 690 if (cp != NULL) { 691 cpe = cpl; 692 while (cpe->ch_list != NULL) 693 cpe = cpe->ch_list; 694 cpe->ch_list = cp; 695 } 696 cp = cpl; 697 } 698 } 699 700 mutex_exit(&nfscl->nfscl_chtable_lock); 701 702 /* 703 * If cp is empty, then there is nothing to reclaim here. 704 */ 705 if (cp == NULL) 706 return; 707 708 /* 709 * Step through the list of entries to free, destroying each client 710 * handle and kmem_free'ing the memory for each entry. 711 */ 712 while (cp != NULL) { 713 #ifdef DEBUG 714 n++; 715 #endif 716 CLNT_DESTROY(cp->ch_client); 717 cpl = cp->ch_list; 718 kmem_cache_free(chtab_cache, cp); 719 cp = cpl; 720 } 721 722 #ifdef DEBUG 723 /* 724 * Update clalloc so that nfsstat shows the current number 725 * of allocated client handles. 726 */ 727 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); 728 #endif 729 } 730 731 /* ARGSUSED */ 732 static void 733 clreclaim(void *all) 734 { 735 struct nfs_clnt *nfscl; 736 737 #ifdef DEBUG 738 clstat_debug.clreclaim.value.ui64++; 739 #endif 740 /* 741 * The system is low on memory; go through and try to reclaim some from 742 * every zone on the system. 743 */ 744 mutex_enter(&nfs_clnt_list_lock); 745 nfscl = list_head(&nfs_clnt_list); 746 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) 747 clreclaim_zone(nfscl, CL_HOLDTIME); 748 mutex_exit(&nfs_clnt_list_lock); 749 } 750 751 /* 752 * Minimum time-out values indexed by call type 753 * These units are in "eights" of a second to avoid multiplies 754 */ 755 static unsigned int minimum_timeo[] = { 756 6, 7, 10 757 }; 758 759 /* 760 * Back off for retransmission timeout, MAXTIMO is in hz of a sec 761 */ 762 #define MAXTIMO (20*hz) 763 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) 764 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) 765 766 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ 767 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ 768 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ 769 770 /* 771 * Function called when rfscall notices that we have been 772 * re-transmitting, or when we get a response without retransmissions. 773 * Return 1 if the transfer size was adjusted down - 0 if no change. 774 */ 775 static int 776 nfs_feedback(int flag, int which, mntinfo_t *mi) 777 { 778 int kind; 779 int r = 0; 780 781 mutex_enter(&mi->mi_lock); 782 if (flag == FEEDBACK_REXMIT1) { 783 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && 784 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) 785 goto done; 786 if (mi->mi_curread > MIN_NFS_TSIZE) { 787 mi->mi_curread /= 2; 788 if (mi->mi_curread < MIN_NFS_TSIZE) 789 mi->mi_curread = MIN_NFS_TSIZE; 790 r = 1; 791 } 792 793 if (mi->mi_curwrite > MIN_NFS_TSIZE) { 794 mi->mi_curwrite /= 2; 795 if (mi->mi_curwrite < MIN_NFS_TSIZE) 796 mi->mi_curwrite = MIN_NFS_TSIZE; 797 r = 1; 798 } 799 } else if (flag == FEEDBACK_OK) { 800 kind = mi->mi_timer_type[which]; 801 if (kind == 0 || 802 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) 803 goto done; 804 if (kind == 1) { 805 if (mi->mi_curread >= mi->mi_tsize) 806 goto done; 807 mi->mi_curread += MIN_NFS_TSIZE; 808 if (mi->mi_curread > mi->mi_tsize/2) 809 mi->mi_curread = mi->mi_tsize; 810 } else if (kind == 2) { 811 if (mi->mi_curwrite >= mi->mi_stsize) 812 goto done; 813 mi->mi_curwrite += MIN_NFS_TSIZE; 814 if (mi->mi_curwrite > mi->mi_stsize/2) 815 mi->mi_curwrite = mi->mi_stsize; 816 } 817 } 818 done: 819 mutex_exit(&mi->mi_lock); 820 return (r); 821 } 822 823 #ifdef DEBUG 824 static int rfs2call_hits = 0; 825 static int rfs2call_misses = 0; 826 #endif 827 828 int 829 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 830 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 831 enum nfsstat *statusp, int flags, failinfo_t *fi) 832 { 833 int rpcerror; 834 enum clnt_stat rpc_status; 835 836 ASSERT(statusp != NULL); 837 838 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 839 cr, douprintf, &rpc_status, flags, fi); 840 if (!rpcerror) { 841 /* 842 * See crnetadjust() for comments. 843 */ 844 if (*statusp == NFSERR_ACCES && 845 (cr = crnetadjust(cr)) != NULL) { 846 #ifdef DEBUG 847 rfs2call_hits++; 848 #endif 849 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, 850 resp, cr, douprintf, NULL, flags, fi); 851 crfree(cr); 852 #ifdef DEBUG 853 if (*statusp == NFSERR_ACCES) 854 rfs2call_misses++; 855 #endif 856 } 857 } else if (rpc_status == RPC_PROCUNAVAIL) { 858 *statusp = NFSERR_OPNOTSUPP; 859 rpcerror = 0; 860 } 861 862 return (rpcerror); 863 } 864 865 #define NFS3_JUKEBOX_DELAY 10 * hz 866 867 static clock_t nfs3_jukebox_delay = 0; 868 869 #ifdef DEBUG 870 static int rfs3call_hits = 0; 871 static int rfs3call_misses = 0; 872 #endif 873 874 int 875 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 876 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 877 nfsstat3 *statusp, int flags, failinfo_t *fi) 878 { 879 int rpcerror; 880 int user_informed; 881 882 user_informed = 0; 883 do { 884 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, 885 cr, douprintf, NULL, flags, fi); 886 if (!rpcerror) { 887 cred_t *crr; 888 if (*statusp == NFS3ERR_JUKEBOX) { 889 if (ttoproc(curthread) == &p0) { 890 rpcerror = EAGAIN; 891 break; 892 } 893 if (!user_informed) { 894 user_informed = 1; 895 uprintf( 896 "file temporarily unavailable on the server, retrying...\n"); 897 } 898 delay(nfs3_jukebox_delay); 899 } 900 /* 901 * See crnetadjust() for comments. 902 */ 903 else if (*statusp == NFS3ERR_ACCES && 904 (crr = crnetadjust(cr)) != NULL) { 905 #ifdef DEBUG 906 rfs3call_hits++; 907 #endif 908 rpcerror = rfscall(mi, which, xdrargs, argsp, 909 xdrres, resp, crr, douprintf, 910 NULL, flags, fi); 911 912 crfree(crr); 913 #ifdef DEBUG 914 if (*statusp == NFS3ERR_ACCES) 915 rfs3call_misses++; 916 #endif 917 } 918 } 919 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 920 921 return (rpcerror); 922 } 923 924 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) 925 #define INC_READERS(mi) { \ 926 mi->mi_readers++; \ 927 } 928 #define DEC_READERS(mi) { \ 929 mi->mi_readers--; \ 930 if (mi->mi_readers == 0) \ 931 cv_broadcast(&mi->mi_failover_cv); \ 932 } 933 934 static int 935 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 936 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 937 enum clnt_stat *rpc_status, int flags, failinfo_t *fi) 938 { 939 CLIENT *client; 940 struct chtab *ch; 941 cred_t *cr = icr; 942 enum clnt_stat status; 943 struct rpc_err rpcerr, rpcerr_tmp; 944 struct timeval wait; 945 int timeo; /* in units of hz */ 946 int my_rsize, my_wsize; 947 bool_t tryagain; 948 bool_t cred_cloned = FALSE; 949 k_sigset_t smask; 950 servinfo_t *svp; 951 struct nfs_clnt *nfscl; 952 zoneid_t zoneid = getzoneid(); 953 char *msg; 954 #ifdef DEBUG 955 char *bufp; 956 #endif 957 958 959 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 960 "rfscall_start:which %d mi %p", which, mi); 961 962 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 963 ASSERT(nfscl != NULL); 964 965 nfscl->nfscl_stat.calls.value.ui64++; 966 mi->mi_reqs[which].value.ui64++; 967 968 rpcerr.re_status = RPC_SUCCESS; 969 970 /* 971 * In case of forced unmount or zone shutdown, return EIO. 972 */ 973 974 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 975 rpcerr.re_status = RPC_FAILED; 976 rpcerr.re_errno = EIO; 977 return (rpcerr.re_errno); 978 } 979 980 /* 981 * Remember the transfer sizes in case 982 * nfs_feedback changes them underneath us. 983 */ 984 my_rsize = mi->mi_curread; 985 my_wsize = mi->mi_curwrite; 986 987 /* 988 * NFS client failover support 989 * 990 * If this rnode is not in sync with the current server (VALID_FH), 991 * we'd like to do a remap to get in sync. We can be interrupted 992 * in failover_remap(), and if so we'll bail. Otherwise, we'll 993 * use the best info we have to try the RPC. Part of that is 994 * unconditionally updating the filehandle copy kept for V3. 995 * 996 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 997 * rw_enter(); we're trying to keep the current server from being 998 * changed on us until we're done with the remapping and have a 999 * matching client handle. We don't want to sending a filehandle 1000 * to the wrong host. 1001 */ 1002 failoverretry: 1003 if (FAILOVER_MOUNT(mi)) { 1004 mutex_enter(&mi->mi_lock); 1005 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1006 if (failover_wait(mi)) { 1007 mutex_exit(&mi->mi_lock); 1008 return (EINTR); 1009 } 1010 } 1011 INC_READERS(mi); 1012 mutex_exit(&mi->mi_lock); 1013 if (fi) { 1014 if (!VALID_FH(fi) && 1015 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1016 int remaperr; 1017 1018 svp = mi->mi_curr_serv; 1019 remaperr = failover_remap(fi); 1020 if (remaperr != 0) { 1021 #ifdef DEBUG 1022 if (remaperr != EINTR) 1023 nfs_cmn_err(remaperr, CE_WARN, 1024 "rfscall couldn't failover: %m"); 1025 #endif 1026 mutex_enter(&mi->mi_lock); 1027 DEC_READERS(mi); 1028 mutex_exit(&mi->mi_lock); 1029 /* 1030 * If failover_remap returns ETIMEDOUT 1031 * and the filesystem is hard mounted 1032 * we have to retry the call with a new 1033 * server. 1034 */ 1035 if ((mi->mi_flags & MI_HARD) && 1036 IS_RECOVERABLE_ERROR(remaperr)) { 1037 if (svp == mi->mi_curr_serv) 1038 failover_newserver(mi); 1039 rpcerr.re_status = RPC_SUCCESS; 1040 goto failoverretry; 1041 } 1042 rpcerr.re_errno = remaperr; 1043 return (remaperr); 1044 } 1045 } 1046 if (fi->fhp && fi->copyproc) 1047 (*fi->copyproc)(fi->fhp, fi->vp); 1048 } 1049 } 1050 1051 /* For TSOL, use a new cred which has net_mac_aware flag */ 1052 if (!cred_cloned && is_system_labeled()) { 1053 cred_cloned = TRUE; 1054 cr = crdup(icr); 1055 (void) setpflags(NET_MAC_AWARE, 1, cr); 1056 } 1057 1058 /* 1059 * clget() calls clnt_tli_kinit() which clears the xid, so we 1060 * are guaranteed to reprocess the retry as a new request. 1061 */ 1062 svp = mi->mi_curr_serv; 1063 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); 1064 1065 if (FAILOVER_MOUNT(mi)) { 1066 mutex_enter(&mi->mi_lock); 1067 DEC_READERS(mi); 1068 mutex_exit(&mi->mi_lock); 1069 1070 if ((rpcerr.re_errno == ETIMEDOUT || 1071 rpcerr.re_errno == ECONNRESET) && 1072 failover_safe(fi)) { 1073 if (svp == mi->mi_curr_serv) 1074 failover_newserver(mi); 1075 goto failoverretry; 1076 } 1077 } 1078 if (rpcerr.re_errno != 0) 1079 return (rpcerr.re_errno); 1080 1081 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1082 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1083 timeo = (mi->mi_timeo * hz) / 10; 1084 } else { 1085 mutex_enter(&mi->mi_lock); 1086 timeo = CLNT_SETTIMERS(client, 1087 &(mi->mi_timers[mi->mi_timer_type[which]]), 1088 &(mi->mi_timers[NFS_CALLTYPES]), 1089 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, 1090 (void (*)())NULL, (caddr_t)mi, 0); 1091 mutex_exit(&mi->mi_lock); 1092 } 1093 1094 /* 1095 * If hard mounted fs, retry call forever unless hard error occurs. 1096 */ 1097 do { 1098 tryagain = FALSE; 1099 1100 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1101 status = RPC_FAILED; 1102 rpcerr.re_status = RPC_FAILED; 1103 rpcerr.re_errno = EIO; 1104 break; 1105 } 1106 1107 TICK_TO_TIMEVAL(timeo, &wait); 1108 1109 /* 1110 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1111 * and SIGTERM. (Preserving the existing masks). 1112 * Mask out SIGINT if mount option nointr is specified. 1113 */ 1114 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1115 if (!(mi->mi_flags & MI_INT)) 1116 client->cl_nosignal = TRUE; 1117 1118 /* 1119 * If there is a current signal, then don't bother 1120 * even trying to send out the request because we 1121 * won't be able to block waiting for the response. 1122 * Simply assume RPC_INTR and get on with it. 1123 */ 1124 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1125 status = RPC_INTR; 1126 else { 1127 status = CLNT_CALL(client, which, xdrargs, argsp, 1128 xdrres, resp, wait); 1129 } 1130 1131 if (!(mi->mi_flags & MI_INT)) 1132 client->cl_nosignal = FALSE; 1133 /* 1134 * restore original signal mask 1135 */ 1136 sigunintr(&smask); 1137 1138 switch (status) { 1139 case RPC_SUCCESS: 1140 if ((mi->mi_flags & MI_DYNAMIC) && 1141 mi->mi_timer_type[which] != 0 && 1142 (mi->mi_curread != my_rsize || 1143 mi->mi_curwrite != my_wsize)) 1144 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1145 break; 1146 1147 case RPC_INTR: 1148 /* 1149 * There is no way to recover from this error, 1150 * even if mount option nointr is specified. 1151 * SIGKILL, for example, cannot be blocked. 1152 */ 1153 rpcerr.re_status = RPC_INTR; 1154 rpcerr.re_errno = EINTR; 1155 break; 1156 1157 case RPC_UDERROR: 1158 /* 1159 * If the NFS server is local (vold) and 1160 * it goes away then we get RPC_UDERROR. 1161 * This is a retryable error, so we would 1162 * loop, so check to see if the specific 1163 * error was ECONNRESET, indicating that 1164 * target did not exist at all. If so, 1165 * return with RPC_PROGUNAVAIL and 1166 * ECONNRESET to indicate why. 1167 */ 1168 CLNT_GETERR(client, &rpcerr); 1169 if (rpcerr.re_errno == ECONNRESET) { 1170 rpcerr.re_status = RPC_PROGUNAVAIL; 1171 rpcerr.re_errno = ECONNRESET; 1172 break; 1173 } 1174 /*FALLTHROUGH*/ 1175 1176 default: /* probably RPC_TIMEDOUT */ 1177 if (IS_UNRECOVERABLE_RPC(status)) 1178 break; 1179 1180 /* 1181 * increment server not responding count 1182 */ 1183 mutex_enter(&mi->mi_lock); 1184 mi->mi_noresponse++; 1185 mutex_exit(&mi->mi_lock); 1186 #ifdef DEBUG 1187 nfscl->nfscl_stat.noresponse.value.ui64++; 1188 #endif 1189 1190 if (!(mi->mi_flags & MI_HARD)) { 1191 if (!(mi->mi_flags & MI_SEMISOFT) || 1192 (mi->mi_ss_call_type[which] == 0)) 1193 break; 1194 } 1195 1196 /* 1197 * The call is in progress (over COTS). 1198 * Try the CLNT_CALL again, but don't 1199 * print a noisy error message. 1200 */ 1201 if (status == RPC_INPROGRESS) { 1202 tryagain = TRUE; 1203 break; 1204 } 1205 1206 if (flags & RFSCALL_SOFT) 1207 break; 1208 1209 /* 1210 * On zone shutdown, just move on. 1211 */ 1212 if (zone_status_get(curproc->p_zone) >= 1213 ZONE_IS_SHUTTING_DOWN) { 1214 rpcerr.re_status = RPC_FAILED; 1215 rpcerr.re_errno = EIO; 1216 break; 1217 } 1218 1219 /* 1220 * NFS client failover support 1221 * 1222 * If the current server just failed us, we'll 1223 * start the process of finding a new server. 1224 * After that, we can just retry. 1225 */ 1226 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1227 if (svp == mi->mi_curr_serv) 1228 failover_newserver(mi); 1229 clfree_impl(client, ch, nfscl); 1230 goto failoverretry; 1231 } 1232 1233 tryagain = TRUE; 1234 timeo = backoff(timeo); 1235 1236 CLNT_GETERR(client, &rpcerr_tmp); 1237 if ((status == RPC_CANTSEND) && 1238 (rpcerr_tmp.re_errno == ENOBUFS)) 1239 msg = SRV_QFULL_MSG; 1240 else 1241 msg = SRV_NOTRESP_MSG; 1242 1243 mutex_enter(&mi->mi_lock); 1244 if (!(mi->mi_flags & MI_PRINTED)) { 1245 mi->mi_flags |= MI_PRINTED; 1246 mutex_exit(&mi->mi_lock); 1247 #ifdef DEBUG 1248 zprintf(zoneid, msg, mi->mi_vers, 1249 svp->sv_hostname); 1250 #else 1251 zprintf(zoneid, msg, svp->sv_hostname); 1252 #endif 1253 } else 1254 mutex_exit(&mi->mi_lock); 1255 if (*douprintf && nfs_has_ctty()) { 1256 *douprintf = 0; 1257 if (!(mi->mi_flags & MI_NOPRINT)) 1258 #ifdef DEBUG 1259 uprintf(msg, mi->mi_vers, 1260 svp->sv_hostname); 1261 #else 1262 uprintf(msg, svp->sv_hostname); 1263 #endif 1264 } 1265 1266 /* 1267 * If doing dynamic adjustment of transfer 1268 * size and if it's a read or write call 1269 * and if the transfer size changed while 1270 * retransmitting or if the feedback routine 1271 * changed the transfer size, 1272 * then exit rfscall so that the transfer 1273 * size can be adjusted at the vnops level. 1274 */ 1275 if ((mi->mi_flags & MI_DYNAMIC) && 1276 mi->mi_timer_type[which] != 0 && 1277 (mi->mi_curread != my_rsize || 1278 mi->mi_curwrite != my_wsize || 1279 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1280 /* 1281 * On read or write calls, return 1282 * back to the vnode ops level if 1283 * the transfer size changed. 1284 */ 1285 clfree_impl(client, ch, nfscl); 1286 if (cred_cloned) 1287 crfree(cr); 1288 return (ENFS_TRYAGAIN); 1289 } 1290 } 1291 } while (tryagain); 1292 1293 if (status != RPC_SUCCESS) { 1294 /* 1295 * Let soft mounts use the timed out message. 1296 */ 1297 if (status == RPC_INPROGRESS) 1298 status = RPC_TIMEDOUT; 1299 nfscl->nfscl_stat.badcalls.value.ui64++; 1300 if (status != RPC_INTR) { 1301 mutex_enter(&mi->mi_lock); 1302 mi->mi_flags |= MI_DOWN; 1303 mutex_exit(&mi->mi_lock); 1304 CLNT_GETERR(client, &rpcerr); 1305 #ifdef DEBUG 1306 bufp = clnt_sperror(client, svp->sv_hostname); 1307 zprintf(zoneid, "NFS%d %s failed for %s\n", 1308 mi->mi_vers, mi->mi_rfsnames[which], bufp); 1309 if (nfs_has_ctty()) { 1310 if (!(mi->mi_flags & MI_NOPRINT)) { 1311 uprintf("NFS%d %s failed for %s\n", 1312 mi->mi_vers, mi->mi_rfsnames[which], 1313 bufp); 1314 } 1315 } 1316 kmem_free(bufp, MAXPATHLEN); 1317 #else 1318 zprintf(zoneid, 1319 "NFS %s failed for server %s: error %d (%s)\n", 1320 mi->mi_rfsnames[which], svp->sv_hostname, 1321 status, clnt_sperrno(status)); 1322 if (nfs_has_ctty()) { 1323 if (!(mi->mi_flags & MI_NOPRINT)) { 1324 uprintf( 1325 "NFS %s failed for server %s: error %d (%s)\n", 1326 mi->mi_rfsnames[which], 1327 svp->sv_hostname, status, 1328 clnt_sperrno(status)); 1329 } 1330 } 1331 #endif 1332 /* 1333 * when CLNT_CALL() fails with RPC_AUTHERROR, 1334 * re_errno is set appropriately depending on 1335 * the authentication error 1336 */ 1337 if (status == RPC_VERSMISMATCH || 1338 status == RPC_PROGVERSMISMATCH) 1339 rpcerr.re_errno = EIO; 1340 } 1341 } else { 1342 /* 1343 * Test the value of mi_down and mi_printed without 1344 * holding the mi_lock mutex. If they are both zero, 1345 * then it is okay to skip the down and printed 1346 * processing. This saves on a mutex_enter and 1347 * mutex_exit pair for a normal, successful RPC. 1348 * This was just complete overhead. 1349 */ 1350 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1351 mutex_enter(&mi->mi_lock); 1352 mi->mi_flags &= ~MI_DOWN; 1353 if (mi->mi_flags & MI_PRINTED) { 1354 mi->mi_flags &= ~MI_PRINTED; 1355 mutex_exit(&mi->mi_lock); 1356 #ifdef DEBUG 1357 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1358 zprintf(zoneid, "NFS%d server %s ok\n", 1359 mi->mi_vers, svp->sv_hostname); 1360 #else 1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1362 zprintf(zoneid, "NFS server %s ok\n", 1363 svp->sv_hostname); 1364 #endif 1365 } else 1366 mutex_exit(&mi->mi_lock); 1367 } 1368 1369 if (*douprintf == 0) { 1370 if (!(mi->mi_flags & MI_NOPRINT)) 1371 #ifdef DEBUG 1372 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1373 uprintf("NFS%d server %s ok\n", 1374 mi->mi_vers, svp->sv_hostname); 1375 #else 1376 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) 1377 uprintf("NFS server %s ok\n", svp->sv_hostname); 1378 #endif 1379 *douprintf = 1; 1380 } 1381 } 1382 1383 clfree_impl(client, ch, nfscl); 1384 if (cred_cloned) 1385 crfree(cr); 1386 1387 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1388 1389 if (rpc_status != NULL) 1390 *rpc_status = rpcerr.re_status; 1391 1392 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1393 rpcerr.re_errno); 1394 1395 return (rpcerr.re_errno); 1396 } 1397 1398 #ifdef DEBUG 1399 static int acl2call_hits = 0; 1400 static int acl2call_misses = 0; 1401 #endif 1402 1403 int 1404 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1405 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1406 enum nfsstat *statusp, int flags, failinfo_t *fi) 1407 { 1408 int rpcerror; 1409 1410 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1411 cr, douprintf, flags, fi); 1412 if (!rpcerror) { 1413 /* 1414 * See comments with crnetadjust(). 1415 */ 1416 if (*statusp == NFSERR_ACCES && 1417 (cr = crnetadjust(cr)) != NULL) { 1418 #ifdef DEBUG 1419 acl2call_hits++; 1420 #endif 1421 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, 1422 resp, cr, douprintf, flags, fi); 1423 crfree(cr); 1424 #ifdef DEBUG 1425 if (*statusp == NFSERR_ACCES) 1426 acl2call_misses++; 1427 #endif 1428 } 1429 } 1430 1431 return (rpcerror); 1432 } 1433 1434 #ifdef DEBUG 1435 static int acl3call_hits = 0; 1436 static int acl3call_misses = 0; 1437 #endif 1438 1439 int 1440 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1441 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, 1442 nfsstat3 *statusp, int flags, failinfo_t *fi) 1443 { 1444 int rpcerror; 1445 int user_informed; 1446 1447 user_informed = 0; 1448 1449 do { 1450 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, 1451 cr, douprintf, flags, fi); 1452 if (!rpcerror) { 1453 cred_t *crr; 1454 if (*statusp == NFS3ERR_JUKEBOX) { 1455 if (!user_informed) { 1456 user_informed = 1; 1457 uprintf( 1458 "file temporarily unavailable on the server, retrying...\n"); 1459 } 1460 delay(nfs3_jukebox_delay); 1461 } 1462 /* 1463 * See crnetadjust() for comments. 1464 */ 1465 else if (*statusp == NFS3ERR_ACCES && 1466 (crr = crnetadjust(cr)) != NULL) { 1467 #ifdef DEBUG 1468 acl3call_hits++; 1469 #endif 1470 rpcerror = aclcall(mi, which, xdrargs, argsp, 1471 xdrres, resp, crr, douprintf, flags, fi); 1472 1473 crfree(crr); 1474 #ifdef DEBUG 1475 if (*statusp == NFS3ERR_ACCES) 1476 acl3call_misses++; 1477 #endif 1478 } 1479 } 1480 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); 1481 1482 return (rpcerror); 1483 } 1484 1485 static int 1486 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, 1487 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, 1488 int flags, failinfo_t *fi) 1489 { 1490 CLIENT *client; 1491 struct chtab *ch; 1492 cred_t *cr = icr; 1493 bool_t cred_cloned = FALSE; 1494 enum clnt_stat status; 1495 struct rpc_err rpcerr; 1496 struct timeval wait; 1497 int timeo; /* in units of hz */ 1498 #if 0 /* notyet */ 1499 int my_rsize, my_wsize; 1500 #endif 1501 bool_t tryagain; 1502 k_sigset_t smask; 1503 servinfo_t *svp; 1504 struct nfs_clnt *nfscl; 1505 zoneid_t zoneid = getzoneid(); 1506 #ifdef DEBUG 1507 char *bufp; 1508 #endif 1509 1510 #if 0 /* notyet */ 1511 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, 1512 "rfscall_start:which %d mi %p", which, mi); 1513 #endif 1514 1515 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 1516 ASSERT(nfscl != NULL); 1517 1518 nfscl->nfscl_stat.calls.value.ui64++; 1519 mi->mi_aclreqs[which].value.ui64++; 1520 1521 rpcerr.re_status = RPC_SUCCESS; 1522 1523 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1524 rpcerr.re_status = RPC_FAILED; 1525 rpcerr.re_errno = EIO; 1526 return (rpcerr.re_errno); 1527 } 1528 1529 #if 0 /* notyet */ 1530 /* 1531 * Remember the transfer sizes in case 1532 * nfs_feedback changes them underneath us. 1533 */ 1534 my_rsize = mi->mi_curread; 1535 my_wsize = mi->mi_curwrite; 1536 #endif 1537 1538 /* 1539 * NFS client failover support 1540 * 1541 * If this rnode is not in sync with the current server (VALID_FH), 1542 * we'd like to do a remap to get in sync. We can be interrupted 1543 * in failover_remap(), and if so we'll bail. Otherwise, we'll 1544 * use the best info we have to try the RPC. Part of that is 1545 * unconditionally updating the filehandle copy kept for V3. 1546 * 1547 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible 1548 * rw_enter(); we're trying to keep the current server from being 1549 * changed on us until we're done with the remapping and have a 1550 * matching client handle. We don't want to sending a filehandle 1551 * to the wrong host. 1552 */ 1553 failoverretry: 1554 if (FAILOVER_MOUNT(mi)) { 1555 mutex_enter(&mi->mi_lock); 1556 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1557 if (failover_wait(mi)) { 1558 mutex_exit(&mi->mi_lock); 1559 return (EINTR); 1560 } 1561 } 1562 INC_READERS(mi); 1563 mutex_exit(&mi->mi_lock); 1564 if (fi) { 1565 if (!VALID_FH(fi) && 1566 !(flags & RFSCALL_SOFT) && failover_safe(fi)) { 1567 int remaperr; 1568 1569 svp = mi->mi_curr_serv; 1570 remaperr = failover_remap(fi); 1571 if (remaperr != 0) { 1572 #ifdef DEBUG 1573 if (remaperr != EINTR) 1574 nfs_cmn_err(remaperr, CE_WARN, 1575 "aclcall couldn't failover: %m"); 1576 #endif 1577 mutex_enter(&mi->mi_lock); 1578 DEC_READERS(mi); 1579 mutex_exit(&mi->mi_lock); 1580 1581 /* 1582 * If failover_remap returns ETIMEDOUT 1583 * and the filesystem is hard mounted 1584 * we have to retry the call with a new 1585 * server. 1586 */ 1587 if ((mi->mi_flags & MI_HARD) && 1588 IS_RECOVERABLE_ERROR(remaperr)) { 1589 if (svp == mi->mi_curr_serv) 1590 failover_newserver(mi); 1591 rpcerr.re_status = RPC_SUCCESS; 1592 goto failoverretry; 1593 } 1594 return (remaperr); 1595 } 1596 } 1597 if (fi->fhp && fi->copyproc) 1598 (*fi->copyproc)(fi->fhp, fi->vp); 1599 } 1600 } 1601 1602 /* For TSOL, use a new cred which has net_mac_aware flag */ 1603 if (!cred_cloned && is_system_labeled()) { 1604 cred_cloned = TRUE; 1605 cr = crdup(icr); 1606 (void) setpflags(NET_MAC_AWARE, 1, cr); 1607 } 1608 1609 /* 1610 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we 1611 * are guaranteed to reprocess the retry as a new request. 1612 */ 1613 svp = mi->mi_curr_serv; 1614 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); 1615 if (FAILOVER_MOUNT(mi)) { 1616 mutex_enter(&mi->mi_lock); 1617 DEC_READERS(mi); 1618 mutex_exit(&mi->mi_lock); 1619 1620 if ((rpcerr.re_errno == ETIMEDOUT || 1621 rpcerr.re_errno == ECONNRESET) && 1622 failover_safe(fi)) { 1623 if (svp == mi->mi_curr_serv) 1624 failover_newserver(mi); 1625 goto failoverretry; 1626 } 1627 } 1628 if (rpcerr.re_errno != 0) { 1629 if (cred_cloned) 1630 crfree(cr); 1631 return (rpcerr.re_errno); 1632 } 1633 1634 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || 1635 svp->sv_knconf->knc_semantics == NC_TPI_COTS) { 1636 timeo = (mi->mi_timeo * hz) / 10; 1637 } else { 1638 mutex_enter(&mi->mi_lock); 1639 timeo = CLNT_SETTIMERS(client, 1640 &(mi->mi_timers[mi->mi_acl_timer_type[which]]), 1641 &(mi->mi_timers[NFS_CALLTYPES]), 1642 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, 1643 (void (*)()) 0, (caddr_t)mi, 0); 1644 mutex_exit(&mi->mi_lock); 1645 } 1646 1647 /* 1648 * If hard mounted fs, retry call forever unless hard error occurs. 1649 */ 1650 do { 1651 tryagain = FALSE; 1652 1653 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 1654 status = RPC_FAILED; 1655 rpcerr.re_status = RPC_FAILED; 1656 rpcerr.re_errno = EIO; 1657 break; 1658 } 1659 1660 TICK_TO_TIMEVAL(timeo, &wait); 1661 1662 /* 1663 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 1664 * and SIGTERM. (Preserving the existing masks). 1665 * Mask out SIGINT if mount option nointr is specified. 1666 */ 1667 sigintr(&smask, (int)mi->mi_flags & MI_INT); 1668 if (!(mi->mi_flags & MI_INT)) 1669 client->cl_nosignal = TRUE; 1670 1671 /* 1672 * If there is a current signal, then don't bother 1673 * even trying to send out the request because we 1674 * won't be able to block waiting for the response. 1675 * Simply assume RPC_INTR and get on with it. 1676 */ 1677 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) 1678 status = RPC_INTR; 1679 else { 1680 status = CLNT_CALL(client, which, xdrargs, argsp, 1681 xdrres, resp, wait); 1682 } 1683 1684 if (!(mi->mi_flags & MI_INT)) 1685 client->cl_nosignal = FALSE; 1686 /* 1687 * restore original signal mask 1688 */ 1689 sigunintr(&smask); 1690 1691 switch (status) { 1692 case RPC_SUCCESS: 1693 #if 0 /* notyet */ 1694 if ((mi->mi_flags & MI_DYNAMIC) && 1695 mi->mi_timer_type[which] != 0 && 1696 (mi->mi_curread != my_rsize || 1697 mi->mi_curwrite != my_wsize)) 1698 (void) nfs_feedback(FEEDBACK_OK, which, mi); 1699 #endif 1700 break; 1701 1702 /* 1703 * Unfortunately, there are servers in the world which 1704 * are not coded correctly. They are not prepared to 1705 * handle RPC requests to the NFS port which are not 1706 * NFS requests. Thus, they may try to process the 1707 * NFS_ACL request as if it were an NFS request. This 1708 * does not work. Generally, an error will be generated 1709 * on the client because it will not be able to decode 1710 * the response from the server. However, it seems 1711 * possible that the server may not be able to decode 1712 * the arguments. Thus, the criteria for deciding 1713 * whether the server supports NFS_ACL or not is whether 1714 * the following RPC errors are returned from CLNT_CALL. 1715 */ 1716 case RPC_CANTDECODERES: 1717 case RPC_PROGUNAVAIL: 1718 case RPC_CANTDECODEARGS: 1719 case RPC_PROGVERSMISMATCH: 1720 mutex_enter(&mi->mi_lock); 1721 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); 1722 mutex_exit(&mi->mi_lock); 1723 break; 1724 1725 /* 1726 * If the server supports NFS_ACL but not the new ops 1727 * for extended attributes, make sure we don't retry. 1728 */ 1729 case RPC_PROCUNAVAIL: 1730 mutex_enter(&mi->mi_lock); 1731 mi->mi_flags &= ~MI_EXTATTR; 1732 mutex_exit(&mi->mi_lock); 1733 break; 1734 1735 case RPC_INTR: 1736 /* 1737 * There is no way to recover from this error, 1738 * even if mount option nointr is specified. 1739 * SIGKILL, for example, cannot be blocked. 1740 */ 1741 rpcerr.re_status = RPC_INTR; 1742 rpcerr.re_errno = EINTR; 1743 break; 1744 1745 case RPC_UDERROR: 1746 /* 1747 * If the NFS server is local (vold) and 1748 * it goes away then we get RPC_UDERROR. 1749 * This is a retryable error, so we would 1750 * loop, so check to see if the specific 1751 * error was ECONNRESET, indicating that 1752 * target did not exist at all. If so, 1753 * return with RPC_PROGUNAVAIL and 1754 * ECONNRESET to indicate why. 1755 */ 1756 CLNT_GETERR(client, &rpcerr); 1757 if (rpcerr.re_errno == ECONNRESET) { 1758 rpcerr.re_status = RPC_PROGUNAVAIL; 1759 rpcerr.re_errno = ECONNRESET; 1760 break; 1761 } 1762 /*FALLTHROUGH*/ 1763 1764 default: /* probably RPC_TIMEDOUT */ 1765 if (IS_UNRECOVERABLE_RPC(status)) 1766 break; 1767 1768 /* 1769 * increment server not responding count 1770 */ 1771 mutex_enter(&mi->mi_lock); 1772 mi->mi_noresponse++; 1773 mutex_exit(&mi->mi_lock); 1774 #ifdef DEBUG 1775 nfscl->nfscl_stat.noresponse.value.ui64++; 1776 #endif 1777 1778 if (!(mi->mi_flags & MI_HARD)) { 1779 if (!(mi->mi_flags & MI_SEMISOFT) || 1780 (mi->mi_acl_ss_call_type[which] == 0)) 1781 break; 1782 } 1783 1784 /* 1785 * The call is in progress (over COTS). 1786 * Try the CLNT_CALL again, but don't 1787 * print a noisy error message. 1788 */ 1789 if (status == RPC_INPROGRESS) { 1790 tryagain = TRUE; 1791 break; 1792 } 1793 1794 if (flags & RFSCALL_SOFT) 1795 break; 1796 1797 /* 1798 * On zone shutdown, just move on. 1799 */ 1800 if (zone_status_get(curproc->p_zone) >= 1801 ZONE_IS_SHUTTING_DOWN) { 1802 rpcerr.re_status = RPC_FAILED; 1803 rpcerr.re_errno = EIO; 1804 break; 1805 } 1806 1807 /* 1808 * NFS client failover support 1809 * 1810 * If the current server just failed us, we'll 1811 * start the process of finding a new server. 1812 * After that, we can just retry. 1813 */ 1814 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { 1815 if (svp == mi->mi_curr_serv) 1816 failover_newserver(mi); 1817 clfree_impl(client, ch, nfscl); 1818 goto failoverretry; 1819 } 1820 1821 tryagain = TRUE; 1822 timeo = backoff(timeo); 1823 mutex_enter(&mi->mi_lock); 1824 if (!(mi->mi_flags & MI_PRINTED)) { 1825 mi->mi_flags |= MI_PRINTED; 1826 mutex_exit(&mi->mi_lock); 1827 #ifdef DEBUG 1828 zprintf(zoneid, 1829 "NFS_ACL%d server %s not responding still trying\n", 1830 mi->mi_vers, svp->sv_hostname); 1831 #else 1832 zprintf(zoneid, 1833 "NFS server %s not responding still trying\n", 1834 svp->sv_hostname); 1835 #endif 1836 } else 1837 mutex_exit(&mi->mi_lock); 1838 if (*douprintf && nfs_has_ctty()) { 1839 *douprintf = 0; 1840 if (!(mi->mi_flags & MI_NOPRINT)) 1841 #ifdef DEBUG 1842 uprintf( 1843 "NFS_ACL%d server %s not responding still trying\n", 1844 mi->mi_vers, svp->sv_hostname); 1845 #else 1846 uprintf( 1847 "NFS server %s not responding still trying\n", 1848 svp->sv_hostname); 1849 #endif 1850 } 1851 1852 #if 0 /* notyet */ 1853 /* 1854 * If doing dynamic adjustment of transfer 1855 * size and if it's a read or write call 1856 * and if the transfer size changed while 1857 * retransmitting or if the feedback routine 1858 * changed the transfer size, 1859 * then exit rfscall so that the transfer 1860 * size can be adjusted at the vnops level. 1861 */ 1862 if ((mi->mi_flags & MI_DYNAMIC) && 1863 mi->mi_acl_timer_type[which] != 0 && 1864 (mi->mi_curread != my_rsize || 1865 mi->mi_curwrite != my_wsize || 1866 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { 1867 /* 1868 * On read or write calls, return 1869 * back to the vnode ops level if 1870 * the transfer size changed. 1871 */ 1872 clfree_impl(client, ch, nfscl); 1873 if (cred_cloned) 1874 crfree(cr); 1875 return (ENFS_TRYAGAIN); 1876 } 1877 #endif 1878 } 1879 } while (tryagain); 1880 1881 if (status != RPC_SUCCESS) { 1882 /* 1883 * Let soft mounts use the timed out message. 1884 */ 1885 if (status == RPC_INPROGRESS) 1886 status = RPC_TIMEDOUT; 1887 nfscl->nfscl_stat.badcalls.value.ui64++; 1888 if (status == RPC_CANTDECODERES || 1889 status == RPC_PROGUNAVAIL || 1890 status == RPC_PROCUNAVAIL || 1891 status == RPC_CANTDECODEARGS || 1892 status == RPC_PROGVERSMISMATCH) 1893 CLNT_GETERR(client, &rpcerr); 1894 else if (status != RPC_INTR) { 1895 mutex_enter(&mi->mi_lock); 1896 mi->mi_flags |= MI_DOWN; 1897 mutex_exit(&mi->mi_lock); 1898 CLNT_GETERR(client, &rpcerr); 1899 #ifdef DEBUG 1900 bufp = clnt_sperror(client, svp->sv_hostname); 1901 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", 1902 mi->mi_vers, mi->mi_aclnames[which], bufp); 1903 if (nfs_has_ctty()) { 1904 if (!(mi->mi_flags & MI_NOPRINT)) { 1905 uprintf("NFS_ACL%d %s failed for %s\n", 1906 mi->mi_vers, mi->mi_aclnames[which], 1907 bufp); 1908 } 1909 } 1910 kmem_free(bufp, MAXPATHLEN); 1911 #else 1912 zprintf(zoneid, 1913 "NFS %s failed for server %s: error %d (%s)\n", 1914 mi->mi_aclnames[which], svp->sv_hostname, 1915 status, clnt_sperrno(status)); 1916 if (nfs_has_ctty()) { 1917 if (!(mi->mi_flags & MI_NOPRINT)) 1918 uprintf( 1919 "NFS %s failed for server %s: error %d (%s)\n", 1920 mi->mi_aclnames[which], 1921 svp->sv_hostname, status, 1922 clnt_sperrno(status)); 1923 } 1924 #endif 1925 /* 1926 * when CLNT_CALL() fails with RPC_AUTHERROR, 1927 * re_errno is set appropriately depending on 1928 * the authentication error 1929 */ 1930 if (status == RPC_VERSMISMATCH || 1931 status == RPC_PROGVERSMISMATCH) 1932 rpcerr.re_errno = EIO; 1933 } 1934 } else { 1935 /* 1936 * Test the value of mi_down and mi_printed without 1937 * holding the mi_lock mutex. If they are both zero, 1938 * then it is okay to skip the down and printed 1939 * processing. This saves on a mutex_enter and 1940 * mutex_exit pair for a normal, successful RPC. 1941 * This was just complete overhead. 1942 */ 1943 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { 1944 mutex_enter(&mi->mi_lock); 1945 mi->mi_flags &= ~MI_DOWN; 1946 if (mi->mi_flags & MI_PRINTED) { 1947 mi->mi_flags &= ~MI_PRINTED; 1948 mutex_exit(&mi->mi_lock); 1949 #ifdef DEBUG 1950 zprintf(zoneid, "NFS_ACL%d server %s ok\n", 1951 mi->mi_vers, svp->sv_hostname); 1952 #else 1953 zprintf(zoneid, "NFS server %s ok\n", 1954 svp->sv_hostname); 1955 #endif 1956 } else 1957 mutex_exit(&mi->mi_lock); 1958 } 1959 1960 if (*douprintf == 0) { 1961 if (!(mi->mi_flags & MI_NOPRINT)) 1962 #ifdef DEBUG 1963 uprintf("NFS_ACL%d server %s ok\n", 1964 mi->mi_vers, svp->sv_hostname); 1965 #else 1966 uprintf("NFS server %s ok\n", svp->sv_hostname); 1967 #endif 1968 *douprintf = 1; 1969 } 1970 } 1971 1972 clfree_impl(client, ch, nfscl); 1973 if (cred_cloned) 1974 crfree(cr); 1975 1976 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); 1977 1978 #if 0 /* notyet */ 1979 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", 1980 rpcerr.re_errno); 1981 #endif 1982 1983 return (rpcerr.re_errno); 1984 } 1985 1986 int 1987 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) 1988 { 1989 uint_t mask = vap->va_mask; 1990 1991 if (!(mask & AT_MODE)) 1992 sa->sa_mode = (uint32_t)-1; 1993 else 1994 sa->sa_mode = vap->va_mode; 1995 if (!(mask & AT_UID)) 1996 sa->sa_uid = (uint32_t)-1; 1997 else 1998 sa->sa_uid = (uint32_t)vap->va_uid; 1999 if (!(mask & AT_GID)) 2000 sa->sa_gid = (uint32_t)-1; 2001 else 2002 sa->sa_gid = (uint32_t)vap->va_gid; 2003 if (!(mask & AT_SIZE)) 2004 sa->sa_size = (uint32_t)-1; 2005 else 2006 sa->sa_size = (uint32_t)vap->va_size; 2007 if (!(mask & AT_ATIME)) 2008 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; 2009 else { 2010 /* check time validity */ 2011 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2012 return (EOVERFLOW); 2013 } 2014 sa->sa_atime.tv_sec = vap->va_atime.tv_sec; 2015 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2016 } 2017 if (!(mask & AT_MTIME)) 2018 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; 2019 else { 2020 /* check time validity */ 2021 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2022 return (EOVERFLOW); 2023 } 2024 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; 2025 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2026 } 2027 return (0); 2028 } 2029 2030 int 2031 vattr_to_sattr3(struct vattr *vap, sattr3 *sa) 2032 { 2033 uint_t mask = vap->va_mask; 2034 2035 if (!(mask & AT_MODE)) 2036 sa->mode.set_it = FALSE; 2037 else { 2038 sa->mode.set_it = TRUE; 2039 sa->mode.mode = (mode3)vap->va_mode; 2040 } 2041 if (!(mask & AT_UID)) 2042 sa->uid.set_it = FALSE; 2043 else { 2044 sa->uid.set_it = TRUE; 2045 sa->uid.uid = (uid3)vap->va_uid; 2046 } 2047 if (!(mask & AT_GID)) 2048 sa->gid.set_it = FALSE; 2049 else { 2050 sa->gid.set_it = TRUE; 2051 sa->gid.gid = (gid3)vap->va_gid; 2052 } 2053 if (!(mask & AT_SIZE)) 2054 sa->size.set_it = FALSE; 2055 else { 2056 sa->size.set_it = TRUE; 2057 sa->size.size = (size3)vap->va_size; 2058 } 2059 if (!(mask & AT_ATIME)) 2060 sa->atime.set_it = DONT_CHANGE; 2061 else { 2062 /* check time validity */ 2063 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { 2064 return (EOVERFLOW); 2065 } 2066 sa->atime.set_it = SET_TO_CLIENT_TIME; 2067 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; 2068 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; 2069 } 2070 if (!(mask & AT_MTIME)) 2071 sa->mtime.set_it = DONT_CHANGE; 2072 else { 2073 /* check time validity */ 2074 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { 2075 return (EOVERFLOW); 2076 } 2077 sa->mtime.set_it = SET_TO_CLIENT_TIME; 2078 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; 2079 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; 2080 } 2081 return (0); 2082 } 2083 2084 void 2085 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) 2086 { 2087 2088 da->da_fhandle = VTOFH(dvp); 2089 da->da_name = nm; 2090 da->da_flags = 0; 2091 } 2092 2093 void 2094 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) 2095 { 2096 2097 da->dirp = VTOFH3(dvp); 2098 da->name = nm; 2099 } 2100 2101 int 2102 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) 2103 { 2104 int error; 2105 rnode_t *rp; 2106 struct vattr va; 2107 2108 va.va_mask = AT_MODE | AT_GID; 2109 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2110 if (error) 2111 return (error); 2112 2113 /* 2114 * To determine the expected group-id of the created file: 2115 * 1) If the filesystem was not mounted with the Old-BSD-compatible 2116 * GRPID option, and the directory's set-gid bit is clear, 2117 * then use the process's gid. 2118 * 2) Otherwise, set the group-id to the gid of the parent directory. 2119 */ 2120 rp = VTOR(dvp); 2121 mutex_enter(&rp->r_statelock); 2122 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) 2123 *gidp = crgetgid(cr); 2124 else 2125 *gidp = va.va_gid; 2126 mutex_exit(&rp->r_statelock); 2127 return (0); 2128 } 2129 2130 int 2131 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) 2132 { 2133 int error; 2134 struct vattr va; 2135 2136 va.va_mask = AT_MODE; 2137 error = VOP_GETATTR(dvp, &va, 0, cr, NULL); 2138 if (error) 2139 return (error); 2140 2141 /* 2142 * Modify the expected mode (om) so that the set-gid bit matches 2143 * that of the parent directory (dvp). 2144 */ 2145 if (va.va_mode & VSGID) 2146 *omp |= VSGID; 2147 else 2148 *omp &= ~VSGID; 2149 return (0); 2150 } 2151 2152 void 2153 nfs_setswaplike(vnode_t *vp, vattr_t *vap) 2154 { 2155 2156 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { 2157 if (!(vp->v_flag & VSWAPLIKE)) { 2158 mutex_enter(&vp->v_lock); 2159 vp->v_flag |= VSWAPLIKE; 2160 mutex_exit(&vp->v_lock); 2161 } 2162 } else { 2163 if (vp->v_flag & VSWAPLIKE) { 2164 mutex_enter(&vp->v_lock); 2165 vp->v_flag &= ~VSWAPLIKE; 2166 mutex_exit(&vp->v_lock); 2167 } 2168 } 2169 } 2170 2171 /* 2172 * Free the resources associated with an rnode. 2173 */ 2174 static void 2175 rinactive(rnode_t *rp, cred_t *cr) 2176 { 2177 vnode_t *vp; 2178 cred_t *cred; 2179 char *contents; 2180 int size; 2181 vsecattr_t *vsp; 2182 int error; 2183 nfs3_pathconf_info *info; 2184 2185 /* 2186 * Before freeing anything, wait until all asynchronous 2187 * activity is done on this rnode. This will allow all 2188 * asynchronous read ahead and write behind i/o's to 2189 * finish. 2190 */ 2191 mutex_enter(&rp->r_statelock); 2192 while (rp->r_count > 0) 2193 cv_wait(&rp->r_cv, &rp->r_statelock); 2194 mutex_exit(&rp->r_statelock); 2195 2196 /* 2197 * Flush and invalidate all pages associated with the vnode. 2198 */ 2199 vp = RTOV(rp); 2200 if (vn_has_cached_data(vp)) { 2201 ASSERT(vp->v_type != VCHR); 2202 if ((rp->r_flags & RDIRTY) && !rp->r_error) { 2203 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL); 2204 if (error && (error == ENOSPC || error == EDQUOT)) { 2205 mutex_enter(&rp->r_statelock); 2206 if (!rp->r_error) 2207 rp->r_error = error; 2208 mutex_exit(&rp->r_statelock); 2209 } 2210 } 2211 nfs_invalidate_pages(vp, (u_offset_t)0, cr); 2212 } 2213 2214 /* 2215 * Free any held credentials and caches which may be associated 2216 * with this rnode. 2217 */ 2218 mutex_enter(&rp->r_statelock); 2219 cred = rp->r_cred; 2220 rp->r_cred = NULL; 2221 contents = rp->r_symlink.contents; 2222 size = rp->r_symlink.size; 2223 rp->r_symlink.contents = NULL; 2224 vsp = rp->r_secattr; 2225 rp->r_secattr = NULL; 2226 info = rp->r_pathconf; 2227 rp->r_pathconf = NULL; 2228 mutex_exit(&rp->r_statelock); 2229 2230 /* 2231 * Free the held credential. 2232 */ 2233 if (cred != NULL) 2234 crfree(cred); 2235 2236 /* 2237 * Free the access cache entries. 2238 */ 2239 (void) nfs_access_purge_rp(rp); 2240 2241 /* 2242 * Free the readdir cache entries. 2243 */ 2244 if (HAVE_RDDIR_CACHE(rp)) 2245 nfs_purge_rddir_cache(vp); 2246 2247 /* 2248 * Free the symbolic link cache. 2249 */ 2250 if (contents != NULL) { 2251 2252 kmem_free((void *)contents, size); 2253 } 2254 2255 /* 2256 * Free any cached ACL. 2257 */ 2258 if (vsp != NULL) 2259 nfs_acl_free(vsp); 2260 2261 /* 2262 * Free any cached pathconf information. 2263 */ 2264 if (info != NULL) 2265 kmem_free(info, sizeof (*info)); 2266 } 2267 2268 /* 2269 * Return a vnode for the given NFS Version 2 file handle. 2270 * If no rnode exists for this fhandle, create one and put it 2271 * into the hash queues. If the rnode for this fhandle 2272 * already exists, return it. 2273 * 2274 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2275 */ 2276 vnode_t * 2277 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, 2278 hrtime_t t, cred_t *cr, char *dnm, char *nm) 2279 { 2280 int newnode; 2281 int index; 2282 vnode_t *vp; 2283 nfs_fhandle nfh; 2284 vattr_t va; 2285 2286 nfh.fh_len = NFS_FHSIZE; 2287 bcopy(fh, nfh.fh_buf, NFS_FHSIZE); 2288 2289 index = rtablehash(&nfh); 2290 rw_enter(&rtable[index].r_lock, RW_READER); 2291 2292 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, 2293 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); 2294 2295 if (attr != NULL) { 2296 if (!newnode) { 2297 rw_exit(&rtable[index].r_lock); 2298 (void) nfs_cache_fattr(vp, attr, &va, t, cr); 2299 } else { 2300 if (attr->na_type < NFNON || attr->na_type > NFSOC) 2301 vp->v_type = VBAD; 2302 else 2303 vp->v_type = n2v_type(attr); 2304 /* 2305 * A translation here seems to be necessary 2306 * because this function can be called 2307 * with `attr' that has come from the wire, 2308 * and been operated on by vattr_to_nattr(). 2309 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() 2310 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() 2311 * ->makenfsnode(). 2312 */ 2313 if ((attr->na_rdev & 0xffff0000) == 0) 2314 vp->v_rdev = nfsv2_expdev(attr->na_rdev); 2315 else 2316 vp->v_rdev = expldev(n2v_rdev(attr)); 2317 nfs_attrcache(vp, attr, t); 2318 rw_exit(&rtable[index].r_lock); 2319 } 2320 } else { 2321 if (newnode) { 2322 PURGE_ATTRCACHE(vp); 2323 } 2324 rw_exit(&rtable[index].r_lock); 2325 } 2326 2327 return (vp); 2328 } 2329 2330 /* 2331 * Return a vnode for the given NFS Version 3 file handle. 2332 * If no rnode exists for this fhandle, create one and put it 2333 * into the hash queues. If the rnode for this fhandle 2334 * already exists, return it. 2335 * 2336 * Note: make_rnode() may upgrade the hash bucket lock to exclusive. 2337 */ 2338 vnode_t * 2339 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, 2340 cred_t *cr, char *dnm, char *nm) 2341 { 2342 int newnode; 2343 int index; 2344 vnode_t *vp; 2345 2346 index = rtablehash((nfs_fhandle *)fh); 2347 rw_enter(&rtable[index].r_lock, RW_READER); 2348 2349 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2350 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2351 dnm, nm); 2352 2353 if (vap == NULL) { 2354 if (newnode) { 2355 PURGE_ATTRCACHE(vp); 2356 } 2357 rw_exit(&rtable[index].r_lock); 2358 return (vp); 2359 } 2360 2361 if (!newnode) { 2362 rw_exit(&rtable[index].r_lock); 2363 nfs_attr_cache(vp, vap, t, cr); 2364 } else { 2365 rnode_t *rp = VTOR(vp); 2366 2367 vp->v_type = vap->va_type; 2368 vp->v_rdev = vap->va_rdev; 2369 2370 mutex_enter(&rp->r_statelock); 2371 if (rp->r_mtime <= t) 2372 nfs_attrcache_va(vp, vap); 2373 mutex_exit(&rp->r_statelock); 2374 rw_exit(&rtable[index].r_lock); 2375 } 2376 2377 return (vp); 2378 } 2379 2380 vnode_t * 2381 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, 2382 cred_t *cr, char *dnm, char *nm) 2383 { 2384 int newnode; 2385 int index; 2386 vnode_t *vp; 2387 vattr_t va; 2388 2389 index = rtablehash((nfs_fhandle *)fh); 2390 rw_enter(&rtable[index].r_lock, RW_READER); 2391 2392 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, 2393 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, 2394 dnm, nm); 2395 2396 if (attr == NULL) { 2397 if (newnode) { 2398 PURGE_ATTRCACHE(vp); 2399 } 2400 rw_exit(&rtable[index].r_lock); 2401 return (vp); 2402 } 2403 2404 if (!newnode) { 2405 rw_exit(&rtable[index].r_lock); 2406 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); 2407 } else { 2408 if (attr->type < NF3REG || attr->type > NF3FIFO) 2409 vp->v_type = VBAD; 2410 else 2411 vp->v_type = nf3_to_vt[attr->type]; 2412 vp->v_rdev = makedevice(attr->rdev.specdata1, 2413 attr->rdev.specdata2); 2414 nfs3_attrcache(vp, attr, t); 2415 rw_exit(&rtable[index].r_lock); 2416 } 2417 2418 return (vp); 2419 } 2420 2421 /* 2422 * Read this comment before making changes to rtablehash()! 2423 * This is a hash function in which seemingly obvious and harmless 2424 * changes can cause escalations costing million dollars! 2425 * Know what you are doing. 2426 * 2427 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The 2428 * algorithm is currently detailed here: 2429 * 2430 * http://burtleburtle.net/bob/hash/doobs.html 2431 * 2432 * Of course, the above link may not be valid by the time you are reading 2433 * this, but suffice it to say that the one-at-a-time algorithm works well in 2434 * almost all cases. If you are changing the algorithm be sure to verify that 2435 * the hash algorithm still provides even distribution in all cases and with 2436 * any server returning filehandles in whatever order (sequential or random). 2437 */ 2438 static int 2439 rtablehash(nfs_fhandle *fh) 2440 { 2441 ulong_t hash, len, i; 2442 char *key; 2443 2444 key = fh->fh_buf; 2445 len = (ulong_t)fh->fh_len; 2446 for (hash = 0, i = 0; i < len; i++) { 2447 hash += key[i]; 2448 hash += (hash << 10); 2449 hash ^= (hash >> 6); 2450 } 2451 hash += (hash << 3); 2452 hash ^= (hash >> 11); 2453 hash += (hash << 15); 2454 return (hash & rtablemask); 2455 } 2456 2457 static vnode_t * 2458 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, 2459 struct vnodeops *vops, 2460 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), 2461 int (*compar)(const void *, const void *), 2462 int *newnode, cred_t *cr, char *dnm, char *nm) 2463 { 2464 rnode_t *rp; 2465 rnode_t *trp; 2466 vnode_t *vp; 2467 mntinfo_t *mi; 2468 2469 ASSERT(RW_READ_HELD(&rhtp->r_lock)); 2470 2471 mi = VFTOMI(vfsp); 2472 start: 2473 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { 2474 vp = RTOV(rp); 2475 nfs_set_vroot(vp); 2476 *newnode = 0; 2477 return (vp); 2478 } 2479 rw_exit(&rhtp->r_lock); 2480 2481 mutex_enter(&rpfreelist_lock); 2482 if (rpfreelist != NULL && rnew >= nrnode) { 2483 rp = rpfreelist; 2484 rp_rmfree(rp); 2485 mutex_exit(&rpfreelist_lock); 2486 2487 vp = RTOV(rp); 2488 2489 if (rp->r_flags & RHASHED) { 2490 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2491 mutex_enter(&vp->v_lock); 2492 if (vp->v_count > 1) { 2493 vp->v_count--; 2494 mutex_exit(&vp->v_lock); 2495 rw_exit(&rp->r_hashq->r_lock); 2496 rw_enter(&rhtp->r_lock, RW_READER); 2497 goto start; 2498 } 2499 mutex_exit(&vp->v_lock); 2500 rp_rmhash_locked(rp); 2501 rw_exit(&rp->r_hashq->r_lock); 2502 } 2503 2504 rinactive(rp, cr); 2505 2506 mutex_enter(&vp->v_lock); 2507 if (vp->v_count > 1) { 2508 vp->v_count--; 2509 mutex_exit(&vp->v_lock); 2510 rw_enter(&rhtp->r_lock, RW_READER); 2511 goto start; 2512 } 2513 mutex_exit(&vp->v_lock); 2514 vn_invalid(vp); 2515 /* 2516 * destroy old locks before bzero'ing and 2517 * recreating the locks below. 2518 */ 2519 nfs_rw_destroy(&rp->r_rwlock); 2520 nfs_rw_destroy(&rp->r_lkserlock); 2521 mutex_destroy(&rp->r_statelock); 2522 cv_destroy(&rp->r_cv); 2523 cv_destroy(&rp->r_commit.c_cv); 2524 nfs_free_r_path(rp); 2525 avl_destroy(&rp->r_dir); 2526 /* 2527 * Make sure that if rnode is recycled then 2528 * VFS count is decremented properly before 2529 * reuse. 2530 */ 2531 VFS_RELE(vp->v_vfsp); 2532 vn_reinit(vp); 2533 } else { 2534 vnode_t *new_vp; 2535 2536 mutex_exit(&rpfreelist_lock); 2537 2538 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); 2539 new_vp = vn_alloc(KM_SLEEP); 2540 2541 atomic_inc_ulong((ulong_t *)&rnew); 2542 #ifdef DEBUG 2543 clstat_debug.nrnode.value.ui64++; 2544 #endif 2545 vp = new_vp; 2546 } 2547 2548 bzero(rp, sizeof (*rp)); 2549 rp->r_vnode = vp; 2550 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); 2551 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); 2552 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); 2553 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); 2554 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); 2555 rp->r_fh.fh_len = fh->fh_len; 2556 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); 2557 rp->r_server = mi->mi_curr_serv; 2558 if (FAILOVER_MOUNT(mi)) { 2559 /* 2560 * If replicated servers, stash pathnames 2561 */ 2562 if (dnm != NULL && nm != NULL) { 2563 char *s, *p; 2564 uint_t len; 2565 2566 len = (uint_t)(strlen(dnm) + strlen(nm) + 2); 2567 rp->r_path = kmem_alloc(len, KM_SLEEP); 2568 #ifdef DEBUG 2569 clstat_debug.rpath.value.ui64 += len; 2570 #endif 2571 s = rp->r_path; 2572 for (p = dnm; *p; p++) 2573 *s++ = *p; 2574 *s++ = '/'; 2575 for (p = nm; *p; p++) 2576 *s++ = *p; 2577 *s = '\0'; 2578 } else { 2579 /* special case for root */ 2580 rp->r_path = kmem_alloc(2, KM_SLEEP); 2581 #ifdef DEBUG 2582 clstat_debug.rpath.value.ui64 += 2; 2583 #endif 2584 *rp->r_path = '.'; 2585 *(rp->r_path + 1) = '\0'; 2586 } 2587 } 2588 VFS_HOLD(vfsp); 2589 rp->r_putapage = putapage; 2590 rp->r_hashq = rhtp; 2591 rp->r_flags = RREADDIRPLUS; 2592 avl_create(&rp->r_dir, compar, sizeof (rddir_cache), 2593 offsetof(rddir_cache, tree)); 2594 vn_setops(vp, vops); 2595 vp->v_data = (caddr_t)rp; 2596 vp->v_vfsp = vfsp; 2597 vp->v_type = VNON; 2598 vp->v_flag |= VMODSORT; 2599 nfs_set_vroot(vp); 2600 2601 /* 2602 * There is a race condition if someone else 2603 * alloc's the rnode while no locks are held, so we 2604 * check again and recover if found. 2605 */ 2606 rw_enter(&rhtp->r_lock, RW_WRITER); 2607 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { 2608 vp = RTOV(trp); 2609 nfs_set_vroot(vp); 2610 *newnode = 0; 2611 rw_exit(&rhtp->r_lock); 2612 rp_addfree(rp, cr); 2613 rw_enter(&rhtp->r_lock, RW_READER); 2614 return (vp); 2615 } 2616 rp_addhash(rp); 2617 *newnode = 1; 2618 return (vp); 2619 } 2620 2621 /* 2622 * Callback function to check if the page should be marked as 2623 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT. 2624 */ 2625 int 2626 nfs_setmod_check(page_t *pp) 2627 { 2628 if (pp->p_fsdata != C_NOCOMMIT) { 2629 pp->p_fsdata = C_NOCOMMIT; 2630 return (1); 2631 } 2632 return (0); 2633 } 2634 2635 static void 2636 nfs_set_vroot(vnode_t *vp) 2637 { 2638 rnode_t *rp; 2639 nfs_fhandle *rootfh; 2640 2641 rp = VTOR(vp); 2642 rootfh = &rp->r_server->sv_fhandle; 2643 if (rootfh->fh_len == rp->r_fh.fh_len && 2644 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { 2645 if (!(vp->v_flag & VROOT)) { 2646 mutex_enter(&vp->v_lock); 2647 vp->v_flag |= VROOT; 2648 mutex_exit(&vp->v_lock); 2649 } 2650 } 2651 } 2652 2653 static void 2654 nfs_free_r_path(rnode_t *rp) 2655 { 2656 char *path; 2657 size_t len; 2658 2659 path = rp->r_path; 2660 if (path) { 2661 rp->r_path = NULL; 2662 len = strlen(path) + 1; 2663 kmem_free(path, len); 2664 #ifdef DEBUG 2665 clstat_debug.rpath.value.ui64 -= len; 2666 #endif 2667 } 2668 } 2669 2670 /* 2671 * Put an rnode on the free list. 2672 * 2673 * Rnodes which were allocated above and beyond the normal limit 2674 * are immediately freed. 2675 */ 2676 void 2677 rp_addfree(rnode_t *rp, cred_t *cr) 2678 { 2679 vnode_t *vp; 2680 struct vfs *vfsp; 2681 2682 vp = RTOV(rp); 2683 ASSERT(vp->v_count >= 1); 2684 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 2685 2686 /* 2687 * If we have too many rnodes allocated and there are no 2688 * references to this rnode, or if the rnode is no longer 2689 * accessible by it does not reside in the hash queues, 2690 * or if an i/o error occurred while writing to the file, 2691 * then just free it instead of putting it on the rnode 2692 * freelist. 2693 */ 2694 vfsp = vp->v_vfsp; 2695 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || 2696 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { 2697 if (rp->r_flags & RHASHED) { 2698 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2699 mutex_enter(&vp->v_lock); 2700 if (vp->v_count > 1) { 2701 vp->v_count--; 2702 mutex_exit(&vp->v_lock); 2703 rw_exit(&rp->r_hashq->r_lock); 2704 return; 2705 } 2706 mutex_exit(&vp->v_lock); 2707 rp_rmhash_locked(rp); 2708 rw_exit(&rp->r_hashq->r_lock); 2709 } 2710 2711 rinactive(rp, cr); 2712 2713 /* 2714 * Recheck the vnode reference count. We need to 2715 * make sure that another reference has not been 2716 * acquired while we were not holding v_lock. The 2717 * rnode is not in the rnode hash queues, so the 2718 * only way for a reference to have been acquired 2719 * is for a VOP_PUTPAGE because the rnode was marked 2720 * with RDIRTY or for a modified page. This 2721 * reference may have been acquired before our call 2722 * to rinactive. The i/o may have been completed, 2723 * thus allowing rinactive to complete, but the 2724 * reference to the vnode may not have been released 2725 * yet. In any case, the rnode can not be destroyed 2726 * until the other references to this vnode have been 2727 * released. The other references will take care of 2728 * either destroying the rnode or placing it on the 2729 * rnode freelist. If there are no other references, 2730 * then the rnode may be safely destroyed. 2731 */ 2732 mutex_enter(&vp->v_lock); 2733 if (vp->v_count > 1) { 2734 vp->v_count--; 2735 mutex_exit(&vp->v_lock); 2736 return; 2737 } 2738 mutex_exit(&vp->v_lock); 2739 2740 destroy_rnode(rp); 2741 return; 2742 } 2743 2744 /* 2745 * Lock the hash queue and then recheck the reference count 2746 * to ensure that no other threads have acquired a reference 2747 * to indicate that the rnode should not be placed on the 2748 * freelist. If another reference has been acquired, then 2749 * just release this one and let the other thread complete 2750 * the processing of adding this rnode to the freelist. 2751 */ 2752 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2753 2754 mutex_enter(&vp->v_lock); 2755 if (vp->v_count > 1) { 2756 vp->v_count--; 2757 mutex_exit(&vp->v_lock); 2758 rw_exit(&rp->r_hashq->r_lock); 2759 return; 2760 } 2761 mutex_exit(&vp->v_lock); 2762 2763 /* 2764 * If there is no cached data or metadata for this file, then 2765 * put the rnode on the front of the freelist so that it will 2766 * be reused before other rnodes which may have cached data or 2767 * metadata associated with them. 2768 */ 2769 mutex_enter(&rpfreelist_lock); 2770 if (rpfreelist == NULL) { 2771 rp->r_freef = rp; 2772 rp->r_freeb = rp; 2773 rpfreelist = rp; 2774 } else { 2775 rp->r_freef = rpfreelist; 2776 rp->r_freeb = rpfreelist->r_freeb; 2777 rpfreelist->r_freeb->r_freef = rp; 2778 rpfreelist->r_freeb = rp; 2779 if (!vn_has_cached_data(vp) && 2780 !HAVE_RDDIR_CACHE(rp) && 2781 rp->r_symlink.contents == NULL && 2782 rp->r_secattr == NULL && 2783 rp->r_pathconf == NULL) 2784 rpfreelist = rp; 2785 } 2786 mutex_exit(&rpfreelist_lock); 2787 2788 rw_exit(&rp->r_hashq->r_lock); 2789 } 2790 2791 /* 2792 * Remove an rnode from the free list. 2793 * 2794 * The caller must be holding rpfreelist_lock and the rnode 2795 * must be on the freelist. 2796 */ 2797 static void 2798 rp_rmfree(rnode_t *rp) 2799 { 2800 2801 ASSERT(MUTEX_HELD(&rpfreelist_lock)); 2802 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); 2803 2804 if (rp == rpfreelist) { 2805 rpfreelist = rp->r_freef; 2806 if (rp == rpfreelist) 2807 rpfreelist = NULL; 2808 } 2809 2810 rp->r_freeb->r_freef = rp->r_freef; 2811 rp->r_freef->r_freeb = rp->r_freeb; 2812 2813 rp->r_freef = rp->r_freeb = NULL; 2814 } 2815 2816 /* 2817 * Put a rnode in the hash table. 2818 * 2819 * The caller must be holding the exclusive hash queue lock. 2820 */ 2821 static void 2822 rp_addhash(rnode_t *rp) 2823 { 2824 2825 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2826 ASSERT(!(rp->r_flags & RHASHED)); 2827 2828 rp->r_hashf = rp->r_hashq->r_hashf; 2829 rp->r_hashq->r_hashf = rp; 2830 rp->r_hashb = (rnode_t *)rp->r_hashq; 2831 rp->r_hashf->r_hashb = rp; 2832 2833 mutex_enter(&rp->r_statelock); 2834 rp->r_flags |= RHASHED; 2835 mutex_exit(&rp->r_statelock); 2836 } 2837 2838 /* 2839 * Remove a rnode from the hash table. 2840 * 2841 * The caller must be holding the hash queue lock. 2842 */ 2843 static void 2844 rp_rmhash_locked(rnode_t *rp) 2845 { 2846 2847 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); 2848 ASSERT(rp->r_flags & RHASHED); 2849 2850 rp->r_hashb->r_hashf = rp->r_hashf; 2851 rp->r_hashf->r_hashb = rp->r_hashb; 2852 2853 mutex_enter(&rp->r_statelock); 2854 rp->r_flags &= ~RHASHED; 2855 mutex_exit(&rp->r_statelock); 2856 } 2857 2858 /* 2859 * Remove a rnode from the hash table. 2860 * 2861 * The caller must not be holding the hash queue lock. 2862 */ 2863 void 2864 rp_rmhash(rnode_t *rp) 2865 { 2866 2867 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 2868 rp_rmhash_locked(rp); 2869 rw_exit(&rp->r_hashq->r_lock); 2870 } 2871 2872 /* 2873 * Lookup a rnode by fhandle. 2874 * 2875 * The caller must be holding the hash queue lock, either shared or exclusive. 2876 */ 2877 static rnode_t * 2878 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp) 2879 { 2880 rnode_t *rp; 2881 vnode_t *vp; 2882 2883 ASSERT(RW_LOCK_HELD(&rhtp->r_lock)); 2884 2885 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) { 2886 vp = RTOV(rp); 2887 if (vp->v_vfsp == vfsp && 2888 rp->r_fh.fh_len == fh->fh_len && 2889 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) { 2890 /* 2891 * remove rnode from free list, if necessary. 2892 */ 2893 if (rp->r_freef != NULL) { 2894 mutex_enter(&rpfreelist_lock); 2895 /* 2896 * If the rnode is on the freelist, 2897 * then remove it and use that reference 2898 * as the new reference. Otherwise, 2899 * need to increment the reference count. 2900 */ 2901 if (rp->r_freef != NULL) { 2902 rp_rmfree(rp); 2903 mutex_exit(&rpfreelist_lock); 2904 } else { 2905 mutex_exit(&rpfreelist_lock); 2906 VN_HOLD(vp); 2907 } 2908 } else 2909 VN_HOLD(vp); 2910 return (rp); 2911 } 2912 } 2913 return (NULL); 2914 } 2915 2916 /* 2917 * Return 1 if there is a active vnode belonging to this vfs in the 2918 * rtable cache. 2919 * 2920 * Several of these checks are done without holding the usual 2921 * locks. This is safe because destroy_rtable(), rp_addfree(), 2922 * etc. will redo the necessary checks before actually destroying 2923 * any rnodes. 2924 */ 2925 int 2926 check_rtable(struct vfs *vfsp) 2927 { 2928 int index; 2929 rnode_t *rp; 2930 vnode_t *vp; 2931 2932 for (index = 0; index < rtablesize; index++) { 2933 rw_enter(&rtable[index].r_lock, RW_READER); 2934 for (rp = rtable[index].r_hashf; 2935 rp != (rnode_t *)(&rtable[index]); 2936 rp = rp->r_hashf) { 2937 vp = RTOV(rp); 2938 if (vp->v_vfsp == vfsp) { 2939 if (rp->r_freef == NULL || 2940 (vn_has_cached_data(vp) && 2941 (rp->r_flags & RDIRTY)) || 2942 rp->r_count > 0) { 2943 rw_exit(&rtable[index].r_lock); 2944 return (1); 2945 } 2946 } 2947 } 2948 rw_exit(&rtable[index].r_lock); 2949 } 2950 return (0); 2951 } 2952 2953 /* 2954 * Destroy inactive vnodes from the hash queues which belong to this 2955 * vfs. It is essential that we destroy all inactive vnodes during a 2956 * forced unmount as well as during a normal unmount. 2957 */ 2958 void 2959 destroy_rtable(struct vfs *vfsp, cred_t *cr) 2960 { 2961 int index; 2962 rnode_t *rp; 2963 rnode_t *rlist; 2964 rnode_t *r_hashf; 2965 vnode_t *vp; 2966 2967 rlist = NULL; 2968 2969 for (index = 0; index < rtablesize; index++) { 2970 rw_enter(&rtable[index].r_lock, RW_WRITER); 2971 for (rp = rtable[index].r_hashf; 2972 rp != (rnode_t *)(&rtable[index]); 2973 rp = r_hashf) { 2974 /* save the hash pointer before destroying */ 2975 r_hashf = rp->r_hashf; 2976 vp = RTOV(rp); 2977 if (vp->v_vfsp == vfsp) { 2978 mutex_enter(&rpfreelist_lock); 2979 if (rp->r_freef != NULL) { 2980 rp_rmfree(rp); 2981 mutex_exit(&rpfreelist_lock); 2982 rp_rmhash_locked(rp); 2983 rp->r_hashf = rlist; 2984 rlist = rp; 2985 } else 2986 mutex_exit(&rpfreelist_lock); 2987 } 2988 } 2989 rw_exit(&rtable[index].r_lock); 2990 } 2991 2992 for (rp = rlist; rp != NULL; rp = rlist) { 2993 rlist = rp->r_hashf; 2994 /* 2995 * This call to rp_addfree will end up destroying the 2996 * rnode, but in a safe way with the appropriate set 2997 * of checks done. 2998 */ 2999 rp_addfree(rp, cr); 3000 } 3001 3002 } 3003 3004 /* 3005 * This routine destroys all the resources associated with the rnode 3006 * and then the rnode itself. 3007 */ 3008 static void 3009 destroy_rnode(rnode_t *rp) 3010 { 3011 vnode_t *vp; 3012 vfs_t *vfsp; 3013 3014 vp = RTOV(rp); 3015 vfsp = vp->v_vfsp; 3016 3017 ASSERT(vp->v_count == 1); 3018 ASSERT(rp->r_count == 0); 3019 ASSERT(rp->r_lmpl == NULL); 3020 ASSERT(rp->r_mapcnt == 0); 3021 ASSERT(!(rp->r_flags & RHASHED)); 3022 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); 3023 atomic_dec_ulong((ulong_t *)&rnew); 3024 #ifdef DEBUG 3025 clstat_debug.nrnode.value.ui64--; 3026 #endif 3027 nfs_rw_destroy(&rp->r_rwlock); 3028 nfs_rw_destroy(&rp->r_lkserlock); 3029 mutex_destroy(&rp->r_statelock); 3030 cv_destroy(&rp->r_cv); 3031 cv_destroy(&rp->r_commit.c_cv); 3032 if (rp->r_flags & RDELMAPLIST) 3033 list_destroy(&rp->r_indelmap); 3034 nfs_free_r_path(rp); 3035 avl_destroy(&rp->r_dir); 3036 vn_invalid(vp); 3037 vn_free(vp); 3038 kmem_cache_free(rnode_cache, rp); 3039 VFS_RELE(vfsp); 3040 } 3041 3042 /* 3043 * Flush all vnodes in this (or every) vfs. 3044 * Used by nfs_sync and by nfs_unmount. 3045 */ 3046 void 3047 rflush(struct vfs *vfsp, cred_t *cr) 3048 { 3049 int index; 3050 rnode_t *rp; 3051 vnode_t *vp, **vplist; 3052 long num, cnt; 3053 3054 /* 3055 * Check to see whether there is anything to do. 3056 */ 3057 num = rnew; 3058 if (num == 0) 3059 return; 3060 3061 /* 3062 * Allocate a slot for all currently active rnodes on the 3063 * supposition that they all may need flushing. 3064 */ 3065 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP); 3066 cnt = 0; 3067 3068 /* 3069 * Walk the hash queues looking for rnodes with page 3070 * lists associated with them. Make a list of these 3071 * files. 3072 */ 3073 for (index = 0; index < rtablesize; index++) { 3074 rw_enter(&rtable[index].r_lock, RW_READER); 3075 for (rp = rtable[index].r_hashf; 3076 rp != (rnode_t *)(&rtable[index]); 3077 rp = rp->r_hashf) { 3078 vp = RTOV(rp); 3079 /* 3080 * Don't bother sync'ing a vp if it 3081 * is part of virtual swap device or 3082 * if VFS is read-only 3083 */ 3084 if (IS_SWAPVP(vp) || vn_is_readonly(vp)) 3085 continue; 3086 /* 3087 * If flushing all mounted file systems or 3088 * the vnode belongs to this vfs, has pages 3089 * and is marked as either dirty or mmap'd, 3090 * hold and add this vnode to the list of 3091 * vnodes to flush. 3092 */ 3093 if ((vfsp == NULL || vp->v_vfsp == vfsp) && 3094 vn_has_cached_data(vp) && 3095 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) { 3096 VN_HOLD(vp); 3097 vplist[cnt++] = vp; 3098 if (cnt == num) { 3099 rw_exit(&rtable[index].r_lock); 3100 goto toomany; 3101 } 3102 } 3103 } 3104 rw_exit(&rtable[index].r_lock); 3105 } 3106 toomany: 3107 3108 /* 3109 * Flush and release all of the files on the list. 3110 */ 3111 while (cnt-- > 0) { 3112 vp = vplist[cnt]; 3113 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL); 3114 VN_RELE(vp); 3115 } 3116 3117 /* 3118 * Free the space allocated to hold the list. 3119 */ 3120 kmem_free(vplist, num * sizeof (*vplist)); 3121 } 3122 3123 /* 3124 * This probably needs to be larger than or equal to 3125 * log2(sizeof (struct rnode)) due to the way that rnodes are 3126 * allocated. 3127 */ 3128 #define ACACHE_SHIFT_BITS 9 3129 3130 static int 3131 acachehash(rnode_t *rp, cred_t *cr) 3132 { 3133 3134 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) & 3135 acachemask); 3136 } 3137 3138 #ifdef DEBUG 3139 static long nfs_access_cache_hits = 0; 3140 static long nfs_access_cache_misses = 0; 3141 #endif 3142 3143 nfs_access_type_t 3144 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr) 3145 { 3146 vnode_t *vp; 3147 acache_t *ap; 3148 acache_hash_t *hp; 3149 nfs_access_type_t all; 3150 3151 vp = RTOV(rp); 3152 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp)) 3153 return (NFS_ACCESS_UNKNOWN); 3154 3155 if (rp->r_acache != NULL) { 3156 hp = &acache[acachehash(rp, cr)]; 3157 rw_enter(&hp->lock, RW_READER); 3158 ap = hp->next; 3159 while (ap != (acache_t *)hp) { 3160 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3161 if ((ap->known & acc) == acc) { 3162 #ifdef DEBUG 3163 nfs_access_cache_hits++; 3164 #endif 3165 if ((ap->allowed & acc) == acc) 3166 all = NFS_ACCESS_ALLOWED; 3167 else 3168 all = NFS_ACCESS_DENIED; 3169 } else { 3170 #ifdef DEBUG 3171 nfs_access_cache_misses++; 3172 #endif 3173 all = NFS_ACCESS_UNKNOWN; 3174 } 3175 rw_exit(&hp->lock); 3176 return (all); 3177 } 3178 ap = ap->next; 3179 } 3180 rw_exit(&hp->lock); 3181 } 3182 3183 #ifdef DEBUG 3184 nfs_access_cache_misses++; 3185 #endif 3186 return (NFS_ACCESS_UNKNOWN); 3187 } 3188 3189 void 3190 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr) 3191 { 3192 acache_t *ap; 3193 acache_t *nap; 3194 acache_hash_t *hp; 3195 3196 hp = &acache[acachehash(rp, cr)]; 3197 3198 /* 3199 * Allocate now assuming that mostly an allocation will be 3200 * required. This allows the allocation to happen without 3201 * holding the hash bucket locked. 3202 */ 3203 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP); 3204 if (nap != NULL) { 3205 nap->known = acc; 3206 nap->allowed = resacc; 3207 nap->rnode = rp; 3208 crhold(cr); 3209 nap->cred = cr; 3210 nap->hashq = hp; 3211 } 3212 3213 rw_enter(&hp->lock, RW_WRITER); 3214 3215 if (rp->r_acache != NULL) { 3216 ap = hp->next; 3217 while (ap != (acache_t *)hp) { 3218 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) { 3219 ap->known |= acc; 3220 ap->allowed &= ~acc; 3221 ap->allowed |= resacc; 3222 rw_exit(&hp->lock); 3223 if (nap != NULL) { 3224 crfree(nap->cred); 3225 kmem_cache_free(acache_cache, nap); 3226 } 3227 return; 3228 } 3229 ap = ap->next; 3230 } 3231 } 3232 3233 if (nap != NULL) { 3234 #ifdef DEBUG 3235 clstat_debug.access.value.ui64++; 3236 #endif 3237 nap->next = hp->next; 3238 hp->next = nap; 3239 nap->next->prev = nap; 3240 nap->prev = (acache_t *)hp; 3241 3242 mutex_enter(&rp->r_statelock); 3243 nap->list = rp->r_acache; 3244 rp->r_acache = nap; 3245 mutex_exit(&rp->r_statelock); 3246 } 3247 3248 rw_exit(&hp->lock); 3249 } 3250 3251 int 3252 nfs_access_purge_rp(rnode_t *rp) 3253 { 3254 acache_t *ap; 3255 acache_t *tmpap; 3256 acache_t *rplist; 3257 3258 /* 3259 * If there aren't any cached entries, then there is nothing 3260 * to free. 3261 */ 3262 if (rp->r_acache == NULL) 3263 return (0); 3264 3265 mutex_enter(&rp->r_statelock); 3266 rplist = rp->r_acache; 3267 rp->r_acache = NULL; 3268 mutex_exit(&rp->r_statelock); 3269 3270 /* 3271 * Loop through each entry in the list pointed to in the 3272 * rnode. Remove each of these entries from the hash 3273 * queue that it is on and remove it from the list in 3274 * the rnode. 3275 */ 3276 for (ap = rplist; ap != NULL; ap = tmpap) { 3277 rw_enter(&ap->hashq->lock, RW_WRITER); 3278 ap->prev->next = ap->next; 3279 ap->next->prev = ap->prev; 3280 rw_exit(&ap->hashq->lock); 3281 3282 tmpap = ap->list; 3283 crfree(ap->cred); 3284 kmem_cache_free(acache_cache, ap); 3285 #ifdef DEBUG 3286 clstat_debug.access.value.ui64--; 3287 #endif 3288 } 3289 3290 return (1); 3291 } 3292 3293 static const char prefix[] = ".nfs"; 3294 3295 static kmutex_t newnum_lock; 3296 3297 int 3298 newnum(void) 3299 { 3300 static uint_t newnum = 0; 3301 uint_t id; 3302 3303 mutex_enter(&newnum_lock); 3304 if (newnum == 0) 3305 newnum = gethrestime_sec() & 0xffff; 3306 id = newnum++; 3307 mutex_exit(&newnum_lock); 3308 return (id); 3309 } 3310 3311 char * 3312 newname(void) 3313 { 3314 char *news; 3315 char *s; 3316 const char *p; 3317 uint_t id; 3318 3319 id = newnum(); 3320 news = kmem_alloc(MAXNAMELEN, KM_SLEEP); 3321 s = news; 3322 p = prefix; 3323 while (*p != '\0') 3324 *s++ = *p++; 3325 while (id != 0) { 3326 *s++ = "0123456789ABCDEF"[id & 0x0f]; 3327 id >>= 4; 3328 } 3329 *s = '\0'; 3330 return (news); 3331 } 3332 3333 /* 3334 * Snapshot callback for nfs:0:nfs_client as registered with the kstat 3335 * framework. 3336 */ 3337 static int 3338 cl_snapshot(kstat_t *ksp, void *buf, int rw) 3339 { 3340 ksp->ks_snaptime = gethrtime(); 3341 if (rw == KSTAT_WRITE) { 3342 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl)); 3343 #ifdef DEBUG 3344 /* 3345 * Currently only the global zone can write to kstats, but we 3346 * add the check just for paranoia. 3347 */ 3348 if (INGLOBALZONE(curproc)) 3349 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug, 3350 sizeof (clstat_debug)); 3351 #endif 3352 } else { 3353 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl)); 3354 #ifdef DEBUG 3355 /* 3356 * If we're displaying the "global" debug kstat values, we 3357 * display them as-is to all zones since in fact they apply to 3358 * the system as a whole. 3359 */ 3360 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl), 3361 sizeof (clstat_debug)); 3362 #endif 3363 } 3364 return (0); 3365 } 3366 3367 static void * 3368 clinit_zone(zoneid_t zoneid) 3369 { 3370 kstat_t *nfs_client_kstat; 3371 struct nfs_clnt *nfscl; 3372 uint_t ndata; 3373 3374 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP); 3375 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL); 3376 nfscl->nfscl_chtable = NULL; 3377 nfscl->nfscl_zoneid = zoneid; 3378 3379 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl)); 3380 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t); 3381 #ifdef DEBUG 3382 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t); 3383 #endif 3384 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client", 3385 "misc", KSTAT_TYPE_NAMED, ndata, 3386 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) { 3387 nfs_client_kstat->ks_private = &nfscl->nfscl_stat; 3388 nfs_client_kstat->ks_snapshot = cl_snapshot; 3389 kstat_install(nfs_client_kstat); 3390 } 3391 mutex_enter(&nfs_clnt_list_lock); 3392 list_insert_head(&nfs_clnt_list, nfscl); 3393 mutex_exit(&nfs_clnt_list_lock); 3394 return (nfscl); 3395 } 3396 3397 /*ARGSUSED*/ 3398 static void 3399 clfini_zone(zoneid_t zoneid, void *arg) 3400 { 3401 struct nfs_clnt *nfscl = arg; 3402 chhead_t *chp, *next; 3403 3404 if (nfscl == NULL) 3405 return; 3406 mutex_enter(&nfs_clnt_list_lock); 3407 list_remove(&nfs_clnt_list, nfscl); 3408 mutex_exit(&nfs_clnt_list_lock); 3409 clreclaim_zone(nfscl, 0); 3410 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) { 3411 ASSERT(chp->ch_list == NULL); 3412 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1); 3413 next = chp->ch_next; 3414 kmem_free(chp, sizeof (*chp)); 3415 } 3416 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid); 3417 mutex_destroy(&nfscl->nfscl_chtable_lock); 3418 kmem_free(nfscl, sizeof (*nfscl)); 3419 } 3420 3421 /* 3422 * Called by endpnt_destructor to make sure the client handles are 3423 * cleaned up before the RPC endpoints. This becomes a no-op if 3424 * clfini_zone (above) is called first. This function is needed 3425 * (rather than relying on clfini_zone to clean up) because the ZSD 3426 * callbacks have no ordering mechanism, so we have no way to ensure 3427 * that clfini_zone is called before endpnt_destructor. 3428 */ 3429 void 3430 clcleanup_zone(zoneid_t zoneid) 3431 { 3432 struct nfs_clnt *nfscl; 3433 3434 mutex_enter(&nfs_clnt_list_lock); 3435 nfscl = list_head(&nfs_clnt_list); 3436 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) { 3437 if (nfscl->nfscl_zoneid == zoneid) { 3438 clreclaim_zone(nfscl, 0); 3439 break; 3440 } 3441 } 3442 mutex_exit(&nfs_clnt_list_lock); 3443 } 3444 3445 int 3446 nfs_subrinit(void) 3447 { 3448 int i; 3449 ulong_t nrnode_max; 3450 3451 /* 3452 * Allocate and initialize the rnode hash queues 3453 */ 3454 if (nrnode <= 0) 3455 nrnode = ncsize; 3456 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode)); 3457 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) { 3458 zcmn_err(GLOBAL_ZONEID, CE_NOTE, 3459 "!setting nrnode to max value of %ld", nrnode_max); 3460 nrnode = nrnode_max; 3461 } 3462 3463 rtablesize = 1 << highbit(nrnode / hashlen); 3464 rtablemask = rtablesize - 1; 3465 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP); 3466 for (i = 0; i < rtablesize; i++) { 3467 rtable[i].r_hashf = (rnode_t *)(&rtable[i]); 3468 rtable[i].r_hashb = (rnode_t *)(&rtable[i]); 3469 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL); 3470 } 3471 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t), 3472 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0); 3473 3474 /* 3475 * Allocate and initialize the access cache 3476 */ 3477 3478 /* 3479 * Initial guess is one access cache entry per rnode unless 3480 * nacache is set to a non-zero value and then it is used to 3481 * indicate a guess at the number of access cache entries. 3482 */ 3483 if (nacache > 0) 3484 acachesize = 1 << highbit(nacache / hashlen); 3485 else 3486 acachesize = rtablesize; 3487 acachemask = acachesize - 1; 3488 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP); 3489 for (i = 0; i < acachesize; i++) { 3490 acache[i].next = (acache_t *)&acache[i]; 3491 acache[i].prev = (acache_t *)&acache[i]; 3492 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL); 3493 } 3494 acache_cache = kmem_cache_create("nfs_access_cache", 3495 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 3496 /* 3497 * Allocate and initialize the client handle cache 3498 */ 3499 chtab_cache = kmem_cache_create("client_handle_cache", 3500 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0); 3501 /* 3502 * Initialize the list of per-zone client handles (and associated data). 3503 * This needs to be done before we call zone_key_create(). 3504 */ 3505 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt), 3506 offsetof(struct nfs_clnt, nfscl_node)); 3507 /* 3508 * Initialize the zone_key for per-zone client handle lists. 3509 */ 3510 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone); 3511 /* 3512 * Initialize the various mutexes and reader/writer locks 3513 */ 3514 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL); 3515 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL); 3516 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); 3517 3518 /* 3519 * Assign unique major number for all nfs mounts 3520 */ 3521 if ((nfs_major = getudev()) == -1) { 3522 zcmn_err(GLOBAL_ZONEID, CE_WARN, 3523 "nfs: init: can't get unique device number"); 3524 nfs_major = 0; 3525 } 3526 nfs_minor = 0; 3527 3528 if (nfs3_jukebox_delay == 0) 3529 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY; 3530 3531 return (0); 3532 } 3533 3534 void 3535 nfs_subrfini(void) 3536 { 3537 int i; 3538 3539 /* 3540 * Deallocate the rnode hash queues 3541 */ 3542 kmem_cache_destroy(rnode_cache); 3543 3544 for (i = 0; i < rtablesize; i++) 3545 rw_destroy(&rtable[i].r_lock); 3546 kmem_free(rtable, rtablesize * sizeof (*rtable)); 3547 3548 /* 3549 * Deallocated the access cache 3550 */ 3551 kmem_cache_destroy(acache_cache); 3552 3553 for (i = 0; i < acachesize; i++) 3554 rw_destroy(&acache[i].lock); 3555 kmem_free(acache, acachesize * sizeof (*acache)); 3556 3557 /* 3558 * Deallocate the client handle cache 3559 */ 3560 kmem_cache_destroy(chtab_cache); 3561 3562 /* 3563 * Destroy the various mutexes and reader/writer locks 3564 */ 3565 mutex_destroy(&rpfreelist_lock); 3566 mutex_destroy(&newnum_lock); 3567 mutex_destroy(&nfs_minor_lock); 3568 (void) zone_key_delete(nfsclnt_zone_key); 3569 } 3570 3571 enum nfsstat 3572 puterrno(int error) 3573 { 3574 3575 switch (error) { 3576 case EOPNOTSUPP: 3577 return (NFSERR_OPNOTSUPP); 3578 case ENAMETOOLONG: 3579 return (NFSERR_NAMETOOLONG); 3580 case ENOTEMPTY: 3581 return (NFSERR_NOTEMPTY); 3582 case EDQUOT: 3583 return (NFSERR_DQUOT); 3584 case ESTALE: 3585 return (NFSERR_STALE); 3586 case EREMOTE: 3587 return (NFSERR_REMOTE); 3588 case ENOSYS: 3589 return (NFSERR_OPNOTSUPP); 3590 case EOVERFLOW: 3591 return (NFSERR_INVAL); 3592 default: 3593 return ((enum nfsstat)error); 3594 } 3595 /* NOTREACHED */ 3596 } 3597 3598 int 3599 geterrno(enum nfsstat status) 3600 { 3601 3602 switch (status) { 3603 case NFSERR_OPNOTSUPP: 3604 return (EOPNOTSUPP); 3605 case NFSERR_NAMETOOLONG: 3606 return (ENAMETOOLONG); 3607 case NFSERR_NOTEMPTY: 3608 return (ENOTEMPTY); 3609 case NFSERR_DQUOT: 3610 return (EDQUOT); 3611 case NFSERR_STALE: 3612 return (ESTALE); 3613 case NFSERR_REMOTE: 3614 return (EREMOTE); 3615 case NFSERR_WFLUSH: 3616 return (EIO); 3617 default: 3618 return ((int)status); 3619 } 3620 /* NOTREACHED */ 3621 } 3622 3623 enum nfsstat3 3624 puterrno3(int error) 3625 { 3626 3627 #ifdef DEBUG 3628 switch (error) { 3629 case 0: 3630 return (NFS3_OK); 3631 case EPERM: 3632 return (NFS3ERR_PERM); 3633 case ENOENT: 3634 return (NFS3ERR_NOENT); 3635 case EIO: 3636 return (NFS3ERR_IO); 3637 case ENXIO: 3638 return (NFS3ERR_NXIO); 3639 case EACCES: 3640 return (NFS3ERR_ACCES); 3641 case EEXIST: 3642 return (NFS3ERR_EXIST); 3643 case EXDEV: 3644 return (NFS3ERR_XDEV); 3645 case ENODEV: 3646 return (NFS3ERR_NODEV); 3647 case ENOTDIR: 3648 return (NFS3ERR_NOTDIR); 3649 case EISDIR: 3650 return (NFS3ERR_ISDIR); 3651 case EINVAL: 3652 return (NFS3ERR_INVAL); 3653 case EFBIG: 3654 return (NFS3ERR_FBIG); 3655 case ENOSPC: 3656 return (NFS3ERR_NOSPC); 3657 case EROFS: 3658 return (NFS3ERR_ROFS); 3659 case EMLINK: 3660 return (NFS3ERR_MLINK); 3661 case ENAMETOOLONG: 3662 return (NFS3ERR_NAMETOOLONG); 3663 case ENOTEMPTY: 3664 return (NFS3ERR_NOTEMPTY); 3665 case EDQUOT: 3666 return (NFS3ERR_DQUOT); 3667 case ESTALE: 3668 return (NFS3ERR_STALE); 3669 case EREMOTE: 3670 return (NFS3ERR_REMOTE); 3671 case ENOSYS: 3672 case EOPNOTSUPP: 3673 return (NFS3ERR_NOTSUPP); 3674 case EOVERFLOW: 3675 return (NFS3ERR_INVAL); 3676 default: 3677 zcmn_err(getzoneid(), CE_WARN, 3678 "puterrno3: got error %d", error); 3679 return ((enum nfsstat3)error); 3680 } 3681 #else 3682 switch (error) { 3683 case ENAMETOOLONG: 3684 return (NFS3ERR_NAMETOOLONG); 3685 case ENOTEMPTY: 3686 return (NFS3ERR_NOTEMPTY); 3687 case EDQUOT: 3688 return (NFS3ERR_DQUOT); 3689 case ESTALE: 3690 return (NFS3ERR_STALE); 3691 case ENOSYS: 3692 case EOPNOTSUPP: 3693 return (NFS3ERR_NOTSUPP); 3694 case EREMOTE: 3695 return (NFS3ERR_REMOTE); 3696 case EOVERFLOW: 3697 return (NFS3ERR_INVAL); 3698 default: 3699 return ((enum nfsstat3)error); 3700 } 3701 #endif 3702 } 3703 3704 int 3705 geterrno3(enum nfsstat3 status) 3706 { 3707 3708 #ifdef DEBUG 3709 switch (status) { 3710 case NFS3_OK: 3711 return (0); 3712 case NFS3ERR_PERM: 3713 return (EPERM); 3714 case NFS3ERR_NOENT: 3715 return (ENOENT); 3716 case NFS3ERR_IO: 3717 return (EIO); 3718 case NFS3ERR_NXIO: 3719 return (ENXIO); 3720 case NFS3ERR_ACCES: 3721 return (EACCES); 3722 case NFS3ERR_EXIST: 3723 return (EEXIST); 3724 case NFS3ERR_XDEV: 3725 return (EXDEV); 3726 case NFS3ERR_NODEV: 3727 return (ENODEV); 3728 case NFS3ERR_NOTDIR: 3729 return (ENOTDIR); 3730 case NFS3ERR_ISDIR: 3731 return (EISDIR); 3732 case NFS3ERR_INVAL: 3733 return (EINVAL); 3734 case NFS3ERR_FBIG: 3735 return (EFBIG); 3736 case NFS3ERR_NOSPC: 3737 return (ENOSPC); 3738 case NFS3ERR_ROFS: 3739 return (EROFS); 3740 case NFS3ERR_MLINK: 3741 return (EMLINK); 3742 case NFS3ERR_NAMETOOLONG: 3743 return (ENAMETOOLONG); 3744 case NFS3ERR_NOTEMPTY: 3745 return (ENOTEMPTY); 3746 case NFS3ERR_DQUOT: 3747 return (EDQUOT); 3748 case NFS3ERR_STALE: 3749 return (ESTALE); 3750 case NFS3ERR_REMOTE: 3751 return (EREMOTE); 3752 case NFS3ERR_BADHANDLE: 3753 return (ESTALE); 3754 case NFS3ERR_NOT_SYNC: 3755 return (EINVAL); 3756 case NFS3ERR_BAD_COOKIE: 3757 return (ENOENT); 3758 case NFS3ERR_NOTSUPP: 3759 return (EOPNOTSUPP); 3760 case NFS3ERR_TOOSMALL: 3761 return (EINVAL); 3762 case NFS3ERR_SERVERFAULT: 3763 return (EIO); 3764 case NFS3ERR_BADTYPE: 3765 return (EINVAL); 3766 case NFS3ERR_JUKEBOX: 3767 return (ENXIO); 3768 default: 3769 zcmn_err(getzoneid(), CE_WARN, 3770 "geterrno3: got status %d", status); 3771 return ((int)status); 3772 } 3773 #else 3774 switch (status) { 3775 case NFS3ERR_NAMETOOLONG: 3776 return (ENAMETOOLONG); 3777 case NFS3ERR_NOTEMPTY: 3778 return (ENOTEMPTY); 3779 case NFS3ERR_DQUOT: 3780 return (EDQUOT); 3781 case NFS3ERR_STALE: 3782 case NFS3ERR_BADHANDLE: 3783 return (ESTALE); 3784 case NFS3ERR_NOTSUPP: 3785 return (EOPNOTSUPP); 3786 case NFS3ERR_REMOTE: 3787 return (EREMOTE); 3788 case NFS3ERR_NOT_SYNC: 3789 case NFS3ERR_TOOSMALL: 3790 case NFS3ERR_BADTYPE: 3791 return (EINVAL); 3792 case NFS3ERR_BAD_COOKIE: 3793 return (ENOENT); 3794 case NFS3ERR_SERVERFAULT: 3795 return (EIO); 3796 case NFS3ERR_JUKEBOX: 3797 return (ENXIO); 3798 default: 3799 return ((int)status); 3800 } 3801 #endif 3802 } 3803 3804 rddir_cache * 3805 rddir_cache_alloc(int flags) 3806 { 3807 rddir_cache *rc; 3808 3809 rc = kmem_alloc(sizeof (*rc), flags); 3810 if (rc != NULL) { 3811 rc->entries = NULL; 3812 rc->flags = RDDIR; 3813 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL); 3814 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL); 3815 rc->count = 1; 3816 #ifdef DEBUG 3817 atomic_inc_64(&clstat_debug.dirent.value.ui64); 3818 #endif 3819 } 3820 return (rc); 3821 } 3822 3823 static void 3824 rddir_cache_free(rddir_cache *rc) 3825 { 3826 3827 #ifdef DEBUG 3828 atomic_dec_64(&clstat_debug.dirent.value.ui64); 3829 #endif 3830 if (rc->entries != NULL) { 3831 #ifdef DEBUG 3832 rddir_cache_buf_free(rc->entries, rc->buflen); 3833 #else 3834 kmem_free(rc->entries, rc->buflen); 3835 #endif 3836 } 3837 cv_destroy(&rc->cv); 3838 mutex_destroy(&rc->lock); 3839 kmem_free(rc, sizeof (*rc)); 3840 } 3841 3842 void 3843 rddir_cache_hold(rddir_cache *rc) 3844 { 3845 3846 mutex_enter(&rc->lock); 3847 rc->count++; 3848 mutex_exit(&rc->lock); 3849 } 3850 3851 void 3852 rddir_cache_rele(rddir_cache *rc) 3853 { 3854 3855 mutex_enter(&rc->lock); 3856 ASSERT(rc->count > 0); 3857 if (--rc->count == 0) { 3858 mutex_exit(&rc->lock); 3859 rddir_cache_free(rc); 3860 } else 3861 mutex_exit(&rc->lock); 3862 } 3863 3864 #ifdef DEBUG 3865 char * 3866 rddir_cache_buf_alloc(size_t size, int flags) 3867 { 3868 char *rc; 3869 3870 rc = kmem_alloc(size, flags); 3871 if (rc != NULL) 3872 atomic_add_64(&clstat_debug.dirents.value.ui64, size); 3873 return (rc); 3874 } 3875 3876 void 3877 rddir_cache_buf_free(void *addr, size_t size) 3878 { 3879 3880 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size); 3881 kmem_free(addr, size); 3882 } 3883 #endif 3884 3885 static int 3886 nfs_free_data_reclaim(rnode_t *rp) 3887 { 3888 char *contents; 3889 int size; 3890 vsecattr_t *vsp; 3891 nfs3_pathconf_info *info; 3892 int freed; 3893 cred_t *cred; 3894 3895 /* 3896 * Free any held credentials and caches which 3897 * may be associated with this rnode. 3898 */ 3899 mutex_enter(&rp->r_statelock); 3900 cred = rp->r_cred; 3901 rp->r_cred = NULL; 3902 contents = rp->r_symlink.contents; 3903 size = rp->r_symlink.size; 3904 rp->r_symlink.contents = NULL; 3905 vsp = rp->r_secattr; 3906 rp->r_secattr = NULL; 3907 info = rp->r_pathconf; 3908 rp->r_pathconf = NULL; 3909 mutex_exit(&rp->r_statelock); 3910 3911 if (cred != NULL) 3912 crfree(cred); 3913 3914 /* 3915 * Free the access cache entries. 3916 */ 3917 freed = nfs_access_purge_rp(rp); 3918 3919 if (!HAVE_RDDIR_CACHE(rp) && 3920 contents == NULL && 3921 vsp == NULL && 3922 info == NULL) 3923 return (freed); 3924 3925 /* 3926 * Free the readdir cache entries 3927 */ 3928 if (HAVE_RDDIR_CACHE(rp)) 3929 nfs_purge_rddir_cache(RTOV(rp)); 3930 3931 /* 3932 * Free the symbolic link cache. 3933 */ 3934 if (contents != NULL) { 3935 3936 kmem_free((void *)contents, size); 3937 } 3938 3939 /* 3940 * Free any cached ACL. 3941 */ 3942 if (vsp != NULL) 3943 nfs_acl_free(vsp); 3944 3945 /* 3946 * Free any cached pathconf information. 3947 */ 3948 if (info != NULL) 3949 kmem_free(info, sizeof (*info)); 3950 3951 return (1); 3952 } 3953 3954 static int 3955 nfs_active_data_reclaim(rnode_t *rp) 3956 { 3957 char *contents; 3958 int size; 3959 vsecattr_t *vsp; 3960 nfs3_pathconf_info *info; 3961 int freed; 3962 3963 /* 3964 * Free any held credentials and caches which 3965 * may be associated with this rnode. 3966 */ 3967 if (!mutex_tryenter(&rp->r_statelock)) 3968 return (0); 3969 contents = rp->r_symlink.contents; 3970 size = rp->r_symlink.size; 3971 rp->r_symlink.contents = NULL; 3972 vsp = rp->r_secattr; 3973 rp->r_secattr = NULL; 3974 info = rp->r_pathconf; 3975 rp->r_pathconf = NULL; 3976 mutex_exit(&rp->r_statelock); 3977 3978 /* 3979 * Free the access cache entries. 3980 */ 3981 freed = nfs_access_purge_rp(rp); 3982 3983 if (!HAVE_RDDIR_CACHE(rp) && 3984 contents == NULL && 3985 vsp == NULL && 3986 info == NULL) 3987 return (freed); 3988 3989 /* 3990 * Free the readdir cache entries 3991 */ 3992 if (HAVE_RDDIR_CACHE(rp)) 3993 nfs_purge_rddir_cache(RTOV(rp)); 3994 3995 /* 3996 * Free the symbolic link cache. 3997 */ 3998 if (contents != NULL) { 3999 4000 kmem_free((void *)contents, size); 4001 } 4002 4003 /* 4004 * Free any cached ACL. 4005 */ 4006 if (vsp != NULL) 4007 nfs_acl_free(vsp); 4008 4009 /* 4010 * Free any cached pathconf information. 4011 */ 4012 if (info != NULL) 4013 kmem_free(info, sizeof (*info)); 4014 4015 return (1); 4016 } 4017 4018 static int 4019 nfs_free_reclaim(void) 4020 { 4021 int freed; 4022 rnode_t *rp; 4023 4024 #ifdef DEBUG 4025 clstat_debug.f_reclaim.value.ui64++; 4026 #endif 4027 freed = 0; 4028 mutex_enter(&rpfreelist_lock); 4029 rp = rpfreelist; 4030 if (rp != NULL) { 4031 do { 4032 if (nfs_free_data_reclaim(rp)) 4033 freed = 1; 4034 } while ((rp = rp->r_freef) != rpfreelist); 4035 } 4036 mutex_exit(&rpfreelist_lock); 4037 return (freed); 4038 } 4039 4040 static int 4041 nfs_active_reclaim(void) 4042 { 4043 int freed; 4044 int index; 4045 rnode_t *rp; 4046 4047 #ifdef DEBUG 4048 clstat_debug.a_reclaim.value.ui64++; 4049 #endif 4050 freed = 0; 4051 for (index = 0; index < rtablesize; index++) { 4052 rw_enter(&rtable[index].r_lock, RW_READER); 4053 for (rp = rtable[index].r_hashf; 4054 rp != (rnode_t *)(&rtable[index]); 4055 rp = rp->r_hashf) { 4056 if (nfs_active_data_reclaim(rp)) 4057 freed = 1; 4058 } 4059 rw_exit(&rtable[index].r_lock); 4060 } 4061 return (freed); 4062 } 4063 4064 static int 4065 nfs_rnode_reclaim(void) 4066 { 4067 int freed; 4068 rnode_t *rp; 4069 vnode_t *vp; 4070 4071 #ifdef DEBUG 4072 clstat_debug.r_reclaim.value.ui64++; 4073 #endif 4074 freed = 0; 4075 mutex_enter(&rpfreelist_lock); 4076 while ((rp = rpfreelist) != NULL) { 4077 rp_rmfree(rp); 4078 mutex_exit(&rpfreelist_lock); 4079 if (rp->r_flags & RHASHED) { 4080 vp = RTOV(rp); 4081 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4082 mutex_enter(&vp->v_lock); 4083 if (vp->v_count > 1) { 4084 vp->v_count--; 4085 mutex_exit(&vp->v_lock); 4086 rw_exit(&rp->r_hashq->r_lock); 4087 mutex_enter(&rpfreelist_lock); 4088 continue; 4089 } 4090 mutex_exit(&vp->v_lock); 4091 rp_rmhash_locked(rp); 4092 rw_exit(&rp->r_hashq->r_lock); 4093 } 4094 /* 4095 * This call to rp_addfree will end up destroying the 4096 * rnode, but in a safe way with the appropriate set 4097 * of checks done. 4098 */ 4099 rp_addfree(rp, CRED()); 4100 mutex_enter(&rpfreelist_lock); 4101 } 4102 mutex_exit(&rpfreelist_lock); 4103 return (freed); 4104 } 4105 4106 /*ARGSUSED*/ 4107 static void 4108 nfs_reclaim(void *cdrarg) 4109 { 4110 4111 #ifdef DEBUG 4112 clstat_debug.reclaim.value.ui64++; 4113 #endif 4114 if (nfs_free_reclaim()) 4115 return; 4116 4117 if (nfs_active_reclaim()) 4118 return; 4119 4120 (void) nfs_rnode_reclaim(); 4121 } 4122 4123 /* 4124 * NFS client failover support 4125 * 4126 * Routines to copy filehandles 4127 */ 4128 void 4129 nfscopyfh(caddr_t fhp, vnode_t *vp) 4130 { 4131 fhandle_t *dest = (fhandle_t *)fhp; 4132 4133 if (dest != NULL) 4134 *dest = *VTOFH(vp); 4135 } 4136 4137 void 4138 nfs3copyfh(caddr_t fhp, vnode_t *vp) 4139 { 4140 nfs_fh3 *dest = (nfs_fh3 *)fhp; 4141 4142 if (dest != NULL) 4143 *dest = *VTOFH3(vp); 4144 } 4145 4146 /* 4147 * NFS client failover support 4148 * 4149 * failover_safe() will test various conditions to ensure that 4150 * failover is permitted for this vnode. It will be denied 4151 * if: 4152 * 1) the operation in progress does not support failover (NULL fi) 4153 * 2) there are no available replicas (NULL mi_servers->sv_next) 4154 * 3) any locks are outstanding on this file 4155 */ 4156 static int 4157 failover_safe(failinfo_t *fi) 4158 { 4159 4160 /* 4161 * Does this op permit failover? 4162 */ 4163 if (fi == NULL || fi->vp == NULL) 4164 return (0); 4165 4166 /* 4167 * Are there any alternates to failover to? 4168 */ 4169 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL) 4170 return (0); 4171 4172 /* 4173 * Disable check; we've forced local locking 4174 * 4175 * if (flk_has_remote_locks(fi->vp)) 4176 * return (0); 4177 */ 4178 4179 /* 4180 * If we have no partial path, we can't do anything 4181 */ 4182 if (VTOR(fi->vp)->r_path == NULL) 4183 return (0); 4184 4185 return (1); 4186 } 4187 4188 #include <sys/thread.h> 4189 4190 /* 4191 * NFS client failover support 4192 * 4193 * failover_newserver() will start a search for a new server, 4194 * preferably by starting an async thread to do the work. If 4195 * someone is already doing this (recognizable by MI_BINDINPROG 4196 * being set), it will simply return and the calling thread 4197 * will queue on the mi_failover_cv condition variable. 4198 */ 4199 static void 4200 failover_newserver(mntinfo_t *mi) 4201 { 4202 /* 4203 * Check if someone else is doing this already 4204 */ 4205 mutex_enter(&mi->mi_lock); 4206 if (mi->mi_flags & MI_BINDINPROG) { 4207 mutex_exit(&mi->mi_lock); 4208 return; 4209 } 4210 mi->mi_flags |= MI_BINDINPROG; 4211 4212 /* 4213 * Need to hold the vfs struct so that it can't be released 4214 * while the failover thread is selecting a new server. 4215 */ 4216 VFS_HOLD(mi->mi_vfsp); 4217 4218 /* 4219 * Start a thread to do the real searching. 4220 */ 4221 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri); 4222 4223 mutex_exit(&mi->mi_lock); 4224 } 4225 4226 /* 4227 * NFS client failover support 4228 * 4229 * failover_thread() will find a new server to replace the one 4230 * currently in use, wake up other threads waiting on this mount 4231 * point, and die. It will start at the head of the server list 4232 * and poll servers until it finds one with an NFS server which is 4233 * registered and responds to a NULL procedure ping. 4234 * 4235 * XXX failover_thread is unsafe within the scope of the 4236 * present model defined for cpr to suspend the system. 4237 * Specifically, over-the-wire calls made by the thread 4238 * are unsafe. The thread needs to be reevaluated in case of 4239 * future updates to the cpr suspend model. 4240 */ 4241 static void 4242 failover_thread(mntinfo_t *mi) 4243 { 4244 servinfo_t *svp = NULL; 4245 CLIENT *cl; 4246 enum clnt_stat status; 4247 struct timeval tv; 4248 int error; 4249 int oncethru = 0; 4250 callb_cpr_t cprinfo; 4251 rnode_t *rp; 4252 int index; 4253 char *srvnames; 4254 size_t srvnames_len; 4255 struct nfs_clnt *nfscl = NULL; 4256 zoneid_t zoneid = getzoneid(); 4257 4258 #ifdef DEBUG 4259 /* 4260 * This is currently only needed to access counters which exist on 4261 * DEBUG kernels, hence we don't want to pay the penalty of the lookup 4262 * on non-DEBUG kernels. 4263 */ 4264 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4265 ASSERT(nfscl != NULL); 4266 #endif 4267 4268 /* 4269 * Its safe to piggyback on the mi_lock since failover_newserver() 4270 * code guarantees that there will be only one failover thread 4271 * per mountinfo at any instance. 4272 */ 4273 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr, 4274 "failover_thread"); 4275 4276 mutex_enter(&mi->mi_lock); 4277 while (mi->mi_readers) { 4278 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4279 cv_wait(&mi->mi_failover_cv, &mi->mi_lock); 4280 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4281 } 4282 mutex_exit(&mi->mi_lock); 4283 4284 tv.tv_sec = 2; 4285 tv.tv_usec = 0; 4286 4287 /* 4288 * Ping the null NFS procedure of every server in 4289 * the list until one responds. We always start 4290 * at the head of the list and always skip the one 4291 * that is current, since it's caused us a problem. 4292 */ 4293 while (svp == NULL) { 4294 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 4295 if (!oncethru && svp == mi->mi_curr_serv) 4296 continue; 4297 4298 /* 4299 * If the file system was forcibly umounted 4300 * while trying to do a failover, then just 4301 * give up on the failover. It won't matter 4302 * what the server is. 4303 */ 4304 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { 4305 svp = NULL; 4306 goto done; 4307 } 4308 4309 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 4310 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl); 4311 if (error) 4312 continue; 4313 4314 if (!(mi->mi_flags & MI_INT)) 4315 cl->cl_nosignal = TRUE; 4316 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 4317 xdr_void, NULL, tv); 4318 if (!(mi->mi_flags & MI_INT)) 4319 cl->cl_nosignal = FALSE; 4320 AUTH_DESTROY(cl->cl_auth); 4321 CLNT_DESTROY(cl); 4322 if (status == RPC_SUCCESS) { 4323 if (svp == mi->mi_curr_serv) { 4324 #ifdef DEBUG 4325 zcmn_err(zoneid, CE_NOTE, 4326 "NFS%d: failing over: selecting original server %s", 4327 mi->mi_vers, svp->sv_hostname); 4328 #else 4329 zcmn_err(zoneid, CE_NOTE, 4330 "NFS: failing over: selecting original server %s", 4331 svp->sv_hostname); 4332 #endif 4333 } else { 4334 #ifdef DEBUG 4335 zcmn_err(zoneid, CE_NOTE, 4336 "NFS%d: failing over from %s to %s", 4337 mi->mi_vers, 4338 mi->mi_curr_serv->sv_hostname, 4339 svp->sv_hostname); 4340 #else 4341 zcmn_err(zoneid, CE_NOTE, 4342 "NFS: failing over from %s to %s", 4343 mi->mi_curr_serv->sv_hostname, 4344 svp->sv_hostname); 4345 #endif 4346 } 4347 break; 4348 } 4349 } 4350 4351 if (svp == NULL) { 4352 if (!oncethru) { 4353 srvnames = nfs_getsrvnames(mi, &srvnames_len); 4354 #ifdef DEBUG 4355 zprintf(zoneid, 4356 "NFS%d servers %s not responding " 4357 "still trying\n", mi->mi_vers, srvnames); 4358 #else 4359 zprintf(zoneid, "NFS servers %s not responding " 4360 "still trying\n", srvnames); 4361 #endif 4362 oncethru = 1; 4363 } 4364 mutex_enter(&mi->mi_lock); 4365 CALLB_CPR_SAFE_BEGIN(&cprinfo); 4366 mutex_exit(&mi->mi_lock); 4367 delay(hz); 4368 mutex_enter(&mi->mi_lock); 4369 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock); 4370 mutex_exit(&mi->mi_lock); 4371 } 4372 } 4373 4374 if (oncethru) { 4375 #ifdef DEBUG 4376 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames); 4377 #else 4378 zprintf(zoneid, "NFS servers %s ok\n", srvnames); 4379 #endif 4380 } 4381 4382 if (svp != mi->mi_curr_serv) { 4383 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 4384 index = rtablehash(&mi->mi_curr_serv->sv_fhandle); 4385 rw_enter(&rtable[index].r_lock, RW_WRITER); 4386 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle, 4387 mi->mi_vfsp); 4388 if (rp != NULL) { 4389 if (rp->r_flags & RHASHED) 4390 rp_rmhash_locked(rp); 4391 rw_exit(&rtable[index].r_lock); 4392 rp->r_server = svp; 4393 rp->r_fh = svp->sv_fhandle; 4394 (void) nfs_free_data_reclaim(rp); 4395 index = rtablehash(&rp->r_fh); 4396 rp->r_hashq = &rtable[index]; 4397 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4398 vn_exists(RTOV(rp)); 4399 rp_addhash(rp); 4400 rw_exit(&rp->r_hashq->r_lock); 4401 VN_RELE(RTOV(rp)); 4402 } else 4403 rw_exit(&rtable[index].r_lock); 4404 } 4405 4406 done: 4407 if (oncethru) 4408 kmem_free(srvnames, srvnames_len); 4409 mutex_enter(&mi->mi_lock); 4410 mi->mi_flags &= ~MI_BINDINPROG; 4411 if (svp != NULL) { 4412 mi->mi_curr_serv = svp; 4413 mi->mi_failover++; 4414 #ifdef DEBUG 4415 nfscl->nfscl_stat.failover.value.ui64++; 4416 #endif 4417 } 4418 cv_broadcast(&mi->mi_failover_cv); 4419 CALLB_CPR_EXIT(&cprinfo); 4420 VFS_RELE(mi->mi_vfsp); 4421 zthread_exit(); 4422 /* NOTREACHED */ 4423 } 4424 4425 /* 4426 * NFS client failover support 4427 * 4428 * failover_wait() will put the thread to sleep until MI_BINDINPROG 4429 * is cleared, meaning that failover is complete. Called with 4430 * mi_lock mutex held. 4431 */ 4432 static int 4433 failover_wait(mntinfo_t *mi) 4434 { 4435 k_sigset_t smask; 4436 4437 /* 4438 * If someone else is hunting for a living server, 4439 * sleep until it's done. After our sleep, we may 4440 * be bound to the right server and get off cheaply. 4441 */ 4442 while (mi->mi_flags & MI_BINDINPROG) { 4443 /* 4444 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT 4445 * and SIGTERM. (Preserving the existing masks). 4446 * Mask out SIGINT if mount option nointr is specified. 4447 */ 4448 sigintr(&smask, (int)mi->mi_flags & MI_INT); 4449 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) { 4450 /* 4451 * restore original signal mask 4452 */ 4453 sigunintr(&smask); 4454 return (EINTR); 4455 } 4456 /* 4457 * restore original signal mask 4458 */ 4459 sigunintr(&smask); 4460 } 4461 return (0); 4462 } 4463 4464 /* 4465 * NFS client failover support 4466 * 4467 * failover_remap() will do a partial pathname lookup and find the 4468 * desired vnode on the current server. The interim vnode will be 4469 * discarded after we pilfer the new filehandle. 4470 * 4471 * Side effects: 4472 * - This routine will also update the filehandle in the args structure 4473 * pointed to by the fi->fhp pointer if it is non-NULL. 4474 */ 4475 4476 static int 4477 failover_remap(failinfo_t *fi) 4478 { 4479 vnode_t *vp, *nvp, *rootvp; 4480 rnode_t *rp, *nrp; 4481 mntinfo_t *mi; 4482 int error; 4483 #ifdef DEBUG 4484 struct nfs_clnt *nfscl; 4485 4486 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); 4487 ASSERT(nfscl != NULL); 4488 #endif 4489 /* 4490 * Sanity check 4491 */ 4492 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL) 4493 return (EINVAL); 4494 vp = fi->vp; 4495 rp = VTOR(vp); 4496 mi = VTOMI(vp); 4497 4498 if (!(vp->v_flag & VROOT)) { 4499 /* 4500 * Given the root fh, use the path stored in 4501 * the rnode to find the fh for the new server. 4502 */ 4503 error = VFS_ROOT(mi->mi_vfsp, &rootvp); 4504 if (error) 4505 return (error); 4506 4507 error = failover_lookup(rp->r_path, rootvp, 4508 fi->lookupproc, fi->xattrdirproc, &nvp); 4509 4510 VN_RELE(rootvp); 4511 4512 if (error) 4513 return (error); 4514 4515 /* 4516 * If we found the same rnode, we're done now 4517 */ 4518 if (nvp == vp) { 4519 /* 4520 * Failed and the new server may physically be same 4521 * OR may share a same disk subsystem. In this case 4522 * file handle for a particular file path is not going 4523 * to change, given the same filehandle lookup will 4524 * always locate the same rnode as the existing one. 4525 * All we might need to do is to update the r_server 4526 * with the current servinfo. 4527 */ 4528 if (!VALID_FH(fi)) { 4529 rp->r_server = mi->mi_curr_serv; 4530 } 4531 VN_RELE(nvp); 4532 return (0); 4533 } 4534 4535 /* 4536 * Try to make it so that no one else will find this 4537 * vnode because it is just a temporary to hold the 4538 * new file handle until that file handle can be 4539 * copied to the original vnode/rnode. 4540 */ 4541 nrp = VTOR(nvp); 4542 mutex_enter(&mi->mi_remap_lock); 4543 /* 4544 * Some other thread could have raced in here and could 4545 * have done the remap for this particular rnode before 4546 * this thread here. Check for rp->r_server and 4547 * mi->mi_curr_serv and return if they are same. 4548 */ 4549 if (VALID_FH(fi)) { 4550 mutex_exit(&mi->mi_remap_lock); 4551 VN_RELE(nvp); 4552 return (0); 4553 } 4554 4555 if (nrp->r_flags & RHASHED) 4556 rp_rmhash(nrp); 4557 4558 /* 4559 * As a heuristic check on the validity of the new 4560 * file, check that the size and type match against 4561 * that we remember from the old version. 4562 */ 4563 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) { 4564 mutex_exit(&mi->mi_remap_lock); 4565 zcmn_err(mi->mi_zone->zone_id, CE_WARN, 4566 "NFS replicas %s and %s: file %s not same.", 4567 rp->r_server->sv_hostname, 4568 nrp->r_server->sv_hostname, rp->r_path); 4569 VN_RELE(nvp); 4570 return (EINVAL); 4571 } 4572 4573 /* 4574 * snarf the filehandle from the new rnode 4575 * then release it, again while updating the 4576 * hash queues for the rnode. 4577 */ 4578 if (rp->r_flags & RHASHED) 4579 rp_rmhash(rp); 4580 rp->r_server = mi->mi_curr_serv; 4581 rp->r_fh = nrp->r_fh; 4582 rp->r_hashq = nrp->r_hashq; 4583 /* 4584 * Copy the attributes from the new rnode to the old 4585 * rnode. This will help to reduce unnecessary page 4586 * cache flushes. 4587 */ 4588 rp->r_attr = nrp->r_attr; 4589 rp->r_attrtime = nrp->r_attrtime; 4590 rp->r_mtime = nrp->r_mtime; 4591 (void) nfs_free_data_reclaim(rp); 4592 nfs_setswaplike(vp, &rp->r_attr); 4593 rw_enter(&rp->r_hashq->r_lock, RW_WRITER); 4594 rp_addhash(rp); 4595 rw_exit(&rp->r_hashq->r_lock); 4596 mutex_exit(&mi->mi_remap_lock); 4597 VN_RELE(nvp); 4598 } 4599 4600 /* 4601 * Update successful failover remap count 4602 */ 4603 mutex_enter(&mi->mi_lock); 4604 mi->mi_remap++; 4605 mutex_exit(&mi->mi_lock); 4606 #ifdef DEBUG 4607 nfscl->nfscl_stat.remap.value.ui64++; 4608 #endif 4609 4610 /* 4611 * If we have a copied filehandle to update, do it now. 4612 */ 4613 if (fi->fhp != NULL && fi->copyproc != NULL) 4614 (*fi->copyproc)(fi->fhp, vp); 4615 4616 return (0); 4617 } 4618 4619 /* 4620 * NFS client failover support 4621 * 4622 * We want a simple pathname lookup routine to parse the pieces 4623 * of path in rp->r_path. We know that the path was a created 4624 * as rnodes were made, so we know we have only to deal with 4625 * paths that look like: 4626 * dir1/dir2/dir3/file 4627 * Any evidence of anything like .., symlinks, and ENOTDIR 4628 * are hard errors, because they mean something in this filesystem 4629 * is different from the one we came from, or has changed under 4630 * us in some way. If this is true, we want the failure. 4631 * 4632 * Extended attributes: if the filesystem is mounted with extended 4633 * attributes enabled (-o xattr), the attribute directory will be 4634 * represented in the r_path as the magic name XATTR_RPATH. So if 4635 * we see that name in the pathname, is must be because this node 4636 * is an extended attribute. Therefore, look it up that way. 4637 */ 4638 static int 4639 failover_lookup(char *path, vnode_t *root, 4640 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int, 4641 vnode_t *, cred_t *, int), 4642 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int), 4643 vnode_t **new) 4644 { 4645 vnode_t *dvp, *nvp; 4646 int error = EINVAL; 4647 char *s, *p, *tmppath; 4648 size_t len; 4649 mntinfo_t *mi; 4650 bool_t xattr; 4651 4652 /* Make local copy of path */ 4653 len = strlen(path) + 1; 4654 tmppath = kmem_alloc(len, KM_SLEEP); 4655 (void) strcpy(tmppath, path); 4656 s = tmppath; 4657 4658 dvp = root; 4659 VN_HOLD(dvp); 4660 mi = VTOMI(root); 4661 xattr = mi->mi_flags & MI_EXTATTR; 4662 4663 do { 4664 p = strchr(s, '/'); 4665 if (p != NULL) 4666 *p = '\0'; 4667 if (xattr && strcmp(s, XATTR_RPATH) == 0) { 4668 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(), 4669 RFSCALL_SOFT); 4670 } else { 4671 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL, 4672 CRED(), RFSCALL_SOFT); 4673 } 4674 if (p != NULL) 4675 *p++ = '/'; 4676 if (error) { 4677 VN_RELE(dvp); 4678 kmem_free(tmppath, len); 4679 return (error); 4680 } 4681 s = p; 4682 VN_RELE(dvp); 4683 dvp = nvp; 4684 } while (p != NULL); 4685 4686 if (nvp != NULL && new != NULL) 4687 *new = nvp; 4688 kmem_free(tmppath, len); 4689 return (0); 4690 } 4691 4692 /* 4693 * NFS client failover support 4694 * 4695 * sv_free() frees the malloc'd portion of a "servinfo_t". 4696 */ 4697 void 4698 sv_free(servinfo_t *svp) 4699 { 4700 servinfo_t *next; 4701 struct knetconfig *knconf; 4702 4703 while (svp != NULL) { 4704 next = svp->sv_next; 4705 if (svp->sv_secdata) 4706 sec_clnt_freeinfo(svp->sv_secdata); 4707 if (svp->sv_hostname && svp->sv_hostnamelen > 0) 4708 kmem_free(svp->sv_hostname, svp->sv_hostnamelen); 4709 knconf = svp->sv_knconf; 4710 if (knconf != NULL) { 4711 if (knconf->knc_protofmly != NULL) 4712 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4713 if (knconf->knc_proto != NULL) 4714 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4715 kmem_free(knconf, sizeof (*knconf)); 4716 } 4717 knconf = svp->sv_origknconf; 4718 if (knconf != NULL) { 4719 if (knconf->knc_protofmly != NULL) 4720 kmem_free(knconf->knc_protofmly, KNC_STRSIZE); 4721 if (knconf->knc_proto != NULL) 4722 kmem_free(knconf->knc_proto, KNC_STRSIZE); 4723 kmem_free(knconf, sizeof (*knconf)); 4724 } 4725 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0) 4726 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen); 4727 mutex_destroy(&svp->sv_lock); 4728 kmem_free(svp, sizeof (*svp)); 4729 svp = next; 4730 } 4731 } 4732 4733 /* 4734 * Only can return non-zero if intr != 0. 4735 */ 4736 int 4737 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr) 4738 { 4739 4740 mutex_enter(&l->lock); 4741 4742 /* 4743 * If this is a nested enter, then allow it. There 4744 * must be as many exits as enters through. 4745 */ 4746 if (l->owner == curthread) { 4747 /* lock is held for writing by current thread */ 4748 ASSERT(rw == RW_READER || rw == RW_WRITER); 4749 l->count--; 4750 } else if (rw == RW_READER) { 4751 /* 4752 * While there is a writer active or writers waiting, 4753 * then wait for them to finish up and move on. Then, 4754 * increment the count to indicate that a reader is 4755 * active. 4756 */ 4757 while (l->count < 0 || l->waiters > 0) { 4758 if (intr) { 4759 klwp_t *lwp = ttolwp(curthread); 4760 4761 if (lwp != NULL) 4762 lwp->lwp_nostop++; 4763 if (cv_wait_sig(&l->cv, &l->lock) == 0) { 4764 if (lwp != NULL) 4765 lwp->lwp_nostop--; 4766 mutex_exit(&l->lock); 4767 return (EINTR); 4768 } 4769 if (lwp != NULL) 4770 lwp->lwp_nostop--; 4771 } else 4772 cv_wait(&l->cv, &l->lock); 4773 4774 /* 4775 * If there are no readers active nor a writer active 4776 * we need to wake up the next waiter. If there is a 4777 * writer waiting we will wait again so we need to wake 4778 * up the next waiter (possible writer). If there is 4779 * no writer waiting we need to wake up the next 4780 * waiting reader (if any) so it is invited to the 4781 * party. 4782 */ 4783 if (l->count == 0) 4784 cv_signal(&l->cv); 4785 4786 /* 4787 * If there are readers active and no writers waiting 4788 * then wake up the next waiting reader (if any). 4789 */ 4790 if (l->count > 0 && l->waiters == 0) 4791 cv_signal(&l->cv); 4792 } 4793 ASSERT(l->count < INT_MAX); 4794 #ifdef DEBUG 4795 if ((l->count % 10000) == 9999) 4796 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on" 4797 "rwlock @ %p\n", l->count, (void *)&l); 4798 #endif 4799 l->count++; 4800 } else { 4801 ASSERT(rw == RW_WRITER); 4802 /* 4803 * While there are readers active or a writer 4804 * active, then wait for all of the readers 4805 * to finish or for the writer to finish. 4806 * Then, set the owner field to curthread and 4807 * decrement count to indicate that a writer 4808 * is active. 4809 */ 4810 while (l->count != 0) { 4811 l->waiters++; 4812 if (intr) { 4813 klwp_t *lwp = ttolwp(curthread); 4814 4815 if (lwp != NULL) 4816 lwp->lwp_nostop++; 4817 if (cv_wait_sig(&l->cv, &l->lock) == 0) { 4818 if (lwp != NULL) 4819 lwp->lwp_nostop--; 4820 l->waiters--; 4821 /* 4822 * If there are readers active and no 4823 * writers waiting then wake up the 4824 * next waiting reader (if any). 4825 */ 4826 if (l->count > 0 && l->waiters == 0) 4827 cv_signal(&l->cv); 4828 mutex_exit(&l->lock); 4829 return (EINTR); 4830 } 4831 if (lwp != NULL) 4832 lwp->lwp_nostop--; 4833 } else 4834 cv_wait(&l->cv, &l->lock); 4835 l->waiters--; 4836 } 4837 ASSERT(l->owner == NULL); 4838 l->owner = curthread; 4839 l->count--; 4840 } 4841 4842 mutex_exit(&l->lock); 4843 4844 return (0); 4845 } 4846 4847 /* 4848 * If the lock is available, obtain it and return non-zero. If there is 4849 * already a conflicting lock, return 0 immediately. 4850 */ 4851 4852 int 4853 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw) 4854 { 4855 mutex_enter(&l->lock); 4856 4857 /* 4858 * If this is a nested enter, then allow it. There 4859 * must be as many exits as enters through. 4860 */ 4861 if (l->owner == curthread) { 4862 /* lock is held for writing by current thread */ 4863 ASSERT(rw == RW_READER || rw == RW_WRITER); 4864 l->count--; 4865 } else if (rw == RW_READER) { 4866 /* 4867 * If there is a writer active or writers waiting, deny the 4868 * lock. Otherwise, bump the count of readers. 4869 */ 4870 if (l->count < 0 || l->waiters > 0) { 4871 mutex_exit(&l->lock); 4872 return (0); 4873 } 4874 l->count++; 4875 } else { 4876 ASSERT(rw == RW_WRITER); 4877 /* 4878 * If there are readers active or a writer active, deny the 4879 * lock. Otherwise, set the owner field to curthread and 4880 * decrement count to indicate that a writer is active. 4881 */ 4882 if (l->count != 0) { 4883 mutex_exit(&l->lock); 4884 return (0); 4885 } 4886 ASSERT(l->owner == NULL); 4887 l->owner = curthread; 4888 l->count--; 4889 } 4890 4891 mutex_exit(&l->lock); 4892 4893 return (1); 4894 } 4895 4896 void 4897 nfs_rw_exit(nfs_rwlock_t *l) 4898 { 4899 4900 mutex_enter(&l->lock); 4901 /* 4902 * If this is releasing a writer lock, then increment count to 4903 * indicate that there is one less writer active. If this was 4904 * the last of possibly nested writer locks, then clear the owner 4905 * field as well to indicate that there is no writer active 4906 * and wakeup the first waiting writer or reader. 4907 * 4908 * If releasing a reader lock, then just decrement count to 4909 * indicate that there is one less reader active. If this was 4910 * the last active reader and there are writer(s) waiting, 4911 * then wake up the first. 4912 */ 4913 if (l->owner != NULL) { 4914 ASSERT(l->owner == curthread); 4915 l->count++; 4916 if (l->count == 0) { 4917 l->owner = NULL; 4918 cv_signal(&l->cv); 4919 } 4920 } else { 4921 ASSERT(l->count > 0); 4922 l->count--; 4923 if (l->count == 0 && l->waiters > 0) 4924 cv_signal(&l->cv); 4925 } 4926 mutex_exit(&l->lock); 4927 } 4928 4929 int 4930 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw) 4931 { 4932 4933 if (rw == RW_READER) 4934 return (l->count > 0); 4935 ASSERT(rw == RW_WRITER); 4936 return (l->count < 0); 4937 } 4938 4939 /* ARGSUSED */ 4940 void 4941 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg) 4942 { 4943 4944 l->count = 0; 4945 l->waiters = 0; 4946 l->owner = NULL; 4947 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL); 4948 cv_init(&l->cv, NULL, CV_DEFAULT, NULL); 4949 } 4950 4951 void 4952 nfs_rw_destroy(nfs_rwlock_t *l) 4953 { 4954 4955 mutex_destroy(&l->lock); 4956 cv_destroy(&l->cv); 4957 } 4958 4959 int 4960 nfs3_rddir_compar(const void *x, const void *y) 4961 { 4962 rddir_cache *a = (rddir_cache *)x; 4963 rddir_cache *b = (rddir_cache *)y; 4964 4965 if (a->nfs3_cookie == b->nfs3_cookie) { 4966 if (a->buflen == b->buflen) 4967 return (0); 4968 if (a->buflen < b->buflen) 4969 return (-1); 4970 return (1); 4971 } 4972 4973 if (a->nfs3_cookie < b->nfs3_cookie) 4974 return (-1); 4975 4976 return (1); 4977 } 4978 4979 int 4980 nfs_rddir_compar(const void *x, const void *y) 4981 { 4982 rddir_cache *a = (rddir_cache *)x; 4983 rddir_cache *b = (rddir_cache *)y; 4984 4985 if (a->nfs_cookie == b->nfs_cookie) { 4986 if (a->buflen == b->buflen) 4987 return (0); 4988 if (a->buflen < b->buflen) 4989 return (-1); 4990 return (1); 4991 } 4992 4993 if (a->nfs_cookie < b->nfs_cookie) 4994 return (-1); 4995 4996 return (1); 4997 } 4998 4999 static char * 5000 nfs_getsrvnames(mntinfo_t *mi, size_t *len) 5001 { 5002 servinfo_t *s; 5003 char *srvnames; 5004 char *namep; 5005 size_t length; 5006 5007 /* 5008 * Calculate the length of the string required to hold all 5009 * of the server names plus either a comma or a null 5010 * character following each individual one. 5011 */ 5012 length = 0; 5013 for (s = mi->mi_servers; s != NULL; s = s->sv_next) 5014 length += s->sv_hostnamelen; 5015 5016 srvnames = kmem_alloc(length, KM_SLEEP); 5017 5018 namep = srvnames; 5019 for (s = mi->mi_servers; s != NULL; s = s->sv_next) { 5020 (void) strcpy(namep, s->sv_hostname); 5021 namep += s->sv_hostnamelen - 1; 5022 *namep++ = ','; 5023 } 5024 *--namep = '\0'; 5025 5026 *len = length; 5027 5028 return (srvnames); 5029 } 5030 5031 /* 5032 * These two functions are temporary and designed for the upgrade-workaround 5033 * only. They cannot be used for general zone-crossing NFS client support, and 5034 * will be removed shortly. 5035 * 5036 * When the workaround is enabled, all NFS traffic is forced into the global 5037 * zone. These functions are called when the code needs to refer to the state 5038 * of the underlying network connection. They're not called when the function 5039 * needs to refer to the state of the process that invoked the system call. 5040 * (E.g., when checking whether the zone is shutting down during the mount() 5041 * call.) 5042 */ 5043 5044 struct zone * 5045 nfs_zone(void) 5046 { 5047 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone); 5048 } 5049 5050 zoneid_t 5051 nfs_zoneid(void) 5052 { 5053 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid()); 5054 } 5055 5056 /* 5057 * nfs_mount_label_policy: 5058 * Determine whether the mount is allowed according to MAC check, 5059 * by comparing (where appropriate) label of the remote server 5060 * against the label of the zone being mounted into. 5061 * 5062 * Returns: 5063 * 0 : access allowed 5064 * -1 : read-only access allowed (i.e., read-down) 5065 * >0 : error code, such as EACCES 5066 */ 5067 int 5068 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, 5069 struct knetconfig *knconf, cred_t *cr) 5070 { 5071 int addr_type; 5072 void *ipaddr; 5073 bslabel_t *server_sl, *mntlabel; 5074 zone_t *mntzone = NULL; 5075 ts_label_t *zlabel; 5076 tsol_tpc_t *tp; 5077 ts_label_t *tsl = NULL; 5078 int retv; 5079 5080 /* 5081 * Get the zone's label. Each zone on a labeled system has a label. 5082 */ 5083 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 5084 zlabel = mntzone->zone_slabel; 5085 ASSERT(zlabel != NULL); 5086 label_hold(zlabel); 5087 5088 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) { 5089 addr_type = IPV4_VERSION; 5090 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr; 5091 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) { 5092 addr_type = IPV6_VERSION; 5093 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr; 5094 } else { 5095 retv = 0; 5096 goto out; 5097 } 5098 5099 retv = EACCES; /* assume the worst */ 5100 5101 /* 5102 * Next, get the assigned label of the remote server. 5103 */ 5104 tp = find_tpc(ipaddr, addr_type, B_FALSE); 5105 if (tp == NULL) 5106 goto out; /* error getting host entry */ 5107 5108 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi) 5109 goto rel_tpc; /* invalid domain */ 5110 if ((tp->tpc_tp.host_type != SUN_CIPSO) && 5111 (tp->tpc_tp.host_type != UNLABELED)) 5112 goto rel_tpc; /* invalid hosttype */ 5113 5114 if (tp->tpc_tp.host_type == SUN_CIPSO) { 5115 tsl = getflabel_cipso(vfsp); 5116 if (tsl == NULL) 5117 goto rel_tpc; /* error getting server lbl */ 5118 5119 server_sl = label2bslabel(tsl); 5120 } else { /* UNLABELED */ 5121 server_sl = &tp->tpc_tp.tp_def_label; 5122 } 5123 5124 mntlabel = label2bslabel(zlabel); 5125 5126 /* 5127 * Now compare labels to complete the MAC check. If the labels 5128 * are equal or if the requestor is in the global zone and has 5129 * NET_MAC_AWARE, then allow read-write access. (Except for 5130 * mounts into the global zone itself; restrict these to 5131 * read-only.) 5132 * 5133 * If the requestor is in some other zone, but their label 5134 * dominates the server, then allow read-down. 5135 * 5136 * Otherwise, access is denied. 5137 */ 5138 if (blequal(mntlabel, server_sl) || 5139 (crgetzoneid(cr) == GLOBAL_ZONEID && 5140 getpflags(NET_MAC_AWARE, cr) != 0)) { 5141 if ((mntzone == global_zone) || 5142 !blequal(mntlabel, server_sl)) 5143 retv = -1; /* read-only */ 5144 else 5145 retv = 0; /* access OK */ 5146 } else if (bldominates(mntlabel, server_sl)) { 5147 retv = -1; /* read-only */ 5148 } else { 5149 retv = EACCES; 5150 } 5151 5152 if (tsl != NULL) 5153 label_rele(tsl); 5154 5155 rel_tpc: 5156 TPC_RELE(tp); 5157 out: 5158 if (mntzone) 5159 zone_rele(mntzone); 5160 label_rele(zlabel); 5161 return (retv); 5162 } 5163 5164 boolean_t 5165 nfs_has_ctty(void) 5166 { 5167 boolean_t rv; 5168 mutex_enter(&curproc->p_splock); 5169 rv = (curproc->p_sessp->s_vp != NULL); 5170 mutex_exit(&curproc->p_splock); 5171 return (rv); 5172 } 5173 5174 /* 5175 * See if xattr directory to see if it has any generic user attributes 5176 */ 5177 int 5178 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr) 5179 { 5180 struct uio uio; 5181 struct iovec iov; 5182 char *dbuf; 5183 struct dirent64 *dp; 5184 size_t dlen = 8 * 1024; 5185 size_t dbuflen; 5186 int eof = 0; 5187 int error; 5188 5189 *valp = 0; 5190 dbuf = kmem_alloc(dlen, KM_SLEEP); 5191 uio.uio_iov = &iov; 5192 uio.uio_iovcnt = 1; 5193 uio.uio_segflg = UIO_SYSSPACE; 5194 uio.uio_fmode = 0; 5195 uio.uio_extflg = UIO_COPY_CACHED; 5196 uio.uio_loffset = 0; 5197 uio.uio_resid = dlen; 5198 iov.iov_base = dbuf; 5199 iov.iov_len = dlen; 5200 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); 5201 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0); 5202 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); 5203 5204 dbuflen = dlen - uio.uio_resid; 5205 5206 if (error || dbuflen == 0) { 5207 kmem_free(dbuf, dlen); 5208 return (error); 5209 } 5210 5211 dp = (dirent64_t *)dbuf; 5212 5213 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) { 5214 if (strcmp(dp->d_name, ".") == 0 || 5215 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name, 5216 VIEW_READWRITE) == 0 || strcmp(dp->d_name, 5217 VIEW_READONLY) == 0) { 5218 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); 5219 continue; 5220 } 5221 5222 *valp = 1; 5223 break; 5224 } 5225 kmem_free(dbuf, dlen); 5226 return (0); 5227 } 5228